diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000000000000000000000000000000000000..d2fcc2b1c4384d0bcd1424b7f83db8e48fa753f6
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,23 @@
+BSD 2-CLAUSE LICENSE
+Copyright 2024 LinkedIn Corporation 
+All Rights Reserved.
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the following
+conditions are met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials provided
+with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..904d3c1492b7a20bb2a0e993404ed644c770b213
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,73 @@
+.PHONY: test checkstyle test-convergence all serve build clean
+
+
+all: checkstyle test test-convergence
+
+# Command to run pytest for correctness tests
+test:
+	python -m pytest --disable-warnings \
+		--cov=src/liger_kernel \
+		--cov-report=term-missing \
+		--ignore=test/convergence \
+		test/
+
+# Command to run coverage report
+coverage:
+	coverage report -m
+
+# Command to run ruff for linting and formatting code
+checkstyle:
+	ruff check --output-format=concise .; ruff_check_status=$$?; \
+	ruff format --check --diff .; ruff_format_status=$$?; \
+	ruff check . --fix; \
+	ruff format .; \
+	if [ $$ruff_check_status -ne 0 ] || [ $$ruff_format_status -ne 0 ]; then \
+		exit 1; \
+	fi
+
+# Command to run pytest for convergence tests
+# We have to explicitly set HF_DATASETS_OFFLINE=1, or dataset will silently try to send metrics and timeout (80s) https://github.com/huggingface/datasets/blob/37a603679f451826cfafd8aae00738b01dcb9d58/src/datasets/load.py#L286
+test-convergence:
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/fp32/test_mini_models.py
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/fp32/test_mini_models_multimodal.py
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/fp32/test_mini_models_with_logits.py
+
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/bf16/test_mini_models.py
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/bf16/test_mini_models_multimodal.py
+	HF_DATASETS_OFFLINE=1 python -m pytest --disable-warnings test/convergence/bf16/test_mini_models_with_logits.py
+
+# Command to run all benchmark scripts and update benchmarking data file
+# By default this doesn't overwrite existing data for the same benchmark experiment
+# run with `make run-benchmarks OVERWRITE=1` to overwrite existing benchmark data
+BENCHMARK_DIR = benchmark/scripts
+BENCHMARK_SCRIPTS = $(wildcard $(BENCHMARK_DIR)/benchmark_*.py)
+OVERWRITE ?= 0
+
+run-benchmarks:
+	@for script in $(BENCHMARK_SCRIPTS); do \
+		echo "Running benchmark: $$script"; \
+		if [ $(OVERWRITE) -eq 1 ]; then \
+			python $$script --overwrite; \
+		else \
+			python $$script; \
+		fi; \
+	done
+
+# MkDocs Configuration
+MKDOCS = mkdocs
+CONFIG_FILE = mkdocs.yml
+SITE_DIR = site
+
+# MkDocs targets
+
+# Serve the documentation 
+serve:
+	$(MKDOCS) serve -f $(CONFIG_FILE)
+
+# Build the documentation into the specified site directory
+build:
+	$(MKDOCS) build -f $(CONFIG_FILE) --site-dir $(SITE_DIR)
+
+# Clean the output directory
+clean:
+	rm -rf $(SITE_DIR)/
diff --git a/NOTICE b/NOTICE
new file mode 100755
index 0000000000000000000000000000000000000000..ea2881754f5b3e0eb9926dd9dc6c9d772f962911
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,58 @@
+Copyright 2024 LinkedIn Corporation
+All Rights Reserved.
+
+Licensed under the BSD 2-Clause License (the "License").  See License in the project root for license information.
+
+This product includes software developed by LinkedIn Corporation.
+
+This product contains code derived from the following open source projects:
+
+1. Unsloth
+   Copyright (c) 2023 Unsloth AI
+   Licensed under the Apache License, Version 2.0
+   Source: https://github.com/unslothai/unsloth
+   
+   The `calculate_settings` function to determine block size and warp is reused for Norm and MLP operations.
+   Modifications and additions were made to the RMS Norm implementation.
+
+2. Triton
+   Copyright (c) 2023 OpenAI
+   Licensed under the MIT License
+   Source: https://github.com/openai/triton
+   
+   Modifications were made based on Triton tutorials for the RMS Norm implementation.
+
+3. Efficient Cross Entropy
+   Copyright (c) 2023 Mohamed Malek
+   Licensed under the MIT License
+   Source: https://github.com/mgmalek/efficient_cross_entropy
+   
+   The idea of gradient-in-forward and chunking was used in the Linear Cross Entropy implementation.
+
+4. Flash Attention
+   Copyright (c) 2023 Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré
+   Licensed under the BSD 3-Clause License
+   Source: https://github.com/Dao-AILab/flash-attention
+   
+   Optimization ideas such as tiling and recomputation were inspired by this work.
+
+5. AutoAWQ
+   Copyright (c) 2023 Casper Hansen
+   Licensed under the MIT License
+   Source: https://github.com/casper-hansen/AutoAWQ
+   
+   The design of the automodel was referenced from this project.
+
+6. llm.c
+   Copyright (c) 2023 Andrej Karpathy
+   Licensed under the MIT License
+   Source: https://github.com/karpathy/llm.c
+   
+   The design of end-to-end testing was referenced from this project.
+
+7. Tiny Shakespeare Dataset
+   Source: https://huggingface.co/datasets/karpathy/tiny_shakespeare
+   
+   This dataset is used to conduct convergence tests on mini models.
+
+For full license texts, please refer to the respective project repositories.
diff --git a/benchmark/BENCHMARK_GUIDELINES.md b/benchmark/BENCHMARK_GUIDELINES.md
new file mode 100755
index 0000000000000000000000000000000000000000..907223430151540d36acf7ac73509a1252f2ca65
--- /dev/null
+++ b/benchmark/BENCHMARK_GUIDELINES.md
@@ -0,0 +1,101 @@
+# Guideline for Adding Benchmark Scripts
+
+This document describes how to add new benchmark scripts to Liger-Kernel in line with the shared framework.
+
+## 1. Where and how to add a script
+
+- **Location**: `benchmark/scripts/`
+- **Naming**: `benchmark_<kernel_name>.py` (e.g. `benchmark_geglu.py`, `benchmark_swiglu.py`)
+
+## 2. Use shared infrastructure
+
+Do **not** hardcode batch size, sequence length, or model dimensions. Use:
+
+| Need | Use |
+|------|-----|
+| Model dimensions (hidden_size, vocab_size, etc.) | `benchmark_model_configs.py`: `ModelConfig`, `get_benchmark_model_config()` |
+| Safe sweep config (seq_len or hidden_size) | `compute_seq_len_sweep_config()` (returns `SeqLenSweepConfig`) or `compute_hidden_size_sweep_config()` (returns `HiddenSizeSweepConfig`), with optional `estimate_kernel_peak_memory()` |
+| Speed / memory measurement | `utils.py`: `run_speed_benchmark()`, `run_memory_benchmark()` |
+| CLI (overwrite, model choice) | `utils.py`: `parse_benchmark_script_args()` (includes `--model`) |
+| Running the grid and writing CSV | `utils.py`: `run_benchmarks()` |
+
+## 3. Script structure (three parts)
+
+### 3.1 Setup factory
+
+Define a single **setup function** that builds inputs and the layer (or callable) from `SingleBenchmarkRunInput`, so both speed and memory benchmarks reuse the same setup.
+
+- **Signature**: `_setup_<kernel>(input: SingleBenchmarkRunInput) -> (tensors, layer_or_fn)`
+- **Input**: `input.x` is the varying dimension (e.g. sequence length); `input.extra_benchmark_config` holds `bsz`, `hidden_size`, `dtype`, etc.; `input.kernel_provider` identifies the implementation variant (e.g. `"liger"`, `"huggingface"`, `"torch"`; values are kernel-specific).
+- **Return**: Whatever the benchmark helpers need (e.g. `(x, layer)` for a single-tensor forward like GEGLU).
+
+Example (conceptually):
+
+```python
+def _setup_geglu(input: SingleBenchmarkRunInput):
+    cfg = input.extra_benchmark_config
+    # Build config, create x tensor, instantiate LigerGEGLUMLP or LlamaMLP by provider
+    return x, layer
+```
+
+### 3.2 Speed and memory benchmark functions
+
+Each takes `SingleBenchmarkRunInput` and returns `SingleBenchmarkRunOutput` by calling the shared helpers.
+
+- **Speed**: `run_speed_benchmark(fwd_fn, mode, input_tensors, rep=...)`
+- **Memory**: `run_memory_benchmark(fwd_fn, mode)`
+- **Modes**: Use `["full", "forward", "backward"]` for both speed and memory for consistency.
+
+Example:
+
+```python
+def bench_speed_geglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_geglu(input)
+    return run_speed_benchmark(lambda: layer(x), input.kernel_operation_mode, [x])
+
+def bench_memory_geglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_geglu(input)
+    return run_memory_benchmark(lambda: layer(x), input.kernel_operation_mode)
+```
+
+For **scalar output** (e.g. loss) or **multiple outputs** (e.g. RoPE), use the appropriate helpers from `utils.py` if available (e.g. loss or multi-output variants), or implement custom measurement and still use the same setup factory and `run_benchmarks()`.
+
+### 3.3 `__main__`: model config, shape computation, run
+
+1. Parse args: `args = parse_benchmark_script_args()` and resolve `model = get_benchmark_model_config(args.model)`.
+2. (Recommended) Measure peak memory with a small probe using the **highest-memory baseline** implementation (e.g. `"huggingface"` or `"torch"`):
+   - Define a `_probe()` function that creates tensors/layers, runs a forward pass, and returns the output tensor. `_probe()` owns setup; `estimate_kernel_peak_memory` handles memory-stat reset before the call, runs `.backward()`, and performs cleanup (gc + cache clear) afterward.
+   - Call `peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe)`.
+3. Compute sweep config (device memory is obtained internally by both helpers):
+   - **Sequence-length sweep** (e.g. GEGLU, SwiGLU): convert peak bytes to per-token (`kernel_bpt = peak_bytes // probe_seq_len`), then `config = compute_seq_len_sweep_config(model, kernel_bytes_per_token=kernel_bpt)`. The returned `SeqLenSweepConfig` has `batch_size` and `seq_len`.
+   - **Hidden-size sweep** (e.g. DyT): pass total peak bytes directly: `config = compute_hidden_size_sweep_config(model, kernel_peak_bytes=peak_bytes, bt=BT)`. The returned `HiddenSizeSweepConfig` has `bt` and `max_hidden_size`.
+4. Build `x_values` from `config.seq_len` (seq_len sweep) or `config.max_hidden_size` (hidden_size sweep).
+5. Build `extra_benchmark_configs` from `model` and config:
+   - Seq_len sweep: e.g. `bsz=config.batch_size`, `hidden_size=model.hidden_size`, `dtype=model.dtype`.
+   - Hidden_size sweep: e.g. `BT=config.bt`, `dtype=model.dtype`.
+6. Call `run_benchmarks(..., kernel_operation_modes=["full", "forward", "backward"], ...)` for both speed and memory.
+
+## 4. CLI
+
+Scripts should support:
+
+- `--overwrite`: overwrite existing rows in the benchmark CSV.
+- `--model`: model profile name from `MODEL_REGISTRY` (e.g. `llama_2_7b`, `llama_3_8b`). Default when not set is `DEFAULT_MODEL_CONFIG` (e.g. `llama_3_8b`).
+
+These are provided by `parse_benchmark_script_args()` in `utils.py`.
+
+## 5. Reference scripts
+
+- **Element-wise (single tensor in/out, seq_len sweep)**: `benchmark_geglu.py`, `benchmark_swiglu.py` — `compute_seq_len_sweep_config()`.
+- **Element-wise (single tensor in/out, hidden_size sweep)**: `benchmark_dyt.py` — `compute_hidden_size_sweep_config()`.
+
+## 6. Checklist for a new script
+
+- [ ] Script under `benchmark/scripts/` named `benchmark_<kernel>.py`.
+- [ ] Single `_setup_<kernel>(SingleBenchmarkRunInput)` used by both speed and memory.
+- [ ] Speed/memory implemented via `run_speed_benchmark` / `run_memory_benchmark` (or the correct variant for loss / multi-output).
+- [ ] `kernel_operation_modes=["full", "forward", "backward"]` for both speed and memory.
+- [ ] No hardcoded batch size or sequence length; use `compute_seq_len_sweep_config()` or `compute_hidden_size_sweep_config()` (and optionally `estimate_kernel_peak_memory()`).
+- [ ] Model dimensions and dtype from `ModelConfig` / `get_benchmark_model_config()` / `args.model`.
+- [ ] CLI via `parse_benchmark_script_args()` (so `--model` and `--overwrite` work).
+- [ ] Results written through `run_benchmarks()` so data goes to the shared CSV.
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..02c883d9215de7dbe38174c46deb1edd2bb01d4f
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,48 @@
+## Benchmarking Liger Kernels
+
+Follow these steps to benchmark and visualize kernel performance:
+
+1. Create a benchmark script
+   - Add your script under `benchmark/scripts/`
+   - Name it according to the kernel (e.g., `benchmark_<kernel_name>.py`)
+
+2. Run the benchmark
+   - Results will be saved to `benchmark/data/all_benchmark_data.csv`
+   
+   Example: Benchmarking KTO Loss
+   ```bash
+   cd benchmark
+   python scripts/benchmark_kto_loss.py
+   ```
+
+3. Visualize results
+   - Use the visualization script with optional modes:
+
+     * To target specific mode(s), pass `--kernel-operation-mode` one or more values.
+     * If you omit `--kernel-operation-mode`, the script will:
+       - For `speed` metrics: generate plots for all available modes (forward/backward/full).
+       - For `memory` metrics: generate only the `full` plot.
+
+   Examples:
+   1. Specific modes (speed):
+   ```bash
+   python benchmarks_visualizer.py \
+       --kernel-name kto_loss \
+       --metric-name speed \
+       --kernel-operation-mode forward backward
+   ```
+   2. All modes (speed):
+   ```bash
+   python benchmarks_visualizer.py \
+       --kernel-name kto_loss \
+       --metric-name speed
+   ```
+   3. Memory (always full):
+   ```bash
+   python benchmarks_visualizer.py \
+       --kernel-name kto_loss \
+       --metric-name memory
+   ```
+
+4. View results
+   - Generated plots will be saved in `benchmark/visualizations/`
\ No newline at end of file
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/benchmark/benchmarks_visualizer.py b/benchmark/benchmarks_visualizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..e33d844eaeba7a945b660fd4619183e3689226e4
--- /dev/null
+++ b/benchmark/benchmarks_visualizer.py
@@ -0,0 +1,299 @@
+import json
+import os
+import sys
+
+from argparse import ArgumentParser
+from dataclasses import dataclass
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+DATA_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "data/all_benchmark_data.csv"))
+VISUALIZATIONS_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "visualizations/"))
+
+
+@dataclass
+class VisualizationsConfig:
+    """
+    Configuration for the visualizations script.
+
+    Args:
+        kernel_name (str): Kernel name to benchmark. (Will run `scripts/benchmark_{kernel_name}.py`)
+        metric_name (str): Metric name to visualize (speed/memory)
+        kernel_operation_mode (str): Kernel operation mode to visualize (forward/backward/full). Defaults to "full"
+        extra_config_filter (str, optional): A string to filter extra_benchmark_config.
+                                            Can be a substring to match or a 'key=value' pair (e.g., "'H': 4096").
+                                            Defaults to None, which means the first available config will be used if multiple exist.
+        display (bool): Display the visualization. Defaults to False
+        overwrite (bool): Overwrite existing visualization, if none exist this flag has no effect as ones are always created and saved. Defaults to False
+
+    """
+
+    kernel_name: str
+    metric_name: str
+    kernel_operation_mode: str = "full"
+    extra_config_filter: str | None = None
+    display: bool = False
+    overwrite: bool = False
+
+
+def parse_args() -> VisualizationsConfig:
+    """Parse command line arguments into a configuration object.
+
+    Returns:
+        VisualizationsConfig: Configuration object for the visualizations script.
+    """
+    parser = ArgumentParser()
+    parser.add_argument("--kernel-name", type=str, required=True, help="Kernel name to benchmark")
+    parser.add_argument(
+        "--metric-name",
+        type=str,
+        required=True,
+        help="Metric name to visualize (speed/memory)",
+    )
+    parser.add_argument(
+        "--kernel-operation-mode",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Kernel operation modes to visualize (forward/backward/full). If not provided, generate for all available modes.",
+    )
+    parser.add_argument(
+        "--extra-config-filter",
+        type=str,
+        default=None,
+        help="A string to filter extra_benchmark_config. "
+        "Can be a substring to match or a JSON-like 'key=value' pair (e.g., \"'H': 4096\" or \"H=4096\" for simple cases). "
+        "Defaults to None (first available config if multiple exist).",
+    )
+    parser.add_argument("--display", action="store_true", help="Display the visualization")
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing visualization, if none exist this flag has no effect as one are always created",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def load_data(config: VisualizationsConfig) -> pd.DataFrame:
+    """Loads the benchmark data from the CSV file and filters it based on the configuration.
+
+    Args:
+        config (VisualizationsConfig): Configuration object for the visualizations script.
+
+    Raises:
+        ValueError: If no data is found for the given filters.
+
+    Returns:
+        pd.DataFrame: Filtered benchmark dataframe.
+    """
+    df = pd.read_csv(DATA_PATH)
+    df["extra_benchmark_config"] = df["extra_benchmark_config_str"].apply(json.loads)
+
+    base_filtered_df = df[
+        (df["kernel_name"] == config.kernel_name)
+        & (df["metric_name"] == config.metric_name)
+        & (df["kernel_operation_mode"] == config.kernel_operation_mode)
+    ]
+
+    if base_filtered_df.empty:
+        raise ValueError(
+            f"No data found for kernel_name='{config.kernel_name}', "
+            f"metric_name='{config.metric_name}', "
+            f"kernel_operation_mode='{config.kernel_operation_mode}'."
+        )
+
+    unique_extra_configs_str = base_filtered_df["extra_benchmark_config_str"].unique()
+    selected_extra_config_str = None
+
+    if len(unique_extra_configs_str) == 0:
+        print(
+            "Warning: No extra_benchmark_config found for the initial filters. "
+            "Proceeding with all data from initial filter."
+        )
+        return base_filtered_df
+
+    if config.extra_config_filter:
+        matched_configs = []
+        try:
+            if "=" in config.extra_config_filter:
+                key_filter, value_filter = config.extra_config_filter.split("=", 1)
+                for cfg_str in unique_extra_configs_str:
+                    cfg_json = json.loads(cfg_str)
+                    if str(cfg_json.get(key_filter.strip("'\" "))) == value_filter.strip("'\" "):
+                        matched_configs.append(cfg_str)
+            if not matched_configs:
+                matched_configs = [
+                    cfg_str for cfg_str in unique_extra_configs_str if config.extra_config_filter in cfg_str
+                ]
+        except Exception as e:
+            print(
+                f"Note: Could not parse extra_config_filter '{config.extra_config_filter}' as key=value ({e}), using substring match."
+            )
+            matched_configs = [cfg_str for cfg_str in unique_extra_configs_str if config.extra_config_filter in cfg_str]
+
+        if matched_configs:
+            if len(matched_configs) > 1:
+                print(
+                    f"Warning: Multiple extra_benchmark_configs match filter '{config.extra_config_filter}': {matched_configs}. "
+                    f"Using the first one: {matched_configs[0]}"
+                )
+            selected_extra_config_str = matched_configs[0]
+        else:
+            print(
+                f"Warning: No extra_benchmark_config matches filter '{config.extra_config_filter}'. "
+                f"Available configs for {config.kernel_name} ({config.metric_name}, {config.kernel_operation_mode}): {list(unique_extra_configs_str)}"
+            )
+            if len(unique_extra_configs_str) > 0:
+                selected_extra_config_str = unique_extra_configs_str[0]
+                print(f"Defaulting to the first available extra_benchmark_config: {selected_extra_config_str}")
+            else:
+                raise ValueError("No extra_benchmark_config available to select after failed filter attempt.")
+
+    elif len(unique_extra_configs_str) > 1:
+        selected_extra_config_str = unique_extra_configs_str[0]
+        print(
+            f"Warning: Multiple extra_benchmark_configs found for {config.kernel_name} ({config.metric_name}, {config.kernel_operation_mode})."
+        )
+        print(f"Defaulting to use: {selected_extra_config_str}")
+        print(f"Available configs: {list(unique_extra_configs_str)}")
+        print(
+            "Use the --extra-config-filter argument to select a specific one "
+            "(e.g., --extra-config-filter \"'H': 4096\" or a substring like \"'seq_len': 512\")."
+        )
+    elif len(unique_extra_configs_str) == 1:
+        selected_extra_config_str = unique_extra_configs_str[0]
+        print(f"Using unique extra_benchmark_config: {selected_extra_config_str}")
+
+    if selected_extra_config_str:
+        final_filtered_df = base_filtered_df[
+            base_filtered_df["extra_benchmark_config_str"] == selected_extra_config_str
+        ]
+    else:
+        print("Warning: Could not select an extra_benchmark_config. Using data from initial filter if any.")
+        final_filtered_df = base_filtered_df
+
+    if final_filtered_df.empty:
+        raise ValueError(
+            f"No data found after attempting to filter by extra_benchmark_config. "
+            f"Selected/Defaulted extra_config_str: {selected_extra_config_str}"
+            if selected_extra_config_str
+            else "No specific extra_config was selected."
+        )
+
+    print(
+        f"Plotting data for extra_benchmark_config: {json.loads(selected_extra_config_str if selected_extra_config_str else '{}')}"
+    )
+    return final_filtered_df
+
+
+def plot_data(df: pd.DataFrame, config: VisualizationsConfig):
+    """Plots the benchmark data, saving the result if needed.
+
+    Args:
+        df (pd.DataFrame): Filtered benchmark dataframe.
+        config (VisualizationsConfig): Configuration object for the visualizations script.
+    """
+    for col in ["y_value_20", "y_value_50", "y_value_80"]:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+
+    xlabel = df["x_label"].iloc[0]
+    ylabel = f"{config.metric_name} ({df['metric_unit'].iloc[0]})"
+    # Sort by "kernel_provider" to ensure consistent color assignment
+    df = df.sort_values(by="kernel_provider")
+
+    plt.figure(figsize=(10, 6))
+    sns.set(style="whitegrid")
+    try:
+        ax = sns.lineplot(
+            data=df,
+            x="x_value",
+            y="y_value_50",
+            hue="kernel_provider",
+            marker="o",
+            palette="tab10",
+            errorbar=("ci", None),
+        )
+    except Exception:
+        ax = sns.lineplot(
+            data=df,
+            x="x_value",
+            y="y_value_50",
+            hue="kernel_provider",
+            marker="o",
+            palette="tab10",
+            errorbar=None,
+        )
+
+    # Seaborn can't plot pre-computed error bars, so we need to do it manually
+    lines = ax.get_lines()
+    colors = [line.get_color() for line in lines]
+
+    for (_, group_data), color in zip(df.groupby("kernel_provider"), colors):
+        # for i, row in group_data.iterrows():
+        y_error_lower = group_data["y_value_50"] - group_data["y_value_20"]
+        y_error_upper = group_data["y_value_80"] - group_data["y_value_50"]
+        y_error = [y_error_lower, y_error_upper]
+
+        plt.errorbar(
+            group_data["x_value"],
+            group_data["y_value_50"],
+            yerr=y_error,
+            fmt="o",
+            color=color,
+            capsize=5,
+        )
+    plt.legend(title="Kernel Provider")
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.tight_layout()
+
+    out_path = os.path.join(
+        VISUALIZATIONS_PATH,
+        f"{config.kernel_name}_{config.metric_name}_{config.kernel_operation_mode}.png",
+    )
+
+    if config.display:
+        plt.show()
+    if config.overwrite or not os.path.exists(
+        out_path
+    ):  # Save the plot if it doesn't exist or if we want to overwrite it
+        os.makedirs(VISUALIZATIONS_PATH, exist_ok=True)
+        plt.savefig(out_path)
+    plt.close()
+
+
+def main():
+    args = parse_args()
+    all_df = pd.read_csv(DATA_PATH)
+    all_df["extra_benchmark_config"] = all_df["extra_benchmark_config_str"].apply(json.loads)
+
+    if args.metric_name == "memory":
+        modes = ["full"]
+    elif args.kernel_operation_mode:
+        modes = args.kernel_operation_mode
+    else:
+        filtered = all_df[(all_df["kernel_name"] == args.kernel_name) & (all_df["metric_name"] == args.metric_name)]
+        modes = filtered["kernel_operation_mode"].unique().tolist()
+        if not modes:
+            print(f"No data found for kernel '{args.kernel_name}' and metric '{args.metric_name}'.", file=sys.stderr)
+            sys.exit(1)
+
+    for mode in modes:
+        config = VisualizationsConfig(
+            kernel_name=args.kernel_name,
+            metric_name=args.metric_name,
+            kernel_operation_mode=mode,
+            display=args.display,
+            overwrite=args.overwrite,
+        )
+        df = load_data(config)
+        plot_data(df, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/data/all_benchmark_data.csv b/benchmark/data/all_benchmark_data.csv
new file mode 100755
index 0000000000000000000000000000000000000000..f63286a16a0e577b7a51bb672706cb713172f024
--- /dev/null
+++ b/benchmark/data/all_benchmark_data.csv
@@ -0,0 +1,1957 @@
+kernel_name,kernel_provider,kernel_operation_mode,metric_name,metric_unit,x_name,x_label,x_value,y_value_50,y_value_20,y_value_80,extra_benchmark_config_str,gpu_name,timestamp,liger_version
+cross_entropy,liger,forward,speed,ms,V,vocab size,4096,0.5324159860610962,0.5291008353233337,0.53476482629776,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:39,0.2.1
+cross_entropy,liger,forward,speed,ms,V,vocab size,8192,0.8101439476013184,0.7565760016441345,0.9144319891929626,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:39,0.2.1
+cross_entropy,liger,forward,speed,ms,V,vocab size,16384,1.4320800304412842,1.4087040424346924,1.5254720449447632,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:39,0.2.1
+cross_entropy,liger,forward,speed,ms,V,vocab size,32768,2.8378241062164307,2.805759906768799,2.9447360038757324,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:39,0.2.1
+cross_entropy,liger,forward,speed,ms,V,vocab size,65536,6.805135726928711,6.790579319000244,6.98748779296875,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:39,0.2.1
+cross_entropy,liger,forward,speed,ms,V,vocab size,131072,15.009359359741211,15.00483226776123,15.045599937438965,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:39,0.2.1
+cross_entropy,huggingface,forward,speed,ms,V,vocab size,4096,0.8751360177993774,0.87330561876297,0.8773248195648193,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:40,0.2.1
+cross_entropy,huggingface,forward,speed,ms,V,vocab size,8192,1.188480019569397,1.1871488094329834,1.1901824474334717,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:40,0.2.1
+cross_entropy,huggingface,forward,speed,ms,V,vocab size,16384,1.9522240161895752,1.9451839923858643,1.962073564529419,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:40,0.2.1
+cross_entropy,huggingface,forward,speed,ms,V,vocab size,32768,5.316768169403076,5.314131259918213,5.319046497344971,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:40,0.2.1
+cross_entropy,huggingface,forward,speed,ms,V,vocab size,65536,10.615103721618652,10.607129096984863,10.61723518371582,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:40,0.2.1
+cross_entropy,huggingface,forward,speed,ms,V,vocab size,131072,20.72643280029297,20.72038459777832,20.758554458618164,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:40,0.2.1
+cross_entropy,liger,full,speed,ms,V,vocab size,4096,0.8637440204620361,0.8607680201530457,0.8670976161956787,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:41,0.2.1
+cross_entropy,liger,full,speed,ms,V,vocab size,8192,1.462272047996521,1.4576319456100464,1.4661248922348022,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:41,0.2.1
+cross_entropy,liger,full,speed,ms,V,vocab size,16384,2.7454559803009033,2.741612672805786,2.780428647994995,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:41,0.2.1
+cross_entropy,liger,full,speed,ms,V,vocab size,32768,5.403264045715332,5.398873329162598,5.4122114181518555,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:41,0.2.1
+cross_entropy,liger,full,speed,ms,V,vocab size,65536,11.925024032592773,11.919878005981445,11.92919635772705,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:41,0.2.1
+cross_entropy,liger,full,speed,ms,V,vocab size,131072,25.22287940979004,25.21867561340332,25.23493766784668,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:41,0.2.1
+cross_entropy,huggingface,full,speed,ms,V,vocab size,4096,2.2260000705718994,2.2239038944244385,2.2290303707122803,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,speed,ms,V,vocab size,8192,3.5976319313049316,3.595616102218628,3.6007039546966553,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,speed,ms,V,vocab size,16384,6.8023200035095215,6.795276641845703,6.806528091430664,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,speed,ms,V,vocab size,32768,15.486032485961914,15.483936309814453,15.48681640625,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,speed,ms,V,vocab size,65536,30.778079986572266,30.76335334777832,30.77827262878418,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,speed,ms,V,vocab size,131072,60.43830490112305,60.43830490112305,60.43830490112305,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,liger,full,memory,MB,V,vocab size,4096,256.32861328125,256.32861328125,256.32861328125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,liger,full,memory,MB,V,vocab size,8192,512.32861328125,512.32861328125,512.32861328125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,liger,full,memory,MB,V,vocab size,16384,1024.32861328125,1024.32861328125,1024.32861328125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,liger,full,memory,MB,V,vocab size,32768,2048.32861328125,2048.32861328125,2048.32861328125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,liger,full,memory,MB,V,vocab size,65536,4096.32861328125,4096.32861328125,4096.32861328125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,liger,full,memory,MB,V,vocab size,131072,8192.328125,8192.328125,8192.328125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,memory,MB,V,vocab size,4096,1280.1259765625,1280.1259765625,1280.1259765625,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,memory,MB,V,vocab size,8192,2560.1259765625,2560.1259765625,2560.1259765625,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,memory,MB,V,vocab size,16384,5120.1259765625,5120.1259765625,5120.1259765625,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,memory,MB,V,vocab size,32768,10240.1259765625,10240.1259765625,10240.1259765625,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,memory,MB,V,vocab size,65536,20480.125,20480.125,20480.125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+cross_entropy,huggingface,full,memory,MB,V,vocab size,131072,40960.125,40960.125,40960.125,"{""B"": 8, ""T"": 2048}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:42,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,1024,0.04262400045990944,0.04214400053024292,0.04428799822926521,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,2048,0.04668800160288811,0.04560000076889992,0.04825599864125252,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,4096,0.0493599995970726,0.048153601586818695,0.05084799975156784,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,8192,0.05558399856090546,0.054207999259233475,0.0568000003695488,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,16384,0.061503998935222626,0.06022400036454201,0.06260479986667633,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,32768,0.06518399715423584,0.06406400352716446,0.06634879857301712,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,65536,0.06779199838638306,0.06656000018119812,0.06905599683523178,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,131072,0.07091200351715088,0.06963200122117996,0.07225599884986877,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:53,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,1024,0.16672000288963318,0.1416832059621811,0.16777600347995758,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,2048,0.14406399428844452,0.1435839980840683,0.1446399986743927,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,4096,0.1539199948310852,0.15334400534629822,0.1546431928873062,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,8192,0.1627199947834015,0.16179199516773224,0.16357119381427765,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,16384,0.1666879951953888,0.16587519645690918,0.16772480309009552,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,32768,0.1687680035829544,0.16784639656543732,0.1697216033935547,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,65536,0.16918399930000305,0.1685439944267273,0.17001600563526154,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,131072,0.17027199268341064,0.16927999258041382,0.17123199999332428,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:31:56,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,1024,0.039712000638246536,0.03798399865627289,0.04079360142350197,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,2048,0.04652800038456917,0.045318398624658585,0.04755200073122978,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,4096,0.05462399870157242,0.05361919850111008,0.05580800026655197,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,8192,0.06015999987721443,0.059487998485565186,0.06102399900555611,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,16384,0.06412799656391144,0.06329599767923355,0.06508159637451172,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,32768,0.066880002617836,0.06583040207624435,0.06777600198984146,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,65536,0.06896000355482101,0.06785280257463455,0.07009919732809067,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,131072,0.06915199756622314,0.0682239979505539,0.06998399645090103,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:01,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,1024,0.44515201449394226,0.4440639913082123,0.4463231861591339,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,2048,0.4620960056781769,0.4610239863395691,0.46300798654556274,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,4096,0.49136000871658325,0.4905087947845459,0.49270400404930115,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,8192,0.5527999997138977,0.5520448088645935,0.5538623929023743,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,16384,0.6350079774856567,0.6340479850769043,0.6363840103149414,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,32768,0.7710559964179993,0.7691839933395386,0.7727680206298828,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,65536,1.002560019493103,1.0006400346755981,1.004467248916626,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,131072,1.4482879638671875,1.4459072351455688,1.4513407945632935,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:05,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,1024,0.4537919759750366,0.4517247974872589,0.46081918478012085,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,2048,0.47407999634742737,0.4729023873806,0.47523200511932373,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,4096,0.5310080051422119,0.5298879742622375,0.5320383906364441,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,8192,0.6528639793395996,0.6514303684234619,0.6546239852905273,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,16384,0.8056960105895996,0.8048319816589355,0.807424008846283,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,32768,0.954543948173523,0.9533119797706604,0.9559999704360962,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,65536,1.1960480213165283,1.1946111917495728,1.1982656717300415,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,131072,1.642624020576477,1.6409599781036377,1.6447807550430298,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:08,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,1024,0.3001280128955841,0.29503998160362244,0.30576640367507935,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,2048,0.297760009765625,0.2938239872455597,0.3054080009460449,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,4096,0.2991679906845093,0.2956480085849762,0.3070079982280731,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,8192,0.2961280047893524,0.2899264097213745,0.3029248118400574,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,16384,0.3465920090675354,0.34563198685646057,0.3476351797580719,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,32768,0.46585598587989807,0.4641471803188324,0.4674175977706909,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,65536,0.6924160122871399,0.6907200217247009,0.6938239932060242,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,131072,1.1352640390396118,1.1327999830245972,1.1376447677612305,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:13,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,1024,0.18961599469184875,0.1879040002822876,0.19174399971961975,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,2048,0.21296000480651855,0.2112639993429184,0.21513600647449493,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,4096,0.2367040067911148,0.23467519879341125,0.23888640105724335,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,8192,0.26335999369621277,0.26099199056625366,0.2656640112400055,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,16384,0.2850880026817322,0.28336000442504883,0.2869440019130707,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,32768,0.30460798740386963,0.3023360073566437,0.30684158205986023,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,65536,0.31569600105285645,0.3138048052787781,0.3180544078350067,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,liger,forward,speed,ms,V,embedding dimension,131072,0.31988799571990967,0.31808000802993774,0.3219392001628876,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:28,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,1024,0.7865599989891052,0.7846271991729736,0.7891008257865906,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,2048,0.8262079954147339,0.8236607909202576,0.8279871940612793,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,4096,0.8446240425109863,0.8429504036903381,0.8475391864776611,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,8192,0.8540480136871338,0.8518400192260742,0.8557760119438171,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,16384,0.857695996761322,0.8553280234336853,0.8595200181007385,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,32768,0.8596479892730713,0.8576639890670776,0.8618879914283752,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,65536,1.0087039470672607,0.8624832034111023,1.0126848220825195,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,huggingface,forward,speed,ms,V,embedding dimension,131072,0.8633919954299927,0.8609600067138672,0.8647680282592773,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:43,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,1024,0.2572160065174103,0.255840003490448,0.25833600759506226,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,2048,0.2817760109901428,0.2805440127849579,0.2831552028656006,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,4096,0.30182400345802307,0.3002175986766815,0.3032831847667694,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,8192,0.3126400113105774,0.3114303946495056,0.31427839398384094,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,16384,0.3190400004386902,0.31795841455459595,0.32016000151634216,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,32768,0.32419198751449585,0.32281601428985596,0.32559359073638916,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,65536,0.3238080143928528,0.32236799597740173,0.3250240087509155,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,torch_compile,forward,speed,ms,V,embedding dimension,131072,0.3256959915161133,0.32434558868408203,0.32689279317855835,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:32:58,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,1024,2.17740797996521,2.1755776405334473,2.180025577545166,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,2048,2.2861440181732178,2.284735918045044,2.2882239818573,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,4096,2.4825921058654785,2.48024320602417,2.484800100326538,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,8192,2.74452805519104,2.7430784702301025,2.7452287673950195,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,16384,3.1216320991516113,3.1202433109283447,3.125638484954834,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,32768,3.7801599502563477,3.774118423461914,3.7824511528015137,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,65536,4.991136074066162,4.9875006675720215,4.993491172790527,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,liger,full,speed,ms,V,embedding dimension,131072,7.383471965789795,7.377497673034668,7.386828899383545,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:13,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,1024,1.5774879455566406,1.5668543577194214,1.7933248281478882,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,2048,1.7074079513549805,1.7012799978256226,1.8109056949615479,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,4096,1.950543999671936,1.9466559886932373,1.9592640399932861,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,8192,2.404927968978882,2.400460720062256,2.4551360607147217,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,16384,3.119904041290283,3.1171774864196777,3.1267263889312744,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,32768,4.32857608795166,4.321491241455078,4.439519882202148,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,65536,5.065216064453125,5.059558391571045,5.115980625152588,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,huggingface,full,speed,ms,V,embedding dimension,131072,7.489376068115234,7.484294414520264,7.5203776359558105,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:28,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,1024,1.0930559635162354,1.0918079614639282,1.0945919752120972,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,2048,1.1930559873580933,1.191705584526062,1.1951104402542114,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,4096,1.3096319437026978,1.3073855638504028,1.3119615316390991,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,8192,1.4822720289230347,1.480512022972107,1.4839999675750732,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,16384,1.7870559692382812,1.7859647274017334,1.7892736196517944,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,32768,2.3838400840759277,2.381312131881714,2.3860929012298584,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,65536,3.7430078983306885,3.740166425704956,3.745452880859375,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,torch_compile,full,speed,ms,V,embedding dimension,131072,5.940896034240723,5.934713363647461,5.943462371826172,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:43,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,1024,12348.125,12348.125,12348.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,2048,12360.125,12360.125,12360.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,4096,12384.125,12384.125,12384.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,8192,12432.125,12432.125,12432.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,16384,12528.125,12528.125,12528.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,32768,12720.125,12720.125,12720.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,65536,13104.125,13104.125,13104.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,131072,13872.125,13872.125,13872.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:45,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,1024,12356.537109375,12356.537109375,12356.537109375,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,2048,12371.359375,12371.359375,12371.359375,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,4096,12401.40625,12401.40625,12401.40625,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,8192,12461.5,12461.5,12461.5,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,16384,12581.6875,12581.6875,12581.6875,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,32768,12773.6875,12773.6875,12773.6875,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,65536,13157.6875,13157.6875,13157.6875,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,131072,13925.6875,13925.6875,13925.6875,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:48,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,1024,12348.125,12348.125,12348.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,2048,12366.125,12366.125,12366.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,4096,12402.125,12402.125,12402.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,8192,12474.125,12474.125,12474.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,16384,12618.125,12618.125,12618.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,32768,12906.125,12906.125,12906.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,65536,13482.125,13482.125,13482.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,131072,14634.125,14634.125,14634.125,"{""B"": 32, ""T"": 512, ""D"": 768, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:33:52,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,1024,14346.125,14346.125,14346.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,2048,14410.125,14410.125,14410.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,4096,14538.125,14538.125,14538.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,8192,14794.125,14794.125,14794.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,16384,15306.125,15306.125,15306.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,32768,16330.125,16330.125,16330.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,65536,18378.125,18378.125,18378.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,liger,full,memory,MB,V,embedding dimension,131072,22474.125,22474.125,22474.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:04,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,1024,14388.130859375,14388.130859375,14388.130859375,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,2048,14468.154296875,14468.154296875,14468.154296875,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,4096,14628.201171875,14628.201171875,14628.201171875,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,8192,14948.294921875,14948.294921875,14948.294921875,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,16384,15588.482421875,15588.482421875,15588.482421875,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,32768,16612.482421875,16612.482421875,16612.482421875,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,65536,18660.482421875,18660.482421875,18660.482421875,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,huggingface,full,memory,MB,V,embedding dimension,131072,22756.482421875,22756.482421875,22756.482421875,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:17,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,1024,14346.125,14346.125,14346.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,2048,14442.125,14442.125,14442.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,4096,14634.125,14634.125,14634.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,8192,15018.125,15018.125,15018.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,16384,1536.125,1536.125,1536.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,32768,3072.125,3072.125,3072.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,65536,6144.125,6144.125,6144.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+embedding,torch_compile,full,memory,MB,V,embedding dimension,131072,12288.125,12288.125,12288.125,"{""B"": 8, ""T"": 2048, ""D"": 4096, ""dtype"": ""torch.float32""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:34:31,0.2.1
+fused_linear_cross_entropy,liger,forward,speed,ms,BT,B x T,4096,119.52153778076172,119.52153778076172,119.52153778076172,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:03,0.4.2
+fused_linear_cross_entropy,liger,forward,speed,ms,BT,B x T,8192,168.08563232421875,168.08563232421875,168.08563232421875,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:03,0.4.2
+fused_linear_cross_entropy,liger,forward,speed,ms,BT,B x T,16384,274.07342529296875,274.07342529296875,274.07342529296875,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:03,0.4.2
+fused_linear_cross_entropy,liger,forward,speed,ms,BT,B x T,32768,508.4652099609375,508.4652099609375,508.4652099609375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:03,0.4.2
+fused_linear_cross_entropy,huggingface,forward,speed,ms,BT,B x T,4096,20.911680221557617,20.90903663635254,20.915321350097656,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:34,0.4.2
+fused_linear_cross_entropy,huggingface,forward,speed,ms,BT,B x T,8192,37.97203063964844,37.9546012878418,37.989463806152344,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:34,0.4.2
+fused_linear_cross_entropy,huggingface,forward,speed,ms,BT,B x T,16384,76.39142608642578,76.39142608642578,76.39142608642578,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:34,0.4.2
+fused_linear_cross_entropy,huggingface,forward,speed,ms,BT,B x T,32768,151.91404724121094,151.91404724121094,151.91404724121094,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:44:34,0.4.2
+fused_linear_cross_entropy,liger,full,speed,ms,BT,B x T,4096,121.43059539794922,121.43059539794922,121.43059539794922,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:11,0.4.2
+fused_linear_cross_entropy,liger,full,speed,ms,BT,B x T,8192,166.70867919921875,166.70867919921875,166.70867919921875,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:11,0.4.2
+fused_linear_cross_entropy,liger,full,speed,ms,BT,B x T,16384,277.1166687011719,277.1166687011719,277.1166687011719,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:11,0.4.2
+fused_linear_cross_entropy,liger,full,speed,ms,BT,B x T,32768,511.0638732910156,511.0638732910156,511.0638732910156,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:11,0.4.2
+fused_linear_cross_entropy,huggingface,full,speed,ms,BT,B x T,4096,55.96684646606445,55.96684646606445,55.96684646606445,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:46,0.4.2
+fused_linear_cross_entropy,huggingface,full,speed,ms,BT,B x T,8192,111.45471954345703,111.45471954345703,111.45471954345703,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:46,0.4.2
+fused_linear_cross_entropy,huggingface,full,speed,ms,BT,B x T,16384,220.7836151123047,220.7836151123047,220.7836151123047,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:46,0.4.2
+fused_linear_cross_entropy,huggingface,full,speed,ms,BT,B x T,32768,452.4712829589844,452.4712829589844,452.4712829589844,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:45:46,0.4.2
+fused_linear_cross_entropy,liger,full,memory,MB,BT,B x T,4096,4245.5478515625,4245.5478515625,4245.5478515625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:25,0.4.2
+fused_linear_cross_entropy,liger,full,memory,MB,BT,B x T,8192,4466.9697265625,4466.9697265625,4466.9697265625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:25,0.4.2
+fused_linear_cross_entropy,liger,full,memory,MB,BT,B x T,16384,4910.4384765625,4910.4384765625,4910.4384765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:25,0.4.2
+fused_linear_cross_entropy,liger,full,memory,MB,BT,B x T,32768,5794.6259765625,5794.6259765625,5794.6259765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:25,0.4.2
+fused_linear_cross_entropy,huggingface,full,memory,MB,BT,B x T,4096,6092.2822265625,6092.2822265625,6092.2822265625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:53,0.4.2
+fused_linear_cross_entropy,huggingface,full,memory,MB,BT,B x T,8192,9162.3134765625,9162.3134765625,9162.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:53,0.4.2
+fused_linear_cross_entropy,huggingface,full,memory,MB,BT,B x T,16384,15302.3759765625,15302.3759765625,15302.3759765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:53,0.4.2
+fused_linear_cross_entropy,huggingface,full,memory,MB,BT,B x T,32768,27582.5,27582.5,27582.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-22 17:46:53,0.4.2
+geglu,liger,full,speed,ms,T,sequence length,1024,30.03536033630371,30.03536033630371,30.03536033630371,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:14,0.2.1
+geglu,liger,full,speed,ms,T,sequence length,2048,54.04060745239258,54.04060745239258,54.04060745239258,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:14,0.2.1
+geglu,liger,full,speed,ms,T,sequence length,4096,108.52435302734375,108.52435302734375,108.52435302734375,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:14,0.2.1
+geglu,liger,full,speed,ms,T,sequence length,8192,216.6227264404297,216.6227264404297,216.6227264404297,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:14,0.2.1
+geglu,huggingface,full,speed,ms,T,sequence length,1024,27.938560485839844,27.938560485839844,27.938560485839844,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:21,0.2.1
+geglu,huggingface,full,speed,ms,T,sequence length,2048,54.51279830932617,54.51279830932617,54.51279830932617,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:21,0.2.1
+geglu,huggingface,full,speed,ms,T,sequence length,4096,110.97718048095703,110.97718048095703,110.97718048095703,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:21,0.2.1
+geglu,huggingface,full,speed,ms,T,sequence length,8192,220.93954467773438,220.93954467773438,220.93954467773438,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:21,0.2.1
+geglu,liger,forward,speed,ms,T,sequence length,1024,9.280096054077148,9.280096054077148,9.280096054077148,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:26,0.2.1
+geglu,liger,forward,speed,ms,T,sequence length,2048,17.59040069580078,17.59040069580078,17.59040069580078,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:26,0.2.1
+geglu,liger,forward,speed,ms,T,sequence length,4096,36.18726348876953,36.18726348876953,36.18726348876953,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:26,0.2.1
+geglu,liger,forward,speed,ms,T,sequence length,8192,72.60655975341797,72.60655975341797,72.60655975341797,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:26,0.2.1
+geglu,huggingface,forward,speed,ms,T,sequence length,1024,9.257439613342285,9.257439613342285,9.257439613342285,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:31,0.2.1
+geglu,huggingface,forward,speed,ms,T,sequence length,2048,18.099519729614258,18.099519729614258,18.099519729614258,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:31,0.2.1
+geglu,huggingface,forward,speed,ms,T,sequence length,4096,36.37263870239258,36.37263870239258,36.37263870239258,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:31,0.2.1
+geglu,huggingface,forward,speed,ms,T,sequence length,8192,72.66553497314453,72.66553497314453,72.66553497314453,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:31,0.2.1
+geglu,liger,backward,speed,ms,T,sequence length,1024,18.088287353515625,18.088287353515625,18.088287353515625,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:37,0.2.1
+geglu,liger,backward,speed,ms,T,sequence length,2048,35.195518493652344,35.195518493652344,35.195518493652344,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:37,0.2.1
+geglu,liger,backward,speed,ms,T,sequence length,4096,70.51395416259766,70.51395416259766,70.51395416259766,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:37,0.2.1
+geglu,liger,backward,speed,ms,T,sequence length,8192,141.28550720214844,141.28550720214844,141.28550720214844,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:37,0.2.1
+geglu,huggingface,backward,speed,ms,T,sequence length,1024,18.521728515625,18.521728515625,18.521728515625,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:42,0.2.1
+geglu,huggingface,backward,speed,ms,T,sequence length,2048,36.045406341552734,36.045406341552734,36.045406341552734,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:42,0.2.1
+geglu,huggingface,backward,speed,ms,T,sequence length,4096,72.88412475585938,72.88412475585938,72.88412475585938,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:42,0.2.1
+geglu,huggingface,backward,speed,ms,T,sequence length,8192,144.2132110595703,144.2132110595703,144.2132110595703,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:42,0.2.1
+geglu,liger,full,memory,MB,T,sequence length,1024,1582.25,1582.25,1582.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:45,0.2.1
+geglu,liger,full,memory,MB,T,sequence length,2048,2546.25,2546.25,2546.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:45,0.2.1
+geglu,liger,full,memory,MB,T,sequence length,4096,4474.25,4474.25,4474.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:45,0.2.1
+geglu,liger,full,memory,MB,T,sequence length,8192,8330.25,8330.25,8330.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:45,0.2.1
+geglu,huggingface,full,memory,MB,T,sequence length,1024,1992.25,1992.25,1992.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:50,0.2.1
+geglu,huggingface,full,memory,MB,T,sequence length,2048,3452.25,3452.25,3452.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:50,0.2.1
+geglu,huggingface,full,memory,MB,T,sequence length,4096,6372.25,6372.25,6372.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:50,0.2.1
+geglu,huggingface,full,memory,MB,T,sequence length,8192,12212.25,12212.25,12212.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:50,0.2.1
+geglu,liger,forward,memory,MB,T,sequence length,1024,918.25,918.25,918.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:55,0.2.1
+geglu,liger,forward,memory,MB,T,sequence length,2048,1562.25,1562.25,1562.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:55,0.2.1
+geglu,liger,forward,memory,MB,T,sequence length,4096,2850.25,2850.25,2850.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:55,0.2.1
+geglu,liger,forward,memory,MB,T,sequence length,8192,5426.25,5426.25,5426.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:55,0.2.1
+geglu,huggingface,forward,memory,MB,T,sequence length,1024,1090.25,1090.25,1090.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:58,0.2.1
+geglu,huggingface,forward,memory,MB,T,sequence length,2048,1906.25,1906.25,1906.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:58,0.2.1
+geglu,huggingface,forward,memory,MB,T,sequence length,4096,3538.25,3538.25,3538.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:58,0.2.1
+geglu,huggingface,forward,memory,MB,T,sequence length,8192,6802.25,6802.25,6802.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:38:58,0.2.1
+geglu,liger,backward,memory,MB,T,sequence length,1024,1582.25,1582.25,1582.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:02,0.2.1
+geglu,liger,backward,memory,MB,T,sequence length,2048,2546.25,2546.25,2546.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:02,0.2.1
+geglu,liger,backward,memory,MB,T,sequence length,4096,4474.25,4474.25,4474.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:02,0.2.1
+geglu,liger,backward,memory,MB,T,sequence length,8192,8330.25,8330.25,8330.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:02,0.2.1
+geglu,huggingface,backward,memory,MB,T,sequence length,1024,1992.25,1992.25,1992.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:06,0.2.1
+geglu,huggingface,backward,memory,MB,T,sequence length,2048,3452.25,3452.25,3452.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:06,0.2.1
+geglu,huggingface,backward,memory,MB,T,sequence length,4096,6372.25,6372.25,6372.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:06,0.2.1
+geglu,huggingface,backward,memory,MB,T,sequence length,8192,12212.25,12212.25,12212.25,"{""bsz"": 8, ""hidden_size"": 4096, ""intermediate_size"": 11008, ""hidden_act"": ""gelu_pytorch_tanh"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:06,0.2.1
+layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.030271999537944794,0.02921600081026554,0.03142400085926056,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:14,0.2.1
+layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.04992000013589859,0.04912000149488449,0.050783999264240265,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:14,0.2.1
+layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.08816000074148178,0.08739200234413147,0.08899199962615967,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:14,0.2.1
+layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.16521599888801575,0.16435199975967407,0.16627199947834015,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:14,0.2.1
+layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.32230401039123535,0.32070401310920715,0.32393598556518555,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:14,0.2.1
+layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.034143999218940735,0.033376000821590424,0.03580800071358681,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:17,0.2.1
+layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.05734400078654289,0.05615999922156334,0.05859199911355972,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:17,0.2.1
+layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.1218239963054657,0.12054400146007538,0.12316799908876419,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:17,0.2.1
+layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.25755199790000916,0.255840003490448,0.25939199328422546,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:17,0.2.1
+layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.5066879987716675,0.5045183897018433,0.5089280009269714,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:17,0.2.1
+layer_norm,liger,full,speed,ms,N,hidden size,1024,0.28019198775291443,0.2780799865722656,0.284960001707077,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:19,0.2.1
+layer_norm,liger,full,speed,ms,N,hidden size,2048,0.27827200293540955,0.27638399600982666,0.2824704051017761,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:19,0.2.1
+layer_norm,liger,full,speed,ms,N,hidden size,4096,0.2847039997577667,0.27955201268196106,0.2908479869365692,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:19,0.2.1
+layer_norm,liger,full,speed,ms,N,hidden size,8192,0.4405759871006012,0.43780481815338135,0.4440320134162903,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:19,0.2.1
+layer_norm,liger,full,speed,ms,N,hidden size,16384,1.1488319635391235,1.1439871788024902,1.1527807712554932,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:19,0.2.1
+layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.11884800344705582,0.11750400066375732,0.12035199999809265,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.1966399997472763,0.19432319700717926,0.19888000190258026,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.43142399191856384,0.42931199073791504,0.4336639940738678,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,speed,ms,N,hidden size,8192,0.829584002494812,0.826918363571167,0.832857608795166,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,speed,ms,N,hidden size,16384,1.6212799549102783,1.6171647310256958,1.6246912479400635,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,liger,full,memory,MB,N,hidden size,1024,80.90625,80.90625,80.90625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,liger,full,memory,MB,N,hidden size,2048,161.78125,161.78125,161.78125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,liger,full,memory,MB,N,hidden size,4096,323.53125,323.53125,323.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,liger,full,memory,MB,N,hidden size,8192,647.03125,647.03125,647.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,liger,full,memory,MB,N,hidden size,16384,1294.03125,1294.03125,1294.03125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:21,0.2.1
+rms_norm,liger,forward,speed,ms,H,hidden size,1024,0.01360000018030405,0.012864000163972378,0.01603199914097786,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:30,0.2.1
+rms_norm,liger,forward,speed,ms,H,hidden size,2048,0.019999999552965164,0.018624000251293182,0.02160000056028366,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:30,0.2.1
+rms_norm,liger,forward,speed,ms,H,hidden size,4096,0.031072000041604042,0.030047999694943428,0.031968001276254654,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:30,0.2.1
+rms_norm,liger,forward,speed,ms,H,hidden size,8192,0.0517439991235733,0.050624001771211624,0.05289600044488907,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:30,0.2.1
+rms_norm,liger,forward,speed,ms,H,hidden size,16384,0.0952640026807785,0.0942080020904541,0.09667199850082397,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:30,0.2.1
+rms_norm,liger,forward,speed,ms,H,hidden size,32768,0.18223999440670013,0.18035200238227844,0.18417279422283173,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:30,0.2.1
+rms_norm,huggingface,forward,speed,ms,H,hidden size,1024,0.07820799946784973,0.0777600035071373,0.0790719985961914,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:33,0.2.1
+rms_norm,huggingface,forward,speed,ms,H,hidden size,2048,0.13631999492645264,0.13555200397968292,0.13731199502944946,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:33,0.2.1
+rms_norm,huggingface,forward,speed,ms,H,hidden size,4096,0.27990400791168213,0.2789439857006073,0.28118398785591125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:33,0.2.1
+rms_norm,huggingface,forward,speed,ms,H,hidden size,8192,0.5190399885177612,0.5175359845161438,0.5209856033325195,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:33,0.2.1
+rms_norm,huggingface,forward,speed,ms,H,hidden size,16384,0.9856320023536682,0.9835839867591858,0.9876928329467773,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:33,0.2.1
+rms_norm,huggingface,forward,speed,ms,H,hidden size,32768,1.9190720319747925,1.917081594467163,1.921875238418579,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:33,0.2.1
+rms_norm,liger,full,speed,ms,H,hidden size,1024,0.28601598739624023,0.2837119996547699,0.29068800806999207,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:36,0.2.1
+rms_norm,liger,full,speed,ms,H,hidden size,2048,0.286624014377594,0.2845824062824249,0.2905920147895813,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:36,0.2.1
+rms_norm,liger,full,speed,ms,H,hidden size,4096,0.28830400109291077,0.28533118963241577,0.2935168147087097,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:36,0.2.1
+rms_norm,liger,full,speed,ms,H,hidden size,8192,0.29407998919487,0.289216011762619,0.3038719892501831,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:36,0.2.1
+rms_norm,liger,full,speed,ms,H,hidden size,16384,0.410863995552063,0.4088575839996338,0.41293439269065857,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:36,0.2.1
+rms_norm,liger,full,speed,ms,H,hidden size,32768,1.2316479682922363,1.228230357170105,1.235001564025879,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:36,0.2.1
+rms_norm,huggingface,full,speed,ms,H,hidden size,1024,0.3176960051059723,0.3147839903831482,0.32177281379699707,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:40,0.2.1
+rms_norm,huggingface,full,speed,ms,H,hidden size,2048,0.49038398265838623,0.4888896048069,0.4920639991760254,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:40,0.2.1
+rms_norm,huggingface,full,speed,ms,H,hidden size,4096,1.011423945426941,1.0089855194091797,1.013759970664978,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:40,0.2.1
+rms_norm,huggingface,full,speed,ms,H,hidden size,8192,1.8621759414672852,1.859769582748413,1.8646591901779175,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:40,0.2.1
+rms_norm,huggingface,full,speed,ms,H,hidden size,16384,3.5439999103546143,3.5410239696502686,3.547679901123047,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:40,0.2.1
+rms_norm,huggingface,full,speed,ms,H,hidden size,32768,6.910431861877441,6.907142639160156,6.914393901824951,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:40,0.2.1
+rms_norm,liger,backward,speed,ms,H,hidden size,1024,0.09372799843549728,0.09177599847316742,0.09763199836015701,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:43,0.2.1
+rms_norm,liger,backward,speed,ms,H,hidden size,2048,0.09030400216579437,0.08746880292892456,0.09398400038480759,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:43,0.2.1
+rms_norm,liger,backward,speed,ms,H,hidden size,4096,0.09913600236177444,0.09804800152778625,0.10039679706096649,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:43,0.2.1
+rms_norm,liger,backward,speed,ms,H,hidden size,8192,0.17801600694656372,0.1765120029449463,0.1793919950723648,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:43,0.2.1
+rms_norm,liger,backward,speed,ms,H,hidden size,16384,0.32051199674606323,0.3187839984893799,0.32230401039123535,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:43,0.2.1
+rms_norm,liger,backward,speed,ms,H,hidden size,32768,1.0562880039215088,1.053491234779358,1.059673547744751,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:43,0.2.1
+rms_norm,huggingface,backward,speed,ms,H,hidden size,1024,0.19577600061893463,0.19523200392723083,0.19631999731063843,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,backward,speed,ms,H,hidden size,2048,0.36188799142837524,0.3601599931716919,0.363647997379303,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,backward,speed,ms,H,hidden size,4096,0.7403839826583862,0.7381759881973267,0.7426176071166992,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,backward,speed,ms,H,hidden size,8192,1.3515520095825195,1.348736047744751,1.3550655841827393,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,backward,speed,ms,H,hidden size,16384,2.569632053375244,2.5663681030273438,2.5731201171875,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,backward,speed,ms,H,hidden size,32768,5.0147199630737305,5.011123180389404,5.0179901123046875,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,liger,full,memory,MB,H,hidden size,1024,36.02392578125,36.02392578125,36.02392578125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,liger,full,memory,MB,H,hidden size,2048,72.03955078125,72.03955078125,72.03955078125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,liger,full,memory,MB,H,hidden size,4096,144.07080078125,144.07080078125,144.07080078125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,liger,full,memory,MB,H,hidden size,8192,268.13330078125,268.13330078125,268.13330078125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,liger,full,memory,MB,H,hidden size,16384,432.25830078125,432.25830078125,432.25830078125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,liger,full,memory,MB,H,hidden size,32768,752.5087890625,752.5087890625,752.5087890625,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,full,memory,MB,H,hidden size,1024,80.01953125,80.01953125,80.01953125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,full,memory,MB,H,hidden size,2048,160.03125,160.03125,160.03125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,full,memory,MB,H,hidden size,4096,320.0546875,320.0546875,320.0546875,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,full,memory,MB,H,hidden size,8192,640.1015625,640.1015625,640.1015625,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,full,memory,MB,H,hidden size,16384,1280.1953125,1280.1953125,1280.1953125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rms_norm,huggingface,full,memory,MB,H,hidden size,32768,2560.3828125,2560.3828125,2560.3828125,"{""M"": 2048, ""dtype"": ""torch.bfloat16"", ""eps"": 1e-06}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:46,0.2.1
+rope,liger,forward,speed,ms,H,hidden size,512,0.011359999887645245,0.01033599954098463,0.011455999687314034,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:55,0.2.1
+rope,liger,forward,speed,ms,H,hidden size,2048,0.020864000543951988,0.020447999238967896,0.02239999920129776,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:55,0.2.1
+rope,liger,forward,speed,ms,H,hidden size,8192,0.059487998485565186,0.05830400064587593,0.06060799956321716,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:55,0.2.1
+rope,huggingface,forward,speed,ms,H,hidden size,512,0.07968000322580338,0.07923199981451035,0.10408961027860641,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:56,0.2.1
+rope,huggingface,forward,speed,ms,H,hidden size,2048,0.1570879966020584,0.15651200711727142,0.15785600244998932,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:56,0.2.1
+rope,huggingface,forward,speed,ms,H,hidden size,8192,0.5167999863624573,0.5161600112915039,0.5176640152931213,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:56,0.2.1
+rope,liger,backward,speed,ms,H,hidden size,512,0.12227199971675873,0.05539200082421303,0.1699904054403305,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:57,0.2.1
+rope,liger,backward,speed,ms,H,hidden size,2048,0.12337599694728851,0.11945600062608719,0.15338242053985596,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:57,0.2.1
+rope,liger,backward,speed,ms,H,hidden size,8192,0.12812800705432892,0.11593600362539291,0.1985855996608734,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:57,0.2.1
+rope,huggingface,backward,speed,ms,H,hidden size,512,0.2648000121116638,0.2489279955625534,0.3578239977359772,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:59,0.2.1
+rope,huggingface,backward,speed,ms,H,hidden size,2048,0.2536320090293884,0.24692480266094208,0.31929606199264526,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:59,0.2.1
+rope,huggingface,backward,speed,ms,H,hidden size,8192,0.621504008769989,0.6208000183105469,0.6223679780960083,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:39:59,0.2.1
+rope,liger,full,speed,ms,H,hidden size,512,0.27401599287986755,0.26447999477386475,0.3555007874965668,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:00,0.2.1
+rope,liger,full,speed,ms,H,hidden size,2048,0.2815040051937103,0.26904961466789246,0.3562496304512024,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:00,0.2.1
+rope,liger,full,speed,ms,H,hidden size,8192,0.2759679853916168,0.267244815826416,0.3601728081703186,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:00,0.2.1
+rope,huggingface,full,speed,ms,H,hidden size,512,0.5160639882087708,0.5028480291366577,0.6553279757499695,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,huggingface,full,speed,ms,H,hidden size,2048,0.5289119482040405,0.510598361492157,0.7208256721496582,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,huggingface,full,speed,ms,H,hidden size,8192,1.1329920291900635,1.1318720579147339,1.1339199542999268,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,liger,full,memory,MB,H,hidden size,512,13.26611328125,13.26611328125,13.26611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,liger,full,memory,MB,H,hidden size,2048,28.64111328125,28.64111328125,28.64111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,liger,full,memory,MB,H,hidden size,8192,90.14111328125,90.14111328125,90.14111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,huggingface,full,memory,MB,H,hidden size,512,22.26611328125,22.26611328125,22.26611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,huggingface,full,memory,MB,H,hidden size,2048,64.64111328125,64.64111328125,64.64111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,huggingface,full,memory,MB,H,hidden size,8192,234.14111328125,234.14111328125,234.14111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:01,0.2.1
+rope,liger,forward,speed,ms,T,sequence length,1024,0.034432001411914825,0.03340800106525421,0.03545600175857544,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:04,0.2.1
+rope,liger,forward,speed,ms,T,sequence length,2048,0.058880001306533813,0.0578560009598732,0.059859201312065125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:04,0.2.1
+rope,liger,forward,speed,ms,T,sequence length,4096,0.10899200290441513,0.10784000158309937,0.1101439967751503,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:04,0.2.1
+rope,liger,forward,speed,ms,T,sequence length,8192,0.20927999913692474,0.20796799659729004,0.21059200167655945,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:04,0.2.1
+rope,liger,forward,speed,ms,T,sequence length,16384,0.4105280041694641,0.4089151918888092,0.41203200817108154,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:04,0.2.1
+rope,huggingface,forward,speed,ms,T,sequence length,1024,0.2808319926261902,0.28019198775291443,0.28160640597343445,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:06,0.2.1
+rope,huggingface,forward,speed,ms,T,sequence length,2048,0.5160959959030151,0.5155072212219238,0.5169280171394348,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:06,0.2.1
+rope,huggingface,forward,speed,ms,T,sequence length,4096,0.9947839975357056,0.9939200282096863,0.9956799745559692,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:06,0.2.1
+rope,huggingface,forward,speed,ms,T,sequence length,8192,1.9332640171051025,1.9323519468307495,1.9344960451126099,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:06,0.2.1
+rope,huggingface,forward,speed,ms,T,sequence length,16384,3.8169920444488525,3.815808057785034,3.8180160522460938,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:06,0.2.1
+rope,liger,backward,speed,ms,T,sequence length,1024,0.1260479986667633,0.12014079838991165,0.143449604511261,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:08,0.2.1
+rope,liger,backward,speed,ms,T,sequence length,2048,0.11606399714946747,0.11021439731121063,0.12432000041007996,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:08,0.2.1
+rope,liger,backward,speed,ms,T,sequence length,4096,0.12409599870443344,0.11817599833011627,0.1313920021057129,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:08,0.2.1
+rope,liger,backward,speed,ms,T,sequence length,8192,0.21004800498485565,0.20867200195789337,0.21164800226688385,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:08,0.2.1
+rope,liger,backward,speed,ms,T,sequence length,16384,0.4102399945259094,0.40871042013168335,0.4119040071964264,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:08,0.2.1
+rope,huggingface,backward,speed,ms,T,sequence length,1024,0.3304319977760315,0.3296447992324829,0.3314239978790283,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:10,0.2.1
+rope,huggingface,backward,speed,ms,T,sequence length,2048,0.6213759779930115,0.6205440163612366,0.6223359704017639,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:10,0.2.1
+rope,huggingface,backward,speed,ms,T,sequence length,4096,1.1872799396514893,1.1858432292938232,1.1886080503463745,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:10,0.2.1
+rope,huggingface,backward,speed,ms,T,sequence length,8192,2.321280002593994,2.318873643875122,2.324160099029541,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:10,0.2.1
+rope,huggingface,backward,speed,ms,T,sequence length,16384,4.557248115539551,4.550220966339111,4.560742378234863,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:10,0.2.1
+rope,liger,full,speed,ms,T,sequence length,1024,0.2682560086250305,0.2641535997390747,0.2762559950351715,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:12,0.2.1
+rope,liger,full,speed,ms,T,sequence length,2048,0.2654559910297394,0.26105600595474243,0.2746559977531433,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:12,0.2.1
+rope,liger,full,speed,ms,T,sequence length,4096,0.2650560140609741,0.2608831822872162,0.2715519964694977,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:12,0.2.1
+rope,liger,full,speed,ms,T,sequence length,8192,0.4158720076084137,0.41413119435310364,0.4178048074245453,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:12,0.2.1
+rope,liger,full,speed,ms,T,sequence length,16384,0.8167039752006531,0.8143680095672607,0.8189184069633484,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:12,0.2.1
+rope,huggingface,full,speed,ms,T,sequence length,1024,0.6059200167655945,0.6047679781913757,0.6072319746017456,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,huggingface,full,speed,ms,T,sequence length,2048,1.1326719522476196,1.1318080425262451,1.133631944656372,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,huggingface,full,speed,ms,T,sequence length,4096,2.176192045211792,2.175136089324951,2.177433729171753,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,huggingface,full,speed,ms,T,sequence length,8192,4.248256206512451,4.246367931365967,4.2566399574279785,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,huggingface,full,speed,ms,T,sequence length,16384,8.365951538085938,8.36348819732666,8.380928039550781,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,liger,full,memory,MB,T,sequence length,1024,49.13330078125,49.13330078125,49.13330078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,liger,full,memory,MB,T,sequence length,2048,90.14111328125,90.14111328125,90.14111328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,liger,full,memory,MB,T,sequence length,4096,172.15673828125,172.15673828125,172.15673828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,liger,full,memory,MB,T,sequence length,8192,336.18798828125,336.18798828125,336.18798828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,liger,full,memory,MB,T,sequence length,16384,664.25048828125,664.25048828125,664.25048828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:14,0.2.1
+rope,huggingface,full,memory,MB,T,sequence length,1024,121.13330078125,121.13330078125,121.13330078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:15,0.2.1
+rope,huggingface,full,memory,MB,T,sequence length,2048,234.14111328125,234.14111328125,234.14111328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:15,0.2.1
+rope,huggingface,full,memory,MB,T,sequence length,4096,460.15673828125,460.15673828125,460.15673828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:15,0.2.1
+rope,huggingface,full,memory,MB,T,sequence length,8192,912.18798828125,912.18798828125,912.18798828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:15,0.2.1
+rope,huggingface,full,memory,MB,T,sequence length,16384,1816.25048828125,1816.25048828125,1816.25048828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:15,0.2.1
+swiglu,liger,forward,speed,ms,T,sequence length,1024,5.06441593170166,5.06441593170166,5.06441593170166,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:24,0.2.1
+swiglu,liger,forward,speed,ms,T,sequence length,2048,10.075455665588379,10.075455665588379,10.075455665588379,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:24,0.2.1
+swiglu,liger,forward,speed,ms,T,sequence length,4096,18.001951217651367,18.001951217651367,18.001951217651367,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:24,0.2.1
+swiglu,liger,forward,speed,ms,T,sequence length,8192,35.930015563964844,35.930015563964844,35.930015563964844,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:24,0.2.1
+swiglu,huggingface,forward,speed,ms,T,sequence length,1024,4.582320213317871,4.5821757316589355,4.582464218139648,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:28,0.2.1
+swiglu,huggingface,forward,speed,ms,T,sequence length,2048,9.252832412719727,9.252832412719727,9.252832412719727,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:28,0.2.1
+swiglu,huggingface,forward,speed,ms,T,sequence length,4096,18.160255432128906,18.160255432128906,18.160255432128906,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:28,0.2.1
+swiglu,huggingface,forward,speed,ms,T,sequence length,8192,36.2911376953125,36.2911376953125,36.2911376953125,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:28,0.2.1
+swiglu,liger,full,memory,MB,T,sequence length,1024,1100.25,1100.25,1100.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:32,0.2.1
+swiglu,liger,full,memory,MB,T,sequence length,2048,1582.25,1582.25,1582.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:32,0.2.1
+swiglu,liger,full,memory,MB,T,sequence length,4096,2546.25,2546.25,2546.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:32,0.2.1
+swiglu,liger,full,memory,MB,T,sequence length,8192,4474.25,4474.25,4474.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:32,0.2.1
+swiglu,huggingface,full,memory,MB,T,sequence length,1024,1294.25,1294.25,1294.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:36,0.2.1
+swiglu,huggingface,full,memory,MB,T,sequence length,2048,1992.25,1992.25,1992.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:36,0.2.1
+swiglu,huggingface,full,memory,MB,T,sequence length,4096,3452.25,3452.25,3452.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:36,0.2.1
+swiglu,huggingface,full,memory,MB,T,sequence length,8192,6372.25,6372.25,6372.25,"{""B"": 4, ""hidden_size"": 4096, ""dtype"": ""torch.bfloat16"", ""intermediate_size"": 11008, ""hidden_act"": ""silu""}",NVIDIA A100-SXM4-80GB,2024-09-03 15:40:36,0.2.1
+kl_div,liger,full,memory,MB,V,vocab size,4096,1536.0009765625,1536.0009765625,1536.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:40,0.2.1
+kl_div,liger,full,memory,MB,V,vocab size,8192,3072.0009765625,3072.0009765625,3072.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:40,0.2.1
+kl_div,liger,full,memory,MB,V,vocab size,16384,6144.0009765625,6144.0009765625,6144.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:40,0.2.1
+kl_div,liger,full,memory,MB,V,vocab size,32768,12288.0009765625,12288.0009765625,12288.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:40,0.2.1
+kl_div,liger,full,memory,MB,V,vocab size,65536,24576.0,24576.0,24576.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:40,0.2.1
+kl_div,liger,full,memory,MB,V,vocab size,131072,49152.0,49152.0,49152.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:40,0.2.1
+kl_div,torch,full,memory,MB,V,vocab size,4096,1792.0,1792.0,1792.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:41,0.2.1
+kl_div,torch,full,memory,MB,V,vocab size,8192,3584.0,3584.0,3584.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:41,0.2.1
+kl_div,torch,full,memory,MB,V,vocab size,16384,7168.0,7168.0,7168.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:41,0.2.1
+kl_div,torch,full,memory,MB,V,vocab size,32768,14336.0,14336.0,14336.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:41,0.2.1
+kl_div,torch,full,memory,MB,V,vocab size,65536,28672.0,28672.0,28672.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:41,0.2.1
+kl_div,torch,full,memory,MB,V,vocab size,131072,57344.0,57344.0,57344.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:41,0.2.1
+kl_div,liger,forward,speed,ms,V,vocab size,4096,0.30640000104904175,0.30563199520111084,0.30745598673820496,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:43,0.2.1
+kl_div,liger,forward,speed,ms,V,vocab size,8192,0.5763360261917114,0.5754943490028381,0.5773376226425171,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:43,0.2.1
+kl_div,liger,forward,speed,ms,V,vocab size,16384,1.1176480054855347,1.1165119409561157,1.1186367273330688,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:43,0.2.1
+kl_div,liger,forward,speed,ms,V,vocab size,32768,2.1987199783325195,2.1970815658569336,2.200934410095215,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:43,0.2.1
+kl_div,liger,forward,speed,ms,V,vocab size,65536,4.356672286987305,4.355186939239502,4.358956813812256,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:43,0.2.1
+kl_div,liger,forward,speed,ms,V,vocab size,131072,8.697919845581055,8.690688133239746,8.703583717346191,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:43,0.2.1
+kl_div,torch,forward,speed,ms,V,vocab size,4096,1.3298559188842773,1.3287359476089478,1.331385612487793,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:45,0.2.1
+kl_div,torch,forward,speed,ms,V,vocab size,8192,2.594543933868408,2.592736005783081,2.596640110015869,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:45,0.2.1
+kl_div,torch,forward,speed,ms,V,vocab size,16384,5.13375997543335,5.1324286460876465,5.1364288330078125,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:45,0.2.1
+kl_div,torch,forward,speed,ms,V,vocab size,32768,10.225567817687988,10.225190162658691,10.227231979370117,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:45,0.2.1
+kl_div,torch,forward,speed,ms,V,vocab size,65536,20.412960052490234,20.411020278930664,20.415000915527344,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:45,0.2.1
+kl_div,torch,forward,speed,ms,V,vocab size,131072,40.818641662597656,40.816402435302734,40.82087707519531,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:45,0.2.1
+kl_div,liger,full,speed,ms,V,vocab size,4096,2.040031909942627,1.9614335298538208,2.192307233810425,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:46,0.2.1
+kl_div,liger,full,speed,ms,V,vocab size,8192,3.866431951522827,3.7955007553100586,3.8693249225616455,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:46,0.2.1
+kl_div,liger,full,speed,ms,V,vocab size,16384,7.261951923370361,7.255136013031006,7.281760215759277,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:46,0.2.1
+kl_div,liger,full,speed,ms,V,vocab size,32768,15.092127799987793,15.07801628112793,15.09660816192627,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:46,0.2.1
+kl_div,liger,full,speed,ms,V,vocab size,65536,29.921375274658203,29.914867401123047,29.921951293945312,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:46,0.2.1
+kl_div,liger,full,speed,ms,V,vocab size,131072,59.70220947265625,59.70220947265625,59.70220947265625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:46,0.2.1
+kl_div,torch,full,speed,ms,V,vocab size,4096,2.8552000522613525,2.852755069732666,2.856454372406006,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:48,0.2.1
+kl_div,torch,full,speed,ms,V,vocab size,8192,5.593632221221924,5.590988636016846,5.594636917114258,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:48,0.2.1
+kl_div,torch,full,speed,ms,V,vocab size,16384,11.124671936035156,11.122162818908691,11.125061988830566,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:48,0.2.1
+kl_div,torch,full,speed,ms,V,vocab size,32768,23.052032470703125,23.050334930419922,23.052589416503906,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:48,0.2.1
+kl_div,torch,full,speed,ms,V,vocab size,65536,46.063167572021484,46.05990219116211,46.06643295288086,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:48,0.2.1
+kl_div,torch,full,speed,ms,V,vocab size,131072,92.06393432617188,92.06393432617188,92.06393432617188,"{""B"": 8, ""T"": 2048}",NVIDIA H100 PCIe,2024-09-04 12:59:48,0.2.1
+jsd,liger,full,memory,MB,V,vocab size,4096,768.0029296875,768.0029296875,768.0029296875,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:31,0.3.1
+jsd,liger,full,memory,MB,V,vocab size,8192,1536.0029296875,1536.0029296875,1536.0029296875,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:31,0.3.1
+jsd,liger,full,memory,MB,V,vocab size,16384,3072.0048828125,3072.0048828125,3072.0048828125,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:31,0.3.1
+jsd,liger,full,memory,MB,V,vocab size,32768,6144.0087890625,6144.0087890625,6144.0087890625,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:31,0.3.1
+jsd,liger,full,memory,MB,V,vocab size,65536,12288.0166015625,12288.0166015625,12288.0166015625,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:31,0.3.1
+jsd,liger,full,memory,MB,V,vocab size,131072,24576.015625,24576.015625,24576.015625,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:31,0.3.1
+jsd,torch,full,memory,MB,V,vocab size,4096,1664.0009765625,1664.0009765625,1664.0009765625,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:33,0.3.1
+jsd,torch,full,memory,MB,V,vocab size,8192,3328.0009765625,3328.0009765625,3328.0009765625,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:33,0.3.1
+jsd,torch,full,memory,MB,V,vocab size,16384,6656.0009765625,6656.0009765625,6656.0009765625,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:33,0.3.1
+jsd,torch,full,memory,MB,V,vocab size,32768,13312.0009765625,13312.0009765625,13312.0009765625,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:33,0.3.1
+jsd,torch,full,memory,MB,V,vocab size,65536,26624.0,26624.0,26624.0,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:33,0.3.1
+jsd,torch,full,memory,MB,V,vocab size,131072,53248.0,53248.0,53248.0,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:33,0.3.1
+jsd,liger,forward,speed,ms,V,vocab size,4096,0.4651840031147003,0.4636736214160919,0.4659839868545532,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:37,0.3.1
+jsd,liger,forward,speed,ms,V,vocab size,8192,0.927888035774231,0.926751971244812,0.92952960729599,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:37,0.3.1
+jsd,liger,forward,speed,ms,V,vocab size,16384,10.96003246307373,10.942886352539062,10.970770835876465,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:37,0.3.1
+jsd,liger,forward,speed,ms,V,vocab size,32768,22.405792236328125,22.390380859375,22.41998863220215,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:37,0.3.1
+jsd,liger,forward,speed,ms,V,vocab size,65536,43.49095916748047,43.47438049316406,43.50754165649414,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:37,0.3.1
+jsd,liger,forward,speed,ms,V,vocab size,131072,87.0363540649414,87.0363540649414,87.0363540649414,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:37,0.3.1
+jsd,torch,forward,speed,ms,V,vocab size,4096,2.4744958877563477,2.4725184440612793,2.4764864444732666,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:38,0.3.1
+jsd,torch,forward,speed,ms,V,vocab size,8192,4.8528642654418945,4.851238250732422,4.854745864868164,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:38,0.3.1
+jsd,torch,forward,speed,ms,V,vocab size,16384,9.532496452331543,9.528634071350098,9.535890579223633,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:38,0.3.1
+jsd,torch,forward,speed,ms,V,vocab size,32768,18.91379165649414,18.911853790283203,18.919116973876953,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:38,0.3.1
+jsd,torch,forward,speed,ms,V,vocab size,65536,37.70152282714844,37.70074462890625,37.70229721069336,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:38,0.3.1
+jsd,torch,forward,speed,ms,V,vocab size,131072,75.37680053710938,75.37680053710938,75.37680053710938,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:38,0.3.1
+jsd,liger,full,speed,ms,V,vocab size,4096,1.2074079513549805,1.1739968061447144,1.2760319709777832,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:40,0.3.1
+jsd,liger,full,speed,ms,V,vocab size,8192,2.091792106628418,2.0771327018737793,2.106553554534912,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:40,0.3.1
+jsd,liger,full,speed,ms,V,vocab size,16384,12.928031921386719,12.8988676071167,12.936230659484863,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:40,0.3.1
+jsd,liger,full,speed,ms,V,vocab size,32768,26.55548858642578,26.550823211669922,26.570655822753906,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:40,0.3.1
+jsd,liger,full,speed,ms,V,vocab size,65536,51.6833610534668,51.6833610534668,51.6833610534668,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:40,0.3.1
+jsd,liger,full,speed,ms,V,vocab size,131072,103.12793731689453,103.12793731689453,103.12793731689453,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:40,0.3.1
+jsd,torch,full,speed,ms,V,vocab size,4096,5.397359848022461,5.392876625061035,5.39998722076416,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:43,0.3.1
+jsd,torch,full,speed,ms,V,vocab size,8192,10.60153579711914,10.597900390625,10.60470962524414,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:43,0.3.1
+jsd,torch,full,speed,ms,V,vocab size,16384,20.9442081451416,20.94247055053711,20.9469051361084,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:43,0.3.1
+jsd,torch,full,speed,ms,V,vocab size,32768,42.113216400146484,42.113216400146484,42.113216400146484,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:43,0.3.1
+jsd,torch,full,speed,ms,V,vocab size,65536,83.9959716796875,83.9959716796875,83.9959716796875,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:43,0.3.1
+jsd,torch,full,speed,ms,V,vocab size,131072,167.94175720214844,167.94175720214844,167.94175720214844,"{""B"": 4, ""T"": 2048}",NVIDIA H100 PCIe,2024-10-02 16:21:43,0.3.1
+fused_linear_jsd,liger,forward,speed,ms,BT,B x T,1024,110.02185821533203,110.02185821533203,110.02185821533203,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:18,0.3.1
+fused_linear_jsd,liger,forward,speed,ms,BT,B x T,2048,124.14070129394531,124.14070129394531,124.14070129394531,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:18,0.3.1
+fused_linear_jsd,liger,forward,speed,ms,BT,B x T,4096,143.15420532226562,143.15420532226562,143.15420532226562,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:18,0.3.1
+fused_linear_jsd,liger,forward,speed,ms,BT,B x T,8192,180.90406799316406,180.90406799316406,180.90406799316406,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:18,0.3.1
+fused_linear_jsd,torch,forward,speed,ms,BT,B x T,1024,9.556896209716797,9.550745964050293,9.576268196105957,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:20,0.3.1
+fused_linear_jsd,torch,forward,speed,ms,BT,B x T,2048,18.73731231689453,18.732704162597656,18.737701416015625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:20,0.3.1
+fused_linear_jsd,torch,forward,speed,ms,BT,B x T,4096,37.830482482910156,37.80821990966797,37.85274124145508,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:20,0.3.1
+fused_linear_jsd,torch,forward,speed,ms,BT,B x T,8192,75.15289306640625,75.15289306640625,75.15289306640625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:20,0.3.1
+fused_linear_jsd,liger,full,speed,ms,BT,B x T,1024,111.16019439697266,111.16019439697266,111.16019439697266,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:24,0.3.1
+fused_linear_jsd,liger,full,speed,ms,BT,B x T,2048,125.6825942993164,125.6825942993164,125.6825942993164,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:24,0.3.1
+fused_linear_jsd,liger,full,speed,ms,BT,B x T,4096,144.00784301757812,144.00784301757812,144.00784301757812,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:24,0.3.1
+fused_linear_jsd,liger,full,speed,ms,BT,B x T,8192,182.5832977294922,182.5832977294922,182.5832977294922,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:24,0.3.1
+fused_linear_jsd,torch,full,speed,ms,BT,B x T,1024,25.977184295654297,25.968351364135742,25.989356994628906,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:27,0.3.1
+fused_linear_jsd,torch,full,speed,ms,BT,B x T,2048,49.48417663574219,49.47330093383789,49.495052337646484,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:27,0.3.1
+fused_linear_jsd,torch,full,speed,ms,BT,B x T,4096,98.31510162353516,98.31510162353516,98.31510162353516,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:27,0.3.1
+fused_linear_jsd,torch,full,speed,ms,BT,B x T,8192,195.29539489746094,195.29539489746094,195.29539489746094,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:27,0.3.1
+fused_linear_jsd,liger,full,memory,MB,BT,B x T,1024,4652.48486328125,4652.48486328125,4652.48486328125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:33,0.3.1
+fused_linear_jsd,liger,full,memory,MB,BT,B x T,2048,5231.93798828125,5231.93798828125,5231.93798828125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:33,0.3.1
+fused_linear_jsd,liger,full,memory,MB,BT,B x T,4096,6391.87548828125,6391.87548828125,6391.87548828125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:33,0.3.1
+fused_linear_jsd,liger,full,memory,MB,BT,B x T,8192,8711.75,8711.75,8711.75,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:33,0.3.1
+fused_linear_jsd,torch,full,memory,MB,BT,B x T,1024,10609.005859375,10609.005859375,10609.005859375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:35,0.3.1
+fused_linear_jsd,torch,full,memory,MB,BT,B x T,2048,17146.009765625,17146.009765625,17146.009765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:35,0.3.1
+fused_linear_jsd,torch,full,memory,MB,BT,B x T,4096,30220.017578125,30220.017578125,30220.017578125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:35,0.3.1
+fused_linear_jsd,torch,full,memory,MB,BT,B x T,8192,56368.015625,56368.015625,56368.015625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA H100 80GB HBM3,2024-10-09 12:29:35,0.3.1
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,2,116.00621032714844,116.00621032714844,116.00621032714844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,4,230.83609008789062,230.83609008789062,230.83609008789062,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,8,461.9543151855469,461.9543151855469,461.9543151855469,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,16,922.994384765625,922.994384765625,922.994384765625,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:05,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,2,39.558860778808594,39.52657699584961,39.591148376464844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:36,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,4,79.9734115600586,79.9734115600586,79.9734115600586,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:36,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,8,160.071044921875,160.071044921875,160.071044921875,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:36,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,16,321.4681091308594,321.4681091308594,321.4681091308594,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:24:36,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,2,116.56009674072266,116.56009674072266,116.56009674072266,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:17,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,4,232.43980407714844,232.43980407714844,232.43980407714844,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:17,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,8,464.5750732421875,464.5750732421875,464.5750732421875,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:17,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,16,926.3385009765625,926.3385009765625,926.3385009765625,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:17,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,2,120.68428802490234,120.68428802490234,120.68428802490234,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:58,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,4,241.15061950683594,241.15061950683594,241.15061950683594,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:58,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,8,492.5342102050781,492.5342102050781,492.5342102050781,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:58,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,16,1000.8460693359375,1000.8460693359375,1000.8460693359375,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:25:58,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,2,14556.626953125,14556.626953125,14556.626953125,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:26:42,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,4,14748.689453125,14748.689453125,14748.689453125,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:26:42,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,8,15132.814453125,15132.814453125,15132.814453125,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:26:42,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,16,15901.064453125,15901.064453125,15901.064453125,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:26:42,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,2,12488.501953125,12488.501953125,12488.501953125,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:27:10,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,4,19630.564453125,19630.564453125,19630.564453125,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:27:10,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,8,33914.6875,33914.6875,33914.6875,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:27:10,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,16,62482.9375,62482.9375,62482.9375,"{""T"": 4096, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 21:27:10,0.4.0
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,2,31.02783966064453,31.027551651000977,31.164947509765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:30,0.4.0
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,4,60.88966369628906,60.88966369628906,60.88966369628906,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:30,0.4.0
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,8,121.08070373535156,121.08070373535156,121.08070373535156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:30,0.4.0
+fused_linear_orpo_loss,liger,forward,speed,ms,B,B,16,244.36968994140625,244.36968994140625,244.36968994140625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:30,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,2,12.9093599319458,12.874624252319336,12.947936058044434,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:57,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,4,25.557632446289062,25.526700973510742,25.703763961791992,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:57,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,8,51.75590515136719,51.75590515136719,51.75590515136719,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:57,0.4.0
+fused_linear_orpo_loss,huggingface,forward,speed,ms,B,B,16,103.8515853881836,103.8515853881836,103.8515853881836,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:06:57,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,2,32.52537536621094,32.49258041381836,32.558170318603516,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:28,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,4,63.16300964355469,63.16300964355469,63.16300964355469,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:28,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,8,123.02518463134766,123.02518463134766,123.02518463134766,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:28,0.4.0
+fused_linear_orpo_loss,liger,full,speed,ms,B,B,16,247.44105529785156,247.44105529785156,247.44105529785156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:28,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,2,39.32752227783203,39.32701873779297,39.32802200317383,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:59,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,4,77.9202880859375,77.9202880859375,77.9202880859375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:59,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,8,151.6084442138672,151.6084442138672,151.6084442138672,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:59,0.4.0
+fused_linear_orpo_loss,huggingface,full,speed,ms,B,B,16,304.4580993652344,304.4580993652344,304.4580993652344,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:07:59,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,2,8161.34619140625,8161.34619140625,8161.34619140625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:30,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,4,8209.361328125,8209.361328125,8209.361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:30,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,8,8305.392578125,8305.392578125,8305.392578125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:30,0.4.0
+fused_linear_orpo_loss,liger,full,memory,MB,B,B,16,8497.455078125,8497.455078125,8497.455078125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:30,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,2,8645.314453125,8645.314453125,8645.314453125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:56,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,4,12184.330078125,12184.330078125,12184.330078125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:56,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,8,19262.361328125,19262.361328125,19262.361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:56,0.4.0
+fused_linear_orpo_loss,huggingface,full,memory,MB,B,B,16,33418.421875,33418.421875,33418.421875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-13 22:08:56,0.4.0
+fused_linear_cpo_loss,liger,forward,speed,ms,B,B,2,31.536447525024414,31.457439422607422,31.543052673339844,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:54:47,0.4.1
+fused_linear_cpo_loss,liger,forward,speed,ms,B,B,4,62.407745361328125,62.407745361328125,62.407745361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:54:47,0.4.1
+fused_linear_cpo_loss,liger,forward,speed,ms,B,B,8,123.64259338378906,123.64259338378906,123.64259338378906,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:54:47,0.4.1
+fused_linear_cpo_loss,liger,forward,speed,ms,B,B,16,245.66575622558594,245.66575622558594,245.66575622558594,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:54:47,0.4.1
+fused_linear_cpo_loss,huggingface,forward,speed,ms,B,B,2,14.516239166259766,14.514080047607422,14.52575969696045,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:20,0.4.1
+fused_linear_cpo_loss,huggingface,forward,speed,ms,B,B,4,26.087743759155273,25.943340301513672,26.269376754760742,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:20,0.4.1
+fused_linear_cpo_loss,huggingface,forward,speed,ms,B,B,8,51.85932922363281,51.85932922363281,51.85932922363281,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:20,0.4.1
+fused_linear_cpo_loss,huggingface,forward,speed,ms,B,B,16,104.99673461914062,104.99673461914062,104.99673461914062,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:20,0.4.1
+fused_linear_cpo_loss,liger,full,speed,ms,B,B,2,33.309967041015625,33.21604919433594,33.40388488769531,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:55,0.4.1
+fused_linear_cpo_loss,liger,full,speed,ms,B,B,4,63.053470611572266,63.053470611572266,63.053470611572266,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:55,0.4.1
+fused_linear_cpo_loss,liger,full,speed,ms,B,B,8,125.53849792480469,125.53849792480469,125.53849792480469,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:55,0.4.1
+fused_linear_cpo_loss,liger,full,speed,ms,B,B,16,250.22178649902344,250.22178649902344,250.22178649902344,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:55:55,0.4.1
+fused_linear_cpo_loss,huggingface,full,speed,ms,B,B,2,39.45849609375,39.33102798461914,39.58596420288086,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:56:30,0.4.1
+fused_linear_cpo_loss,huggingface,full,speed,ms,B,B,4,77.00272369384766,77.00272369384766,77.00272369384766,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:56:30,0.4.1
+fused_linear_cpo_loss,huggingface,full,speed,ms,B,B,8,154.28419494628906,154.28419494628906,154.28419494628906,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:56:30,0.4.1
+fused_linear_cpo_loss,huggingface,full,speed,ms,B,B,16,309.23162841796875,309.23162841796875,309.23162841796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:56:30,0.4.1
+fused_linear_cpo_loss,liger,full,memory,MB,B,B,2,8161.34619140625,8161.34619140625,8161.34619140625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:06,0.4.1
+fused_linear_cpo_loss,liger,full,memory,MB,B,B,4,8209.361328125,8209.361328125,8209.361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:06,0.4.1
+fused_linear_cpo_loss,liger,full,memory,MB,B,B,8,8305.392578125,8305.392578125,8305.392578125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:06,0.4.1
+fused_linear_cpo_loss,liger,full,memory,MB,B,B,16,8497.455078125,8497.455078125,8497.455078125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:06,0.4.1
+fused_linear_cpo_loss,huggingface,full,memory,MB,B,B,2,8645.314453125,8645.314453125,8645.314453125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:37,0.4.1
+fused_linear_cpo_loss,huggingface,full,memory,MB,B,B,4,12184.330078125,12184.330078125,12184.330078125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:37,0.4.1
+fused_linear_cpo_loss,huggingface,full,memory,MB,B,B,8,19262.361328125,19262.361328125,19262.361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:37,0.4.1
+fused_linear_cpo_loss,huggingface,full,memory,MB,B,B,16,33418.42578125,33418.42578125,33418.42578125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-14 16:57:37,0.4.1
+fused_linear_simpo_loss,liger,forward,speed,ms,B,B,2,30.28438377380371,30.107013702392578,30.284786224365234,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:26,0.4.1
+fused_linear_simpo_loss,liger,forward,speed,ms,B,B,4,58.80876922607422,58.80876922607422,58.80876922607422,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:26,0.4.1
+fused_linear_simpo_loss,liger,forward,speed,ms,B,B,8,117.96163177490234,117.96163177490234,117.96163177490234,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:26,0.4.1
+fused_linear_simpo_loss,liger,forward,speed,ms,B,B,16,235.60794067382812,235.60794067382812,235.60794067382812,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:26,0.4.1
+fused_linear_simpo_loss,huggingface,forward,speed,ms,B,B,2,14.513839721679688,14.510687828063965,14.517855644226074,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:56,0.4.1
+fused_linear_simpo_loss,huggingface,forward,speed,ms,B,B,4,28.78099250793457,28.72719383239746,28.792186737060547,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:56,0.4.1
+fused_linear_simpo_loss,huggingface,forward,speed,ms,B,B,8,52.5733757019043,52.5733757019043,52.5733757019043,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:56,0.4.1
+fused_linear_simpo_loss,huggingface,forward,speed,ms,B,B,16,104.44764709472656,104.44764709472656,104.44764709472656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:27:56,0.4.1
+fused_linear_simpo_loss,liger,full,speed,ms,B,B,2,31.566062927246094,31.457612991333008,31.674514770507812,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:28:27,0.4.1
+fused_linear_simpo_loss,liger,full,speed,ms,B,B,4,61.4403190612793,61.4403190612793,61.4403190612793,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:28:27,0.4.1
+fused_linear_simpo_loss,liger,full,speed,ms,B,B,8,119.97705841064453,119.97705841064453,119.97705841064453,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:28:27,0.4.1
+fused_linear_simpo_loss,liger,full,speed,ms,B,B,16,238.13417053222656,238.13417053222656,238.13417053222656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:28:27,0.4.1
+fused_linear_simpo_loss,huggingface,full,speed,ms,B,B,2,39.811119079589844,39.65474319458008,39.96749496459961,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:00,0.4.1
+fused_linear_simpo_loss,huggingface,full,speed,ms,B,B,4,77.20928192138672,77.20928192138672,77.20928192138672,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:00,0.4.1
+fused_linear_simpo_loss,huggingface,full,speed,ms,B,B,8,153.6952667236328,153.6952667236328,153.6952667236328,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:00,0.4.1
+fused_linear_simpo_loss,huggingface,full,speed,ms,B,B,16,307.7382507324219,307.7382507324219,307.7382507324219,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:00,0.4.1
+fused_linear_simpo_loss,liger,full,memory,MB,B,B,2,7675.3291015625,7675.3291015625,7675.3291015625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:33,0.4.1
+fused_linear_simpo_loss,liger,full,memory,MB,B,B,4,7723.3447265625,7723.3447265625,7723.3447265625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:33,0.4.1
+fused_linear_simpo_loss,liger,full,memory,MB,B,B,8,7819.3759765625,7819.3759765625,7819.3759765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:33,0.4.1
+fused_linear_simpo_loss,liger,full,memory,MB,B,B,16,8011.4384765625,8011.4384765625,8011.4384765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:29:33,0.4.1
+fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,2,8645.314453125,8645.314453125,8645.314453125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
+fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,4,12184.330078125,12184.330078125,12184.330078125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
+fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,8,19262.361328125,19262.361328125,19262.361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
+fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,16,33418.42578125,33418.42578125,33418.42578125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
+distill_jsd_loss,liger,forward,speed,ms,BT,B x T,1024,7.735536098480225,7.729177474975586,7.798131465911865,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:58:46,0.4.2
+distill_jsd_loss,liger,forward,speed,ms,BT,B x T,2048,15.20411205291748,15.165056228637695,15.226079940795898,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:58:46,0.4.2
+distill_jsd_loss,liger,forward,speed,ms,BT,B x T,4096,30.159456253051758,30.126911163330078,30.165311813354492,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:58:46,0.4.2
+distill_jsd_loss,liger,forward,speed,ms,BT,B x T,8192,60.24163055419922,60.24163055419922,60.24163055419922,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:58:46,0.4.2
+distill_jsd_loss,torch,forward,speed,ms,BT,B x T,1024,10.906111717224121,10.903244972229004,10.91296672821045,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:18,0.4.2
+distill_jsd_loss,torch,forward,speed,ms,BT,B x T,2048,21.480207443237305,21.465139389038086,21.489286422729492,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:18,0.4.2
+distill_jsd_loss,torch,forward,speed,ms,BT,B x T,4096,42.96339416503906,42.96237564086914,42.96440887451172,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:18,0.4.2
+distill_jsd_loss,torch,forward,speed,ms,BT,B x T,8192,85.3946533203125,85.3946533203125,85.3946533203125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:18,0.4.2
+distill_jsd_loss,liger,full,speed,ms,BT,B x T,1024,8.312895774841309,8.310400009155273,8.326751708984375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:51,0.4.2
+distill_jsd_loss,liger,full,speed,ms,BT,B x T,2048,15.770208358764648,15.767775535583496,15.774784088134766,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:51,0.4.2
+distill_jsd_loss,liger,full,speed,ms,BT,B x T,4096,30.922752380371094,30.920312881469727,30.927898406982422,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:51,0.4.2
+distill_jsd_loss,liger,full,speed,ms,BT,B x T,8192,60.70627212524414,60.70627212524414,60.70627212524414,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 07:59:51,0.4.2
+distill_jsd_loss,torch,full,speed,ms,BT,B x T,1024,28.72480010986328,28.718809127807617,28.728179931640625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:25,0.4.2
+distill_jsd_loss,torch,full,speed,ms,BT,B x T,2048,54.281761169433594,54.281761169433594,54.281761169433594,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:25,0.4.2
+distill_jsd_loss,torch,full,speed,ms,BT,B x T,4096,107.08905792236328,107.08905792236328,107.08905792236328,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:25,0.4.2
+distill_jsd_loss,torch,full,speed,ms,BT,B x T,8192,213.1598663330078,213.1598663330078,213.1598663330078,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:25,0.4.2
+distill_jsd_loss,liger,full,memory,MB,BT,B x T,1024,10913.541015625,10913.541015625,10913.541015625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:58,0.4.2
+distill_jsd_loss,liger,full,memory,MB,BT,B x T,2048,10941.548828125,10941.548828125,10941.548828125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:58,0.4.2
+distill_jsd_loss,liger,full,memory,MB,BT,B x T,4096,10997.564453125,10997.564453125,10997.564453125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:58,0.4.2
+distill_jsd_loss,liger,full,memory,MB,BT,B x T,8192,11109.595703125,11109.595703125,11109.595703125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:00:58,0.4.2
+distill_jsd_loss,torch,full,memory,MB,BT,B x T,1024,16174.0390625,16174.0390625,16174.0390625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:01:32,0.4.2
+distill_jsd_loss,torch,full,memory,MB,BT,B x T,2048,23713.05078125,23713.05078125,23713.05078125,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:01:32,0.4.2
+distill_jsd_loss,torch,full,memory,MB,BT,B x T,4096,38791.07421875,38791.07421875,38791.07421875,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:01:32,0.4.2
+distill_jsd_loss,torch,full,memory,MB,BT,B x T,8192,68947.1015625,68947.1015625,68947.1015625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA H100 80GB HBM3,2024-12-03 08:01:32,0.4.2
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),2,3.9951679706573486,3.991487979888916,4.002252578735352,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),4,7.8037919998168945,7.788575649261475,7.808595180511475,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),8,15.43172836303711,15.430015563964844,15.4335355758667,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),16,30.66864013671875,30.66431999206543,30.670501708984375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),32,61.1163215637207,61.1163215637207,61.1163215637207,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:22:44,0.5.4
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),2,3.8766400814056396,3.8680384159088135,3.8897151947021484,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),4,7.213727951049805,7.206470489501953,7.229574680328369,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),8,13.828800201416016,13.810944557189941,13.834943771362305,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),16,27.0930233001709,27.08517074584961,27.09713363647461,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),32,54.13715362548828,54.13715362548828,54.13715362548828,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:01,0.5.4
+kto_loss,liger,full,speed,ms,B,Batch Size (B),2,4.782928466796875,4.677459239959717,5.3430914878845215,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
+kto_loss,liger,full,speed,ms,B,Batch Size (B),4,8.517248153686523,8.481344223022461,8.561504364013672,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
+kto_loss,liger,full,speed,ms,B,Batch Size (B),8,16.547504425048828,16.513471603393555,16.678144454956055,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
+kto_loss,liger,full,speed,ms,B,Batch Size (B),16,31.891263961791992,31.819705963134766,32.274131774902344,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
+kto_loss,liger,full,speed,ms,B,Batch Size (B),32,62.953758239746094,62.953758239746094,62.953758239746094,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:18,0.5.4
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),2,6.201632022857666,6.163315296173096,6.314668655395508,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),4,11.156224250793457,11.142304420471191,11.207296371459961,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),8,21.249855041503906,21.231891632080078,21.264543533325195,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),16,41.55686569213867,41.536956787109375,41.57677459716797,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),32,81.56924438476562,81.56924438476562,81.56924438476562,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:35,0.5.4
+kto_loss,liger,full,memory,MB,B,Batch Size (B),2,2585.73876953125,2585.73876953125,2585.73876953125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
+kto_loss,liger,full,memory,MB,B,Batch Size (B),4,3348.9892578125,3348.9892578125,3348.9892578125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
+kto_loss,liger,full,memory,MB,B,Batch Size (B),8,3361.0048828125,3361.0048828125,3361.0048828125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
+kto_loss,liger,full,memory,MB,B,Batch Size (B),16,3385.0361328125,3385.0361328125,3385.0361328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
+kto_loss,liger,full,memory,MB,B,Batch Size (B),32,3433.0986328125,3433.0986328125,3433.0986328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:23:55,0.5.4
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),2,4341.74951171875,4341.74951171875,4341.74951171875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),4,6099.26513671875,6099.26513671875,6099.26513671875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),8,9613.298828125,9613.298828125,9613.298828125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),16,16643.365234375,16643.365234375,16643.365234375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),32,30703.498046875,30703.498046875,30703.498046875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA H100 80GB HBM3,2025-03-03 08:24:11,0.5.4
+sparsemax,liger,forward,speed,ms,V,feature size,1024,0.41471999883651733,0.4126720130443573,0.42393600940704346,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:08,0.5.8
+sparsemax,liger,forward,speed,ms,V,feature size,2048,0.7608320116996765,0.7598080039024353,0.7628800272941589,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:08,0.5.8
+sparsemax,liger,forward,speed,ms,V,feature size,4096,1.4561280012130737,1.4540799856185913,1.4581760168075562,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:08,0.5.8
+sparsemax,liger,forward,speed,ms,V,feature size,8192,5.288959980010986,5.2848639488220215,5.29986572265625,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:08,0.5.8
+sparsemax,liger,forward,speed,ms,V,feature size,16384,10.734624862670898,10.729472160339355,11.096882820129395,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:08,0.5.8
+sparsemax,liger,forward,speed,ms,V,feature size,32768,21.729312896728516,21.7128963470459,22.20728302001953,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:08,0.5.8
+sparsemax,torch,forward,speed,ms,V,feature size,1024,0.42291200160980225,0.42188799381256104,0.42393600940704346,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:12,0.5.8
+sparsemax,torch,forward,speed,ms,V,feature size,2048,0.7782400250434875,0.7772160172462463,0.779263973236084,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:12,0.5.8
+sparsemax,torch,forward,speed,ms,V,feature size,4096,1.4940160512924194,1.491968035697937,1.4960639476776123,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:12,0.5.8
+sparsemax,torch,forward,speed,ms,V,feature size,8192,5.359615802764893,5.356544017791748,5.366579055786133,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:12,0.5.8
+sparsemax,torch,forward,speed,ms,V,feature size,16384,10.883584022521973,10.874879837036133,11.224268913269043,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:12,0.5.8
+sparsemax,torch,forward,speed,ms,V,feature size,32768,22.19878387451172,22.018457412719727,22.48888397216797,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:12,0.5.8
+sparsemax,liger,full,speed,ms,V,feature size,1024,0.4558719992637634,0.45558398962020874,0.45772799849510193,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:16,0.5.8
+sparsemax,liger,full,speed,ms,V,feature size,2048,0.8488960266113281,0.8478720188140869,0.8509439826011658,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:16,0.5.8
+sparsemax,liger,full,speed,ms,V,feature size,4096,1.6476160287857056,1.6465920209884644,1.6499264240264893,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:16,0.5.8
+sparsemax,liger,full,speed,ms,V,feature size,8192,5.664768218994141,5.660672187805176,5.681356906890869,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:16,0.5.8
+sparsemax,liger,full,speed,ms,V,feature size,16384,11.486207962036133,11.478015899658203,11.874713897705078,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:16,0.5.8
+sparsemax,liger,full,speed,ms,V,feature size,32768,23.457279205322266,23.289682388305664,23.76642608642578,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:16,0.5.8
+sparsemax,torch,full,speed,ms,V,feature size,1024,0.6021119952201843,0.6010879874229431,0.6041600108146667,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:20,0.5.8
+sparsemax,torch,full,speed,ms,V,feature size,2048,1.1212799549102783,1.119264006614685,1.1223039627075195,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:20,0.5.8
+sparsemax,torch,full,speed,ms,V,feature size,4096,2.1637120246887207,2.1616640090942383,2.165760040283203,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:20,0.5.8
+sparsemax,torch,full,speed,ms,V,feature size,8192,6.693888187408447,6.68723201751709,6.705561637878418,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:20,0.5.8
+sparsemax,torch,full,speed,ms,V,feature size,16384,13.523456573486328,13.518848419189453,13.878681182861328,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:20,0.5.8
+sparsemax,torch,full,speed,ms,V,feature size,32768,27.604991912841797,27.295129776000977,27.77518081665039,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:20,0.5.8
+sparsemax,liger,backward,speed,ms,V,feature size,1024,0.04403200000524521,0.043007999658584595,0.05222399905323982,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:22,0.5.8
+sparsemax,liger,backward,speed,ms,V,feature size,2048,0.08806400001049042,0.08713600039482117,0.08806400001049042,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:22,0.5.8
+sparsemax,liger,backward,speed,ms,V,feature size,4096,0.1884160041809082,0.1884160041809082,0.18943999707698822,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:22,0.5.8
+sparsemax,liger,backward,speed,ms,V,feature size,8192,0.374783992767334,0.37376001477241516,0.37486720085144043,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:22,0.5.8
+sparsemax,liger,backward,speed,ms,V,feature size,16384,0.7516160011291504,0.7505919933319092,0.7516160011291504,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:22,0.5.8
+sparsemax,liger,backward,speed,ms,V,feature size,32768,1.5738879442214966,1.572864055633545,1.575935959815979,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:22,0.5.8
+sparsemax,torch,backward,speed,ms,V,feature size,1024,0.1812479943037033,0.1802240014076233,0.18227200210094452,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,torch,backward,speed,ms,V,feature size,2048,0.34406399726867676,0.34406399726867676,0.34508800506591797,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,torch,backward,speed,ms,V,feature size,4096,0.6717439889907837,0.6707199811935425,0.6727679967880249,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,torch,backward,speed,ms,V,feature size,8192,1.3250559568405151,1.3241215944290161,1.3260799646377563,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,torch,backward,speed,ms,V,feature size,16384,2.629631996154785,2.628607988357544,2.6306560039520264,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,torch,backward,speed,ms,V,feature size,32768,5.236735820770264,5.235712051391602,5.239808082580566,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,liger,full,memory,MB,V,feature size,1024,82.03515625,82.03515625,82.03515625,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,liger,full,memory,MB,V,feature size,2048,164.0390625,164.0390625,164.0390625,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,liger,full,memory,MB,V,feature size,4096,328.046875,328.046875,328.046875,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,liger,full,memory,MB,V,feature size,8192,704.00048828125,704.00048828125,704.00048828125,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,liger,full,memory,MB,V,feature size,16384,1408.00048828125,1408.00048828125,1408.00048828125,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,liger,full,memory,MB,V,feature size,32768,2816.00048828125,2816.00048828125,2816.00048828125,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:25,0.5.8
+sparsemax,torch,full,memory,MB,V,feature size,1024,82.03515625,82.03515625,82.03515625,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:26,0.5.8
+sparsemax,torch,full,memory,MB,V,feature size,2048,164.0390625,164.0390625,164.0390625,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:26,0.5.8
+sparsemax,torch,full,memory,MB,V,feature size,4096,328.046875,328.046875,328.046875,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:26,0.5.8
+sparsemax,torch,full,memory,MB,V,feature size,8192,704.00048828125,704.00048828125,704.00048828125,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:26,0.5.8
+sparsemax,torch,full,memory,MB,V,feature size,16384,1408.00048828125,1408.00048828125,1408.00048828125,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:26,0.5.8
+sparsemax,torch,full,memory,MB,V,feature size,32768,2816.00048828125,2816.00048828125,2816.00048828125,"{""B"": 4, ""T"": 512, ""dim"": -1, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-28 00:38:26,0.5.8
+multi_token_attention,liger,forward,speed,ms,L,sequence length,32,0.01740800030529499,0.01740800030529499,0.018432000651955605,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:10,0.1.1
+multi_token_attention,liger,forward,speed,ms,L,sequence length,64,0.018432000651955605,0.01740800030529499,0.01945599913597107,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:10,0.1.1
+multi_token_attention,liger,forward,speed,ms,L,sequence length,128,0.023552000522613525,0.02252800017595291,0.02364799939095974,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:10,0.1.1
+multi_token_attention,liger,forward,speed,ms,L,sequence length,256,0.043007999658584595,0.04198399931192398,0.043007999658584595,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:10,0.1.1
+multi_token_attention,liger,forward,speed,ms,L,sequence length,512,0.12595200538635254,0.12492799758911133,0.12595200538635254,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:10,0.1.1
+multi_token_attention,liger,forward,speed,ms,L,sequence length,1024,0.5283839702606201,0.5253120064735413,0.5294079780578613,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:10,0.1.1
+multi_token_attention,torch,forward,speed,ms,L,sequence length,32,0.2467840015888214,0.24063999950885773,0.2529279887676239,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:11,0.1.1
+multi_token_attention,torch,forward,speed,ms,L,sequence length,64,0.24166400730609894,0.23756800591945648,0.24883200228214264,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:11,0.1.1
+multi_token_attention,torch,forward,speed,ms,L,sequence length,128,0.24268800020217896,0.2385600060224533,0.24985599517822266,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:11,0.1.1
+multi_token_attention,torch,forward,speed,ms,L,sequence length,256,0.24166400730609894,0.23873919248580933,0.24782079458236694,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:11,0.1.1
+multi_token_attention,torch,forward,speed,ms,L,sequence length,512,0.31334400177001953,0.3102720081806183,0.3213888108730316,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:11,0.1.1
+multi_token_attention,torch,forward,speed,ms,L,sequence length,1024,0.719871997833252,0.7167999744415283,0.7260159850120544,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:11,0.1.1
+multi_token_attention,liger,full,speed,ms,L,sequence length,32,0.9349120259284973,0.6543359756469727,0.9494400024414062,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:12,0.1.1
+multi_token_attention,liger,full,speed,ms,L,sequence length,64,0.6215680241584778,0.5631999969482422,0.8916991949081421,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:12,0.1.1
+multi_token_attention,liger,full,speed,ms,L,sequence length,128,0.5406720042228699,0.5335040092468262,0.550003170967102,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:12,0.1.1
+multi_token_attention,liger,full,speed,ms,L,sequence length,256,0.5631999969482422,0.5560320019721985,0.5674688220024109,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:12,0.1.1
+multi_token_attention,liger,full,speed,ms,L,sequence length,512,0.6430720090866089,0.6420480012893677,0.6430720090866089,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:12,0.1.1
+multi_token_attention,liger,full,speed,ms,L,sequence length,1024,2.4780800342559814,2.4770560264587402,2.479987144470215,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:12,0.1.1
+multi_token_attention,torch,full,speed,ms,L,sequence length,32,0.795199990272522,0.78438401222229,0.8038399815559387,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:13,0.1.1
+multi_token_attention,torch,full,speed,ms,L,sequence length,64,0.7362560033798218,0.6504960060119629,0.7464960217475891,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:13,0.1.1
+multi_token_attention,torch,full,speed,ms,L,sequence length,128,0.7680000066757202,0.6437439918518066,0.8105729818344116,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:13,0.1.1
+multi_token_attention,torch,full,speed,ms,L,sequence length,256,0.7685279846191406,0.7586879730224609,0.783519983291626,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:13,0.1.1
+multi_token_attention,torch,full,speed,ms,L,sequence length,512,0.9676799774169922,0.9625599980354309,0.9751039743423462,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:13,0.1.1
+multi_token_attention,torch,full,speed,ms,L,sequence length,1024,2.772480010986328,2.7688961029052734,2.7842559814453125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:13,0.1.1
+multi_token_attention,liger,backward,speed,ms,L,sequence length,32,0.334879994392395,0.3222528100013733,0.6912000179290771,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:14,0.1.1
+multi_token_attention,liger,backward,speed,ms,L,sequence length,64,0.23756800591945648,0.228166401386261,0.2629631757736206,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:14,0.1.1
+multi_token_attention,liger,backward,speed,ms,L,sequence length,128,0.29785600304603577,0.2519040107727051,0.3081727921962738,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:14,0.1.1
+multi_token_attention,liger,backward,speed,ms,L,sequence length,256,0.2590720057487488,0.24391679465770721,0.30832639336586,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:14,0.1.1
+multi_token_attention,liger,backward,speed,ms,L,sequence length,512,0.5171200037002563,0.5169600248336792,0.5181440114974976,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:14,0.1.1
+multi_token_attention,liger,backward,speed,ms,L,sequence length,1024,1.9578880071640015,1.9568639993667603,1.9615744352340698,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:14,0.1.1
+multi_token_attention,torch,backward,speed,ms,L,sequence length,32,0.09830400347709656,0.08908800035715103,0.20353920757770538,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,backward,speed,ms,L,sequence length,64,0.06348799914121628,0.062463998794555664,0.06348799914121628,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,backward,speed,ms,L,sequence length,128,0.09011200070381165,0.08908800035715103,0.09011200070381165,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,backward,speed,ms,L,sequence length,256,0.16383999586105347,0.16383999586105347,0.16486400365829468,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,backward,speed,ms,L,sequence length,512,0.52019202709198,0.5191680192947388,0.52019202709198,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,backward,speed,ms,L,sequence length,1024,1.9763200283050537,1.9752960205078125,1.9763200283050537,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,liger,full,memory,MB,L,sequence length,32,0.97412109375,0.97412109375,0.97412109375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,liger,full,memory,MB,L,sequence length,64,1.53662109375,1.53662109375,1.53662109375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,liger,full,memory,MB,L,sequence length,128,3.69287109375,3.69287109375,3.69287109375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,liger,full,memory,MB,L,sequence length,256,13.068359375,13.068359375,13.068359375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,liger,full,memory,MB,L,sequence length,512,48.974609375,48.974609375,48.974609375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,liger,full,memory,MB,L,sequence length,1024,192.974609375,192.974609375,192.974609375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,full,memory,MB,L,sequence length,32,0.9599609375,0.9599609375,0.9599609375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,full,memory,MB,L,sequence length,64,1.4814453125,1.4814453125,1.4814453125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,full,memory,MB,L,sequence length,128,3.4736328125,3.4736328125,3.4736328125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,full,memory,MB,L,sequence length,256,12.19287109375,12.19287109375,12.19287109375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,full,memory,MB,L,sequence length,512,45.47412109375,45.47412109375,45.47412109375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+multi_token_attention,torch,full,memory,MB,L,sequence length,1024,178.97412109375,178.97412109375,178.97412109375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-28 04:46:15,0.1.1
+softmax,liger,forward,speed,ms,N,hidden size,128,0.0071680000983178616,0.0071680000983178616,0.007942399941384792,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:04,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,256,0.008448000065982342,0.008191999979317188,0.009216000325977802,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:04,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,512,0.013311999849975109,0.01228800043463707,0.013311999849975109,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:04,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,1024,0.021503999829292297,0.021503999829292297,0.02252800017595291,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:04,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,2048,0.04095999896526337,0.04095999896526337,0.04198399931192398,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:04,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,4096,0.0798719972372055,0.0798719972372055,0.08089599758386612,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:04,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,128,0.006144000217318535,0.006144000217318535,0.0071680000983178616,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:07,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,256,0.008191999979317188,0.008191999979317188,0.009216000325977802,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:07,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,512,0.01228800043463707,0.01228800043463707,0.013311999849975109,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:07,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,1024,0.02252800017595291,0.02252800017595291,0.023552000522613525,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:07,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,2048,0.057583998888731,0.05734400078654289,0.058368001133203506,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:07,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,4096,0.08323200047016144,0.08294399827718735,0.08396799862384796,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:07,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,128,0.053247999399900436,0.04505600035190582,0.06172160431742668,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:10,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,256,0.05939200147986412,0.04198399931192398,0.11169920116662979,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:10,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,512,0.11577600240707397,0.07720960676670074,0.16793599724769592,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:10,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,1024,0.12492799758911133,0.10273279249668121,0.2982015907764435,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:10,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,2048,0.1013759970664978,0.10035199671983719,0.12902399897575378,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:10,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,4096,0.19660800695419312,0.19660800695419312,0.19763199985027313,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:10,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,128,0.013311999849975109,0.013311999849975109,0.013504000380635262,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:13,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,256,0.019152000546455383,0.018432000651955605,0.01945599913597107,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:13,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,512,0.03891199827194214,0.03788800165057182,0.03891199827194214,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:13,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,1024,0.08396799862384796,0.08396799862384796,0.08499199897050858,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:13,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,2048,0.18329599499702454,0.18329599499702454,0.18432000279426575,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:13,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,4096,0.3307519853115082,0.32972800731658936,0.33169281482696533,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:13,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,128,0.006335999816656113,0.006144000217318535,0.0071680000983178616,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:16,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,256,0.0071680000983178616,0.006144000217318535,0.0071680000983178616,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:16,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,512,0.008191999979317188,0.008191999979317188,0.009216000325977802,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:16,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,1024,0.013311999849975109,0.01228800043463707,0.013311999849975109,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:16,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,2048,0.02252800017595291,0.02252800017595291,0.023552000522613525,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:16,0.5.8
+softmax,liger,forward,speed,ms,N,hidden size,4096,0.04095999896526337,0.04095999896526337,0.04198399931192398,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:16,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,128,0.006144000217318535,0.005119999870657921,0.006144000217318535,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:19,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,256,0.006207999773323536,0.006144000217318535,0.0071680000983178616,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:19,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,512,0.008383999578654766,0.008191999979317188,0.009216000325977802,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:19,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,1024,0.014336000196635723,0.014336000196635723,0.014336000196635723,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:19,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,2048,0.05939200147986412,0.058368001133203506,0.05939200147986412,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:19,0.5.8
+softmax,torch,forward,speed,ms,N,hidden size,4096,0.06758400052785873,0.06675200164318085,0.06758400052785873,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:19,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,128,0.11472000181674957,0.09744639694690704,0.20684799551963806,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:22,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,256,0.15787199139595032,0.10769280046224594,0.20897281169891357,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:22,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,512,0.14028799533843994,0.0832064226269722,0.2879999876022339,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:22,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,1024,0.2088959962129593,0.11446399986743927,0.2972480058670044,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:22,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,2048,0.1443839967250824,0.09318400174379349,0.28278398513793945,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:22,0.5.8
+softmax,liger,full,speed,ms,N,hidden size,4096,0.11673600226640701,0.10035199671983719,0.28074881434440613,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:22,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,128,0.011264000087976456,0.010239999741315842,0.011264000087976456,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,256,0.013311999849975109,0.013311999849975109,0.013632000423967838,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,512,0.01945599913597107,0.01945599913597107,0.01945599913597107,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,1024,0.04198399931192398,0.04198399931192398,0.04224000126123428,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,2048,0.12595200538635254,0.12595200538635254,0.12697599828243256,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,speed,ms,N,hidden size,4096,0.19763199985027313,0.19660800695419312,0.19809921085834503,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,128,0.00244140625,0.00244140625,0.00244140625,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,256,0.0048828125,0.0048828125,0.0048828125,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,512,0.009765625,0.009765625,0.009765625,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,1024,0.01953125,0.01953125,0.01953125,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,2048,0.0390625,0.0390625,0.0390625,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,4096,0.078125,0.078125,0.078125,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,128,0.0029296875,0.0029296875,0.0029296875,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,256,0.005859375,0.005859375,0.005859375,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,512,0.01171875,0.01171875,0.01171875,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,1024,0.0234375,0.0234375,0.0234375,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,2048,0.046875,0.046875,0.046875,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,4096,0.09375,0.09375,0.09375,"{""M"": 2048, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,128,0.00244140625,0.00244140625,0.00244140625,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,256,0.00244140625,0.00244140625,0.00244140625,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,512,0.0048828125,0.0048828125,0.0048828125,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,1024,0.009765625,0.009765625,0.009765625,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,2048,0.01953125,0.01953125,0.01953125,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,liger,full,memory,MB,N,hidden size,4096,0.0390625,0.0390625,0.0390625,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:25,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,128,0.0029296875,0.0029296875,0.0029296875,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:26,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,256,0.0029296875,0.0029296875,0.0029296875,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:26,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,512,0.005859375,0.005859375,0.005859375,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:26,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,1024,0.01171875,0.01171875,0.01171875,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:26,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,2048,0.0234375,0.0234375,0.0234375,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:26,0.5.8
+softmax,torch,full,memory,MB,N,hidden size,4096,0.046875,0.046875,0.046875,"{""M"": 2048, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 3090,2025-04-30 16:11:26,0.5.8
+sparse_multi_token_attention,liger,forward,speed,ms,L,sequence length,32,0.31436800956726074,0.30646398663520813,0.319487988948822,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:12,0.5.8
+sparse_multi_token_attention,liger,forward,speed,ms,L,sequence length,64,0.3779039978981018,0.3678207993507385,0.38410240411758423,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:12,0.5.8
+sparse_multi_token_attention,liger,forward,speed,ms,L,sequence length,128,0.35020801424980164,0.3428351879119873,0.35839998722076416,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:12,0.5.8
+sparse_multi_token_attention,liger,forward,speed,ms,L,sequence length,256,0.5294079780578613,0.5283839702606201,0.5304319858551025,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:12,0.5.8
+sparse_multi_token_attention,liger,forward,speed,ms,L,sequence length,512,1.7315839529037476,1.7304960489273071,1.815551996231079,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:12,0.5.8
+sparse_multi_token_attention,liger,forward,speed,ms,L,sequence length,1024,6.465375900268555,6.462463855743408,6.718054294586182,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:12,0.5.8
+sparse_multi_token_attention,torch,forward,speed,ms,L,sequence length,32,0.5888000130653381,0.5826560258865356,0.5960000157356262,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:13,0.5.8
+sparse_multi_token_attention,torch,forward,speed,ms,L,sequence length,64,0.6010879874229431,0.5947520136833191,0.608128011226654,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:13,0.5.8
+sparse_multi_token_attention,torch,forward,speed,ms,L,sequence length,128,0.5816320180892944,0.5745791792869568,0.5908480286598206,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:13,0.5.8
+sparse_multi_token_attention,torch,forward,speed,ms,L,sequence length,256,0.8591359853744507,0.8529919981956482,0.8627520203590393,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:13,0.5.8
+sparse_multi_token_attention,torch,forward,speed,ms,L,sequence length,512,1.931391954421997,1.925772786140442,1.935705542564392,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:13,0.5.8
+sparse_multi_token_attention,torch,forward,speed,ms,L,sequence length,1024,6.76915168762207,6.761676788330078,7.009791851043701,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:13,0.5.8
+sparse_multi_token_attention,liger,full,speed,ms,L,sequence length,32,2.111056089401245,2.0716030597686768,2.137094497680664,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:16,0.5.8
+sparse_multi_token_attention,liger,full,speed,ms,L,sequence length,64,2.174975872039795,2.1364736557006836,2.297856092453003,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:16,0.5.8
+sparse_multi_token_attention,liger,full,speed,ms,L,sequence length,128,2.0894718170166016,2.073791980743408,2.1352319717407227,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:16,0.5.8
+sparse_multi_token_attention,liger,full,speed,ms,L,sequence length,256,2.137216091156006,1.8400319814682007,2.194175958633423,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:16,0.5.8
+sparse_multi_token_attention,liger,full,speed,ms,L,sequence length,512,2.2814719676971436,2.1872639656066895,2.2833151817321777,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:16,0.5.8
+sparse_multi_token_attention,liger,full,speed,ms,L,sequence length,1024,8.308735847473145,8.299519538879395,8.551424026489258,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:16,0.5.8
+sparse_multi_token_attention,torch,full,speed,ms,L,sequence length,32,1.5749119520187378,1.498412847518921,2.170527935028076,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:17,0.5.8
+sparse_multi_token_attention,torch,full,speed,ms,L,sequence length,64,1.494047999382019,1.482604742050171,1.5207936763763428,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:17,0.5.8
+sparse_multi_token_attention,torch,full,speed,ms,L,sequence length,128,1.4581760168075562,1.4419968128204346,2.1133759021759033,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:17,0.5.8
+sparse_multi_token_attention,torch,full,speed,ms,L,sequence length,256,1.7448960542678833,1.7180671691894531,1.7537024021148682,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:17,0.5.8
+sparse_multi_token_attention,torch,full,speed,ms,L,sequence length,512,2.796544075012207,2.7762560844421387,2.8190720081329346,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:17,0.5.8
+sparse_multi_token_attention,torch,full,speed,ms,L,sequence length,1024,9.511823654174805,9.501286506652832,9.787391662597656,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:17,0.5.8
+sparse_multi_token_attention,liger,backward,speed,ms,L,sequence length,32,0.3544960021972656,0.33546239137649536,0.8041215538978577,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,liger,backward,speed,ms,L,sequence length,64,0.32897597551345825,0.32051199674606323,0.3438591957092285,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,liger,backward,speed,ms,L,sequence length,128,0.30931198596954346,0.3002240061759949,0.3197120130062103,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,liger,backward,speed,ms,L,sequence length,256,0.31334400177001953,0.2956160008907318,0.3251904249191284,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,liger,backward,speed,ms,L,sequence length,512,0.447488009929657,0.44646400213241577,0.4485119879245758,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,liger,backward,speed,ms,L,sequence length,1024,1.8585599660873413,1.8574656248092651,1.861631989479065,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,torch,backward,speed,ms,L,sequence length,32,0.25804799795150757,0.24883200228214264,0.30926719307899475,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,torch,backward,speed,ms,L,sequence length,64,0.25804799795150757,0.2514623999595642,0.26668161153793335,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,torch,backward,speed,ms,L,sequence length,128,0.24075199663639069,0.2303999960422516,0.25194239616394043,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,torch,backward,speed,ms,L,sequence length,256,0.24686399102210999,0.23756800591945648,0.2550272047519684,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,torch,backward,speed,ms,L,sequence length,512,0.7045120000839233,0.704479992389679,0.7063615918159485,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,torch,backward,speed,ms,L,sequence length,1024,2.698431968688965,2.697216033935547,2.7013120651245117,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:18,0.5.8
+sparse_multi_token_attention,liger,full,memory,MB,L,sequence length,32,0.3603515625,0.3603515625,0.3603515625,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,liger,full,memory,MB,L,sequence length,64,1.4189453125,1.4189453125,1.4189453125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,liger,full,memory,MB,L,sequence length,128,5.6455078125,5.6455078125,5.6455078125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,liger,full,memory,MB,L,sequence length,256,22.53662109375,22.53662109375,22.53662109375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,liger,full,memory,MB,L,sequence length,512,90.06884765625,90.06884765625,90.06884765625,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,liger,full,memory,MB,L,sequence length,1024,360.13330078125,360.13330078125,360.13330078125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,torch,full,memory,MB,L,sequence length,32,0.45263671875,0.45263671875,0.45263671875,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,torch,full,memory,MB,L,sequence length,64,1.7685546875,1.7685546875,1.7685546875,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,torch,full,memory,MB,L,sequence length,128,7.04833984375,7.04833984375,7.04833984375,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,torch,full,memory,MB,L,sequence length,256,28.15478515625,28.15478515625,28.15478515625,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,torch,full,memory,MB,L,sequence length,512,112.55517578125,112.55517578125,112.55517578125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+sparse_multi_token_attention,torch,full,memory,MB,L,sequence length,1024,450.10595703125,450.10595703125,450.10595703125,"{""B"": 2, ""C_in"": 4, ""C_out"": 4, ""K"": 3, ""groups"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-04-30 17:22:19,0.5.8
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.236735999584198,0.16073599457740784,0.24985599517822266,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:08:54,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.22323200106620789,0.21503999829292297,0.2323904037475586,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:08:54,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.24268800020217896,0.2295808047056198,0.25088000297546387,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:08:54,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,0.3307519853115082,0.32805120944976807,0.3317759931087494,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:08:54,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,0.8540160059928894,0.851967990398407,0.8595455884933472,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:08:54,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,2.3658719062805176,2.3617537021636963,2.368511915206909,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:08:54,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,8.466431617736816,8.447999954223633,8.480768203735352,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:08:54,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,5.16915225982666,5.143871784210205,5.297952175140381,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:01,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,10.244048118591309,10.094131469726562,10.48145866394043,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:01,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,20.196895599365234,20.145601272583008,21.581132888793945,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:01,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,42.183536529541016,41.2415771484375,43.12549591064453,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:01,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,77.73798370361328,77.73798370361328,77.73798370361328,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:01,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,172.90853881835938,172.90853881835938,172.90853881835938,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:01,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,346.5686950683594,346.5686950683594,346.5686950683594,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:01,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,2.723423957824707,2.68287992477417,2.7842559814453125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:14,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,2.6542398929595947,2.6169726848602295,2.68984317779541,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:14,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,2.595871925354004,2.1286911964416504,2.6818559169769287,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:14,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,2.738736152648926,2.7115519046783447,2.8180480003356934,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:14,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,2.83457612991333,2.805759906768799,2.88972806930542,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:14,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,6.529168128967285,6.525951862335205,6.66664981842041,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:14,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,23.742895126342773,23.660747528076172,23.825515747070312,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:14,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,6.841343879699707,6.725196838378906,6.972832202911377,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:21,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,11.825152397155762,11.683839797973633,12.080537796020508,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:21,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,21.856351852416992,21.36012077331543,21.95940589904785,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:21,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,42.70033264160156,42.545169830322266,42.855499267578125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:21,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,87.9656982421875,87.9656982421875,87.9656982421875,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:21,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,181.77536010742188,181.77536010742188,181.77536010742188,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:21,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,368.0634765625,368.0634765625,368.0634765625,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:21,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.5920320153236389,0.5674688220024109,1.3856768608093262,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:22,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.6430720090866089,0.6318399906158447,0.6610943675041199,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:22,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.6456320285797119,0.6359040141105652,0.6676480174064636,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:22,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,0.7014399766921997,0.6911231875419617,0.7275007963180542,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:22,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,1.4684159755706787,1.4663679599761963,1.4704639911651611,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:22,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,4.150223731994629,4.14717435836792,4.234445095062256,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:22,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,15.17465591430664,14.853119850158691,15.310848236083984,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:22,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.6000639796257019,0.5832703709602356,1.2799999713897705,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:25,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.5550079941749573,0.5488640069961548,0.5914624333381653,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:25,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,0.5470079779624939,0.5406720042228699,0.562175989151001,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:25,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,0.8714240193367004,0.8617984056472778,1.2751424312591553,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:25,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,2.3746559619903564,2.3727169036865234,2.3797760009765625,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:25,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,8.019968032836914,8.00870418548584,8.2227201461792,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:25,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,28.92291259765625,28.684505462646484,28.97941780090332,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:25,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.23756800591945648,0.22630399465560913,0.24985599517822266,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:32,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.25088000297546387,0.24187520146369934,0.25964802503585815,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:32,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.43110400438308716,0.42920318245887756,0.43212801218032837,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:32,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,1.0199040174484253,1.0147839784622192,1.0281280279159546,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:32,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,2.584575891494751,2.578432083129883,2.593791961669922,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:32,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,7.8611040115356445,7.851212978363037,8.14100456237793,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:32,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,27.072511672973633,27.043020248413086,27.129650115966797,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:32,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,5.303808212280273,5.205196857452393,5.414611339569092,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:38,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,10.352640151977539,10.268671989440918,10.546982765197754,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:38,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,20.696575164794922,20.600217819213867,22.168373107910156,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:38,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,40.9251823425293,39.459224700927734,42.39113998413086,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:38,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,84.20972442626953,84.20972442626953,84.20972442626953,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:38,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,165.5727996826172,165.5727996826172,165.5727996826172,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:38,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,365.4942626953125,365.4942626953125,365.4942626953125,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:38,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,2.5410561561584473,2.5221376419067383,2.574540853500366,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:52,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,2.6214399337768555,2.5966720581054688,2.66780161857605,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:52,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,2.6818559169769287,2.660710334777832,2.7396223545074463,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:52,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,2.9624319076538086,2.959359884262085,2.973695993423462,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:52,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,7.516160011291504,7.5141119956970215,7.782809734344482,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:52,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,22.99033546447754,22.859058380126953,23.101655960083008,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:52,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,79.14390563964844,79.14390563964844,79.14390563964844,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:09:52,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,6.206463813781738,6.177548885345459,6.346368312835693,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:00,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,11.45395278930664,11.369497299194336,11.57201862335205,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:00,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,21.295616149902344,20.8918514251709,22.428876876831055,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:00,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,46.485904693603516,44.799137115478516,48.172672271728516,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:00,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,87.60115051269531,87.60115051269531,87.60115051269531,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:00,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,210.36146545410156,210.36146545410156,210.36146545410156,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:00,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,456.848388671875,456.848388671875,456.848388671875,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:00,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.5756800174713135,0.45319682359695435,0.7064127922058105,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:02,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.5908480286598206,0.48742398619651794,0.6028479933738708,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:02,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.915615975856781,0.8775680065155029,0.9175040125846863,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:02,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,1.9450880289077759,1.9351999759674072,1.9651199579238892,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:02,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,4.930560111999512,4.915200233459473,5.046477317810059,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:02,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,15.102832794189453,14.952447891235352,15.31494426727295,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:02,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,52.104190826416016,52.104190826416016,52.104190826416016,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:02,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.4843519926071167,0.4761984050273895,0.6077119708061218,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:05,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.5319839715957642,0.5222399830818176,0.5335040092468262,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:05,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,1.1182080507278442,1.1151360273361206,1.120255947113037,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:05,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,2.5815041065216064,2.5763840675354004,2.5960447788238525,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:05,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,7.123968124389648,7.087513446807861,7.359897613525391,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:05,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,24.104448318481445,24.077312469482422,24.161880493164062,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:05,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,86.40716552734375,86.40716552734375,86.40716552734375,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:05,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.2467840015888214,0.17902079224586487,0.25702399015426636,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:12,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.23756800591945648,0.23654399812221527,0.24885760247707367,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:12,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.4567039906978607,0.45158401131629944,0.4638719856739044,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:12,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,0.8017920255661011,0.7946239709854126,0.8048639893531799,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:12,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,1.9527679681777954,1.9476544857025146,1.9595264196395874,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:12,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,5.405695915222168,5.392384052276611,5.651423931121826,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:12,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,18.608959197998047,18.311372756958008,18.646629333496094,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:12,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,6.554111957550049,6.130688190460205,6.872096061706543,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:20,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,13.195263862609863,13.134265899658203,13.464166641235352,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:20,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,24.001535415649414,23.594995498657227,25.934438705444336,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:20,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,50.334720611572266,50.334720611572266,50.334720611572266,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:20,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,107.2701416015625,107.2701416015625,107.2701416015625,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:20,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,218.13658142089844,218.13658142089844,218.13658142089844,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:20,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,457.2313537597656,457.2313537597656,457.2313537597656,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:20,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,2.623487949371338,2.605638265609741,2.6442177295684814,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:34,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,2.6389598846435547,2.6225087642669678,2.6781694889068604,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:34,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,2.613312005996704,2.589139223098755,2.6998207569122314,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:34,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,2.7299840450286865,2.7037951946258545,2.783027172088623,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:34,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,5.588992118835449,5.584896087646484,5.632409572601318,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:34,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,15.91859245300293,15.853568077087402,16.029695510864258,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:34,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,54.28019332885742,54.28019332885742,54.28019332885742,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:34,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,8.281087875366211,8.076288223266602,8.5731840133667,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:43,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,14.909952163696289,14.721952438354492,15.562975883483887,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:43,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,25.10848045349121,25.013248443603516,25.180980682373047,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:43,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,53.98118209838867,53.98118209838867,53.98118209838867,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:43,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,115.51538848876953,115.51538848876953,115.51538848876953,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:43,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,234.2144012451172,234.2144012451172,234.2144012451172,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:43,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,493.1143798828125,493.1143798828125,493.1143798828125,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:43,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.6873279809951782,0.6780927777290344,0.8112127780914307,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:45,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.6923519968986511,0.6756608486175537,0.8371520042419434,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:45,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.7854080200195312,0.7739391922950745,0.7946239709854126,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:45,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,1.5523840188980103,1.5431679487228394,1.5880192518234253,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:45,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,3.635200023651123,3.634176015853882,3.637446403503418,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:45,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,10.225664138793945,10.196991920471191,10.515456199645996,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:45,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,35.736061096191406,35.612876892089844,35.859249114990234,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:45,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.4935680031776428,0.4843519926071167,1.2861696481704712,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:48,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.5950400233268738,0.4885439872741699,0.7454720735549927,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:48,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,0.9082880020141602,0.8939520120620728,1.2302591800689697,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:48,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,1.994752049446106,1.9916800260543823,2.002943992614746,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:48,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,5.427199840545654,5.400953769683838,5.5943169593811035,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:48,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,16.917503356933594,16.85626792907715,17.202789306640625,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:48,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,58.775550842285156,58.775550842285156,58.775550842285156,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:48,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.16998399794101715,0.159743994474411,0.24968959391117096,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:52,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.15515199303627014,0.14643199741840363,0.16281600296497345,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:52,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.16998399794101715,0.159743994474411,0.25088000297546387,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:52,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,0.3307519853115082,0.32767999172210693,0.3317759931087494,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:52,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,0.8550400137901306,0.8529919981956482,0.8581119775772095,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:52,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,2.3664638996124268,2.36456298828125,2.371583938598633,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:52,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,8.253439903259277,8.21452808380127,8.534015655517578,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:52,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,5.056511878967285,4.674380779266357,5.254271984100342,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:58,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,10.41360092163086,10.147839546203613,10.88619613647461,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:58,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,21.108095169067383,19.98341178894043,22.000703811645508,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:58,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,39.93907165527344,39.49793243408203,40.380210876464844,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:58,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,87.47724914550781,87.47724914550781,87.47724914550781,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:58,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,162.8107147216797,162.8107147216797,162.8107147216797,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:58,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,318.89202880859375,318.89202880859375,318.89202880859375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:58,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,2.756608009338379,2.50598406791687,2.862694263458252,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:59,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,2.683903932571411,2.656268835067749,2.720358371734619,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:59,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,2.6729280948638916,2.649907112121582,2.703104019165039,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:59,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,2.8049919605255127,2.7712254524230957,2.848358392715454,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:59,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,2.8816640377044678,2.8426239490509033,2.966118335723877,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:59,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,6.523903846740723,6.52185583114624,6.534143924713135,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:59,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,23.48236846923828,23.36788558959961,23.587430953979492,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:10:59,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,6.210592269897461,6.149964809417725,6.439935684204102,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:06,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,11.412479400634766,11.000422477722168,12.122776985168457,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:06,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,21.02124786376953,20.722354888916016,21.280357360839844,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:06,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,44.49420928955078,43.21909713745117,45.769317626953125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:06,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,77.97862243652344,77.97862243652344,77.97862243652344,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:06,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,169.87033081054688,169.87033081054688,169.87033081054688,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:06,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,360.7623596191406,360.7623596191406,360.7623596191406,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:06,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.6484479904174805,0.5443072319030762,1.446675181388855,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:07,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.5460799932479858,0.536575973033905,0.6473984122276306,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:07,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.5612640380859375,0.5377407670021057,0.6634495854377747,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:07,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,0.6347839832305908,0.6327999830245972,0.7219520211219788,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:07,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,1.4684159755706787,1.4624768495559692,1.4744960069656372,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:07,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,4.150784015655518,4.148223876953125,4.164403438568115,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:07,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,15.233535766601562,14.96678352355957,15.318016052246094,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:07,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.596992015838623,0.5801728367805481,1.2581120729446411,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.5565760135650635,0.456928014755249,0.5724160075187683,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,0.5560640096664429,0.4616512060165405,0.5724160075187683,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,0.8714240193367004,0.8622080087661743,1.2775424718856812,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,2.3746559619903564,2.371583938598633,2.3776895999908447,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,8.032719612121582,8.015257835388184,8.314061164855957,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,29.113344192504883,28.672204971313477,29.20366096496582,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,32.525390625,32.525390625,32.525390625,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,37.7734375,37.7734375,37.7734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,53.2734375,53.2734375,53.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,102.2734375,102.2734375,102.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,272.2734375,272.2734375,272.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,900.2734375,900.2734375,900.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,3308.2734375,3308.2734375,3308.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:10,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,32.53125,32.53125,32.53125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:17,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,36.8046875,36.8046875,36.8046875,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:17,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,53.3359375,53.3359375,53.3359375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:17,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,110.5234375,110.5234375,110.5234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:17,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,321.2734375,321.2734375,321.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:17,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,1128.2734375,1128.2734375,1128.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:17,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,4284.2734375,4284.2734375,4284.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:17,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,55.2880859375,55.2880859375,55.2880859375,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:18,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,72.28515625,72.28515625,72.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:18,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,119.03515625,119.03515625,119.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:18,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,265.28515625,265.28515625,265.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:18,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,775.28515625,775.28515625,775.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:18,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,2659.28515625,2659.28515625,2659.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:18,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,9883.28515625,9883.28515625,9883.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:18,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,55.2919921875,55.2919921875,55.2919921875,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:27,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,70.05078125,70.05078125,70.05078125,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:27,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,118.34765625,118.34765625,118.34765625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:27,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,289.53515625,289.53515625,289.53515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:27,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,920.28515625,920.28515625,920.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:27,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,3335.28515625,3335.28515625,3335.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:27,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,12779.28515625,12779.28515625,12779.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:27,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,74.80078125,74.80078125,74.80078125,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:28,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,83.296875,83.296875,83.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:28,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,114.296875,114.296875,114.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:28,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,212.296875,212.296875,212.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:28,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,552.296875,552.296875,552.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:28,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,1808.296875,1808.296875,1808.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:28,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,6624.296875,6624.296875,6624.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:28,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,74.8046875,74.8046875,74.8046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:38,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,82.31640625,82.31640625,82.31640625,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:38,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,114.359375,114.359375,114.359375,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:38,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,228.546875,228.546875,228.546875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:38,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,649.296875,649.296875,649.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:38,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,2260.296875,2260.296875,2260.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:38,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,8560.296875,8560.296875,8560.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:38,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,32.525390625,32.525390625,32.525390625,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:39,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,37.7734375,37.7734375,37.7734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:39,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,53.2734375,53.2734375,53.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:39,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,102.2734375,102.2734375,102.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:39,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,272.2734375,272.2734375,272.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:39,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,900.2734375,900.2734375,900.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:39,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,3308.2734375,3308.2734375,3308.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:39,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,32.53125,32.53125,32.53125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:46,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,36.8046875,36.8046875,36.8046875,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:46,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,53.3359375,53.3359375,53.3359375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:46,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,110.5234375,110.5234375,110.5234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:46,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,321.2734375,321.2734375,321.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:46,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,1128.2734375,1128.2734375,1128.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:46,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,4284.2734375,4284.2734375,4284.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA GeForce RTX 3090,2025-05-27 15:11:46,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.25600001215934753,0.25436800718307495,0.2605184018611908,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:08,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.2569279968738556,0.25494399666786194,0.26105600595474243,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:08,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.25676798820495605,0.2550591826438904,0.2598848044872284,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:08,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,0.25841599702835083,0.25681281089782715,0.2625727951526642,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:08,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,0.3150399923324585,0.31407999992370605,0.31611520051956177,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:08,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,0.8260959982872009,0.8238016366958618,0.828614354133606,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:08,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,2.5686399936676025,2.557523012161255,2.5757951736450195,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:08,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,5.276463985443115,5.270419120788574,5.286643028259277,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:14,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,10.498432159423828,10.476134300231934,10.51439380645752,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:14,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,20.82036781311035,20.771360397338867,20.881420135498047,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:14,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,42.07323455810547,41.776065826416016,42.370399475097656,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:14,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,81.8509750366211,81.8509750366211,81.8509750366211,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:14,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,165.88720703125,165.88720703125,165.88720703125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:14,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,331.2662658691406,331.2662658691406,331.2662658691406,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:14,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,0.8993600010871887,0.8924031853675842,0.9097279906272888,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:25,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,0.8939200043678284,0.8890752196311951,0.9034687876701355,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:25,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,0.9244480133056641,0.9180480241775513,0.940447986125946,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:25,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,0.9229600429534912,0.915289580821991,0.9307839870452881,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:25,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,0.9950560331344604,0.9915199875831604,0.9971520304679871,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:25,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,2.5537919998168945,2.548985481262207,2.5564353466033936,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:25,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,7.698319911956787,7.67669153213501,7.713951587677002,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:25,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,5.840767860412598,5.819551944732666,5.864096164703369,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:31,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,11.064079284667969,11.050003051757812,11.102252960205078,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:31,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,21.443504333496094,21.364646911621094,21.61541748046875,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:31,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,42.16088104248047,42.137290954589844,42.18446731567383,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:31,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,84.43017578125,84.43017578125,84.43017578125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:31,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,169.27821350097656,169.27821350097656,169.27821350097656,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:31,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,342.5223388671875,342.5223388671875,342.5223388671875,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:31,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.49110400676727295,0.4891200065612793,0.49513599276542664,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:32,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.4911839962005615,0.4894847869873047,0.4949440062046051,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:32,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.5103520154953003,0.5084800124168396,0.5146496295928955,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:32,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,0.5199040174484253,0.5182399749755859,0.5254335999488831,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:32,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,0.6806079745292664,0.6792960166931152,0.681990385055542,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:32,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,1.7373919486999512,1.7352639436721802,1.7395071983337402,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:32,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,5.2151360511779785,5.205132484436035,5.221510410308838,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:32,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.4123840034008026,0.41091200709342957,0.4163135886192322,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:35,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.4136800169944763,0.41203200817108154,0.4168703854084015,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:35,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,0.4320639967918396,0.4301888048648834,0.4355071783065796,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:35,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,0.44307199120521545,0.44010239839553833,0.4480448067188263,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:35,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,0.9624000191688538,0.9609023928642273,0.9633920192718506,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:35,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,2.6429600715637207,2.641439914703369,2.644223928451538,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:35,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,8.974464416503906,8.973376274108887,8.97913646697998,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:35,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.2598559856414795,0.2580096125602722,0.2628991901874542,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:40,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.2602880001068115,0.25900799036026,0.26241281628608704,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:40,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.2643519937992096,0.2627519965171814,0.26796799898147583,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:40,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,0.41286399960517883,0.4122239947319031,0.4134399890899658,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:40,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,0.9781439900398254,0.9763264060020447,0.9801728129386902,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:40,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,2.659600019454956,2.655103921890259,2.6648640632629395,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:40,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,8.184944152832031,8.175705909729004,8.197542190551758,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:40,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,5.3048319816589355,5.287481784820557,5.315853118896484,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:47,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,10.493408203125,10.434623718261719,10.539365768432617,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:47,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,20.872079849243164,20.860185623168945,21.320632934570312,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:47,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,41.84241485595703,41.80018615722656,41.884647369384766,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:47,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,84.96883392333984,84.96883392333984,84.96883392333984,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:47,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,169.7915802001953,169.7915802001953,169.7915802001953,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:47,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,345.4809265136719,345.4809265136719,345.4809265136719,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:47,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,0.9144960045814514,0.9068800210952759,0.9251199960708618,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,0.9177280068397522,0.9107391834259033,0.9262208342552185,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,0.9360480308532715,0.9290496110916138,0.949785590171814,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,1.2921760082244873,1.289574384689331,1.2943040132522583,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,2.9243199825286865,2.919097423553467,2.9282751083374023,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,7.83568000793457,7.829171180725098,7.843168258666992,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,24.4779052734375,24.40936279296875,24.545881271362305,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:25:56,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,5.912464141845703,5.879615783691406,5.923999786376953,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,11.05232048034668,11.035250663757324,11.079456329345703,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,21.471296310424805,21.445714950561523,21.49998664855957,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,42.718048095703125,42.69863510131836,42.73746109008789,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,86.00204467773438,86.00204467773438,86.00204467773438,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,177.3928985595703,177.3928985595703,177.3928985595703,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,373.61773681640625,373.61773681640625,373.61773681640625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:03,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.5130239725112915,0.5107200145721436,0.5175104141235352,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:05,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.5187360048294067,0.5168319940567017,0.522816002368927,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:05,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.5284639596939087,0.5261759757995605,0.5319616198539734,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:05,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,0.8799999952316284,0.8791552186012268,0.8812223672866821,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:05,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,1.9606720209121704,1.9588288068771362,1.9625920057296753,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:05,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,5.239616394042969,5.233331203460693,5.246374607086182,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:05,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,16.295886993408203,16.174047470092773,16.315935134887695,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:05,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.4262079894542694,0.42505601048469543,0.42970240116119385,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:07,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.43747198581695557,0.43620482087135315,0.4399871826171875,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:07,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,0.5542719960212708,0.5531839728355408,0.555072009563446,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:07,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,1.0854079723358154,1.0841728448867798,1.0862784385681152,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:07,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,2.6914560794830322,2.6902334690093994,2.6927361488342285,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:07,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,8.072175979614258,8.052319526672363,8.081612586975098,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:07,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,27.25152015686035,27.248275756835938,27.25334358215332,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:07,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.26579201221466064,0.26371198892593384,0.2690303921699524,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:14,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.26337599754333496,0.26162558794021606,0.2659648060798645,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:14,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.264384001493454,0.2627967894077301,0.267276793718338,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:14,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,0.3535360097885132,0.3527039885520935,0.3543359935283661,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:14,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,0.7347840070724487,0.7331455945968628,0.7361727952957153,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:14,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,1.8545279502868652,1.850592017173767,1.8574399948120117,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:14,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,5.953392028808594,5.927840232849121,5.962080001831055,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:14,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,6.691328048706055,6.674118518829346,6.712192058563232,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:22,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,13.332127571105957,13.322579383850098,13.362988471984863,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:22,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,26.70470428466797,26.678035736083984,27.087322235107422,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:22,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,52.936126708984375,52.936126708984375,52.936126708984375,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:22,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,107.26537322998047,107.26537322998047,107.26537322998047,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:22,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,213.9727020263672,213.9727020263672,213.9727020263672,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:22,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,430.3240966796875,430.3240966796875,430.3240966796875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:22,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,0.912992000579834,0.8976320028305054,0.9327296018600464,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:32,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,0.9216639995574951,0.9107776284217834,0.9301823973655701,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:32,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,0.915615975856781,0.9078848361968994,0.9261952042579651,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:32,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,1.1379199028015137,1.1355520486831665,1.1407424211502075,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:32,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,2.277343988418579,2.268371343612671,2.2814719676971436,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:32,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,5.6143999099731445,5.608166217803955,5.673030376434326,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:32,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,17.534591674804688,17.516069412231445,17.57676124572754,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:32,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,7.29852819442749,7.287238597869873,7.318784236907959,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:40,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,13.901632308959961,13.893203735351562,13.942361831665039,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:40,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,27.261056900024414,27.254297256469727,27.288244247436523,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:40,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,54.26707077026367,54.26707077026367,54.26707077026367,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:40,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,108.40013122558594,108.40013122558594,108.40013122558594,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:40,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,220.19622802734375,220.19622802734375,220.19622802734375,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:40,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,453.9944763183594,453.9944763183594,453.9944763183594,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:40,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.49564799666404724,0.4941760003566742,0.49819520115852356,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:42,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.5055680274963379,0.5036479830741882,0.5097920298576355,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:42,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.5073280334472656,0.5049920082092285,0.5109120011329651,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:42,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,0.7868000268936157,0.7859584093093872,0.7878463864326477,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:42,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,1.5349119901657104,1.5336960554122925,1.5368640422821045,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:42,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,3.791167974472046,3.787168025970459,3.802060842514038,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:42,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,11.613519668579102,11.596006393432617,11.618464469909668,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:42,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.41388800740242004,0.412447988986969,0.417279988527298,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:45,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.42691200971603394,0.42473599314689636,0.4324415922164917,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:45,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,0.4886400103569031,0.48771199584007263,0.48993921279907227,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:45,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,0.9216960072517395,0.9203839898109436,0.9231168031692505,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:45,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,1.9877119064331055,1.9866175651550293,1.9888639450073242,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:45,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,5.659264087677002,5.653772830963135,5.6628031730651855,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:45,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,18.87718391418457,18.870214462280273,18.878368377685547,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:45,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,64,0.26070401072502136,0.258950412273407,0.26361599564552307,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:49,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,128,0.2584800124168396,0.256985604763031,0.26101118326187134,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:49,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,256,0.25942400097846985,0.25811201333999634,0.2618303894996643,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:49,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,512,0.26097601652145386,0.25948798656463623,0.2640959918498993,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:49,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,1024,0.3149600028991699,0.3140160143375397,0.31593599915504456,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:49,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,2048,0.8244799971580505,0.8216319680213928,0.8271167874336243,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:49,0.5.10
+fused_neighborhood_attention,liger,forward,speed,ms,seq_len,sequence length,4096,2.5662078857421875,2.5587263107299805,2.5770816802978516,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:49,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,64,5.195775985717773,5.172947406768799,5.230342388153076,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:55,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,128,10.488927841186523,10.467231750488281,10.511955261230469,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:55,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,256,21.20012664794922,21.1026554107666,21.275672912597656,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:55,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,512,43.42755126953125,42.99705123901367,43.858055114746094,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:55,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,1024,84.55020904541016,84.55020904541016,84.55020904541016,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:55,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,2048,169.3335418701172,169.3335418701172,169.3335418701172,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:55,0.5.10
+fused_neighborhood_attention,torch,forward,speed,ms,seq_len,sequence length,4096,340.14495849609375,340.14495849609375,340.14495849609375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:55,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,64,0.8945279717445374,0.886732816696167,0.9055423736572266,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,128,0.8908159732818604,0.8847360014915466,0.8983359932899475,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,256,0.9086400270462036,0.9012479782104492,0.9151040315628052,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,512,0.9225280284881592,0.9153919816017151,0.9314560294151306,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,1024,0.9986559748649597,0.9929599761962891,1.0019199848175049,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,2048,2.5703680515289307,2.56607985496521,2.574105739593506,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:56,0.5.10
+fused_neighborhood_attention,liger,full,speed,ms,seq_len,sequence length,4096,7.78985595703125,7.7626495361328125,7.792575836181641,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:26:56,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,64,5.764095783233643,5.736550331115723,5.7790656089782715,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,128,11.027040481567383,11.009875297546387,11.10332202911377,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,256,21.499038696289062,21.467283248901367,21.521759033203125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,512,42.39520263671875,42.34148025512695,42.44892120361328,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,1024,85.2570571899414,85.2570571899414,85.2570571899414,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,2048,172.73379516601562,172.73379516601562,172.73379516601562,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:03,0.5.10
+fused_neighborhood_attention,torch,full,speed,ms,seq_len,sequence length,4096,347.4947509765625,347.4947509765625,347.4947509765625,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:03,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,64,0.4941760003566742,0.49265921115875244,0.4977791905403137,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:04,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,128,0.49348801374435425,0.49185919761657715,0.4974527955055237,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:04,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,256,0.5101760029792786,0.5087360143661499,0.5148288011550903,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:04,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,512,0.5200639963150024,0.5186240077018738,0.5237439870834351,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:04,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,1024,0.6887840032577515,0.6859776377677917,0.6903167963027954,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:04,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,2048,1.7373759746551514,1.7341376543045044,1.7395455837249756,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:04,0.5.10
+fused_neighborhood_attention,liger,backward,speed,ms,seq_len,sequence length,4096,5.201104164123535,5.196633815765381,5.208876609802246,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:04,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,64,0.4107840061187744,0.40908798575401306,0.41468799114227295,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:06,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,128,0.4121600091457367,0.4106624126434326,0.4156480133533478,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:06,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,256,0.4296959936618805,0.42847999930381775,0.4339391887187958,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:06,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,512,0.43406400084495544,0.4329279959201813,0.43656960129737854,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:06,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,1024,0.9568639993667603,0.9556096196174622,0.9582463502883911,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:06,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,2048,2.6357598304748535,2.634399890899658,2.6394240856170654,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:06,0.5.10
+fused_neighborhood_attention,torch,backward,speed,ms,seq_len,sequence length,4096,8.944831848144531,8.943455696105957,8.947711944580078,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:06,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,80.275390625,80.275390625,80.275390625,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:07,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,85.5234375,85.5234375,85.5234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:07,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,101.0234375,101.0234375,101.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:07,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,150.0234375,150.0234375,150.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:07,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,320.0234375,320.0234375,320.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:07,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,948.0234375,948.0234375,948.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:07,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,3356.0234375,3356.0234375,3356.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:07,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,80.28125,80.28125,80.28125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,84.5546875,84.5546875,84.5546875,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,101.0859375,101.0859375,101.0859375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,158.2734375,158.2734375,158.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,369.0234375,369.0234375,369.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,1176.0234375,1176.0234375,1176.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,4332.0234375,4332.0234375,4332.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,103.0380859375,103.0380859375,103.0380859375,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,120.78515625,120.78515625,120.78515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,166.78515625,166.78515625,166.78515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,313.03515625,313.03515625,313.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,823.03515625,823.03515625,823.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,2707.03515625,2707.03515625,2707.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,9931.03515625,9931.03515625,9931.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:14,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,103.0419921875,103.0419921875,103.0419921875,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,117.05078125,117.05078125,117.05078125,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,167.34765625,167.34765625,167.34765625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,337.28515625,337.28515625,337.28515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,968.03515625,968.03515625,968.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,3383.03515625,3383.03515625,3383.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,12827.03515625,12827.03515625,12827.03515625,"{""batch_size"": 4, ""hidden_size"": 768, ""num_heads"": 12, ""kernel_size"": 7, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,122.55078125,122.55078125,122.55078125,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,131.046875,131.046875,131.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,162.046875,162.046875,162.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,260.046875,260.046875,260.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,600.046875,600.046875,600.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,1856.046875,1856.046875,1856.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,6672.046875,6672.046875,6672.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:22,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,122.5546875,122.5546875,122.5546875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,130.06640625,130.06640625,130.06640625,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,162.109375,162.109375,162.109375,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,276.296875,276.296875,276.296875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,697.046875,697.046875,697.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,2308.046875,2308.046875,2308.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,8608.046875,8608.046875,8608.046875,"{""batch_size"": 2, ""hidden_size"": 1024, ""num_heads"": 16, ""kernel_size"": 9, ""dilation"": 1, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,64,80.275390625,80.275390625,80.275390625,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,128,85.5234375,85.5234375,85.5234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,256,101.0234375,101.0234375,101.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,512,150.0234375,150.0234375,150.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,1024,320.0234375,320.0234375,320.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,2048,948.0234375,948.0234375,948.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,liger,full,memory,MB,seq_len,sequence length,4096,3356.0234375,3356.0234375,3356.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:32,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,64,80.28125,80.28125,80.28125,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,128,84.5546875,84.5546875,84.5546875,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,256,101.0859375,101.0859375,101.0859375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,158.2734375,158.2734375,158.2734375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,369.0234375,369.0234375,369.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,1176.0234375,1176.0234375,1176.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,4332.0234375,4332.0234375,4332.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,1024,13.828096389770508,13.821133041381836,13.885849952697754,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,2048,27.57427215576172,27.52573432922363,27.579801940917967,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,4096,54.79423904418945,54.79423904418945,54.79423904418945,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,8192,109.73490905761719,109.73490905761719,109.73490905761719,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,1024,16.456703186035156,15.045836448669434,16.761650466918944,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,2048,29.703168869018555,29.69333839416504,29.71177024841309,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,4096,59.177982330322266,59.177982330322266,59.177982330322266,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,8192,118.3815689086914,118.3815689086914,118.3815689086914,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,1024,14.654463768005371,14.63398380279541,14.68006420135498,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,2048,28.274688720703125,28.27284507751465,28.279603958129883,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,4096,55.96672058105469,55.96672058105469,55.96672058105469,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,8192,111.38764953613281,111.38764953613281,111.38764953613281,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,1024,37.45382308959961,37.42556076049805,37.482085418701175,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,2048,73.56620788574219,73.56620788574219,73.56620788574219,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,4096,145.73056030273438,145.73056030273438,145.73056030273438,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,8192,291.5000305175781,291.5000305175781,291.5000305175781,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,1024,5059.26806640625,5059.26806640625,5059.26806640625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,2048,5087.27587890625,5087.27587890625,5087.27587890625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,4096,5143.29150390625,5143.29150390625,5143.29150390625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,8192,5255.32275390625,5255.32275390625,5255.32275390625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,1024,7566.2822265625,7566.2822265625,7566.2822265625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,2048,11590.3134765625,11590.3134765625,11590.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,4096,19654.375,19654.375,19654.375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,8192,35782.5,35782.5,35782.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
+layer_norm,liger,forward,speed,ms,N,hidden size,1024,0.018848000094294548,0.018400000408291817,0.020102400332689285,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
+layer_norm,liger,forward,speed,ms,N,hidden size,2048,0.029152000322937965,0.02876799926161766,0.029823999851942062,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
+layer_norm,liger,forward,speed,ms,N,hidden size,4096,0.05104000121355057,0.05036799982190132,0.05177599936723709,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
+layer_norm,liger,forward,speed,ms,N,hidden size,8192,0.0947519987821579,0.09436800330877304,0.09507200121879578,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
+layer_norm,liger,forward,speed,ms,N,hidden size,16384,0.18476800620555878,0.18396799266338348,0.1852159947156906,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:11,0.6.0
+layer_norm,huggingface,forward,speed,ms,N,hidden size,1024,0.023584000766277313,0.023423999547958374,0.023840000852942467,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
+layer_norm,huggingface,forward,speed,ms,N,hidden size,2048,0.03734400123357773,0.03702399879693985,0.037811201065778746,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
+layer_norm,huggingface,forward,speed,ms,N,hidden size,4096,0.06617599725723267,0.06560000032186508,0.06678400188684464,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
+layer_norm,huggingface,forward,speed,ms,N,hidden size,8192,0.15267199277877808,0.15190400183200836,0.15347200632095337,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
+layer_norm,huggingface,forward,speed,ms,N,hidden size,16384,0.3067840039730072,0.3046143889427185,0.3081152021884918,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:14,0.6.0
+layer_norm,liger,backward,speed,ms,N,hidden size,1024,0.12006399780511856,0.11653760075569153,0.12467200309038162,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
+layer_norm,liger,backward,speed,ms,N,hidden size,2048,0.1207360029220581,0.1176128014922142,0.1256511986255646,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
+layer_norm,liger,backward,speed,ms,N,hidden size,4096,0.16630400717258453,0.16412800550460815,0.16838400065898895,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
+layer_norm,liger,backward,speed,ms,N,hidden size,8192,0.31279999017715454,0.31116798520088196,0.3145279884338379,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
+layer_norm,liger,backward,speed,ms,N,hidden size,16384,0.5776320099830627,0.5753471970558167,0.5798912048339844,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:16,0.6.0
+layer_norm,huggingface,backward,speed,ms,N,hidden size,1024,0.0605119988322258,0.059647999703884125,0.061344001442193985,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
+layer_norm,huggingface,backward,speed,ms,N,hidden size,2048,0.09967999905347824,0.09849599748849869,0.10099200159311295,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
+layer_norm,huggingface,backward,speed,ms,N,hidden size,4096,0.17881600558757782,0.17795200645923615,0.17971199750900269,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
+layer_norm,huggingface,backward,speed,ms,N,hidden size,8192,0.33369600772857666,0.3328000009059906,0.33478400111198425,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
+layer_norm,huggingface,backward,speed,ms,N,hidden size,16384,0.6424000263214111,0.6412223815917969,0.643455982208252,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:18,0.6.0
+layer_norm,liger,full,speed,ms,N,hidden size,1024,0.26576000452041626,0.2629248082637787,0.2701759934425354,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
+layer_norm,liger,full,speed,ms,N,hidden size,2048,0.27427199482917786,0.26999040842056277,0.28091518878936766,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
+layer_norm,liger,full,speed,ms,N,hidden size,4096,0.27454400062561035,0.27004799246788025,0.2807359993457794,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
+layer_norm,liger,full,speed,ms,N,hidden size,8192,0.40556800365448,0.40403199195861816,0.40723198652267456,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
+layer_norm,liger,full,speed,ms,N,hidden size,16384,0.7608960270881653,0.7589311957359314,0.7631679773330688,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:21,0.6.0
+layer_norm,huggingface,full,speed,ms,N,hidden size,1024,0.08025600016117096,0.07942400127649307,0.08111999928951263,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,speed,ms,N,hidden size,2048,0.13315199315547943,0.13180799782276154,0.13468800485134125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,speed,ms,N,hidden size,4096,0.2417600005865097,0.24089600145816803,0.24262399971485138,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,speed,ms,N,hidden size,8192,0.4832639992237091,0.48214399814605713,0.4843647956848145,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,speed,ms,N,hidden size,16384,0.950575977563858,0.9484800100326538,0.9528064012527466,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,liger,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,liger,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,liger,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,liger,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,liger,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,memory,MB,N,hidden size,1024,80.0625,80.0625,80.0625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,memory,MB,N,hidden size,2048,160.09375,160.09375,160.09375,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,memory,MB,N,hidden size,4096,320.15625,320.15625,320.15625,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,memory,MB,N,hidden size,8192,640.28125,640.28125,640.28125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+layer_norm,huggingface,full,memory,MB,N,hidden size,16384,1280.53125,1280.53125,1280.53125,"{""M"": 4096, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 80GB HBM3,2025-07-17 18:18:23,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,1024,0.01759999990463257,0.017311999574303627,0.017920000478625298,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,2048,0.02924799919128418,0.028863999992609024,0.029983999207615852,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,4096,0.05129599943757057,0.050624001771211624,0.05209600180387497,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,8192,0.09344000369310379,0.09296000003814697,0.09382399916648865,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,16384,0.1791680008172989,0.17814399302005768,0.1796800047159195,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,forward,speed,ms,H,hidden size,32768,0.43830400705337524,0.43744000792503357,0.43929600715637207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:20,0.6.0
+fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,1024,0.060095999389886856,0.059808000922203064,0.06054399907588959,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
+fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,2048,0.09084799885749817,0.09027200192213058,0.09161599725484848,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
+fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,4096,0.17820799350738525,0.17744000256061554,0.17897599935531616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
+fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,8192,0.312608003616333,0.3118720054626465,0.31324800848960876,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
+fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,16384,0.574944019317627,0.5740479826927185,0.5756288051605225,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
+fused_add_rms_norm,huggingface,forward,speed,ms,H,hidden size,32768,1.0943039655685425,1.0934272289276123,1.0951999425888062,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:23,0.6.0
+fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,1024,0.0352960005402565,0.03481600061058998,0.03811199963092804,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
+fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,2048,0.05430399999022484,0.05392000079154968,0.05503999814391136,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
+fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,4096,0.10592000186443329,0.1054655984044075,0.10630399733781815,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
+fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,8192,0.19679999351501465,0.19631999731063843,0.19724799692630768,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
+fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,16384,0.37436801195144653,0.3733760118484497,0.3752320110797882,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
+fused_add_rms_norm,liger_rms_norm,forward,speed,ms,H,hidden size,32768,0.7376000285148621,0.7361343741416931,0.7391359806060791,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:26,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,1024,0.3147200047969818,0.30796160697937014,0.32764801383018494,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,2048,0.3089919984340668,0.30374398827552795,0.3226880133152008,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,4096,0.30691200494766235,0.3023296058177948,0.3205504059791565,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,8192,0.3246079981327057,0.3185984075069428,0.33656961321830753,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,16384,0.6010559797286987,0.5996800065040588,0.6026239991188049,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,speed,ms,H,hidden size,32768,1.8402559757232666,1.8322880268096924,1.8461120128631592,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:30,0.6.0
+fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,1024,0.23878400027751923,0.23545600473880768,0.2507520020008087,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
+fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,2048,0.34513600170612335,0.34377598762512207,0.34678399562835693,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
+fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,4096,0.6330879926681519,0.631712019443512,0.6345599889755249,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
+fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,8192,1.1185599565505981,1.1172800064086914,1.1196800470352173,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
+fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,16384,2.0697600841522217,2.0678528785705566,2.0713536739349365,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
+fused_add_rms_norm,huggingface,full,speed,ms,H,hidden size,32768,3.9561920166015625,3.953824043273926,3.9581120014190674,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:33,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,1024,0.38916800916194916,0.3824320137500763,0.4037184059619903,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,2048,0.3890720009803772,0.38193280100822447,0.4032831907272339,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,4096,0.39715200662612915,0.3928639888763428,0.41097599267959595,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,8192,0.6275200247764587,0.6259520053863525,0.6287999749183655,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,16384,1.202239990234375,1.199679970741272,1.2048959732055664,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,speed,ms,H,hidden size,32768,2.7738559246063232,2.7705343723297116,2.777868890762329,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:36,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,1024,0.15619200468063354,0.15376000106334686,0.1661248028278351,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,2048,0.15825600177049637,0.15600000321865082,0.16911999881267548,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,4096,0.16700799763202667,0.16502399742603302,0.1709440052509308,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,8192,0.1712000072002411,0.1700800061225891,0.17215999960899353,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,16384,0.42505601048469543,0.4233280122280121,0.42691200971603394,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,backward,speed,ms,H,hidden size,32768,1.4057759642601013,1.3944000005722046,1.4099839925765991,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:39,0.6.0
+fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,1024,0.1520960032939911,0.15136000514030457,0.1528960019350052,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
+fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,2048,0.2533760070800781,0.2524160146713257,0.25436800718307495,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
+fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,4096,0.4551039934158325,0.4540799856185913,0.45612800121307373,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
+fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,8192,0.8053439855575562,0.8038079738616943,0.806656002998352,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
+fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,16384,1.4933120012283325,1.492095947265625,1.49452805519104,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
+fused_add_rms_norm,huggingface,backward,speed,ms,H,hidden size,32768,2.8600640296936035,2.8583295822143557,2.8612607955932616,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:42,0.6.0
+fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,1024,0.20175999402999878,0.199072003364563,0.2154303938150406,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,2048,0.20263999700546265,0.20000000298023224,0.21675519943237304,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,4096,0.25276800990104675,0.2515519857406616,0.2539199888706207,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,8192,0.4322720021009445,0.43088001012802124,0.4336000084877014,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,16384,0.8288000226020813,0.8266303777694701,0.8311295866966247,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,backward,speed,ms,H,hidden size,32768,2.03987193107605,2.0360767364501955,2.0436416149139403,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,1024,72.546875,72.546875,72.546875,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,2048,145.0859375,145.0859375,145.0859375,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,4096,290.1640625,290.1640625,290.1640625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,8192,580.3203125,580.3203125,580.3203125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,16384,1160.6328125,1160.6328125,1160.6328125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_fused_add_rms_norm,full,memory,MB,H,hidden size,32768,2321.2578125,2321.2578125,2321.2578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,1024,104.03173828125,104.03173828125,104.03173828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,2048,208.05517578125,208.05517578125,208.05517578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,4096,416.10205078125,416.10205078125,416.10205078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,8192,832.19580078125,832.19580078125,832.19580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,16384,1664.3125,1664.3125,1664.3125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,huggingface,full,memory,MB,H,hidden size,32768,3328.625,3328.625,3328.625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,1024,104.03564453125,104.03564453125,104.03564453125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,2048,208.06298828125,208.06298828125,208.06298828125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578125,416.11767578125,416.11767578125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,2,40.75366401672363,40.749671173095706,40.75765686035156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,4,80.95231628417969,80.95231628417969,80.95231628417969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,8,163.58604431152344,163.58604431152344,163.58604431152344,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,16,323.6761474609375,323.6761474609375,323.6761474609375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,2,23.71225643157959,23.612825775146483,23.8354434967041,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,4,46.86131286621094,46.80355911254883,46.91906661987304,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,8,94.54898834228516,94.54898834228516,94.54898834228516,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,16,189.99501037597656,189.99501037597656,189.99501037597656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,2,42.67263984680176,42.54085083007813,42.80442886352539,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,4,82.2446060180664,82.2446060180664,82.2446060180664,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,8,167.00416564941406,167.00416564941406,167.00416564941406,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,16,327.0911865234375,327.0911865234375,327.0911865234375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,2,45.36115264892578,45.241344451904304,45.480960845947266,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,4,90.00038146972656,90.00038146972656,90.00038146972656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,8,177.22674560546875,177.22674560546875,177.22674560546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,16,356.5383605957031,356.5383605957031,356.5383605957031,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,2,1.814527988433838,1.8124799728393555,1.8167808055877686,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,4,1.84934401512146,1.8472959995269775,1.8524160385131836,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8872319459915161,1.893990397453308,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,16,1.9722239971160889,1.9660799503326416,1.9763200283050537,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,2,22.014975547790527,21.710438537597657,22.19417533874512,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,4,41.83603096008301,41.752165222167974,41.91989669799805,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,8,81.66400146484375,81.66400146484375,81.66400146484375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,16,162.6429443359375,162.6429443359375,162.6429443359375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,2,40.72038269042969,40.71178131103516,40.728984069824214,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,4,81.69369506835938,81.69369506835938,81.69369506835938,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,8,162.79653930664062,162.79653930664062,162.79653930664062,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,16,323.6546630859375,323.6546630859375,323.6546630859375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,2,23.70047950744629,23.628594589233398,23.732429122924806,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,4,47.36921691894531,47.085364532470706,47.65306930541992,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,8,94.83366394042969,94.83366394042969,94.83366394042969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,16,190.0963897705078,190.0963897705078,190.0963897705078,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,2,42.318336486816406,42.15214080810547,42.48453216552734,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,4,82.4616928100586,82.4616928100586,82.4616928100586,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,8,163.43756103515625,163.43756103515625,163.43756103515625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,16,325.4384765625,325.4384765625,325.4384765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,2,45.99193572998047,45.80761489868165,46.176256561279295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,4,88.57190704345703,88.57190704345703,88.57190704345703,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,8,176.94105529785156,176.94105529785156,176.94105529785156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,16,356.0478820800781,356.0478820800781,356.0478820800781,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,2,1.8242560029029846,1.8102271556854248,1.8309119939804077,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,4,1.84934401512146,1.846886396408081,1.8534400463104248,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8892799615859985,1.8933759927749634,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,16,1.9752960205078125,1.9722239971160889,1.977344036102295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,2,22.0262393951416,21.80997085571289,22.20482559204102,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,4,41.54521560668945,41.224806213378905,41.865625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,8,81.21753692626953,81.21753692626953,81.21753692626953,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,16,160.82022094726562,160.82022094726562,160.82022094726562,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
+llama4_rope,liger,forward,speed,ms,H,hidden size,512,0.08249600231647491,0.08102399855852127,0.08432000130414963,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
+llama4_rope,liger,forward,speed,ms,H,hidden size,2048,0.08169600367546082,0.08037760108709335,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
+llama4_rope,liger,forward,speed,ms,H,hidden size,8192,0.08128000050783157,0.07980799674987793,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
+llama4_rope,huggingface,forward,speed,ms,H,hidden size,512,0.03759999945759773,0.03612799942493439,0.03907199949026108,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
+llama4_rope,huggingface,forward,speed,ms,H,hidden size,2048,0.06185600161552429,0.061267200857400894,0.06252799928188324,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
+llama4_rope,huggingface,forward,speed,ms,H,hidden size,8192,0.206496000289917,0.20582400262355804,0.20716799795627594,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:03,0.6.1
+llama4_rope,liger,backward,speed,ms,H,hidden size,512,0.15404799580574036,0.15241600573062897,0.15615999698638916,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
+llama4_rope,liger,backward,speed,ms,H,hidden size,2048,0.1536320000886917,0.15190400183200836,0.1558080017566681,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
+llama4_rope,liger,backward,speed,ms,H,hidden size,8192,0.15263999998569489,0.15094399452209473,0.15491199493408203,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:04,0.6.1
+llama4_rope,huggingface,backward,speed,ms,H,hidden size,512,0.13760000467300415,0.13574400544166565,0.14009599387645721,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
+llama4_rope,huggingface,backward,speed,ms,H,hidden size,2048,0.13600000739097595,0.13449600338935852,0.1382720023393631,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
+llama4_rope,huggingface,backward,speed,ms,H,hidden size,8192,0.21011200547218323,0.20924800634384155,0.21110400557518005,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:05,0.6.1
+llama4_rope,liger,full,speed,ms,H,hidden size,512,0.3652159869670868,0.3619840145111084,0.3699840009212494,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
+llama4_rope,liger,full,speed,ms,H,hidden size,2048,0.3599040061235428,0.2881920039653778,0.36559998989105225,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
+llama4_rope,liger,full,speed,ms,H,hidden size,8192,0.2874239981174469,0.2852480113506317,0.29029120206832887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:07,0.6.1
+llama4_rope,huggingface,full,speed,ms,H,hidden size,512,0.24691200256347656,0.24489599466323853,0.24961919784545897,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,huggingface,full,speed,ms,H,hidden size,2048,0.24774399399757385,0.24582399427890778,0.2505407989025116,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,huggingface,full,speed,ms,H,hidden size,8192,0.41414400935173035,0.41337600350379944,0.41491198539733887,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,liger,full,memory,MB,H,hidden size,512,37.23486328125,37.23486328125,37.23486328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,liger,full,memory,MB,H,hidden size,2048,52.89111328125,52.89111328125,52.89111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,liger,full,memory,MB,H,hidden size,8192,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,huggingface,full,memory,MB,H,hidden size,512,49.64111328125,49.64111328125,49.64111328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,huggingface,full,memory,MB,H,hidden size,2048,102.51611328125,102.51611328125,102.51611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,huggingface,full,memory,MB,H,hidden size,8192,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:08,0.6.1
+llama4_rope,liger,forward,speed,ms,T,sequence length,1024,0.07417599856853485,0.07248000055551529,0.07596799731254578,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
+llama4_rope,liger,forward,speed,ms,T,sequence length,2048,0.08182399719953537,0.08006399869918823,0.08380799740552902,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
+llama4_rope,liger,forward,speed,ms,T,sequence length,4096,0.11708799749612808,0.1167680025100708,0.11744000017642975,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
+llama4_rope,liger,forward,speed,ms,T,sequence length,8192,0.2165440022945404,0.21596799790859222,0.21715199947357178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
+llama4_rope,liger,forward,speed,ms,T,sequence length,16384,0.41756799817085266,0.41705599427223206,0.41811200976371765,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:10,0.6.1
+llama4_rope,huggingface,forward,speed,ms,T,sequence length,1024,0.11644800007343292,0.11590400338172913,0.11708799749612808,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
+llama4_rope,huggingface,forward,speed,ms,T,sequence length,2048,0.20659199357032776,0.20608000457286835,0.2072640061378479,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
+llama4_rope,huggingface,forward,speed,ms,T,sequence length,4096,0.38553598523139954,0.3846847891807556,0.38624000549316406,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
+llama4_rope,huggingface,forward,speed,ms,T,sequence length,8192,0.7411519885063171,0.7403839826583862,0.7420480251312256,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
+llama4_rope,huggingface,forward,speed,ms,T,sequence length,16384,1.4553920030593872,1.4543871641159059,1.4562879800796509,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:12,0.6.1
+llama4_rope,liger,backward,speed,ms,T,sequence length,1024,0.11840000003576279,0.11711999773979187,0.12031999975442886,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
+llama4_rope,liger,backward,speed,ms,T,sequence length,2048,0.12336000055074692,0.12198399752378464,0.12489599734544754,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
+llama4_rope,liger,backward,speed,ms,T,sequence length,4096,0.12380799651145935,0.12240000069141388,0.12559999525547028,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
+llama4_rope,liger,backward,speed,ms,T,sequence length,8192,0.2170879989862442,0.2165759950876236,0.21753600239753723,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
+llama4_rope,liger,backward,speed,ms,T,sequence length,16384,0.4175359904766083,0.41705599427223206,0.4181375920772552,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:15,0.6.1
+llama4_rope,huggingface,backward,speed,ms,T,sequence length,1024,0.1189119964838028,0.11769600212574005,0.12003199756145477,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
+llama4_rope,huggingface,backward,speed,ms,T,sequence length,2048,0.21011200547218323,0.20927999913692474,0.21119999885559082,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
+llama4_rope,huggingface,backward,speed,ms,T,sequence length,4096,0.39740800857543945,0.3963199853897095,0.39824000000953674,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
+llama4_rope,huggingface,backward,speed,ms,T,sequence length,8192,0.7540159821510315,0.7528960108757019,0.7550719976425171,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
+llama4_rope,huggingface,backward,speed,ms,T,sequence length,16384,1.4822720289230347,1.4810559749603271,1.4833600521087646,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:17,0.6.1
+llama4_rope,liger,full,speed,ms,T,sequence length,1024,0.2874400019645691,0.2853440046310425,0.29052799940109253,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
+llama4_rope,liger,full,speed,ms,T,sequence length,2048,0.28646400570869446,0.2845759987831116,0.28963199257850647,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
+llama4_rope,liger,full,speed,ms,T,sequence length,4096,0.29897600412368774,0.29660800099372864,0.302131199836731,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
+llama4_rope,liger,full,speed,ms,T,sequence length,8192,0.4315840005874634,0.4304639995098114,0.43270400166511536,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
+llama4_rope,liger,full,speed,ms,T,sequence length,16384,0.833184003829956,0.8322240114212036,0.8345024228096007,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:19,0.6.1
+llama4_rope,huggingface,full,speed,ms,T,sequence length,1024,0.24592000246047974,0.24396799504756927,0.24876800179481506,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,speed,ms,T,sequence length,2048,0.4138239920139313,0.41308799386024475,0.4145599901676178,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,speed,ms,T,sequence length,4096,0.7800959944725037,0.7790719866752625,0.7810239791870117,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,speed,ms,T,sequence length,8192,1.4911680221557617,1.4902976036071778,1.4922879934310913,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,speed,ms,T,sequence length,16384,2.9344160556793213,2.9333438873291016,2.9353599548339844,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,liger,full,memory,MB,T,sequence length,1024,73.75830078125,73.75830078125,73.75830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,liger,full,memory,MB,T,sequence length,2048,115.51611328125,115.51611328125,115.51611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,liger,full,memory,MB,T,sequence length,4096,199.03173828125,199.03173828125,199.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,liger,full,memory,MB,T,sequence length,8192,366.06298828125,366.06298828125,366.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,liger,full,memory,MB,T,sequence length,16384,700.12548828125,700.12548828125,700.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,memory,MB,T,sequence length,1024,173.00830078125,173.00830078125,173.00830078125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,memory,MB,T,sequence length,2048,314.01611328125,314.01611328125,314.01611328125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,memory,MB,T,sequence length,4096,596.03173828125,596.03173828125,596.03173828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,memory,MB,T,sequence length,8192,1160.06298828125,1160.06298828125,1160.06298828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+llama4_rope,huggingface,full,memory,MB,T,sequence length,16384,2288.12548828125,2288.12548828125,2288.12548828125,"{""dtype"": ""torch.bfloat16"", ""hidden_size"": 8192, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:21,0.6.1
+tiled_geglu,liger,full,speed,ms,T,sequence length,1024,2.1678080558776855,2.166579246520996,2.1682305335998535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
+tiled_geglu,liger,full,speed,ms,T,sequence length,2048,4.344256401062012,4.343987464904785,4.34452486038208,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
+tiled_geglu,liger,full,speed,ms,T,sequence length,4096,8.653023719787598,8.653023719787598,8.653023719787598,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
+tiled_geglu,liger,full,speed,ms,T,sequence length,8192,16.909311294555664,16.909311294555664,16.909311294555664,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
+tiled_geglu,liger,full,speed,ms,T,sequence length,16384,33.63123321533203,33.63123321533203,33.63123321533203,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:48,0.6.3
+tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,1024,3.353935956954956,3.353523015975952,3.35434889793396,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
+tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,2048,6.023168087005615,6.023168087005615,6.023168087005615,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
+tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,4096,11.495424270629883,11.495424270629883,11.495424270629883,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
+tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,8192,23.68614387512207,23.68614387512207,23.68614387512207,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
+tiled_geglu,liger_tiled,full,speed,ms,T,sequence length,16384,47.478782653808594,47.478782653808594,47.478782653808594,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:49,0.6.3
+tiled_geglu,liger,forward,speed,ms,T,sequence length,1024,0.6614400148391724,0.6594560146331787,0.6635519862174988,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,liger,forward,speed,ms,T,sequence length,2048,1.3471999168395996,1.346560001373291,1.3475840091705322,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,liger,forward,speed,ms,T,sequence length,4096,2.752511978149414,2.7261502742767334,2.7844607830047607,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,liger,forward,speed,ms,T,sequence length,8192,5.433343887329102,5.433343887329102,5.433343887329102,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,liger,forward,speed,ms,T,sequence length,16384,10.712063789367676,10.712063789367676,10.712063789367676,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,1024,0.7403519749641418,0.7402047514915466,0.7413759827613831,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
+tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,2048,1.3941760063171387,1.3895679712295532,1.398144006729126,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
+tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,4096,2.7586560249328613,2.7585408687591553,2.759884834289551,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
+tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,8192,5.789696216583252,5.789696216583252,5.789696216583252,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
+tiled_geglu,liger_tiled,forward,speed,ms,T,sequence length,16384,11.810815811157227,11.810815811157227,11.810815811157227,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:53,0.6.3
+tiled_geglu,liger,backward,speed,ms,T,sequence length,1024,1.491968035697937,1.4916608333587646,1.4940160512924194,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
+tiled_geglu,liger,backward,speed,ms,T,sequence length,2048,3.0185279846191406,3.0131328105926514,3.0555264949798584,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
+tiled_geglu,liger,backward,speed,ms,T,sequence length,4096,6.021120071411133,6.021120071411133,6.021120071411133,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
+tiled_geglu,liger,backward,speed,ms,T,sequence length,8192,11.512767791748047,11.512767791748047,11.512767791748047,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
+tiled_geglu,liger,backward,speed,ms,T,sequence length,16384,22.806528091430664,22.806528091430664,22.806528091430664,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:56,0.6.3
+tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,1024,2.6060800552368164,2.6053311824798584,2.607308864593506,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
+tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,2048,4.665375709533691,4.664742469787598,4.666009426116943,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
+tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,4096,8.71731185913086,8.71731185913086,8.71731185913086,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
+tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,8192,17.99782371520996,17.99782371520996,17.99782371520996,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
+tiled_geglu,liger_tiled,backward,speed,ms,T,sequence length,16384,35.64400100708008,35.64400100708008,35.64400100708008,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:57,0.6.3
+tiled_geglu,liger,full,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger,full,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger,full,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger,full,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger,full,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger_tiled,full,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:00,0.6.3
+tiled_geglu,liger,forward,memory,MB,T,sequence length,1024,128.25,128.25,128.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
+tiled_geglu,liger,forward,memory,MB,T,sequence length,2048,192.25,192.25,192.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
+tiled_geglu,liger,forward,memory,MB,T,sequence length,4096,320.25,320.25,320.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
+tiled_geglu,liger,forward,memory,MB,T,sequence length,8192,576.25,576.25,576.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
+tiled_geglu,liger,forward,memory,MB,T,sequence length,16384,1088.25,1088.25,1088.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:03,0.6.3
+tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,1024,92.25,92.25,92.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,2048,120.25,120.25,120.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,4096,176.25,176.25,176.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,8192,288.25,288.25,288.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,liger_tiled,forward,memory,MB,T,sequence length,16384,512.25,512.25,512.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,liger,backward,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,liger,backward,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,liger,backward,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,liger,backward,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,liger,backward,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
+tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
+tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
+tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
+tiled_geglu,liger_tiled,backward,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:06,0.6.3
+tiled_swiglu,liger,full,speed,ms,T,sequence length,1024,2.165760040283203,2.164659261703491,2.167193651199341,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
+tiled_swiglu,liger,full,speed,ms,T,sequence length,2048,4.371456146240234,4.368383884429932,4.374527931213379,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
+tiled_swiglu,liger,full,speed,ms,T,sequence length,4096,8.935423851013184,8.935423851013184,8.935423851013184,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
+tiled_swiglu,liger,full,speed,ms,T,sequence length,8192,17.078943252563477,17.078943252563477,17.078943252563477,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
+tiled_swiglu,liger,full,speed,ms,T,sequence length,16384,33.74857711791992,33.74857711791992,33.74857711791992,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:10,0.6.3
+tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,1024,3.3510398864746094,3.3507328033447266,3.3513472080230713,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
+tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,2048,6.023168087005615,6.023168087005615,6.023168087005615,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
+tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,4096,11.609087944030762,11.609087944030762,11.609087944030762,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
+tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,8192,23.8591365814209,23.8591365814209,23.8591365814209,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
+tiled_swiglu,liger_tiled,full,speed,ms,T,sequence length,16384,47.721473693847656,47.721473693847656,47.721473693847656,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:11,0.6.3
+tiled_swiglu,liger,forward,speed,ms,T,sequence length,1024,0.6594560146331787,0.6594560146331787,0.6604800224304199,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
+tiled_swiglu,liger,forward,speed,ms,T,sequence length,2048,1.3537280559539795,1.3527040481567383,1.3547519445419312,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
+tiled_swiglu,liger,forward,speed,ms,T,sequence length,4096,2.7152960300445557,2.715123176574707,2.7155072689056396,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
+tiled_swiglu,liger,forward,speed,ms,T,sequence length,8192,5.3361921310424805,5.3361921310424805,5.3361921310424805,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
+tiled_swiglu,liger,forward,speed,ms,T,sequence length,16384,10.870783805847168,10.870783805847168,10.870783805847168,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:14,0.6.3
+tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,1024,0.7395360469818115,0.7383040189743042,0.7413759827613831,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,2048,1.3965599536895752,1.387935996055603,1.4024640321731567,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,4096,2.7778561115264893,2.777395248413086,2.7780096530914307,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,8192,5.829631805419922,5.829631805419922,5.829631805419922,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,liger_tiled,forward,speed,ms,T,sequence length,16384,11.841535568237305,11.841535568237305,11.841535568237305,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,liger,backward,speed,ms,T,sequence length,1024,1.4970879554748535,1.4961408376693726,1.4970879554748535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
+tiled_swiglu,liger,backward,speed,ms,T,sequence length,2048,3.052351951599121,3.0518529415130615,3.0550782680511475,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
+tiled_swiglu,liger,backward,speed,ms,T,sequence length,4096,6.074687957763672,6.074687957763672,6.074687957763672,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
+tiled_swiglu,liger,backward,speed,ms,T,sequence length,8192,11.630592346191406,11.630592346191406,11.630592346191406,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
+tiled_swiglu,liger,backward,speed,ms,T,sequence length,16384,22.76793670654297,22.76793670654297,22.76793670654297,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:17,0.6.3
+tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,1024,2.6021440029144287,2.6000702381134033,2.6032767295837402,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
+tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,2048,4.641791820526123,4.641791820526123,4.641791820526123,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
+tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,4096,8.761343955993652,8.761343955993652,8.761343955993652,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
+tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,8192,17.966079711914062,17.966079711914062,17.966079711914062,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
+tiled_swiglu,liger_tiled,backward,speed,ms,T,sequence length,16384,35.657344818115234,35.657344818115234,35.657344818115234,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:18,0.6.3
+tiled_swiglu,liger,full,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
+tiled_swiglu,liger,full,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
+tiled_swiglu,liger,full,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
+tiled_swiglu,liger,full,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
+tiled_swiglu,liger,full,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:21,0.6.3
+tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
+tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
+tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
+tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
+tiled_swiglu,liger_tiled,full,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:22,0.6.3
+tiled_swiglu,liger,forward,memory,MB,T,sequence length,1024,128.25,128.25,128.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger,forward,memory,MB,T,sequence length,2048,192.25,192.25,192.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger,forward,memory,MB,T,sequence length,4096,320.25,320.25,320.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger,forward,memory,MB,T,sequence length,8192,576.25,576.25,576.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger,forward,memory,MB,T,sequence length,16384,1088.25,1088.25,1088.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,1024,92.25,92.25,92.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,2048,120.25,120.25,120.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,4096,176.25,176.25,176.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,8192,288.25,288.25,288.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger_tiled,forward,memory,MB,T,sequence length,16384,512.25,512.25,512.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:25,0.6.3
+tiled_swiglu,liger,backward,memory,MB,T,sequence length,1024,232.25,232.25,232.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
+tiled_swiglu,liger,backward,memory,MB,T,sequence length,2048,336.25,336.25,336.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
+tiled_swiglu,liger,backward,memory,MB,T,sequence length,4096,544.25,544.25,544.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
+tiled_swiglu,liger,backward,memory,MB,T,sequence length,8192,960.25,960.25,960.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
+tiled_swiglu,liger,backward,memory,MB,T,sequence length,16384,1792.25,1792.25,1792.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:27,0.6.3
+tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,1024,186.25,186.25,186.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
+tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,2048,244.25,244.25,244.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
+tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,4096,360.25,360.25,360.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
+tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,8192,592.25,592.25,592.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
+tiled_swiglu,liger_tiled,backward,memory,MB,T,sequence length,16384,1056.25,1056.25,1056.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:28,0.6.3
+tiled_geglu,huggingface,full,speed,ms,T,sequence length,1024,2.3357439041137695,2.3357439041137695,2.3375871181488037,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
+tiled_geglu,huggingface,full,speed,ms,T,sequence length,2048,4.764671802520752,4.764671802520752,4.764671802520752,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
+tiled_geglu,huggingface,full,speed,ms,T,sequence length,4096,9.4236478805542,9.4236478805542,9.4236478805542,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
+tiled_geglu,huggingface,full,speed,ms,T,sequence length,8192,17.628543853759766,17.628543853759766,17.628543853759766,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
+tiled_geglu,huggingface,full,speed,ms,T,sequence length,16384,35.06790542602539,35.06790542602539,35.06790542602539,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:47,0.6.3
+tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,1024,3.418976068496704,3.4176511764526367,3.4203009605407715,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
+tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,2048,6.158143997192383,6.158143997192383,6.158143997192383,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
+tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,4096,11.934720039367676,11.934720039367676,11.934720039367676,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
+tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,8192,24.731647491455078,24.731647491455078,24.731647491455078,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
+tiled_geglu,deepspeed_tiled,full,speed,ms,T,sequence length,16384,49.46227264404297,49.46227264404297,49.46227264404297,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:51,0.6.3
+tiled_geglu,huggingface,forward,speed,ms,T,sequence length,1024,0.6743040084838867,0.6736640334129333,0.677068829536438,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,huggingface,forward,speed,ms,T,sequence length,2048,1.418239951133728,1.418239951133728,1.421120047569275,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,huggingface,forward,speed,ms,T,sequence length,4096,2.88972806930542,2.889113664627075,2.8909568786621094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,huggingface,forward,speed,ms,T,sequence length,8192,5.701375961303711,5.701375961303711,5.701375961303711,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,huggingface,forward,speed,ms,T,sequence length,16384,11.276288032531738,11.276288032531738,11.276288032531738,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:52,0.6.3
+tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,1024,0.7433919906616211,0.7423999905586243,0.7444480061531067,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
+tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,2048,1.4137760400772095,1.4131200313568115,1.4152319431304932,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
+tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,4096,2.8241920471191406,2.823500871658325,2.8266496658325195,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
+tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,8192,6.087679862976074,6.087679862976074,6.087679862976074,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
+tiled_geglu,deepspeed_tiled,forward,speed,ms,T,sequence length,16384,12.353535652160645,12.353535652160645,12.353535652160645,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:54,0.6.3
+tiled_geglu,huggingface,backward,speed,ms,T,sequence length,1024,1.5499199628829956,1.5489535331726074,1.5523840188980103,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
+tiled_geglu,huggingface,backward,speed,ms,T,sequence length,2048,3.171328067779541,3.169484853744507,3.173171281814575,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
+tiled_geglu,huggingface,backward,speed,ms,T,sequence length,4096,6.263807773590088,6.263807773590088,6.263807773590088,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
+tiled_geglu,huggingface,backward,speed,ms,T,sequence length,8192,12.046143531799316,12.046143531799316,12.046143531799316,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
+tiled_geglu,huggingface,backward,speed,ms,T,sequence length,16384,23.839744567871094,23.839744567871094,23.839744567871094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:55,0.6.3
+tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,1024,2.6757121086120605,2.6755776405334473,2.676710367202759,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
+tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,2048,4.7329277992248535,4.7329277992248535,4.7329277992248535,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
+tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,4096,9.078783988952637,9.078783988952637,9.078783988952637,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
+tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,8192,18.63680076599121,18.63680076599121,18.63680076599121,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
+tiled_geglu,deepspeed_tiled,backward,speed,ms,T,sequence length,16384,37.06163024902344,37.06163024902344,37.06163024902344,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:58,0.6.3
+tiled_geglu,huggingface,full,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
+tiled_geglu,huggingface,full,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
+tiled_geglu,huggingface,full,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
+tiled_geglu,huggingface,full,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
+tiled_geglu,huggingface,full,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:22:59,0.6.3
+tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,deepspeed_tiled,full,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,huggingface,forward,memory,MB,T,sequence length,1024,144.25,144.25,144.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,huggingface,forward,memory,MB,T,sequence length,2048,224.25,224.25,224.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,huggingface,forward,memory,MB,T,sequence length,4096,384.25,384.25,384.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,huggingface,forward,memory,MB,T,sequence length,8192,704.25,704.25,704.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,huggingface,forward,memory,MB,T,sequence length,16384,1344.25,1344.25,1344.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:02,0.6.3
+tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,1024,90.25,90.25,90.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,2048,116.25,116.25,116.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,4096,168.25,168.25,168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,8192,272.25,272.25,272.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,deepspeed_tiled,forward,memory,MB,T,sequence length,16384,480.25,480.25,480.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:04,0.6.3
+tiled_geglu,huggingface,backward,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,huggingface,backward,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,huggingface,backward,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,huggingface,backward,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,huggingface,backward,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:05,0.6.3
+tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
+tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
+tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
+tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
+tiled_geglu,deepspeed_tiled,backward,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""gelu_pytorch_tanh"", ""activation_type"": ""geglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:07,0.6.3
+tiled_swiglu,huggingface,full,speed,ms,T,sequence length,1024,2.2517759799957275,2.2517759799957275,2.254848003387451,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
+tiled_swiglu,huggingface,full,speed,ms,T,sequence length,2048,4.588511943817139,4.587302207946777,4.5897216796875,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
+tiled_swiglu,huggingface,full,speed,ms,T,sequence length,4096,9.233407974243164,9.233407974243164,9.233407974243164,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
+tiled_swiglu,huggingface,full,speed,ms,T,sequence length,8192,17.869823455810547,17.869823455810547,17.869823455810547,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
+tiled_swiglu,huggingface,full,speed,ms,T,sequence length,16384,35.34422302246094,35.34422302246094,35.34422302246094,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:08,0.6.3
+tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,1024,3.4257922172546387,3.424870491027832,3.426713705062866,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
+tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,2048,6.155263900756836,6.155263900756836,6.155263900756836,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
+tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,4096,11.92959976196289,11.92959976196289,11.92959976196289,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
+tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,8192,24.815616607666016,24.815616607666016,24.815616607666016,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
+tiled_swiglu,deepspeed_tiled,full,speed,ms,T,sequence length,16384,49.62918472290039,49.62918472290039,49.62918472290039,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:12,0.6.3
+tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,1024,0.6748160123825073,0.6737920045852661,0.6758400201797485,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
+tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,2048,1.4332799911499023,1.4325759410858154,1.4335999488830566,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
+tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,4096,2.91212797164917,2.904217481613159,2.9146623611450195,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
+tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,8192,5.658976078033447,5.658976078033447,5.658976078033447,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
+tiled_swiglu,huggingface,forward,speed,ms,T,sequence length,16384,11.341952323913574,11.341952323913574,11.341952323913574,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:13,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,1024,0.7454720139503479,0.7429631948471069,0.7456768155097961,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,2048,1.4120960235595703,1.410048007965088,1.4120960235595703,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,4096,2.825216054916382,2.825216054916382,2.8264448642730713,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,8192,6.077439785003662,6.077439785003662,6.077439785003662,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,speed,ms,T,sequence length,16384,12.356608390808105,12.356608390808105,12.356608390808105,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:15,0.6.3
+tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,1024,1.551360011100769,1.5511807203292847,1.5532032251358032,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
+tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,2048,3.1928319931030273,3.1885311603546143,3.1971328258514404,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
+tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,4096,6.273248195648193,6.273248195648193,6.273248195648193,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
+tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,8192,12.058752059936523,12.058752059936523,12.058752059936523,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
+tiled_swiglu,huggingface,backward,speed,ms,T,sequence length,16384,23.853055953979492,23.853055953979492,23.853055953979492,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:16,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,1024,2.6746881008148193,2.6728639602661133,2.6789886951446533,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,2048,4.739071846008301,4.739071846008301,4.739071846008301,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,4096,9.084927558898926,9.084927558898926,9.084927558898926,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,8192,18.729759216308594,18.729759216308594,18.729759216308594,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,speed,ms,T,sequence length,16384,37.13724899291992,37.13724899291992,37.13724899291992,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,huggingface,full,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,huggingface,full,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,huggingface,full,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,huggingface,full,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,huggingface,full,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:20,0.6.3
+tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
+tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
+tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
+tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
+tiled_swiglu,deepspeed_tiled,full,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:23,0.6.3
+tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,1024,144.25,144.25,144.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
+tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,2048,224.25,224.25,224.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
+tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,4096,384.25,384.25,384.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
+tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,8192,704.25,704.25,704.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
+tiled_swiglu,huggingface,forward,memory,MB,T,sequence length,16384,1344.25,1344.25,1344.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:24,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,1024,90.25,90.25,90.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,2048,116.25,116.25,116.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,4096,168.25,168.25,168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,8192,272.25,272.25,272.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,deepspeed_tiled,forward,memory,MB,T,sequence length,16384,480.25,480.25,480.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,1024,264.25,264.25,264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,2048,400.25,400.25,400.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,4096,688.25,688.25,688.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,8192,1264.25,1264.25,1264.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,huggingface,backward,memory,MB,T,sequence length,16384,2416.25,2416.25,2416.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:26,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,1024,190.25,190.25,190.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,2048,252.25,252.25,252.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,4096,376.25,376.25,376.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,8192,640.25,640.25,640.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
+tiled_swiglu,deepspeed_tiled,backward,memory,MB,T,sequence length,16384,1168.25,1168.25,1168.25,"{""bsz"": 2, ""hidden_size"": 2048, ""intermediate_size"": 4096, ""hidden_act"": ""silu"", ""activation_type"": ""swiglu"", ""num_shards"": 4, ""dtype"": ""torch.bfloat16""}",NVIDIA GeForce RTX 4090,2025-11-11 06:23:29,0.6.3
+tvd,liger,full,memory,MB,V,vocab size,4096,1792.0009765625,1792.0009765625,1792.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,liger,full,memory,MB,V,vocab size,8192,3584.0009765625,3584.0009765625,3584.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,liger,full,memory,MB,V,vocab size,16384,7168.0009765625,7168.0009765625,7168.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,liger,full,memory,MB,V,vocab size,32768,14336.0009765625,14336.0009765625,14336.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,liger,full,memory,MB,V,vocab size,65536,28672.0,28672.0,28672.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,liger,full,memory,MB,V,vocab size,131072,57344.0,57344.0,57344.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,torch,full,memory,MB,V,vocab size,4096,2048.0009765625,2048.0009765625,2048.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,torch,full,memory,MB,V,vocab size,8192,4096.0009765625,4096.0009765625,4096.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,torch,full,memory,MB,V,vocab size,16384,8192.0009765625,8192.0009765625,8192.0009765625,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,torch,full,memory,MB,V,vocab size,32768,16384.0,16384.0,16384.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,torch,full,memory,MB,V,vocab size,65536,32768.0,32768.0,32768.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,torch,full,memory,MB,V,vocab size,131072,65536.0,65536.0,65536.0,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:48,0.7.0
+tvd,liger,forward,speed,ms,V,vocab size,4096,0.2757120132446289,0.27487359642982484,0.27616640329360964,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:50,0.7.0
+tvd,liger,forward,speed,ms,V,vocab size,8192,0.5338559746742249,0.5333759784698486,0.5346879959106445,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:50,0.7.0
+tvd,liger,forward,speed,ms,V,vocab size,16384,1.0511679649353027,1.0505280494689941,1.0521472215652465,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:50,0.7.0
+tvd,liger,forward,speed,ms,V,vocab size,32768,2.0986878871917725,2.09736967086792,2.0999168872833254,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:50,0.7.0
+tvd,liger,forward,speed,ms,V,vocab size,65536,4.221951961517334,4.22039680480957,4.222847938537598,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:50,0.7.0
+tvd,liger,forward,speed,ms,V,vocab size,131072,8.501215934753418,8.498592376708984,8.50380802154541,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:50,0.7.0
+tvd,torch,forward,speed,ms,V,vocab size,4096,0.7288320064544678,0.727942419052124,0.7296640276908875,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:51,0.7.0
+tvd,torch,forward,speed,ms,V,vocab size,8192,1.4264639616012573,1.42576003074646,1.4272960424423218,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:51,0.7.0
+tvd,torch,forward,speed,ms,V,vocab size,16384,2.81440007686615,2.8132031917572022,2.815097618103027,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:51,0.7.0
+tvd,torch,forward,speed,ms,V,vocab size,32768,5.5965118408203125,5.59548807144165,5.598131275177002,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:51,0.7.0
+tvd,torch,forward,speed,ms,V,vocab size,65536,11.178752422332764,11.176428604125977,11.180454635620118,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:51,0.7.0
+tvd,torch,forward,speed,ms,V,vocab size,131072,22.33670425415039,22.334880065917968,22.339027404785156,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:51,0.7.0
+tvd,liger,full,speed,ms,V,vocab size,4096,1.123952031135559,1.1221888303756713,1.1291328191757202,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:53,0.7.0
+tvd,liger,full,speed,ms,V,vocab size,8192,2.1660319566726685,2.162835216522217,2.169088077545166,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:53,0.7.0
+tvd,liger,full,speed,ms,V,vocab size,16384,4.563424110412598,4.559807777404785,4.5669121742248535,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:53,0.7.0
+tvd,liger,full,speed,ms,V,vocab size,32768,9.092079639434814,9.089529991149902,9.094182014465332,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:53,0.7.0
+tvd,liger,full,speed,ms,V,vocab size,65536,18.217248916625977,18.20675277709961,18.219014739990236,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:53,0.7.0
+tvd,liger,full,speed,ms,V,vocab size,131072,36.477935791015625,36.46965026855469,36.48622131347656,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:53,0.7.0
+tvd,torch,full,speed,ms,V,vocab size,4096,2.1256959438323975,2.1249279975891113,2.1270463466644287,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:54,0.7.0
+tvd,torch,full,speed,ms,V,vocab size,8192,4.191232204437256,4.189510250091553,4.192793464660644,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:54,0.7.0
+tvd,torch,full,speed,ms,V,vocab size,16384,8.638431549072266,8.636992454528809,8.639007568359375,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:54,0.7.0
+tvd,torch,full,speed,ms,V,vocab size,32768,17.25654411315918,17.25450286865234,17.25882225036621,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:54,0.7.0
+tvd,torch,full,speed,ms,V,vocab size,65536,34.54822540283203,34.546746826171876,34.549703979492186,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:54,0.7.0
+tvd,torch,full,speed,ms,V,vocab size,131072,69.17910766601562,69.17910766601562,69.17910766601562,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:54,0.7.0
+tvd,liger,backward,speed,ms,V,vocab size,4096,0.8502079844474792,0.8484799861907959,0.8526080250740051,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:55,0.7.0
+tvd,liger,backward,speed,ms,V,vocab size,8192,1.6321280002593994,1.629702377319336,1.6350399732589722,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:55,0.7.0
+tvd,liger,backward,speed,ms,V,vocab size,16384,3.5109760761260986,3.5084415912628173,3.513107109069824,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:55,0.7.0
+tvd,liger,backward,speed,ms,V,vocab size,32768,6.989071846008301,6.985472011566161,6.994240188598633,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:55,0.7.0
+tvd,liger,backward,speed,ms,V,vocab size,65536,13.969247817993164,13.95904598236084,13.971328163146971,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:55,0.7.0
+tvd,liger,backward,speed,ms,V,vocab size,131072,27.982528686523438,27.963673400878903,27.987577819824217,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:55,0.7.0
+tvd,torch,backward,speed,ms,V,vocab size,4096,1.398911952972412,1.3979583740234376,1.4000320434570312,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:57,0.7.0
+tvd,torch,backward,speed,ms,V,vocab size,8192,2.7701759338378906,2.7694976329803467,2.7718528747558593,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:57,0.7.0
+tvd,torch,backward,speed,ms,V,vocab size,16384,5.828160047531128,5.8249921798706055,5.829792022705078,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:57,0.7.0
+tvd,torch,backward,speed,ms,V,vocab size,32768,11.665760040283203,11.664883232116699,11.666317176818847,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:57,0.7.0
+tvd,torch,backward,speed,ms,V,vocab size,65536,23.379840850830078,23.37938575744629,23.381267929077147,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:57,0.7.0
+tvd,torch,backward,speed,ms,V,vocab size,131072,46.83844757080078,46.8328125,46.84408264160156,"{""B"": 8, ""T"": 2048}",NVIDIA H100 80GB HBM3,2026-03-03 23:02:57,0.7.0
+group_norm,liger,forward,speed,ms,C,num_channels,32,0.017535999417304993,0.016863999888300896,0.01833599992096424,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:15,0.7.0
+group_norm,liger,forward,speed,ms,C,num_channels,64,0.018848000094294548,0.018015999346971512,0.019487999379634857,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:15,0.7.0
+group_norm,liger,forward,speed,ms,C,num_channels,128,0.026623999699950218,0.024607999250292778,0.026688000187277794,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:15,0.7.0
+group_norm,liger,forward,speed,ms,C,num_channels,256,0.038943998515605927,0.03888000175356865,0.03903999924659729,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:15,0.7.0
+group_norm,liger,forward,speed,ms,C,num_channels,512,0.06351999938488007,0.06345599889755249,0.06550399959087372,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:15,0.7.0
+group_norm,liger,forward,speed,ms,C,num_channels,1024,0.11475200206041336,0.11468800157308578,0.11673600226640701,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:15,0.7.0
+group_norm,liger,forward,speed,ms,C,num_channels,2048,0.21910400688648224,0.217056006193161,0.22115199267864227,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:15,0.7.0
+group_norm,huggingface,forward,speed,ms,C,num_channels,32,0.030688000842928886,0.030592000111937523,0.030751999467611313,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:19,0.7.0
+group_norm,huggingface,forward,speed,ms,C,num_channels,64,0.043007999658584595,0.04294399917125702,0.04303999990224838,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:19,0.7.0
+group_norm,huggingface,forward,speed,ms,C,num_channels,128,0.07168000191450119,0.07161600142717361,0.07174400240182877,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:19,0.7.0
+group_norm,huggingface,forward,speed,ms,C,num_channels,256,0.13516800105571747,0.1351040005683899,0.13523200154304504,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:19,0.7.0
+group_norm,huggingface,forward,speed,ms,C,num_channels,512,0.25808000564575195,0.2580159902572632,0.25900799036026,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:19,0.7.0
+group_norm,huggingface,forward,speed,ms,C,num_channels,1024,0.4986239969730377,0.4976640045642853,0.4997439980506897,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:19,0.7.0
+group_norm,huggingface,forward,speed,ms,C,num_channels,2048,0.9819360077381134,0.9800639748573303,0.9830080270767212,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:19,0.7.0
+group_norm,liger,full,speed,ms,C,num_channels,32,0.1658720001578331,0.16368000209331512,0.16958080232143402,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:22,0.7.0
+group_norm,liger,full,speed,ms,C,num_channels,64,0.1730239987373352,0.17123199999332428,0.17520000040531158,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:22,0.7.0
+group_norm,liger,full,speed,ms,C,num_channels,128,0.1695999950170517,0.16783360242843628,0.1717183977365494,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:22,0.7.0
+group_norm,liger,full,speed,ms,C,num_channels,256,0.174112007021904,0.17206400632858276,0.17718400061130524,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:22,0.7.0
+group_norm,liger,full,speed,ms,C,num_channels,512,0.18745599687099457,0.18636800348758698,0.18848000466823578,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:22,0.7.0
+group_norm,liger,full,speed,ms,C,num_channels,1024,0.3388479948043823,0.33792001008987427,0.3400000035762787,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:22,0.7.0
+group_norm,liger,full,speed,ms,C,num_channels,2048,0.6390079855918884,0.6371200084686279,0.6410560011863708,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:22,0.7.0
+group_norm,huggingface,full,speed,ms,C,num_channels,32,0.08396799862384796,0.08390399813652039,0.08403199911117554,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:26,0.7.0
+group_norm,huggingface,full,speed,ms,C,num_channels,64,0.11267200112342834,0.11260800063610077,0.1128000020980835,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:26,0.7.0
+group_norm,huggingface,full,speed,ms,C,num_channels,128,0.20054399967193604,0.19868800044059753,0.20080000162124634,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:26,0.7.0
+group_norm,huggingface,full,speed,ms,C,num_channels,256,0.35020801424980164,0.34828799962997437,0.3511039912700653,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:26,0.7.0
+group_norm,huggingface,full,speed,ms,C,num_channels,512,0.6307839751243591,0.6297919750213623,0.6309120059013367,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:26,0.7.0
+group_norm,huggingface,full,speed,ms,C,num_channels,1024,1.177664041519165,1.1766079664230347,1.1796480417251587,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:26,0.7.0
+group_norm,huggingface,full,speed,ms,C,num_channels,2048,2.2947518825531006,2.292736053466797,2.296736001968384,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:26,0.7.0
+group_norm,liger,backward,speed,ms,C,num_channels,32,0.06643199920654297,0.0655359998345375,0.06752000004053116,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:28,0.7.0
+group_norm,liger,backward,speed,ms,C,num_channels,64,0.06732799857854843,0.0663679987192154,0.06838399916887283,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:28,0.7.0
+group_norm,liger,backward,speed,ms,C,num_channels,128,0.07171200215816498,0.06969600170850754,0.07273600250482559,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:28,0.7.0
+group_norm,liger,backward,speed,ms,C,num_channels,256,0.07580800354480743,0.07571200281381607,0.07683199644088745,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:28,0.7.0
+group_norm,liger,backward,speed,ms,C,num_channels,512,0.12697599828243256,0.1249919980764389,0.12703999876976013,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:28,0.7.0
+group_norm,liger,backward,speed,ms,C,num_channels,1024,0.2253440022468567,0.2252800017595291,0.22729599475860596,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:28,0.7.0
+group_norm,liger,backward,speed,ms,C,num_channels,2048,0.42585599422454834,0.42396798729896545,0.4260160028934479,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:28,0.7.0
+group_norm,huggingface,backward,speed,ms,C,num_channels,32,0.05532800033688545,0.05526399984955788,0.056352000683546066,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,huggingface,backward,speed,ms,C,num_channels,64,0.07372800260782242,0.07171200215816498,0.0739263966679573,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,huggingface,backward,speed,ms,C,num_channels,128,0.13315199315547943,0.13308799266815186,0.13331200182437897,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,huggingface,backward,speed,ms,C,num_channels,256,0.21916800737380981,0.21904000639915466,0.21926400065422058,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,huggingface,backward,speed,ms,C,num_channels,512,0.374783992767334,0.37379199266433716,0.37484800815582275,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,huggingface,backward,speed,ms,C,num_channels,1024,0.6820799708366394,0.6810240149497986,0.6839039921760559,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,huggingface,backward,speed,ms,C,num_channels,2048,1.3158719539642334,1.3157440423965454,1.3177599906921387,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,liger,full,memory,MB,C,num_channels,32,40.01171875,40.01171875,40.01171875,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,liger,full,memory,MB,C,num_channels,64,80.01953125,80.01953125,80.01953125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,liger,full,memory,MB,C,num_channels,128,160.03515625,160.03515625,160.03515625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,liger,full,memory,MB,C,num_channels,256,320.0703125,320.0703125,320.0703125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,liger,full,memory,MB,C,num_channels,512,640.140625,640.140625,640.140625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,liger,full,memory,MB,C,num_channels,1024,1280.28125,1280.28125,1280.28125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,liger,full,memory,MB,C,num_channels,2048,2560.5625,2560.5625,2560.5625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:31,0.7.0
+group_norm,huggingface,full,memory,MB,C,num_channels,32,40.06640625,40.06640625,40.06640625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,full,memory,MB,C,num_channels,64,80.12890625,80.12890625,80.12890625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,full,memory,MB,C,num_channels,128,160.25390625,160.25390625,160.25390625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,full,memory,MB,C,num_channels,256,320.5078125,320.5078125,320.5078125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,full,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,full,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,full,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,forward,memory,MB,C,num_channels,32,40.01171875,40.01171875,40.01171875,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,forward,memory,MB,C,num_channels,64,80.01953125,80.01953125,80.01953125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,forward,memory,MB,C,num_channels,128,160.03515625,160.03515625,160.03515625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,forward,memory,MB,C,num_channels,256,320.0703125,320.0703125,320.0703125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,forward,memory,MB,C,num_channels,512,640.140625,640.140625,640.140625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,forward,memory,MB,C,num_channels,1024,1280.28125,1280.28125,1280.28125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,forward,memory,MB,C,num_channels,2048,2560.5625,2560.5625,2560.5625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,forward,memory,MB,C,num_channels,32,40.06640625,40.06640625,40.06640625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,forward,memory,MB,C,num_channels,64,80.12890625,80.12890625,80.12890625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,forward,memory,MB,C,num_channels,128,160.25390625,160.25390625,160.25390625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,forward,memory,MB,C,num_channels,256,320.5078125,320.5078125,320.5078125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,forward,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,forward,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,forward,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,backward,memory,MB,C,num_channels,32,40.01171875,40.01171875,40.01171875,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,backward,memory,MB,C,num_channels,64,80.01953125,80.01953125,80.01953125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,backward,memory,MB,C,num_channels,128,160.03515625,160.03515625,160.03515625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,backward,memory,MB,C,num_channels,256,320.0703125,320.0703125,320.0703125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,backward,memory,MB,C,num_channels,512,640.140625,640.140625,640.140625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,backward,memory,MB,C,num_channels,1024,1280.28125,1280.28125,1280.28125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,liger,backward,memory,MB,C,num_channels,2048,2560.5625,2560.5625,2560.5625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,backward,memory,MB,C,num_channels,32,40.06640625,40.06640625,40.06640625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,backward,memory,MB,C,num_channels,64,80.12890625,80.12890625,80.12890625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,backward,memory,MB,C,num_channels,128,160.25390625,160.25390625,160.25390625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,backward,memory,MB,C,num_channels,256,320.5078125,320.5078125,320.5078125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,backward,memory,MB,C,num_channels,512,641.015625,641.015625,641.015625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,backward,memory,MB,C,num_channels,1024,1282.03125,1282.03125,1282.03125,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
+group_norm,huggingface,backward,memory,MB,C,num_channels,2048,2564.0625,2564.0625,2564.0625,"{""M"": 128, ""H"": 512, ""channels_per_group"": 4, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA B200,2026-02-28 00:23:32,0.7.0
diff --git a/benchmark/scripts/__init__.py b/benchmark/scripts/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/benchmark/scripts/benchmark_cpo_loss.py b/benchmark/scripts/benchmark_cpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..8b10d518880b19644bd7d6c3cc4b9cd64cc8a541
--- /dev/null
+++ b/benchmark/scripts/benchmark_cpo_loss.py
@@ -0,0 +1,167 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+#############################################################################
+# Test the memory consumption of the linear fused cross entropy loss
+#############################################################################
+
+
+def bench_memory_fused_linear_cpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_cpo_loss import LigerLMHeadCPO
+    from test.chunked_loss.test_cpo_loss import TorchLMHeadCPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
+
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+# #############################################################################
+# # Test the speed of the fused linear cross entropy loss
+# #############################################################################
+
+
+def bench_speed_fused_linear_cpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_cpo_loss import LigerLMHeadCPO
+    from test.chunked_loss.test_cpo_loss import TorchLMHeadCPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_cpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_cpo = LigerLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_cpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_cpo(x, target)[0]
+
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "fused_linear_cpo_loss",
+        "x_name": "B",
+        "x_label": "B",
+        "x_values": [2**i for i in range(1, 5)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "T": 1024,
+                "H": 4096,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_linear_cpo_loss,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_linear_cpo_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_cross_entropy.py b/benchmark/scripts/benchmark_cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..cdd61814ac076923a4d75b2eeef0866c8d70f081
--- /dev/null
+++ b/benchmark/scripts/benchmark_cross_entropy.py
@@ -0,0 +1,126 @@
+import torch
+import triton
+
+from torch.nn import CrossEntropyLoss
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def bench_memory_cross_entropy(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    torch_ce = CrossEntropyLoss()
+    liger_ce = LigerCrossEntropyLoss()
+
+    V = input.x
+    provider = input.kernel_provider
+    B = input.extra_benchmark_config["B"]
+    T = input.extra_benchmark_config["T"]
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device)
+    target = torch.randint(V, (B * T, 1), device=device).squeeze(1)
+
+    def fwd():
+        if provider == "liger":
+            return liger_ce(_input, target)
+        else:
+            return torch_ce(_input, target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+def bench_speed_cross_entropy(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    torch_ce = CrossEntropyLoss()
+    liger_ce = LigerCrossEntropyLoss()
+
+    V = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    B = input.extra_benchmark_config["B"]
+    T = input.extra_benchmark_config["T"]
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device)
+    target = torch.randint(V, (B * T, 1), device=device).squeeze(1)
+
+    def fwd():
+        if provider == "liger":
+            return liger_ce(_input, target)
+        else:
+            return torch_ce(_input, target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
+    elif mode == "no-grad-forward":
+        with torch.no_grad():
+            ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
+    elif mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, rep=100, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "cross_entropy",
+        "x_name": "V",
+        "x_label": "vocab size",
+        "x_values": [2**i for i in range(12, 18)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [{"B": 8, "T": 2048}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_cross_entropy,
+        kernel_operation_modes=["forward", "backward", "full", "no-grad-forward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_cross_entropy,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_distill_cosine_loss.py b/benchmark/scripts/benchmark_distill_cosine_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..5cf12b495c835d507c5f11c7f8078b2e27414354
--- /dev/null
+++ b/benchmark/scripts/benchmark_distill_cosine_loss.py
@@ -0,0 +1,266 @@
+import os
+import sys
+
+import torch
+import torch.nn as nn
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+# Ensure the project root is in the path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+class TorchCosineSimilarityLoss(nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        bias: bool = False,
+    ):
+        from test.chunked_loss.test_cosine_loss import HFCosineLoss
+
+        super().__init__()
+        self.student_lin = nn.Linear(in_features=H // 2, out_features=V, bias=bias).to(dtype=dtype)
+        self.teacher_lin = nn.Linear(in_features=H, out_features=V, bias=bias).to(dtype=dtype)
+        self.cosine_loss = HFCosineLoss(
+            ignore_index=ignore_index,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            temperature=temperature,
+        ).get_batch_loss_metrics
+
+    def forward(self, student: torch.Tensor, teacher: torch.Tensor, target: torch.Tensor):
+        return self.cosine_loss(student, self.student_lin.weight, teacher, self.teacher_lin.weight, target)
+
+
+class LigerCosineSimilarityLoss(nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.student_lin = nn.Linear(in_features=H // 2, out_features=V, bias=bias).to(dtype=dtype)
+        self.teacher_lin = nn.Linear(in_features=H, out_features=V, bias=bias).to(dtype=dtype)
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.cosine_loss = LigerFusedLinearCosineSimilarityFunction.apply
+
+    def forward(self, student: torch.Tensor, teacher: torch.Tensor, target: torch.Tensor):
+        return self.cosine_loss(
+            student,
+            self.student_lin.weight,
+            teacher,
+            self.teacher_lin.weight,
+            target,
+            self.student_lin.bias,
+            self.teacher_lin.bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+        )
+
+
+def bench_memory_cosine_similarity_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    weight_hard_loss = input.extra_benchmark_config["weight_hard_loss"]
+    weight_soft_loss = input.extra_benchmark_config["weight_soft_loss"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+
+    torch_cosine_loss = TorchCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+        bias=bias,
+    ).to(device)
+    liger_cosine_loss = LigerCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+
+    _tensor = torch.rand(BT, H // 2, device=device, dtype=dtype)
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(BT, H, device=device, dtype=dtype)
+
+    target = torch.randint(0, V, (BT,), device=device, dtype=torch.long)
+
+    def fwd():
+        if provider == "liger":
+            return liger_cosine_loss(student_input1, teacher_input, target)
+        elif provider == "torch":
+            return torch_cosine_loss(student_input2, teacher_input, target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+def bench_speed_cosine_similarity_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    weight_hard_loss = input.extra_benchmark_config["weight_hard_loss"]
+    weight_soft_loss = input.extra_benchmark_config["weight_soft_loss"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    torch_cosine_loss = TorchCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+
+    liger_cosine_loss = LigerCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+
+    _tensor = torch.rand(BT, H // 2, device=device, dtype=dtype)
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(BT, H, device=device, dtype=dtype)
+
+    target = torch.randint(0, V, (BT,), device=device, dtype=torch.long)
+
+    def fwd():
+        if provider == "liger":
+            return liger_cosine_loss(student_input1, teacher_input, target)
+        elif provider == "torch":
+            return torch_cosine_loss(student_input2, teacher_input, target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[student_input1, student_input2],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "distill_cosine_loss",
+        "x_name": "BT",
+        "x_label": "B x T",
+        "x_values": [2**i for i in range(10, 14)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "H": 4096,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+                "bias": False,
+                "weight_hard_loss": 0.5,
+                "weight_soft_loss": 0.5,
+                "ignore_index": -100,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_cosine_similarity_loss,
+        kernel_operation_modes=["forward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_cosine_similarity_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_distill_jsd_loss.py b/benchmark/scripts/benchmark_distill_jsd_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..324418e17aea8816846a0fd59330828add136fa2
--- /dev/null
+++ b/benchmark/scripts/benchmark_distill_jsd_loss.py
@@ -0,0 +1,272 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDFunction
+from liger_kernel.utils import get_total_gpu_memory
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+class TorchJSDLoss(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        bias: bool = False,
+    ):
+        from test.chunked_loss.test_jsd_loss import HFJSDLoss
+
+        super().__init__()
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=bias, dtype=dtype)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.jsd_loss = HFJSDLoss(
+            ignore_index=ignore_index,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            temperature=temperature,
+        ).get_batch_loss_metrics
+
+    def forward(self, student, teacher, target):
+        return self.jsd_loss(
+            student,
+            self.student_lin.weight,
+            teacher,
+            self.teacher_lin.weight,
+            target,
+        )
+
+
+class LigerJSDLoss(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=bias, dtype=dtype)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.jsd_loss = LigerFusedLinearJSDFunction.apply
+
+    def forward(self, student, teacher, target):
+        return self.jsd_loss(
+            student,
+            self.student_lin.weight,
+            teacher,
+            self.teacher_lin.weight,
+            target,
+            self.student_lin.bias,
+            self.teacher_lin.bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+        )
+
+
+def bench_memory_jsd_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    weight_hard_loss = input.extra_benchmark_config["weight_hard_loss"]
+    weight_soft_loss = input.extra_benchmark_config["weight_soft_loss"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+
+    torch_jsd_loss = TorchJSDLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+    liger_jsd_loss = LigerJSDLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+
+    _tensor = torch.rand(BT, H // 2, device=device, dtype=dtype)
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(BT, H, device=device, dtype=dtype)
+
+    target = torch.randint(0, V, (BT,), device=device, dtype=torch.long)
+
+    def fwd():
+        if provider == "liger":
+            return liger_jsd_loss(student_input1, teacher_input, target)
+        elif provider == "torch":
+            return torch_jsd_loss(student_input2, teacher_input, target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+def bench_speed_jsd_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    weight_hard_loss = input.extra_benchmark_config["weight_hard_loss"]
+    weight_soft_loss = input.extra_benchmark_config["weight_soft_loss"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    torch_jsd_loss = TorchJSDLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+    liger_jsd_loss = LigerJSDLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+
+    _tensor = torch.rand(BT, H // 2, device=device, dtype=dtype)
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(BT, H, device=device, dtype=dtype)
+
+    target = torch.randint(0, V, (BT,), device=device, dtype=torch.long)
+
+    def fwd():
+        if provider == "liger":
+            return liger_jsd_loss(student_input1, teacher_input, target)
+        elif provider == "torch":
+            return torch_jsd_loss(student_input2, teacher_input, target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[student_input1, student_input2],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+    gpu_memory_gbs = get_total_gpu_memory()
+    # We know that the full test will require 69GBs for vocab size 2^13 and 39GBs for vocab size 2^12 on torch
+    if gpu_memory_gbs >= 69:
+        x_max = 13
+    elif gpu_memory_gbs >= 39:
+        x_max = 12
+    else:
+        x_max = 11
+
+    common_configs = {
+        "kernel_name": "distill_jsd_loss",
+        "x_name": "BT",
+        "x_label": "B x T",
+        "x_values": [2**i for i in range(10, x_max + 1)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "H": 4096,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+                "bias": False,
+                "weight_hard_loss": 0.5,
+                "weight_soft_loss": 0.5,
+                "ignore_index": -100,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_jsd_loss,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_jsd_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_dpo_loss.py b/benchmark/scripts/benchmark_dpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..228a228d55753042b0d1bf9471085076e1eefe3b
--- /dev/null
+++ b/benchmark/scripts/benchmark_dpo_loss.py
@@ -0,0 +1,179 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_dpo_loss import LigerLMHeadDPO
+    from test.chunked_loss.test_dpo_loss import TorchLMHeadDPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    beta = input.extra_benchmark_config["beta"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+
+    # Instantiate once and retrieve the first output only
+    torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
+    liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
+
+    # Input shape: [B, T, H]
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False)
+    # Target shape: [B, T]
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    # Add ignore_index tokens to simulate padding
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, ref_input, target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, ref_input, target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_dpo_loss import LigerLMHeadDPO
+    from test.chunked_loss.test_dpo_loss import TorchLMHeadDPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    beta = input.extra_benchmark_config["beta"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    # Instantiate once and retrieve the first output only
+    torch_dpo_loss = TorchLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    liger_dpo_loss = LigerLMHeadDPO(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    torch_fwd = lambda x, ref_x, target: torch_dpo_loss(x, ref_x, target)[0]
+    liger_fwd = lambda x, ref_x, target: liger_dpo_loss(x, ref_x, target)[0]
+
+    # Input shape: [B, T, H]
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False)
+    # Target shape: [B, T]
+    target = torch.randint(V, (B, T), device=device, dtype=torch.long)
+
+    # Add ignore_index tokens
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, ref_input, target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, ref_input, target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "dpo_loss",
+        "x_name": "B",
+        "x_label": "Batch Size (B)",
+        "x_values": [2**i for i in range(1, 6)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "T": 512,
+                "H": 1024,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+                "bias": True,
+                "beta": 0.1,
+                "ignore_index": 42,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_dpo_loss,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_dpo_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_dyt.py b/benchmark/scripts/benchmark_dyt.py
new file mode 100755
index 0000000000000000000000000000000000000000..2c5129000d93001f4c585b58e6b68d143e0685cf
--- /dev/null
+++ b/benchmark/scripts/benchmark_dyt.py
@@ -0,0 +1,96 @@
+import os
+import sys
+
+import torch
+
+from benchmark_model_configs import compute_hidden_size_sweep_config
+from benchmark_model_configs import estimate_kernel_peak_memory
+from benchmark_model_configs import get_benchmark_model_config
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+from utils import run_memory_benchmark
+from utils import run_speed_benchmark
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+def _setup_dyt(input: SingleBenchmarkRunInput):
+    """Create input tensor and DyT layer from benchmark config."""
+    from test.transformers.test_dyt import LigerDyT
+    from test.transformers.test_dyt import TorchDyT
+
+    cfg = input.extra_benchmark_config
+    hidden_size = input.x
+    x = torch.randn(cfg["BT"], hidden_size, device=device, dtype=cfg["dtype"], requires_grad=True)
+    if input.kernel_provider == "liger":
+        layer = LigerDyT(hidden_size=hidden_size, beta=cfg["beta"]).to(device)
+    elif input.kernel_provider == "torch":
+        layer = TorchDyT(hidden_size=hidden_size, beta=cfg["beta"]).to(device)
+    elif input.kernel_provider == "torch_compile":
+        layer = torch.compile(TorchDyT(hidden_size=hidden_size, beta=cfg["beta"]).to(device))
+    else:
+        raise ValueError(f"Invalid provider: {input.kernel_provider} for DyT")
+    return x, layer
+
+
+def bench_speed_dyt(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_dyt(input)
+    return run_speed_benchmark(lambda: layer(x), input.kernel_operation_mode, [x])
+
+
+def bench_memory_dyt(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_dyt(input)
+    return run_memory_benchmark(lambda: layer(x), input.kernel_operation_mode)
+
+
+BT = 4096
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+    model = get_benchmark_model_config(args.model)
+
+    for beta in [False, True]:
+
+        def _probe():
+            probe_input = SingleBenchmarkRunInput(
+                x=model.hidden_size,
+                kernel_provider="torch",
+                extra_benchmark_config={"BT": BT, "dtype": model.dtype, "beta": beta},
+            )
+            x, layer = _setup_dyt(probe_input)
+            return layer(x)
+
+        peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe)
+        sweep_config = compute_hidden_size_sweep_config(model, peak_bytes, bt=BT)
+        x_values = [1024 * i for i in range(1, 17) if 1024 * i <= sweep_config.max_hidden_size] or [model.hidden_size]
+
+        common_configs = {
+            "kernel_name": f"dyt_beta={beta}",
+            "x_name": "hidden_size",
+            "x_label": "hidden_size",
+            "x_values": x_values,
+            "kernel_providers": ["liger", "torch", "torch_compile"],
+            "extra_benchmark_configs": [{"BT": sweep_config.bt, "dtype": model.dtype, "beta": beta}],
+            "overwrite": args.overwrite,
+        }
+
+        run_benchmarks(
+            bench_test_fn=bench_speed_dyt,
+            kernel_operation_modes=["full", "forward", "backward"],
+            metric_name="speed",
+            metric_unit="ms",
+            **common_configs,
+        )
+        run_benchmarks(
+            bench_test_fn=bench_memory_dyt,
+            kernel_operation_modes=["full", "forward", "backward"],
+            metric_name="memory",
+            metric_unit="MB",
+            **common_configs,
+        )
diff --git a/benchmark/scripts/benchmark_embedding.py b/benchmark/scripts/benchmark_embedding.py
new file mode 100755
index 0000000000000000000000000000000000000000..2bd0c60be9735017eb1ab219eddb8b49773d360d
--- /dev/null
+++ b/benchmark/scripts/benchmark_embedding.py
@@ -0,0 +1,134 @@
+import torch
+import triton
+
+from torch.nn import Embedding
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.experimental.embedding import LigerEmbedding
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+# NOTE: For torch compile, we will just use default inductor settings. No further customization
+# is needed.
+
+
+def bench_speed_embedding(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    V = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    B = input.extra_benchmark_config["B"]
+    T = input.extra_benchmark_config["T"]
+    D = input.extra_benchmark_config["D"]
+    dtype = input.extra_benchmark_config["dtype"]
+
+    torch_emb = Embedding(V, D).to(device).to(dtype)
+    liger_emb = LigerEmbedding(V, D).to(device).to(dtype)
+    torch_compile_emb = torch.compile(torch_emb)
+
+    input_ids = torch.randint(0, V, (B, T), device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_emb(input_ids)
+        elif provider == "torch_compile":
+            return torch_compile_emb(input_ids)
+        else:
+            return torch_emb(input_ids)
+
+    def full():
+        output = fwd()
+        output.backward(torch.randn_like(output))
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, rep=100)
+    elif mode == "backward":
+        output = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: output.backward(torch.randn_like(output), retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[input_ids],
+            rep=100,
+        )
+    elif mode == "full":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_embedding(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    V = input.x
+    provider = input.kernel_provider
+
+    B = input.extra_benchmark_config["B"]
+    T = input.extra_benchmark_config["T"]
+    D = input.extra_benchmark_config["D"]
+    dtype = input.extra_benchmark_config["dtype"]
+
+    torch_emb = Embedding(V, D).to(device).to(dtype)
+    liger_emb = LigerEmbedding(V, D).to(device).to(dtype)
+    torch_compile_emb = torch.compile(torch_emb)
+
+    input_ids = torch.randint(0, V, (B, T), device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_emb(input_ids)
+        elif provider == "torch_compile":
+            return torch_compile_emb(input_ids)
+        else:
+            return torch_emb(input_ids)
+
+    def full():
+        output = fwd()
+        output.backward(torch.randn_like(output))
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "embedding",
+        "x_name": "V",
+        "x_label": "embedding dimension",
+        "x_values": [2**i for i in range(10, 18)],
+        "kernel_providers": ["liger", "huggingface", "torch_compile"],
+        "extra_benchmark_configs": [
+            # BERT
+            {"B": 32, "T": 512, "D": 768, "dtype": torch.float32},
+            # Llama
+            {"B": 8, "T": 2048, "D": 4096, "dtype": torch.float32},
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_embedding,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_embedding,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_fused_add_rms_norm.py b/benchmark/scripts/benchmark_fused_add_rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..935871e9065a0a65c555ba7097a9be30866565f3
--- /dev/null
+++ b/benchmark/scripts/benchmark_fused_add_rms_norm.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn as nn
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.fused_add_rms_norm import LigerFusedAddRMSNorm
+from liger_kernel.transformers.rms_norm import LigerRMSNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class NaiveAddRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Naive implementation of the add residual rms norm.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states, residual):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        residual = residual.to(torch.float32)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype), residual.to(input_dtype)
+
+
+class AddLigerRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        AddLigerRMSNorm is equivalent to NaiveAddRMSNorm class above, but uses the LigerRMSNorm kernel.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.rms_norm = LigerRMSNorm(hidden_size, eps, in_place=False)
+
+    def forward(self, hidden_states, residual):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        residual = residual.to(torch.float32)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.rms_norm(hidden_states)
+        return self.weight * hidden_states.to(input_dtype), residual.to(input_dtype)
+
+
+def bench_speed_fused_residual_rms_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+
+    # Fused Add RMS Norm
+    fused_add_rms_norm = LigerFusedAddRMSNorm(hidden_size=N, eps=eps).to(device)
+    # Naive implementation
+    naive_rms_norm = NaiveAddRMSNorm(hidden_size=N, eps=eps).to(device)
+    # LigerRMSNorm without fused residual addition
+    liger_rms_norm = AddLigerRMSNorm(hidden_size=N, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    r = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    ds = torch.randn_like(r)
+    x.requires_grad_(True)
+    r.requires_grad_(True)
+    # utility functions
+
+    def y_fwd():
+        if provider == "liger_fused_add_rms_norm":
+            return fused_add_rms_norm(x, r)
+
+        if provider == "huggingface":
+            return naive_rms_norm(x, r)
+
+        if provider == "liger_rms_norm":
+            return liger_rms_norm(x, r)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            y_fwd,
+            grad_to_none=[x, r],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y, s = y_fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: torch.autograd.backward((y, s), (dy, ds), retain_graph=True),
+            grad_to_none=[x, r],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y, s = y_fwd()
+            torch.autograd.backward((y, s), (dy, ds))
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[x, r],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_fused_residual_rms_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+
+    fused_add_rms_norm = LigerFusedAddRMSNorm(hidden_size=N, eps=eps).to(device)
+    naive_rms_norm = NaiveAddRMSNorm(hidden_size=N, eps=eps).to(device)
+    liger_rms_norm = AddLigerRMSNorm(hidden_size=N, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    r = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    ds = torch.randn_like(r)
+    x.requires_grad_(True)
+    r.requires_grad_(True)
+
+    # utility functions
+    def y_fwd():
+        if provider == "liger_fused_add_rms_norm":
+            return fused_add_rms_norm(x, r)
+        if provider == "huggingface":
+            return naive_rms_norm(x, r)
+        if provider == "liger_rms_norm":
+            return liger_rms_norm(x, r)
+
+    def full():
+        y, s = y_fwd()
+        torch.autograd.backward((y, s), (dy, ds))
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "fused_add_rms_norm",
+        "x_name": "H",
+        "x_label": "hidden size",
+        "x_values": [2**i for i in range(10, 16)],
+        "kernel_providers": ["liger_fused_add_rms_norm", "huggingface", "liger_rms_norm"],
+        "extra_benchmark_configs": [{"M": 2048, "dtype": torch.float32, "eps": 1e-6}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_residual_rms_norm,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_residual_rms_norm,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_fused_linear_cross_entropy.py b/benchmark/scripts/benchmark_fused_linear_cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..4d36a66a6394ec7f9104d0300dd599acf332c29b
--- /dev/null
+++ b/benchmark/scripts/benchmark_fused_linear_cross_entropy.py
@@ -0,0 +1,184 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class TorchLMHeadCE(torch.nn.Module):
+    """Ground truth implementation of the linear fused with torch based cross entropy loss.
+
+    :param H: hidden size
+    :param V: vocab size
+    :param ignore_index: index to ignore
+    :param reduction: reduction method
+    """
+
+    def __init__(self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
+        self.ce_loss = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction="mean")
+
+    def forward(self, x, y):
+        logits = self.lin(x)
+        return self.ce_loss(logits, y)
+
+
+class LigerLMHeadCE(torch.nn.Module):
+    def __init__(self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100, accum_dtype=None):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
+        self.ce_loss = LigerFusedLinearCrossEntropyLoss(
+            ignore_index=ignore_index, reduction="mean", accum_dtype=accum_dtype
+        )
+
+    def forward(self, x, y):
+        return self.ce_loss(self.lin.weight, x, y)
+
+
+#############################################################################
+# Test the memory consumption of the linear fused cross entropy loss
+#############################################################################
+
+
+def bench_memory_fused_linear_cross_entropy(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+
+    lm_head_ce = None
+    if provider == "liger":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+    elif provider == "liger-fp32-accum":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    else:
+        lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+
+    _input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
+
+    def fwd():
+        return lm_head_ce(_input, target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+# #############################################################################
+# # Test the speed of the fused linear cross entropy loss
+# #############################################################################
+
+
+def bench_speed_fused_linear_cross_entropy(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    lm_head_ce = None
+    if provider == "liger":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+    elif provider == "liger-fp32-accum":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    else:
+        lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+
+    _input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
+
+    def fwd():
+        return lm_head_ce(_input, target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "no-grad-forward":
+        with torch.no_grad():
+            ms_50, ms_20, ms_80 = triton.testing.do_bench(
+                fwd,
+                rep=100,
+                quantiles=QUANTILES,
+            )
+    elif mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "fused_linear_cross_entropy",
+        "x_name": "BT",
+        "x_label": "B x T",
+        "x_values": [2**i for i in range(12, 16)],
+        "kernel_providers": ["liger", "liger-fp32-accum", "huggingface"],
+        "extra_benchmark_configs": [{"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_linear_cross_entropy,
+        kernel_operation_modes=["forward", "backward", "full", "no-grad-forward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_linear_cross_entropy,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_fused_linear_jsd.py b/benchmark/scripts/benchmark_fused_linear_jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..ac62863b216a94f4fd9f9970145e394bbe20ebd8
--- /dev/null
+++ b/benchmark/scripts/benchmark_fused_linear_jsd.py
@@ -0,0 +1,260 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class TorchJSD(torch.nn.Module):
+    def __init__(
+        self,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        dtype: torch.dtype = torch.float,
+    ):
+        super(TorchJSD, self).__init__()
+        self.kl = torch.nn.KLDivLoss(reduction="none", log_target=True)
+        self.beta = beta
+        self.ignore_index = ignore_index
+        self.dtype = dtype
+
+    def forward(
+        self,
+        log_q: torch.Tensor,  # input
+        log_p: torch.Tensor,  # target
+        label=None,
+    ):
+        log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
+        log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(-1, log_q.size(-1))
+        m = torch.lerp(torch.exp(log_q), torch.exp(log_p), self.beta)
+        loss = self.beta * self.kl(torch.log(m), log_p).sum(dim=-1) + (1 - self.beta) * self.kl(
+            torch.log(m), log_q
+        ).sum(dim=-1)
+
+        if label is not None:
+            loss = torch.where(label != self.ignore_index, loss, 0.0)
+            n_non_ignore = (label != self.ignore_index).sum().item()
+            if n_non_ignore == 0:
+                loss = 0.0
+            else:
+                loss = (loss / n_non_ignore).sum()
+        else:
+            loss = (loss / log_q.shape[0]).sum()
+        return loss.to(self.dtype)
+
+
+class TorchLMHeadJSD(torch.nn.Module):
+    """Ground truth implementation of the linear fused with torch based jsd loss.
+
+    :param H: hidden size
+    :param V: vocab size
+    :param temperature: softmax temperature
+    :param beta: jsd beta
+    """
+
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__()
+        self.student_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.jsd = TorchJSD(beta=beta, ignore_index=ignore_index, dtype=dtype)
+        self.temperature = temperature
+
+    def forward(self, student_input, teacher_input, label=None):
+        student_logits = self.student_lin(student_input)
+        teacher_logits = self.teacher_lin(teacher_input)
+        student_prob = torch.log_softmax(student_logits / self.temperature, dim=-1)
+        teacher_prob = torch.log_softmax(teacher_logits / self.temperature, dim=-1)
+
+        return self.jsd(student_prob, teacher_prob, label)
+
+
+class LigerLMHeadJSD(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__()
+        self.student_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.fused_jsd = LigerFusedLinearJSD(jsd_beta=beta, ignore_index=ignore_index, temperature=temperature)
+
+    def forward(self, student_input, teacher_input, label=None):
+        return self.fused_jsd(
+            student_input,
+            self.student_lin.weight,
+            teacher_input,
+            self.teacher_lin.weight,
+            label,
+        )
+
+
+#############################################################################
+# Test the memory consumption of the fused linear JSD
+#############################################################################
+
+
+def bench_memory_fused_linear_jsd(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+
+    torch_lm_head_jsd = TorchLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
+    liger_lm_head_jsd = LigerLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
+
+    # init the linear in all FusedLinearJSDs with the same weights
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+
+    student_input = torch.rand(BT, H, requires_grad=True, dtype=dtype, device=device)
+    teacher_input = torch.rand(BT, H, dtype=dtype, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_lm_head_jsd(student_input, teacher_input)
+        elif provider == "torch":
+            return torch_lm_head_jsd(student_input, teacher_input)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+# #############################################################################
+# # Test the speed of the fused linear JSD
+# #############################################################################
+
+
+def bench_speed_fused_linear_jsd(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    mode = input.kernel_operation_mode
+
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+
+    torch_lm_head_jsd = TorchLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
+    liger_lm_head_jsd = LigerLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
+
+    # init the linear in all FusedLinearJSDs with the same weights
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+
+    student_input = torch.rand(BT, H, requires_grad=True, dtype=dtype, device=device)
+    teacher_input = torch.rand(BT, H, dtype=dtype, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_lm_head_jsd(student_input, teacher_input)
+        elif provider == "torch":
+            return torch_lm_head_jsd(student_input, teacher_input)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[
+                student_input,
+                torch_lm_head_jsd.student_lin.weight,
+                torch_lm_head_jsd.teacher_lin.weight,
+            ],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "fused_linear_jsd",
+        "x_name": "BT",
+        "x_label": "B x T",
+        "x_values": [2**i for i in range(10, 14)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [{"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_linear_jsd,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_linear_jsd,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_fused_neighborhood_attention.py b/benchmark/scripts/benchmark_fused_neighborhood_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..515d65cad090d098db93624d99e206a73d602330
--- /dev/null
+++ b/benchmark/scripts/benchmark_fused_neighborhood_attention.py
@@ -0,0 +1,367 @@
+import math
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.fused_neighborhood_attention import LigerFusedNeighborhoodAttention
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class TorchNeighborhoodAttention(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        kernel_size: int = 7,
+        dilation: int = 1,
+        bias: bool = True,
+        dropout: float = 0.0,
+        scale: float = None,
+    ):
+        super().__init__()
+
+        if hidden_size % num_heads != 0:
+            raise ValueError(f"hidden_size ({hidden_size}) must be divisible by num_heads ({num_heads})")
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.scale = scale if scale is not None else 1.0 / math.sqrt(self.head_dim)
+
+        self.q_proj = torch.nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.k_proj = torch.nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.v_proj = torch.nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.out_proj = torch.nn.Linear(hidden_size, hidden_size, bias=bias)
+
+        if dropout > 0.0:
+            self.dropout = torch.nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def _create_neighborhood_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        mask = torch.zeros(seq_len, seq_len, device=device, dtype=torch.bool)
+        half_kernel = self.kernel_size // 2
+
+        for i in range(seq_len):
+            start = max(0, i - half_kernel * self.dilation)
+            end = min(seq_len, i + half_kernel * self.dilation + 1)
+
+            for j in range(start, end):
+                if self.dilation == 1 or (j - i) % self.dilation == 0:
+                    mask[i, j] = True
+
+        return mask
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, hidden_size = hidden_states.shape
+
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = query.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key = key.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value = value.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
+
+        mask = self._create_neighborhood_mask(seq_len, hidden_states.device)
+        scores = scores.masked_fill(~mask, float("-inf"))
+
+        attn_weights = torch.softmax(scores, dim=-1)
+
+        if self.dropout is not None:
+            attn_weights = self.dropout(attn_weights)
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_size)
+
+        output = self.out_proj(attn_output)
+
+        return output
+
+
+def bench_speed_fused_neighborhood_attention(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    seq_len = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    batch_size = extra_benchmark_config["batch_size"]
+    hidden_size = extra_benchmark_config["hidden_size"]
+    num_heads = extra_benchmark_config["num_heads"]
+    kernel_size = extra_benchmark_config["kernel_size"]
+    dilation = extra_benchmark_config["dilation"]
+    bias = extra_benchmark_config["bias"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (batch_size, seq_len, hidden_size)
+
+    liger_attn = (
+        LigerFusedNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    torch_attn = (
+        TorchNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    with torch.no_grad():
+        torch_attn.q_proj.weight.copy_(liger_attn.q_proj.weight)
+        torch_attn.k_proj.weight.copy_(liger_attn.k_proj.weight)
+        torch_attn.v_proj.weight.copy_(liger_attn.v_proj.weight)
+        torch_attn.out_proj.weight.copy_(liger_attn.out_proj.weight)
+
+        if bias:
+            torch_attn.q_proj.bias.copy_(liger_attn.q_proj.bias)
+            torch_attn.k_proj.bias.copy_(liger_attn.k_proj.bias)
+            torch_attn.v_proj.bias.copy_(liger_attn.v_proj.bias)
+            torch_attn.out_proj.bias.copy_(liger_attn.out_proj.bias)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return liger_attn(x)
+        elif provider == "torch":
+            return torch_attn(x)
+
+    print(f"Starting Warmup for input size: {x_shape}")
+    _ = fwd()
+    if mode in ("backward", "full"):
+        y = _
+        y.backward(dy, retain_graph=True)
+    print("Done Warmup")
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, grad_to_none=[x], rep=100, quantiles=QUANTILES)
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            grad_to_none=[x],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, grad_to_none=[x], rep=100, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_fused_neighborhood_attention(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    seq_len = input.x
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    batch_size = extra_benchmark_config["batch_size"]
+    hidden_size = extra_benchmark_config["hidden_size"]
+    num_heads = extra_benchmark_config["num_heads"]
+    kernel_size = extra_benchmark_config["kernel_size"]
+    dilation = extra_benchmark_config["dilation"]
+    bias = extra_benchmark_config["bias"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (batch_size, seq_len, hidden_size)
+
+    liger_attn = (
+        LigerFusedNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    torch_attn = (
+        TorchNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    with torch.no_grad():
+        torch_attn.q_proj.weight.copy_(liger_attn.q_proj.weight)
+        torch_attn.k_proj.weight.copy_(liger_attn.k_proj.weight)
+        torch_attn.v_proj.weight.copy_(liger_attn.v_proj.weight)
+        torch_attn.out_proj.weight.copy_(liger_attn.out_proj.weight)
+
+        if bias:
+            torch_attn.q_proj.bias.copy_(liger_attn.q_proj.bias)
+            torch_attn.k_proj.bias.copy_(liger_attn.k_proj.bias)
+            torch_attn.v_proj.bias.copy_(liger_attn.v_proj.bias)
+            torch_attn.out_proj.bias.copy_(liger_attn.out_proj.bias)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return liger_attn(x)
+        elif provider == "torch":
+            return torch_attn(x)
+
+    def full():
+        y = fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "fused_neighborhood_attention",
+        "x_name": "seq_len",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(6, 13)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "batch_size": 2,
+                "hidden_size": 512,
+                "num_heads": 8,
+                "kernel_size": 7,
+                "dilation": 1,
+                "bias": True,
+                "dtype": torch.float32,
+            },
+            {
+                "batch_size": 4,
+                "hidden_size": 768,
+                "num_heads": 12,
+                "kernel_size": 7,
+                "dilation": 1,
+                "bias": True,
+                "dtype": torch.float32,
+            },
+            {
+                "batch_size": 2,
+                "hidden_size": 1024,
+                "num_heads": 16,
+                "kernel_size": 9,
+                "dilation": 1,
+                "bias": True,
+                "dtype": torch.float32,
+            },
+            {
+                "batch_size": 2,
+                "hidden_size": 512,
+                "num_heads": 8,
+                "kernel_size": 7,
+                "dilation": 2,
+                "bias": True,
+                "dtype": torch.float32,
+            },
+            {
+                "batch_size": 2,
+                "hidden_size": 512,
+                "num_heads": 8,
+                "kernel_size": 7,
+                "dilation": 1,
+                "bias": True,
+                "dtype": torch.bfloat16,
+            },
+            {
+                "batch_size": 4,
+                "hidden_size": 768,
+                "num_heads": 12,
+                "kernel_size": 7,
+                "dilation": 1,
+                "bias": True,
+                "dtype": torch.bfloat16,
+            },
+            {
+                "batch_size": 2,
+                "hidden_size": 1024,
+                "num_heads": 16,
+                "kernel_size": 9,
+                "dilation": 1,
+                "bias": True,
+                "dtype": torch.bfloat16,
+            },
+            {
+                "batch_size": 2,
+                "hidden_size": 512,
+                "num_heads": 8,
+                "kernel_size": 7,
+                "dilation": 2,
+                "bias": True,
+                "dtype": torch.bfloat16,
+            },
+        ],
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_neighborhood_attention,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_neighborhood_attention,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_geglu.py b/benchmark/scripts/benchmark_geglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..d59564bafa15b22cddb2d5cf7b6a64f01d5fa989
--- /dev/null
+++ b/benchmark/scripts/benchmark_geglu.py
@@ -0,0 +1,115 @@
+import math
+
+import torch
+
+from benchmark_model_configs import compute_seq_len_sweep_config
+from benchmark_model_configs import estimate_kernel_peak_memory
+from benchmark_model_configs import get_benchmark_model_config
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+from utils import run_memory_benchmark
+from utils import run_speed_benchmark
+
+from liger_kernel.transformers.geglu import LigerGEGLUMLP
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def _setup_geglu(input: SingleBenchmarkRunInput):
+    """Create input tensor and GEGLU layer from benchmark config."""
+    cfg = input.extra_benchmark_config
+    llama_config = LlamaConfig(
+        hidden_size=cfg["hidden_size"],
+        intermediate_size=cfg["intermediate_size"],
+        hidden_act=cfg["hidden_act"],
+    )
+    x = torch.randn(
+        cfg["bsz"],
+        input.x,
+        cfg["hidden_size"],
+        device=device,
+        dtype=cfg["dtype"],
+        requires_grad=True,
+    )
+    if input.kernel_provider == "liger":
+        layer = LigerGEGLUMLP(config=llama_config).to(device).to(cfg["dtype"])
+    elif input.kernel_provider == "huggingface":
+        layer = LlamaMLP(config=llama_config).to(device).to(cfg["dtype"])
+    else:
+        raise ValueError(f"Invalid provider: {input.kernel_provider} for GEGLU")
+    return x, layer
+
+
+def bench_speed_geglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_geglu(input)
+    return run_speed_benchmark(lambda: layer(x), input.kernel_operation_mode, [x])
+
+
+def bench_memory_geglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_geglu(input)
+    return run_memory_benchmark(lambda: layer(x), input.kernel_operation_mode)
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    model = get_benchmark_model_config(args.model)
+    probe_seq_len = 1024
+
+    def _probe():
+        probe_input = SingleBenchmarkRunInput(
+            x=probe_seq_len,
+            kernel_provider="huggingface",
+            extra_benchmark_config={
+                "bsz": 1,
+                "hidden_size": model.hidden_size,
+                "intermediate_size": model.intermediate_size,
+                "hidden_act": "gelu_pytorch_tanh",
+                "dtype": model.dtype,
+            },
+        )
+        x, layer = _setup_geglu(probe_input)
+        return layer(x)
+
+    peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe)
+    kernel_bpt = peak_bytes // probe_seq_len
+
+    config = compute_seq_len_sweep_config(model, kernel_bytes_per_token=kernel_bpt)
+
+    common_configs = {
+        "kernel_name": "geglu",
+        "x_name": "T",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(10, int(math.log2(config.seq_len)) + 1)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "bsz": config.batch_size,
+                "hidden_size": model.hidden_size,
+                "intermediate_size": model.intermediate_size,
+                "hidden_act": "gelu_pytorch_tanh",
+                "dtype": model.dtype,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_geglu,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_geglu,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_group_norm.py b/benchmark/scripts/benchmark_group_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..5a8bf37f4af06260409085501c6a0329ae5bfdc4
--- /dev/null
+++ b/benchmark/scripts/benchmark_group_norm.py
@@ -0,0 +1,137 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.group_norm import LigerGroupNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def bench_speed_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    C = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    H = extra_benchmark_config["H"]
+    channels_per_group = extra_benchmark_config["channels_per_group"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, C, H)
+    triton_ln = LigerGroupNorm(num_channels=C, num_groups=C // channels_per_group, eps=eps).to(device)
+    torch_ln = torch.nn.GroupNorm(num_groups=C // channels_per_group, num_channels=C, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def y_fwd():
+        if provider == "liger":
+            return triton_ln(x)
+        if provider == "huggingface":
+            return torch_ln(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(y_fwd, quantiles=QUANTILES, grad_to_none=[x], rep=500)
+    elif mode == "backward":
+        y = y_fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[x],
+            rep=500,
+        )
+    elif mode == "full":
+
+        def full():
+            y = y_fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, grad_to_none=[x], rep=500)
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    C = input.x
+    provider = input.kernel_provider
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    H = extra_benchmark_config["H"]
+    channels_per_group = extra_benchmark_config["channels_per_group"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, C, H)
+    triton_ln = LigerGroupNorm(num_channels=C, num_groups=C // channels_per_group, eps=eps).to(device)
+    torch_ln = torch.nn.GroupNorm(num_groups=C // channels_per_group, num_channels=C, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def y_fwd():
+        if provider == "liger":
+            return triton_ln(x)
+        if provider == "huggingface":
+            return torch_ln(x)
+
+    def full():
+        y = y_fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "group_norm",
+        "x_name": "C",
+        "x_label": "num_channels",
+        "x_values": [2**i for i in range(5, 12)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "M": 128,
+                "H": 512,
+                "channels_per_group": 4,
+                "dtype": torch.float32,
+                "eps": 1e-6,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_group_norm,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_group_norm,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_grpo_loss.py b/benchmark/scripts/benchmark_grpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..497d8692c7ab5688637dcfe9c8c9ec955d8cb5e7
--- /dev/null
+++ b/benchmark/scripts/benchmark_grpo_loss.py
@@ -0,0 +1,234 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+#############################################################################
+# Test the memory consumption of the linear fused GRPO loss
+#############################################################################
+
+
+def bench_memory_fused_linear_grpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_grpo_loss import LigerLMHeadGRPO
+    from test.chunked_loss.test_grpo_loss import TorchLMHeadGRPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    importance_sampling_level = input.extra_benchmark_config["importance_sampling_level"]
+    provider = input.kernel_provider
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_grpo = TorchLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
+        device
+    )
+    liger_lm_head_grpo = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
+        device
+    )
+
+    # Create inputs
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    selected_token_ids = torch.randint(0, V, (B, T), dtype=torch.long, device=device)
+    attention_mask = torch.ones(B, T, device=device)
+    advantages = torch.randn(B, dtype=dtype, device=device)
+    ref_input = torch.randn(B, T, H, dtype=dtype, device=device)
+
+    torch_fwd = lambda: torch_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
+        0
+    ]
+    liger_fwd = lambda: liger_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
+        0
+    ]
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd()
+        elif provider == "torch":
+            return torch_fwd()
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+#############################################################################
+# Test the speed of the fused linear GRPO loss
+#############################################################################
+
+
+def bench_speed_fused_linear_grpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_grpo_loss import LigerLMHeadGRPO
+    from test.chunked_loss.test_grpo_loss import TorchLMHeadGRPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    importance_sampling_level = input.extra_benchmark_config["importance_sampling_level"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_grpo = TorchLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
+        device
+    )
+    liger_lm_head_grpo = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, importance_sampling_level=importance_sampling_level).to(
+        device
+    )
+
+    # Create inputs
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    selected_token_ids = torch.randint(0, V, (B, T), dtype=torch.long, device=device)
+    attention_mask = torch.ones(B, T, device=device)
+    advantages = torch.randn(B, dtype=dtype, device=device)
+    ref_input = torch.randn(B, T, H, dtype=dtype, device=device)
+
+    torch_fwd = lambda: torch_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
+        0
+    ]
+    liger_fwd = lambda: liger_lm_head_grpo(_input, selected_token_ids, attention_mask, advantages, ref_input=ref_input)[
+        0
+    ]
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd()
+        elif provider == "torch":
+            return torch_fwd()
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    # Benchmark token-level importance sampling (original GRPO)
+    token_configs = {
+        "kernel_name": "fused_linear_grpo_loss_token",
+        "x_name": "B",
+        "x_label": "B",
+        "x_values": [2**i for i in range(1, 5)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "T": 1024,
+                "H": 4096,
+                "V": 128256,
+                "importance_sampling_level": "token",
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    # Benchmark sequence-level importance sampling (GSPO)
+    sequence_configs = {
+        "kernel_name": "fused_linear_grpo_loss_sequence",
+        "x_name": "B",
+        "x_label": "B",
+        "x_values": [2**i for i in range(1, 5)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "T": 1024,
+                "H": 4096,
+                "V": 128256,
+                "importance_sampling_level": "sequence",
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    # Run benchmarks for token-level (GRPO)
+    print("Benchmarking GRPO (token-level importance sampling)...")
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_linear_grpo_loss,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **token_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_linear_grpo_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **token_configs,
+    )
+
+    # Run benchmarks for sequence-level (GSPO)
+    print("Benchmarking GSPO (sequence-level importance sampling)...")
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_linear_grpo_loss,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **sequence_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_linear_grpo_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **sequence_configs,
+    )
diff --git a/benchmark/scripts/benchmark_jsd.py b/benchmark/scripts/benchmark_jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..16d71eac042f7736989d5e93e2279360cd0f68dd
--- /dev/null
+++ b/benchmark/scripts/benchmark_jsd.py
@@ -0,0 +1,157 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.jsd import LigerJSD
+from liger_kernel.utils import get_total_gpu_memory
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class TorchJSD(torch.nn.Module):
+    def __init__(
+        self,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        dtype: torch.dtype = torch.float,
+    ):
+        super(TorchJSD, self).__init__()
+        self.kl = torch.nn.KLDivLoss(reduction="none", log_target=True)
+        self.beta = beta
+        self.ignore_index = ignore_index
+        self.dtype = dtype
+
+    def forward(
+        self,
+        log_q: torch.Tensor,  # input
+        log_p: torch.Tensor,  # target
+        label=None,
+    ):
+        log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
+        log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(-1, log_q.size(-1))
+        m = torch.lerp(torch.exp(log_q), torch.exp(log_p), self.beta)
+        loss = self.beta * self.kl(torch.log(m), log_p).sum(dim=-1) + (1 - self.beta) * self.kl(
+            torch.log(m), log_q
+        ).sum(dim=-1)
+
+        if label is not None:
+            loss = torch.where(label != self.ignore_index, loss, 0.0)
+            n_non_ignore = (label != self.ignore_index).sum().item()
+            if n_non_ignore == 0:
+                loss = 0.0
+            else:
+                loss = (loss / n_non_ignore).sum()
+        else:
+            loss = (loss / log_q.shape[0]).sum()
+        return loss.to(self.dtype)
+
+
+def bench_speed_jsd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    V = input.x
+    B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
+    torch_jsd = TorchJSD()
+    liger_jsd = LigerJSD()
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).log_softmax(dim=-1)
+
+    def fwd():
+        if input.kernel_provider == "liger":
+            return liger_jsd(_input, target)
+        else:
+            return torch_jsd(_input, target)
+
+    if input.kernel_operation_mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, rep=100)
+    elif input.kernel_operation_mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[_input],
+            rep=100,
+        )
+    elif input.kernel_operation_mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward(retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_jsd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    torch_jsd = TorchJSD()
+    liger_jsd = LigerJSD()
+
+    V = input.x
+    B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).log_softmax(dim=-1)
+
+    def fwd():
+        if input.kernel_provider == "liger":
+            return liger_jsd(_input, target)
+        else:
+            return torch_jsd(_input, target)
+
+    def full():
+        y = fwd()
+        y.backward(retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+    gpu_memory_gbs = get_total_gpu_memory()
+    # We know that the full test will require 54GBs for vocab size 2^17 on torch
+    if gpu_memory_gbs >= 54:
+        x_max = 17
+    else:
+        x_max = 16
+    common_args = {
+        "kernel_name": "jsd",
+        "x_name": "V",
+        "x_label": "vocab size",
+        "x_values": [2**i for i in range(12, x_max + 1)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [{"B": 4, "T": 2048}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_jsd,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_args,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_jsd,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_args,
+    )
diff --git a/benchmark/scripts/benchmark_kl_div.py b/benchmark/scripts/benchmark_kl_div.py
new file mode 100755
index 0000000000000000000000000000000000000000..09948c38b48e0d4b9c549ea374064af833dab0da
--- /dev/null
+++ b/benchmark/scripts/benchmark_kl_div.py
@@ -0,0 +1,117 @@
+import torch
+import torch.nn as nn
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.kl_div import LigerKLDIVLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+S, E = 12, 18
+
+
+def bench_speed_kldiv(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    reduction = "batchmean"
+    V = input.x
+    B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
+    torch_kl_div = nn.KLDivLoss(reduction=reduction)
+    liger_kl_div = LigerKLDIVLoss(reduction=reduction)
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
+
+    def fwd():
+        if input.kernel_provider == "liger":
+            return liger_kl_div(_input, target)
+        else:
+            return torch_kl_div(_input, target)
+
+    if input.kernel_operation_mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, rep=100)
+    elif input.kernel_operation_mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[_input],
+            rep=100,
+        )
+    elif input.kernel_operation_mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward(retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_kldiv(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    reduction = "batchmean"
+    torch_kl_div = nn.KLDivLoss(reduction=reduction)
+    liger_kl_div = LigerKLDIVLoss(reduction=reduction)
+
+    V = input.x
+    B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).log_softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
+
+    def fwd():
+        if input.kernel_provider == "liger":
+            return liger_kl_div(_input, target)
+        else:
+            return torch_kl_div(_input, target)
+
+    def full():
+        y = fwd()
+        y.backward(retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+    common_args = {
+        "kernel_name": "kl_div",
+        "x_name": "V",
+        "x_label": "vocab size",
+        "x_values": [2**i for i in range(12, 18)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [{"B": 8, "T": 512}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_kldiv,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_args,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_kldiv,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_args,
+    )
diff --git a/benchmark/scripts/benchmark_kto_loss.py b/benchmark/scripts/benchmark_kto_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..bbde1d5c6b5749e4ce8067231ee3c7a15fb47e7e
--- /dev/null
+++ b/benchmark/scripts/benchmark_kto_loss.py
@@ -0,0 +1,314 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.chunked_loss import LigerFusedLinearKTOLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+class TorchLMHeadKTO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        use_bias: bool = False,
+        use_ref_bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        from test.chunked_loss.test_kto_loss import HFKTOLoss
+
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_ref_bias, dtype=dtype)
+        self.KTO_loss = HFKTOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+        ).get_batch_loss_metrics
+
+    def forward(self, x, ref_x, y, preference_labels, kl=None):
+        return self.KTO_loss(
+            weight=self.lin.weight,
+            _input=x,
+            target=y,
+            bias=self.lin.bias,
+            ref_input=ref_x,
+            ref_weight=self.ref_lin.weight,
+            ref_bias=self.ref_lin.bias,
+            preference_labels=preference_labels,
+            kl=kl,
+        )
+
+
+class LigerLMHeadKTO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        use_bias: bool = False,
+        use_ref_bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_ref_bias, dtype=dtype)
+        self.KTO_loss = LigerFusedLinearKTOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+        )
+
+    def forward(self, x, ref_x, y, preference_labels, kl=None):
+        return self.KTO_loss(
+            _input=x,
+            lin_weight=self.lin.weight,
+            target=y,
+            preference_labels=preference_labels,
+            bias=self.lin.bias,
+            ref_input=ref_x,
+            ref_weight=self.ref_lin.weight,
+            ref_bias=self.ref_lin.bias,
+            kl=kl,
+        )
+
+
+def bench_memory_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    beta = input.extra_benchmark_config["beta"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+
+    torch_kto_loss = TorchLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        use_bias=bias,
+        use_ref_bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+
+    liger_kto_loss = LigerLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        use_bias=bias,
+        use_ref_bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+
+    # Input shape: [B, T, H]
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+
+    # Target shape: [B, T]
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    # Preference labels shape: [B]
+    # Create binary preference labels (0 or 1) for each sequence in the batch
+    # Used to indicate preferred sequences (1) vs non-preferred sequences (0)
+    preference_labels = torch.randint(2, (B,), dtype=torch.bool, device=device)
+
+    # Precomputed KL divergence between policy and reference distributions
+    kl = torch.randn(1, device=device, dtype=dtype)
+
+    # Add ignore_index tokens to simulate padding
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    # Add ref_x with the same shape as _input
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype)
+
+    def fwd():
+        if provider == "liger":
+            return liger_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )[0]
+        elif provider == "huggingface":
+            return torch_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )[0]
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+def bench_speed_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    beta = input.extra_benchmark_config["beta"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    torch_kto_loss = TorchLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        beta=beta,
+        ignore_index=ignore_index,
+        use_bias=bias,
+    ).to(device)
+    liger_kto_loss = LigerLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        beta=beta,
+        ignore_index=ignore_index,
+        use_bias=bias,
+    ).to(device)
+
+    # Input shape: [B, T, H]
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+
+    # Target shape: [B, T]
+    target = torch.randint(V, (B, T), device=device, dtype=torch.long)
+
+    # Preference labels shape: [B]
+    # Create binary preference labels (0 or 1) for each sequence in the batch
+    # Used to indicate preferred sequences (1) vs non-preferred sequences (0)
+    preference_labels = torch.randint(2, (B,), dtype=torch.bool, device=device)
+
+    # Precomputed KL divergence between policy and reference distributions
+    kl = torch.randn(1, device=device, dtype=dtype)
+
+    # Add ignore_index tokens
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    # Add ref_x with the same shape as _input
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype)
+
+    def fwd():
+        if provider == "liger":
+            return liger_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )[0]
+        elif provider == "huggingface":
+            return torch_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )[0]
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "kto_loss",
+        "x_name": "B",
+        "x_label": "Batch Size (B)",
+        "x_values": [2**i for i in range(1, 6)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "T": 512,
+                "H": 1024,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+                "bias": True,
+                "beta": 0.1,
+                "ignore_index": 42,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_kto_loss,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_kto_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_layer_norm.py b/benchmark/scripts/benchmark_layer_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..0addf78ed8fae60d04869e67f317dac04f3df047
--- /dev/null
+++ b/benchmark/scripts/benchmark_layer_norm.py
@@ -0,0 +1,125 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.layer_norm import LigerLayerNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def bench_speed_layer_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+    triton_ln = LigerLayerNorm(hidden_size=N).to(device)
+    torch_ln = torch.nn.LayerNorm(N, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def y_fwd():
+        if provider == "liger":
+            return triton_ln(x)
+        if provider == "huggingface":
+            return torch_ln(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(y_fwd, quantiles=QUANTILES, grad_to_none=[x], rep=500)
+    elif mode == "backward":
+        y = y_fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[x],
+            rep=500,
+        )
+    elif mode == "full":
+
+        def full():
+            y = y_fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, grad_to_none=[x], rep=500)
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_layer_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+    dtype = input.extra_benchmark_config["dtype"]
+    M = input.extra_benchmark_config["M"]
+    eps = input.extra_benchmark_config["eps"]
+
+    x_shape = (M, N)
+
+    triton_ln = LigerLayerNorm(hidden_size=N).to(device)
+    torch_ln = torch.nn.LayerNorm(N, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def y_fwd():
+        if provider == "liger":
+            return triton_ln(x)
+        if provider == "huggingface":
+            return torch_ln(x)
+
+    def full():
+        y = y_fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "layer_norm",
+        "x_name": "N",
+        "x_label": "hidden size",
+        "x_values": [2**i for i in range(10, 15)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [{"M": 4096, "dtype": torch.float32, "eps": 1e-6}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_layer_norm,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_layer_norm,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_llama4_rope.py b/benchmark/scripts/benchmark_llama4_rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..47d06051e4034ed06d24ea8eb08ec2014135560f
--- /dev/null
+++ b/benchmark/scripts/benchmark_llama4_rope.py
@@ -0,0 +1,245 @@
+import torch
+import triton
+
+from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+from transformers.models.llama4.modeling_llama4 import Llama4TextRotaryEmbedding
+from transformers.models.llama4.modeling_llama4 import apply_rotary_emb
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.llama4_rope import liger_llama4_text_rotary_pos_emb
+from liger_kernel.utils import infer_device
+from liger_kernel.utils import transformers_version_dispatch
+
+device = infer_device()
+
+
+def bench_speed_llama4_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    num_q_heads = extra_benchmark_config["num_q_heads"]
+    num_kv_heads = extra_benchmark_config["num_kv_heads"]
+    dtype = extra_benchmark_config["dtype"]
+
+    # x can be either hidden_size or seq_len
+    hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
+    seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
+
+    head_dim = hidden_size // num_q_heads
+
+    # Create Llama4TextConfig for the rotary embedding
+    config = Llama4TextConfig(
+        hidden_size=hidden_size,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        head_dim=head_dim,
+        max_position_embeddings=seq_len,
+    )
+
+    rotary_emb = transformers_version_dispatch(
+        "4.48.0",
+        Llama4TextRotaryEmbedding,
+        Llama4TextRotaryEmbedding,
+        before_kwargs={"config": config, "device": device},
+        after_kwargs={"config": config, "device": device},
+    )
+
+    q = torch.randn(
+        (1, seq_len, num_q_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    )
+    k = torch.randn(
+        (1, seq_len, num_kv_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    )
+    dq, dk = (
+        torch.randn_like(q, device=device, dtype=dtype),
+        torch.randn_like(k, device=device),
+    )
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
+    freqs_cis = rotary_emb(q, pos_ids)
+
+    def fwd():
+        if provider == "liger":
+            return liger_llama4_text_rotary_pos_emb(q, k, freqs_cis)
+        elif provider == "huggingface":
+            return apply_rotary_emb(q, k, freqs_cis)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for Llama4 RoPE embedding")
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        q_out, k_out = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True),
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            q_out, k_out = fwd()
+            torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_llama4_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    num_q_heads = extra_benchmark_config["num_q_heads"]
+    num_kv_heads = extra_benchmark_config["num_kv_heads"]
+    dtype = extra_benchmark_config["dtype"]
+
+    # x can be either hidden_size or seq_len
+    hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
+    seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
+
+    head_dim = hidden_size // num_q_heads
+
+    # Create Llama4TextConfig for the rotary embedding
+    config = Llama4TextConfig(
+        hidden_size=hidden_size,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        head_dim=head_dim,
+        max_position_embeddings=seq_len,
+    )
+
+    rotary_emb = transformers_version_dispatch(
+        "4.48.0",
+        Llama4TextRotaryEmbedding,
+        Llama4TextRotaryEmbedding,
+        before_kwargs={"config": config, "device": device},
+        after_kwargs={"config": config, "device": device},
+    )
+
+    q = torch.randn(
+        (1, seq_len, num_q_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    )
+    k = torch.randn(
+        (1, seq_len, num_kv_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    )
+    dq, dk = (
+        torch.randn_like(q, device=device, dtype=dtype),
+        torch.randn_like(k, device=device),
+    )
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
+    freqs_cis = rotary_emb(q, pos_ids)
+
+    def full():
+        if provider == "liger":
+            q_out, k_out = liger_llama4_text_rotary_pos_emb(q, k, freqs_cis)
+        else:
+            q_out, k_out = apply_rotary_emb(q, k, freqs_cis)
+        torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(
+        full,
+        quantiles=QUANTILES,
+    )
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs_varying_hidden_size = {
+        "kernel_name": "llama4_rope",
+        "x_name": "H",
+        "x_label": "hidden size",
+        "x_values": [32 * (2**i) for i in range(4, 10, 2)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "dtype": torch.bfloat16,
+                "seq_len": 2048,
+                "num_q_heads": 32,
+                "num_kv_heads": 8,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_llama4_rope,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_varying_hidden_size,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_llama4_rope,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_varying_hidden_size,
+    )
+
+    common_configs_varying_seq_len = {
+        "kernel_name": "llama4_rope",
+        "x_name": "T",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(10, 15)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "dtype": torch.bfloat16,
+                "hidden_size": 8192,
+                "num_q_heads": 32,
+                "num_kv_heads": 8,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_llama4_rope,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_varying_seq_len,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_llama4_rope,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_varying_seq_len,
+    )
diff --git a/benchmark/scripts/benchmark_mhc.py b/benchmark/scripts/benchmark_mhc.py
new file mode 100755
index 0000000000000000000000000000000000000000..47cdd6336879510244f2433285c9ec6fe9fcb449
--- /dev/null
+++ b/benchmark/scripts/benchmark_mhc.py
@@ -0,0 +1,255 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.functional import liger_mhc_coeffs
+from liger_kernel.transformers.functional import liger_mhc_post_res
+from liger_kernel.transformers.functional import liger_mhc_pre
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+def bench_speed_mhc(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    from test.transformers.test_mhc import mhc_coeffs_ref
+
+    T = input.x
+    B = input.extra_benchmark_config["B"]
+    HC = input.extra_benchmark_config["HC"]
+    C = input.extra_benchmark_config["C"]
+    sub_kernel = input.extra_benchmark_config["sub_kernel"]
+    tmax = input.extra_benchmark_config["tmax"]
+    rms_eps = input.extra_benchmark_config["rms_eps"]
+    pre_eps = input.extra_benchmark_config["pre_eps"]
+    sinkhorn_eps = input.extra_benchmark_config["sinkhorn_eps"]
+    post_mult = input.extra_benchmark_config["post_mult"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    coeffs_cfg = dict(tmax=tmax, rms_eps=rms_eps, pre_eps=pre_eps, sinkhorn_eps=sinkhorn_eps, post_mult=post_mult)
+    need_grad = mode in ("backward", "full")
+
+    x = torch.randn(B, T, HC, C, device=device, dtype=torch.bfloat16, requires_grad=need_grad)
+    K, M = HC * C, HC * HC + 2 * HC
+    phi = (torch.randn(K, M, device=device, dtype=torch.bfloat16) * 0.02).requires_grad_(need_grad)
+    b_param = torch.zeros(M, device=device, dtype=torch.float32, requires_grad=need_grad)
+    alpha_pre = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=need_grad)
+    alpha_post = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=need_grad)
+    alpha_res = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=need_grad)
+
+    grad_to_none = [x, phi, b_param, alpha_pre, alpha_post, alpha_res] if need_grad else None
+
+    if sub_kernel == "coeffs":
+
+        def fwd():
+            if provider == "liger":
+                return liger_mhc_coeffs(x, phi, b_param, alpha_pre, alpha_post, alpha_res, **coeffs_cfg)
+            return mhc_coeffs_ref(x, phi, b_param, alpha_pre, alpha_post, alpha_res, **coeffs_cfg)
+
+        def fwd_loss():
+            h_pre, h_post, h_res = fwd()
+            return h_pre.square().mean() + h_post.square().mean() + h_res.square().mean()
+
+    elif sub_kernel == "pre":
+        with torch.no_grad():
+            h_pre_c, _, _ = liger_mhc_coeffs(
+                x.detach(),
+                phi.detach(),
+                b_param.detach(),
+                alpha_pre.detach(),
+                alpha_post.detach(),
+                alpha_res.detach(),
+                **coeffs_cfg,
+            )
+        h_pre_c.requires_grad_(need_grad)
+        grad_to_none = [x, h_pre_c] if need_grad else None
+
+        def fwd():
+            if provider == "liger":
+                return liger_mhc_pre(x, h_pre_c)
+            return (x.float() * h_pre_c.unsqueeze(-1)).sum(dim=-2)
+
+        def fwd_loss():
+            return fwd().square().mean()
+
+    elif sub_kernel == "post_res":
+        with torch.no_grad():
+            _, h_post_c, h_res_c = liger_mhc_coeffs(
+                x.detach(),
+                phi.detach(),
+                b_param.detach(),
+                alpha_pre.detach(),
+                alpha_post.detach(),
+                alpha_res.detach(),
+                **coeffs_cfg,
+            )
+        h_post_c.requires_grad_(need_grad)
+        h_res_c.requires_grad_(need_grad)
+        f_out = torch.randn(B, T, C, device=device, dtype=torch.bfloat16, requires_grad=need_grad)
+        grad_to_none = [x, f_out, h_post_c, h_res_c] if need_grad else None
+
+        def fwd():
+            if provider == "liger":
+                return liger_mhc_post_res(x, f_out, h_post_c, h_res_c)
+            return torch.einsum("...oi,...ic->...oc", h_res_c, x.float()) + h_post_c.unsqueeze(
+                -1
+            ) * f_out.float().unsqueeze(-2)
+
+        def fwd_loss():
+            return fwd().square().mean()
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
+    elif mode == "backward":
+        y = fwd_loss()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=grad_to_none,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd_loss()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, grad_to_none=grad_to_none, rep=100, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(y_20=ms_20, y_50=ms_50, y_80=ms_80)
+
+
+def bench_memory_mhc(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    from test.transformers.test_mhc import mhc_coeffs_ref
+
+    T = input.x
+    B = input.extra_benchmark_config["B"]
+    HC = input.extra_benchmark_config["HC"]
+    C = input.extra_benchmark_config["C"]
+    sub_kernel = input.extra_benchmark_config["sub_kernel"]
+    tmax = input.extra_benchmark_config["tmax"]
+    rms_eps = input.extra_benchmark_config["rms_eps"]
+    pre_eps = input.extra_benchmark_config["pre_eps"]
+    sinkhorn_eps = input.extra_benchmark_config["sinkhorn_eps"]
+    post_mult = input.extra_benchmark_config["post_mult"]
+    provider = input.kernel_provider
+
+    coeffs_cfg = dict(tmax=tmax, rms_eps=rms_eps, pre_eps=pre_eps, sinkhorn_eps=sinkhorn_eps, post_mult=post_mult)
+
+    x = torch.randn(B, T, HC, C, device=device, dtype=torch.bfloat16, requires_grad=True)
+    K, M = HC * C, HC * HC + 2 * HC
+    phi = (torch.randn(K, M, device=device, dtype=torch.bfloat16) * 0.02).requires_grad_(True)
+    b_param = torch.zeros(M, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_pre = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_post = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_res = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+
+    if sub_kernel == "coeffs":
+
+        def full():
+            if provider == "liger":
+                hp, hpo, hr = liger_mhc_coeffs(x, phi, b_param, alpha_pre, alpha_post, alpha_res, **coeffs_cfg)
+            else:
+                hp, hpo, hr = mhc_coeffs_ref(x, phi, b_param, alpha_pre, alpha_post, alpha_res, **coeffs_cfg)
+            (hp.square().mean() + hpo.square().mean() + hr.square().mean()).backward()
+
+    elif sub_kernel == "pre":
+        with torch.no_grad():
+            h_pre_c, _, _ = liger_mhc_coeffs(
+                x.detach(),
+                phi.detach(),
+                b_param.detach(),
+                alpha_pre.detach(),
+                alpha_post.detach(),
+                alpha_res.detach(),
+                **coeffs_cfg,
+            )
+        h_pre_c.requires_grad_(True)
+
+        def full():
+            if provider == "liger":
+                out = liger_mhc_pre(x, h_pre_c)
+            else:
+                out = (x.float() * h_pre_c.unsqueeze(-1)).sum(dim=-2)
+            out.square().mean().backward()
+
+    elif sub_kernel == "post_res":
+        with torch.no_grad():
+            _, h_post_c, h_res_c = liger_mhc_coeffs(
+                x.detach(),
+                phi.detach(),
+                b_param.detach(),
+                alpha_pre.detach(),
+                alpha_post.detach(),
+                alpha_res.detach(),
+                **coeffs_cfg,
+            )
+        h_post_c.requires_grad_(True)
+        h_res_c.requires_grad_(True)
+        f_out = torch.randn(B, T, C, device=device, dtype=torch.bfloat16, requires_grad=True)
+
+        def full():
+            if provider == "liger":
+                out = liger_mhc_post_res(x, f_out, h_post_c, h_res_c)
+            else:
+                out = torch.einsum("...oi,...ic->...oc", h_res_c, x.float()) + h_post_c.unsqueeze(
+                    -1
+                ) * f_out.float().unsqueeze(-2)
+            out.square().mean().backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(y_20=mem_20, y_50=mem_50, y_80=mem_80)
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    for sub_kernel in ["coeffs", "pre", "post_res"]:
+        common_configs = {
+            "kernel_name": f"mhc_{sub_kernel}",
+            "x_name": "T",
+            "x_label": "Sequence Length (T)",
+            "x_values": [2**i for i in range(7, 12)],
+            "kernel_providers": ["liger", "torch"],
+            "extra_benchmark_configs": [
+                {
+                    "B": 4,
+                    "HC": 4,
+                    "C": 4096,
+                    "tmax": 20,
+                    "rms_eps": 1e-6,
+                    "pre_eps": 0.0,
+                    "sinkhorn_eps": 1e-6,
+                    "post_mult": 2.0,
+                    "sub_kernel": sub_kernel,
+                }
+            ],
+            "overwrite": args.overwrite,
+        }
+
+        run_benchmarks(
+            bench_test_fn=bench_speed_mhc,
+            kernel_operation_modes=["forward", "backward", "full"],
+            metric_name="speed",
+            metric_unit="ms",
+            **common_configs,
+        )
+
+        run_benchmarks(
+            bench_test_fn=bench_memory_mhc,
+            kernel_operation_modes=["full"],
+            metric_name="memory",
+            metric_unit="MB",
+            **common_configs,
+        )
diff --git a/benchmark/scripts/benchmark_mhc_lm.py b/benchmark/scripts/benchmark_mhc_lm.py
new file mode 100755
index 0000000000000000000000000000000000000000..6330a0e1a51da94bc7994731bc1bac9344e600d0
--- /dev/null
+++ b/benchmark/scripts/benchmark_mhc_lm.py
@@ -0,0 +1,455 @@
+import os
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.mhc import LigerMHC
+from liger_kernel.utils import infer_device
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+device = infer_device()
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, *, eps: float, dtype: torch.dtype, device: str):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        var = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(var + self.eps)
+        return x * self.weight
+
+
+def _build_rope_cache(seq_len: int, head_dim: int, *, device: torch.device, dtype: torch.dtype):
+    inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2, device=device, dtype=torch.float32) / head_dim))
+    positions = torch.arange(seq_len, device=device, dtype=torch.float32)
+    freqs = torch.einsum("i,j->ij", positions, inv_freq)
+    cos = freqs.cos().to(dtype)
+    sin = freqs.sin().to(dtype)
+    return cos, sin
+
+
+def _apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    cos = cos[None, None, :, :]
+    sin = sin[None, None, :, :]
+    return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
+
+
+class MiniLlamaAttention(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, *, dtype: torch.dtype, device: str):
+        super().__init__()
+        assert hidden_size % num_heads == 0
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        assert self.head_dim % 2 == 0, "head_dim must be even for RoPE"
+
+        self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device)
+        self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device)
+        self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device)
+        self.o_proj = nn.Linear(hidden_size, hidden_size, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+
+        q = q.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = _build_rope_cache(seq_len, self.head_dim, device=x.device, dtype=q.dtype)
+        q = _apply_rope(q, cos, sin)
+        k = _apply_rope(k, cos, sin)
+
+        attn = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        attn = attn.transpose(1, 2).contiguous().view(bsz, seq_len, self.hidden_size)
+        return self.o_proj(attn)
+
+
+class MiniLlamaMLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_mult: int, *, dtype: torch.dtype, device: str):
+        super().__init__()
+        intermediate_size = hidden_size * intermediate_mult
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False, dtype=dtype, device=device)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False, dtype=dtype, device=device)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+class AttentionBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, *, dtype: torch.dtype, device: str):
+        super().__init__()
+        self.norm = RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        self.attn = MiniLlamaAttention(hidden_size, num_heads, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.attn(self.norm(x))
+
+
+class MLPBlock(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_mult: int, *, dtype: torch.dtype, device: str):
+        super().__init__()
+        self.norm = RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        self.mlp = MiniLlamaMLP(hidden_size, intermediate_mult, dtype=dtype, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(self.norm(x))
+
+
+class TorchMHC(nn.Module):
+    def __init__(
+        self,
+        layer: nn.Module,
+        *,
+        hc: int,
+        c: int,
+        tmax: int,
+        rms_eps: float,
+        pre_eps: float,
+        sinkhorn_eps: float,
+        post_mult: float,
+        phi_dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.layer = layer
+        self.hc = int(hc)
+        self.c = int(c)
+        self.tmax = int(tmax)
+        self.rms_eps = float(rms_eps)
+        self.pre_eps = float(pre_eps)
+        self.sinkhorn_eps = float(sinkhorn_eps)
+        self.post_mult = float(post_mult)
+
+        layer_param = next(layer.parameters())
+        device = layer_param.device
+
+        m = hc * hc + 2 * hc
+        k = hc * c
+        self.phi = nn.Parameter(torch.randn(k, m, dtype=phi_dtype, device=device) * 0.02)
+        self.b = nn.Parameter(torch.zeros(m, dtype=torch.float32, device=device))
+        self.alpha_pre = nn.Parameter(torch.tensor(1.0, dtype=torch.float32, device=device))
+        self.alpha_post = nn.Parameter(torch.tensor(1.0, dtype=torch.float32, device=device))
+        self.alpha_res = nn.Parameter(torch.tensor(1.0, dtype=torch.float32, device=device))
+
+        self.layer_dtype = layer_param.dtype
+
+    def _coeffs(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        from test.transformers.test_mhc import mhc_coeffs_ref
+
+        return mhc_coeffs_ref(
+            x,
+            self.phi,
+            self.b,
+            self.alpha_pre,
+            self.alpha_post,
+            self.alpha_res,
+            tmax=self.tmax,
+            rms_eps=self.rms_eps,
+            pre_eps=self.pre_eps,
+            sinkhorn_eps=self.sinkhorn_eps,
+            post_mult=self.post_mult,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_pre, h_post, h_res = self._coeffs(x)
+        x_in = (x.float() * h_pre.unsqueeze(-1)).sum(dim=-2)
+        if x_in.dtype != self.layer_dtype:
+            x_in = x_in.to(self.layer_dtype)
+        f_out = self.layer(x_in)
+        x_out = torch.einsum("...oi,...ic->...oc", h_res, x.float()) + h_post.unsqueeze(-1) * f_out.float().unsqueeze(
+            -2
+        )
+        return x_out.to(x.dtype)
+
+
+class MHCDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        mhc_cls: type[nn.Module],
+        *,
+        hidden_size: int,
+        hc: int,
+        num_heads: int,
+        intermediate_mult: int,
+        tmax: int,
+        dtype: torch.dtype,
+        device: str,
+    ):
+        super().__init__()
+        attn = AttentionBlock(hidden_size, num_heads, dtype=dtype, device=device)
+        mlp = MLPBlock(hidden_size, intermediate_mult, dtype=dtype, device=device)
+        self.attn = mhc_cls(
+            attn,
+            hc=hc,
+            c=hidden_size,
+            tmax=tmax,
+            rms_eps=1e-6,
+            pre_eps=1e-4,
+            sinkhorn_eps=1e-6,
+            post_mult=2.0,
+            phi_dtype=dtype,
+        )
+        self.mlp = mhc_cls(
+            mlp,
+            hc=hc,
+            c=hidden_size,
+            tmax=tmax,
+            rms_eps=1e-6,
+            pre_eps=1e-4,
+            sinkhorn_eps=1e-6,
+            post_mult=2.0,
+            phi_dtype=dtype,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.attn(x)
+        x = self.mlp(x)
+        return x
+
+
+class BenchMiniMHCLM(nn.Module):
+    def __init__(
+        self,
+        mhc_cls: type[nn.Module],
+        *,
+        vocab_size: int,
+        hidden_size: int,
+        hc: int,
+        num_layers: int,
+        num_heads: int,
+        intermediate_mult: int,
+        tmax: int,
+        dtype: torch.dtype,
+        device: str,
+    ):
+        super().__init__()
+        self.hc = hc
+        self.hidden_size = hidden_size
+        self.embed = nn.Embedding(vocab_size, hc * hidden_size, dtype=dtype, device=device)
+        self.layers = nn.ModuleList(
+            [
+                MHCDecoderLayer(
+                    mhc_cls,
+                    hidden_size=hidden_size,
+                    hc=hc,
+                    num_heads=num_heads,
+                    intermediate_mult=intermediate_mult,
+                    tmax=tmax,
+                    dtype=dtype,
+                    device=device,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.final_norm = RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+        self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False, dtype=dtype, device=device)
+
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        x = self.embed(input_ids)
+        bsz, seq_len, _ = x.shape
+        x = x.view(bsz, seq_len, self.hc, self.hidden_size)
+        for layer in self.layers:
+            x = layer(x)
+        x = x.mean(dim=-2)
+        x = self.final_norm(x)
+        return self.lm_head(x)
+
+
+def _build_model(
+    provider: str,
+    *,
+    hidden_size: int,
+    hc: int,
+    num_layers: int,
+    num_heads: int,
+    intermediate_mult: int,
+    vocab_size: int,
+    tmax: int,
+    dtype: torch.dtype,
+):
+    mhc_cls = LigerMHC if provider == "liger" else TorchMHC
+    return BenchMiniMHCLM(
+        mhc_cls,
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        hc=hc,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        intermediate_mult=intermediate_mult,
+        tmax=tmax,
+        dtype=dtype,
+        device=device,
+    )
+
+
+def bench_speed_mhc_lm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    hidden_size = int(input.x)
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    extra = input.extra_benchmark_config
+    bsz = extra["B"]
+    seq_len = extra["T"]
+    hc = extra["HC"]
+    num_layers = extra["layers"]
+    num_heads = extra["heads"]
+    vocab_size = extra["vocab"]
+    dtype = extra["dtype"]
+    tmax = extra["tmax"]
+    intermediate_mult = extra["intermediate_mult"]
+
+    if hidden_size % num_heads != 0:
+        raise ValueError("hidden_size must be divisible by num_heads")
+
+    model = _build_model(
+        provider,
+        hidden_size=hidden_size,
+        hc=hc,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        intermediate_mult=intermediate_mult,
+        vocab_size=vocab_size,
+        tmax=tmax,
+        dtype=dtype,
+    )
+
+    input_ids = torch.randint(0, vocab_size, (bsz, seq_len), device=device)
+
+    def fwd():
+        return model(input_ids)
+
+    def fwd_loss():
+        return fwd().float().mean()
+
+    grad_to_none = list(model.parameters())
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, grad_to_none=grad_to_none, rep=100)
+    elif mode == "backward":
+        loss = fwd_loss()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: loss.backward(retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=grad_to_none,
+            rep=100,
+        )
+    elif mode == "full":
+
+        def full():
+            loss = fwd_loss()
+            loss.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, grad_to_none=grad_to_none, rep=100)
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_mhc_lm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    hidden_size = int(input.x)
+    provider = input.kernel_provider
+    extra = input.extra_benchmark_config
+    bsz = extra["B"]
+    seq_len = extra["T"]
+    hc = extra["HC"]
+    num_layers = extra["layers"]
+    num_heads = extra["heads"]
+    vocab_size = extra["vocab"]
+    dtype = extra["dtype"]
+    tmax = extra["tmax"]
+    intermediate_mult = extra["intermediate_mult"]
+
+    if hidden_size % num_heads != 0:
+        raise ValueError("hidden_size must be divisible by num_heads")
+
+    model = _build_model(
+        provider,
+        hidden_size=hidden_size,
+        hc=hc,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        intermediate_mult=intermediate_mult,
+        vocab_size=vocab_size,
+        tmax=tmax,
+        dtype=dtype,
+    )
+
+    input_ids = torch.randint(0, vocab_size, (bsz, seq_len), device=device)
+
+    def fwd():
+        return model(input_ids)
+
+    def full():
+        loss = fwd().float().mean()
+        loss.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "mhc_llama_like_lm",
+        "x_name": "hidden_size",
+        "x_label": "hidden_size",
+        "x_values": [256, 512, 1024],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "B": 2,
+                "T": 256,
+                "HC": 4,
+                "layers": 2,
+                "heads": 8,
+                "vocab": 4096,
+                "dtype": torch.bfloat16,
+                "tmax": 8,
+                "intermediate_mult": 4,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_mhc_lm,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_mhc_lm,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_model_configs.py b/benchmark/scripts/benchmark_model_configs.py
new file mode 100755
index 0000000000000000000000000000000000000000..630b0d555e7050fca83ae1c856fd97ac4812a8bb
--- /dev/null
+++ b/benchmark/scripts/benchmark_model_configs.py
@@ -0,0 +1,258 @@
+"""
+Standardized benchmark model configurations.
+
+Provides canonical model architecture profiles and device-specific benchmark
+parameters.  All benchmark scripts should derive their tensor shapes from these
+shared configs rather than defining ad-hoc per-script constants.
+
+Usage::
+
+    from benchmark_model_configs import (
+        get_benchmark_model_config,
+        compute_seq_len_sweep_config,
+        estimate_kernel_peak_memory,
+    )
+
+    args = parse_benchmark_script_args()
+    model = get_benchmark_model_config(args.model)
+
+    # Measure actual memory via a small probe, then compute sweep config
+    peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe)
+    bpt = peak_bytes // probe_num_tokens
+    config = compute_seq_len_sweep_config(model, kernel_bytes_per_token=bpt)
+"""
+
+import gc
+import math
+
+from dataclasses import dataclass
+from typing import Callable
+from typing import Dict
+from typing import Optional
+
+import torch
+
+from liger_kernel.utils import get_total_gpu_memory
+from liger_kernel.utils import infer_device
+
+
+@dataclass(frozen=True)
+class ModelConfig:
+    """Canonical model architecture profile.
+
+    Each field corresponds to a standard LLM hyperparameter.  Benchmark scripts
+    pick the fields they need (e.g. hidden_size for RMSNorm, vocab_size for
+    CrossEntropy) while kernel-specific overrides (e.g. hidden_act for GEGLU)
+    are applied locally in the benchmark script.
+    """
+
+    name: str
+    hidden_size: int
+    intermediate_size: int
+    vocab_size: int
+    num_attention_heads: int
+    num_key_value_heads: int
+    head_dim: int
+    hidden_act: str
+    max_position_embeddings: int = 8192
+    rms_norm_eps: float = 1e-5
+    dtype: torch.dtype = torch.bfloat16
+
+
+@dataclass(frozen=True)
+class SeqLenSweepConfig:
+    """Config for benchmarks that sweep sequence length (e.g. GEGLU, SwiGLU).
+
+    Attributes:
+        batch_size: Safe batch size for the sweep.
+        seq_len: Max sequence length (upper bound for x_values).
+    """
+
+    batch_size: int
+    seq_len: int
+
+
+@dataclass(frozen=True)
+class HiddenSizeSweepConfig:
+    """Config for benchmarks that sweep hidden_size with fixed BT (e.g. DyT).
+
+    Attributes:
+        bt: Fixed batch * seq dimension.
+        max_hidden_size: Upper bound for hidden_size sweep.
+    """
+
+    bt: int
+    max_hidden_size: int
+
+
+# ── Model Profiles ──────────────────────────────────────────────────────────
+
+LLAMA_2_7B = ModelConfig(
+    name="llama_2_7b",
+    hidden_size=4096,
+    intermediate_size=11008,
+    vocab_size=32000,
+    num_attention_heads=32,
+    num_key_value_heads=32,
+    head_dim=128,
+    hidden_act="silu",
+    max_position_embeddings=4096,
+)
+
+LLAMA_3_8B = ModelConfig(
+    name="llama_3_8b",
+    hidden_size=4096,
+    intermediate_size=14336,
+    vocab_size=128256,
+    num_attention_heads=32,
+    num_key_value_heads=8,
+    head_dim=128,
+    hidden_act="silu",
+    max_position_embeddings=8192,
+)
+
+MODEL_REGISTRY: Dict[str, ModelConfig] = {
+    "llama_2_7b": LLAMA_2_7B,
+    "llama_3_8b": LLAMA_3_8B,
+}
+
+DEFAULT_MODEL_CONFIG = LLAMA_3_8B
+
+
+def get_benchmark_model_config(model_name: Optional[str] = None) -> ModelConfig:
+    """Resolve benchmark model config from name.
+
+    Returns the canonical model architecture profile (hidden_size, vocab_size,
+    dtype, etc.) for benchmark runs.  Use this to obtain model attributes
+    when building benchmark tensors and shapes.
+
+    Args:
+        model_name: Registry key (e.g. ``llama_2_7b``, ``llama_3_8b``).
+            If None, returns ``DEFAULT_MODEL_CONFIG``.
+    """
+    return MODEL_REGISTRY[model_name] if model_name else DEFAULT_MODEL_CONFIG
+
+
+def estimate_kernel_peak_memory(probe_fn: Callable[[], torch.Tensor]) -> int:
+    """Run a forward + backward probe to measure peak memory (bytes).
+
+    Call this with the *pure PyTorch* (e.g. huggingface) implementation --
+    that typically has the highest memory footprint and therefore gives a
+    safe upper-bound estimate.  Returns the total peak bytes; divide by
+    num_tokens if you need bytes-per-token for :func:`compute_seq_len_sweep_config`.
+
+    The probe_fn performs setup and forward pass internally; cleanup is
+    automatic, so callers do not need to manage tensor/layer lifecycle.
+
+    Example::
+
+        peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe)
+        kernel_bpt = peak_bytes // num_tokens  # if needed
+
+    Args:
+        probe_fn: Callable that performs setup, runs a forward pass, and
+            returns an output tensor suitable for ``.backward()``.
+    """
+    device_str = infer_device()
+    torch_device_mod = getattr(torch, device_str)
+
+    gc.collect()
+    torch_device_mod.empty_cache()
+    torch_device_mod.memory.reset_peak_memory_stats()
+
+    y = probe_fn()
+    y.backward(torch.randn_like(y))
+
+    peak_bytes = torch_device_mod.max_memory_allocated()
+
+    del y
+    gc.collect()
+    torch_device_mod.empty_cache()
+
+    return max(1, peak_bytes)
+
+
+def compute_seq_len_sweep_config(
+    model_cfg: ModelConfig,
+    kernel_bytes_per_token: Optional[int] = None,
+    memory_utilization: float = 0.4,
+    max_seq_len: Optional[int] = None,
+    max_batch_size: int = 32,
+) -> SeqLenSweepConfig:
+    """Compute safe batch_size and seq_len for sequence-length sweep (e.g. GEGLU).
+
+    Peak memory is estimated as
+    ``batch_size * seq_len * kernel_bytes_per_token`` and is capped at
+    device memory * memory_utilization.  Device memory is obtained
+    internally via :func:`~liger_kernel.utils.get_total_gpu_memory`.
+
+    Prefer obtaining *kernel_bytes_per_token* via
+    :func:`estimate_kernel_peak_memory` (divide by num_tokens) rather
+    than hardcoding an analytical estimate.
+
+    Args:
+        model_cfg: Model architecture config.
+        kernel_bytes_per_token: Peak memory **per token** (``batch * seq_len``
+            axis).  Best obtained from :func:`estimate_kernel_peak_memory` / num_tokens.
+            Falls back to a conservative heuristic
+            (``hidden_size * dtype_bytes * 16``) when *None*.
+        memory_utilization: Fraction of total device memory to target (0 to 1).
+            Lower values are safer.  Default ``0.4`` leaves headroom for
+            framework overhead and CUDA/NPU context.
+        max_seq_len: Hard upper bound for sequence length.  Defaults to
+            ``model_cfg.max_position_embeddings`` so the sweep never exceeds
+            the model's native context window.
+        max_batch_size: Hard upper bound for batch size.
+    """
+    total_memory_gb = get_total_gpu_memory()
+    dtype_bytes = 2 if model_cfg.dtype in (torch.bfloat16, torch.float16) else 4
+
+    if kernel_bytes_per_token is None:
+        kernel_bytes_per_token = model_cfg.hidden_size * dtype_bytes * 16
+
+    if max_seq_len is None:
+        max_seq_len = model_cfg.max_position_embeddings
+
+    usable_bytes = total_memory_gb * (1024**3) * memory_utilization
+    max_tokens = max(1, int(usable_bytes / kernel_bytes_per_token))
+
+    seq_len = min(max_seq_len, max_tokens)
+    seq_len = 2 ** int(math.log2(seq_len)) if seq_len >= 1024 else 1024
+
+    batch_size = max(1, min(max_tokens // seq_len, max_batch_size))
+
+    return SeqLenSweepConfig(batch_size=batch_size, seq_len=seq_len)
+
+
+def compute_hidden_size_sweep_config(
+    model_cfg: ModelConfig,
+    kernel_peak_bytes: int,
+    bt: int = 4096,
+    memory_utilization: float = 0.4,
+    max_hidden_size_multiplier: int = 4,
+) -> HiddenSizeSweepConfig:
+    """Compute safe max_hidden_size for hidden_size sweep (e.g. DyT).
+
+    For kernels with shape (BT, hidden_size) where BT is fixed and we sweep
+    hidden_size.  Uses probe peak memory to derive max_hidden_size.
+    Device memory is obtained internally via :func:`~liger_kernel.utils.get_total_gpu_memory`.
+
+    Args:
+        model_cfg: Model config.
+        kernel_peak_bytes: Peak memory from probe (BT, model.hidden_size).
+        bt: Fixed BT dimension; must match the probe.
+        memory_utilization: Fraction of device memory to use.
+        max_hidden_size_multiplier: Cap max_hidden_size at model.hidden_size * this.
+    """
+    total_memory_gb = get_total_gpu_memory()
+    usable_bytes = total_memory_gb * (1024**3) * memory_utilization
+    kernel_bpt = max(1, kernel_peak_bytes // bt)
+    max_hidden_size = min(
+        model_cfg.hidden_size * max_hidden_size_multiplier,
+        max(
+            model_cfg.hidden_size,
+            int(usable_bytes * model_cfg.hidden_size / (bt * kernel_bpt)),
+        ),
+    )
+    max_hidden_size = max(1024, 2 ** int(math.log2(max_hidden_size)))
+    return HiddenSizeSweepConfig(bt=bt, max_hidden_size=max_hidden_size)
diff --git a/benchmark/scripts/benchmark_multi_token_attention.py b/benchmark/scripts/benchmark_multi_token_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..b5319af5c70baa9ec547bdac130a98ff4ed4e6e4
--- /dev/null
+++ b/benchmark/scripts/benchmark_multi_token_attention.py
@@ -0,0 +1,218 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.multi_token_attention import LigerMultiTokenAttention
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class TorchMultiTokenAttention(torch.nn.Module):
+    def __init__(self, C_in, C_out, K, groups, bias, dtype, device):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(C_out, C_in // groups, K, K, dtype=dtype, device=device))
+        self.bias = torch.nn.Parameter(torch.empty(C_out, dtype=dtype, device=device)) if bias else None
+        self.K = K
+        self.groups = groups
+
+    def forward(self, scores):
+        B, C_in, L, _ = scores.shape
+        mask = torch.tril(torch.ones(L, L, dtype=torch.bool, device=scores.device)).view(1, 1, L, L)
+        inf = torch.tensor(-1e9, device=scores.device, dtype=scores.dtype)
+        zero = torch.tensor(0.0, device=scores.device, dtype=scores.dtype)
+        s_inf = scores.masked_fill(~mask, inf)
+        probs = torch.nn.functional.softmax(s_inf, dim=-1)
+        out_c = torch.nn.functional.conv2d(
+            probs, self.weight, self.bias, stride=1, padding=self.K // 2, groups=self.groups
+        )
+        return out_c.masked_fill(~mask, zero)
+
+
+def bench_speed_multi_token_attention(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    L = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    B = extra_benchmark_config["B"]
+    C_in = extra_benchmark_config["C_in"]
+    C_out = extra_benchmark_config["C_out"]
+    K = extra_benchmark_config["K"]
+    groups = extra_benchmark_config["groups"]
+    bias = extra_benchmark_config["bias"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (B, C_in, L, L)
+
+    triton_attn = (
+        LigerMultiTokenAttention(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=1,
+            padding=K // 2,
+            dilation=1,
+            groups=groups,
+            bias=bias,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    torch_attn = TorchMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    with torch.no_grad():
+        torch_attn.weight.copy_(triton_attn.weight)
+        if bias:
+            torch_attn.bias.copy_(triton_attn.bias)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return triton_attn(x)
+        elif provider == "torch":
+            return torch_attn(x)
+
+    print(f"Starting Warmup for input size: {x_shape}")
+    _ = fwd()
+    if mode in ("backward", "full"):
+        y = _
+        y.backward(dy, retain_graph=True)
+    print("Done Warmup")
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, grad_to_none=[x], rep=100, quantiles=QUANTILES)
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            grad_to_none=[x],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, grad_to_none=[x], rep=100, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_multi_token_attention(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    L = input.x
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    B = extra_benchmark_config["B"]
+    C_in = extra_benchmark_config["C_in"]
+    C_out = extra_benchmark_config["C_out"]
+    K = extra_benchmark_config["K"]
+    groups = extra_benchmark_config["groups"]
+    bias = extra_benchmark_config["bias"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (B, C_in, L, L)
+
+    triton_attn = (
+        LigerMultiTokenAttention(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=1,
+            padding=K // 2,
+            dilation=1,
+            groups=groups,
+            bias=bias,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    torch_attn = TorchMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    with torch.no_grad():
+        torch_attn.weight.copy_(triton_attn.weight)
+        if bias:
+            torch_attn.bias.copy_(triton_attn.bias)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return triton_attn(x)
+        elif provider == "torch":
+            return torch_attn(x)
+
+    def full():
+        y = fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "multi_token_attention",
+        "x_name": "L",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(5, 10)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "B": 2,
+                "C_in": 4,
+                "C_out": 4,
+                "K": 3,
+                "groups": 1,
+                "bias": True,
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_multi_token_attention,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_multi_token_attention,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_orpo_loss.py b/benchmark/scripts/benchmark_orpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..30b308c42605e08758165f4fb5e719f098995a5b
--- /dev/null
+++ b/benchmark/scripts/benchmark_orpo_loss.py
@@ -0,0 +1,169 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+#############################################################################
+# Test the memory consumption of the linear fused cross entropy loss
+#############################################################################
+
+
+def bench_memory_fused_linear_orpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_orpo_loss import LigerLMHeadORPO
+    from test.chunked_loss.test_orpo_loss import TorchLMHeadORPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_orpo = TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_orpo = LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target, nll_target: torch_lm_head_orpo(x, target, nll_target)[0]
+    liger_fwd = lambda x, target, nll_target: liger_lm_head_orpo(x, target, nll_target)[0]
+
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+    nll_target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, target, nll_target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, target, nll_target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+# #############################################################################
+# # Test the speed of the fused linear cross entropy loss
+# #############################################################################
+
+
+def bench_speed_fused_linear_orpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_orpo_loss import LigerLMHeadORPO
+    from test.chunked_loss.test_orpo_loss import TorchLMHeadORPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_orpo = TorchLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_orpo = LigerLMHeadORPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target, nll_target: torch_lm_head_orpo(x, target, nll_target)[0]
+    liger_fwd = lambda x, target, nll_target: liger_lm_head_orpo(x, target, nll_target)[0]
+
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+    nll_target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, target, nll_target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, target, nll_target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "fused_linear_orpo_loss",
+        "x_name": "B",
+        "x_label": "B",
+        "x_values": [2**i for i in range(1, 5)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "T": 1024,
+                "H": 4096,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_linear_orpo_loss,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_linear_orpo_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_poly_norm.py b/benchmark/scripts/benchmark_poly_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..ddff431d74276d57376c7589435e4e6e9ea82419
--- /dev/null
+++ b/benchmark/scripts/benchmark_poly_norm.py
@@ -0,0 +1,197 @@
+import torch
+import torch.nn as nn
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.poly_norm import LigerPolyNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class NaivePolyNorm(nn.Module):
+    """
+    Naive PyTorch implementation of PolyNorm.
+
+    Reference:
+        https://github.com/BryceZhuo/PolyCom/
+
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+    """
+
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        # Align with PolyCom reference: (1/3, 1/3, 1/3) and bias=1.0
+        self.weight = nn.Parameter(torch.full((3,), 1.0 / 3.0))
+        self.bias = nn.Parameter(torch.tensor(1.0))
+        self.variance_epsilon = eps
+
+    def _norm(self, x):
+        """RMSNorm operation"""
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.variance_epsilon)
+
+    def forward(self, hidden_states):
+        """
+        Forward pass of PolyNorm
+
+        Args:
+            hidden_states: input tensor of shape (..., H)
+
+        Returns:
+            output tensor of same shape as input
+        """
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+
+        # Compute powers
+        x_pow3 = hidden_states**3
+        x_pow2 = hidden_states**2
+        x_pow1 = hidden_states**1
+
+        # Normalize each power
+        norm_x3 = self._norm(x_pow3)
+        norm_x2 = self._norm(x_pow2)
+        norm_x1 = self._norm(x_pow1)
+
+        # Weighted sum with bias
+        output = self.weight[0] * norm_x3 + self.weight[1] * norm_x2 + self.weight[2] * norm_x1 + self.bias
+
+        return output.to(input_dtype)
+
+
+def bench_speed_poly_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+
+    triton_poly = LigerPolyNorm(eps=eps).to(device)
+    naive_poly = NaivePolyNorm(eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    # utility functions
+
+    def y_fwd():
+        if provider == "liger":
+            return triton_poly(x)
+
+        if provider == "huggingface":
+            return naive_poly(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            y_fwd,
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = y_fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = y_fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_poly_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+
+    triton_poly = LigerPolyNorm(eps=eps).to(device)
+    naive_poly = NaivePolyNorm(eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    # utility functions
+    def y_fwd():
+        if provider == "liger":
+            return triton_poly(x)
+        if provider == "huggingface":
+            return naive_poly(x)
+
+    def full():
+        y = y_fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "poly_norm",
+        "x_name": "H",
+        "x_label": "hidden size",
+        "x_values": [2**i for i in range(10, 16)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [{"M": 2048, "dtype": torch.bfloat16, "eps": 1e-6}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_poly_norm,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_poly_norm,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_qwen2vl_mrope.py b/benchmark/scripts/benchmark_qwen2vl_mrope.py
new file mode 100755
index 0000000000000000000000000000000000000000..ec1c53b8909c11de94bcdc17b744dba310268472
--- /dev/null
+++ b/benchmark/scripts/benchmark_qwen2vl_mrope.py
@@ -0,0 +1,241 @@
+import torch
+import triton
+
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLTextConfig
+from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
+from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_multimodal_rotary_pos_emb
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def bench_speed_qwen2vl_mrope(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    num_q_heads = extra_benchmark_config["num_q_heads"]
+    num_kv_heads = extra_benchmark_config["num_kv_heads"]
+    dtype = extra_benchmark_config["dtype"]
+
+    # x can be either hidden_size or seq_len
+    hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
+    seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
+
+    head_dim = hidden_size // num_q_heads
+    mrope_section_hw = head_dim * 3 // 16
+    mrope_section = [
+        head_dim // 2 - 2 * mrope_section_hw,
+        mrope_section_hw,
+        mrope_section_hw,
+    ]
+    config = Qwen2VLTextConfig(
+        hidden_size=hidden_size,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        rope_theta=1000000.0,
+        mrope_section=mrope_section,
+    )
+    rotary_emb = Qwen2VLRotaryEmbedding(config, device=device)
+    q = torch.randn(
+        (1, seq_len, num_q_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    k = torch.randn(
+        (1, seq_len, num_kv_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    dq, dk = (
+        torch.randn_like(q, device=device, dtype=dtype),
+        torch.randn_like(k, device=device, dtype=dtype),
+    )
+    pos_ids = torch.arange(seq_len * 3, device=device, dtype=torch.long).view(3, 1, -1)
+    cos, sin = rotary_emb(k, pos_ids)
+
+    def fwd():
+        if provider == "liger":
+            return liger_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section)
+        elif provider == "huggingface":
+            return apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for M-RoPE embedding")
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        q_out, k_out = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True),
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            q_out, k_out = fwd()
+            torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_qwen2vl_mrope(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    num_q_heads = extra_benchmark_config["num_q_heads"]
+    num_kv_heads = extra_benchmark_config["num_kv_heads"]
+    dtype = extra_benchmark_config["dtype"]
+
+    # x can be either hidden_size or seq_len
+    hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
+    seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
+
+    head_dim = hidden_size // num_q_heads
+
+    mrope_section_hw = head_dim * 3 // 16
+    mrope_section = [
+        head_dim // 2 - 2 * mrope_section_hw,
+        mrope_section_hw,
+        mrope_section_hw,
+    ]
+    config = Qwen2VLTextConfig(
+        hidden_size=hidden_size,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        rope_theta=1000000.0,
+        mrope_section=mrope_section,
+    )
+    rotary_emb = Qwen2VLRotaryEmbedding(config, device=device)
+    q = torch.randn(
+        (1, seq_len, num_q_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    k = torch.randn(
+        (1, seq_len, num_kv_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    dq, dk = (
+        torch.randn_like(q, device=device, dtype=dtype),
+        torch.randn_like(k, device=device, dtype=dtype),
+    )
+    pos_ids = torch.arange(seq_len * 3, device=device, dtype=torch.long).view(3, 1, -1)
+    cos, sin = rotary_emb(k, pos_ids)
+
+    def full():
+        if provider == "liger":
+            q_out, k_out = liger_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section)
+        else:
+            q_out, k_out = apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section)
+        torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(
+        full,
+        quantiles=QUANTILES,
+    )
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs_varying_hidden_size = {
+        "kernel_name": "qwen2vl_mrope",
+        "x_name": "H",
+        "x_label": "hidden size",
+        "x_values": [32 * (2**i) for i in range(4, 10, 2)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "dtype": torch.bfloat16,
+                "seq_len": 2048,
+                "num_q_heads": 32,
+                "num_kv_heads": 8,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_qwen2vl_mrope,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_varying_hidden_size,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_qwen2vl_mrope,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_varying_hidden_size,
+    )
+
+    common_configs_varying_seq_len = {
+        "kernel_name": "qwen2vl_mrope",
+        "x_name": "T",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(10, 15)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "dtype": torch.bfloat16,
+                "hidden_size": 8192,
+                "num_q_heads": 32,
+                "num_kv_heads": 8,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_qwen2vl_mrope,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_varying_seq_len,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_qwen2vl_mrope,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_varying_seq_len,
+    )
diff --git a/benchmark/scripts/benchmark_rms_norm.py b/benchmark/scripts/benchmark_rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..6bcd56a8378a2f6d21dfecf5157569f3d59af9b8
--- /dev/null
+++ b/benchmark/scripts/benchmark_rms_norm.py
@@ -0,0 +1,162 @@
+import torch
+import torch.nn as nn
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.rms_norm import LigerRMSNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+def bench_speed_rms_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+
+    triton_rms = LigerRMSNorm(hidden_size=N, eps=eps).to(device)
+    llama_rms = LlamaRMSNorm(hidden_size=N, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    # utility functions
+
+    def y_fwd():
+        if provider == "liger":
+            return triton_rms(x)
+
+        if provider == "huggingface":
+            return llama_rms(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            y_fwd,
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = y_fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = y_fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_rms_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    eps = extra_benchmark_config["eps"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+
+    triton_rms = LigerRMSNorm(hidden_size=N, eps=eps).to(device)
+    llama_rms = LlamaRMSNorm(hidden_size=N, eps=eps).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    # utility functions
+    def y_fwd():
+        if provider == "liger":
+            return triton_rms(x)
+        if provider == "huggingface":
+            return llama_rms(x)
+
+    def full():
+        y = y_fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "rms_norm",
+        "x_name": "H",
+        "x_label": "hidden size",
+        "x_values": [2**i for i in range(10, 16)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [{"M": 2048, "dtype": torch.bfloat16, "eps": 1e-6}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_rms_norm,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_rms_norm,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_rope.py b/benchmark/scripts/benchmark_rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..ac792881d41965e1ba4fc2be51cd2545159c646e
--- /dev/null
+++ b/benchmark/scripts/benchmark_rope.py
@@ -0,0 +1,223 @@
+import torch
+import triton
+
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.rope import liger_rotary_pos_emb
+from liger_kernel.utils import infer_device
+from liger_kernel.utils import transformers_version_dispatch
+
+device = infer_device()
+
+
+def bench_speed_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    num_q_heads = extra_benchmark_config["num_q_heads"]
+    num_kv_heads = extra_benchmark_config["num_kv_heads"]
+    dtype = extra_benchmark_config["dtype"]
+
+    # x can be either hidden_size or seq_len
+    hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
+    seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
+
+    head_dim = hidden_size // num_q_heads
+    rotary_emb = transformers_version_dispatch(
+        "4.48.0",
+        LlamaRotaryEmbedding,
+        LlamaRotaryEmbedding,
+        before_kwargs={"dim": head_dim, "device": device},
+        after_kwargs={"config": LlamaConfig(num_kv_heads=num_kv_heads, head_dim=head_dim), "device": device},
+    )
+    q = torch.randn(
+        (1, seq_len, num_q_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    k = torch.randn(
+        (1, seq_len, num_kv_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    dq, dk = (
+        torch.randn_like(q, device=device, dtype=dtype),
+        torch.randn_like(k, device=device),
+    )
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
+    cos, sin = rotary_emb(k, pos_ids)
+
+    def fwd():
+        if provider == "liger":
+            return liger_rotary_pos_emb(q, k, cos, sin, pos_ids)
+        elif provider == "huggingface":
+            return apply_rotary_pos_emb(q, k, cos, sin, pos_ids)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for RoPE embedding")
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        q_out, k_out = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True),
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            q_out, k_out = fwd()
+            torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[q, k],
+            rep=400,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_rope(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    num_q_heads = extra_benchmark_config["num_q_heads"]
+    num_kv_heads = extra_benchmark_config["num_kv_heads"]
+    dtype = extra_benchmark_config["dtype"]
+
+    # x can be either hidden_size or seq_len
+    hidden_size = extra_benchmark_config["hidden_size"] if "hidden_size" in extra_benchmark_config else input.x
+    seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
+
+    head_dim = hidden_size // num_q_heads
+    rotary_emb = transformers_version_dispatch(
+        "4.48.0",
+        LlamaRotaryEmbedding,
+        LlamaRotaryEmbedding,
+        before_kwargs={"dim": head_dim, "device": device},
+        after_kwargs={"config": LlamaConfig(num_kv_heads=num_kv_heads, head_dim=head_dim), "device": device},
+    )
+    q = torch.randn(
+        (1, seq_len, num_q_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    k = torch.randn(
+        (1, seq_len, num_kv_heads, head_dim),
+        device=device,
+        requires_grad=True,
+        dtype=dtype,
+    ).transpose(1, 2)
+    dq, dk = (
+        torch.randn_like(q, device=device, dtype=dtype),
+        torch.randn_like(k, device=device),
+    )
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
+    cos, sin = rotary_emb(k, pos_ids)
+
+    def full():
+        if provider == "liger":
+            q_out, k_out = liger_rotary_pos_emb(q, k, cos, sin, pos_ids)
+        else:
+            q_out, k_out = apply_rotary_pos_emb(q, k, cos, sin, pos_ids)
+        torch.autograd.grad((q_out, k_out), (q, k), (dq, dk), allow_unused=True, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(
+        full,
+        quantiles=QUANTILES,
+    )
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs_varying_hidden_size = {
+        "kernel_name": "rope",
+        "x_name": "H",
+        "x_label": "hidden size",
+        "x_values": [32 * (2**i) for i in range(4, 10, 2)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "dtype": torch.bfloat16,
+                "seq_len": 2048,
+                "num_q_heads": 32,
+                "num_kv_heads": 8,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_rope,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_varying_hidden_size,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_rope,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_varying_hidden_size,
+    )
+
+    common_configs_varying_seq_len = {
+        "kernel_name": "rope",
+        "x_name": "T",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(10, 15)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "dtype": torch.bfloat16,
+                "hidden_size": 8192,
+                "num_q_heads": 32,
+                "num_kv_heads": 8,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_rope,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_varying_seq_len,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_rope,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_varying_seq_len,
+    )
diff --git a/benchmark/scripts/benchmark_simpo_loss.py b/benchmark/scripts/benchmark_simpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..148b8e3e4db0d68cc524f443e3076d7f1afd596c
--- /dev/null
+++ b/benchmark/scripts/benchmark_simpo_loss.py
@@ -0,0 +1,167 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+#############################################################################
+# Test the memory consumption of the linear fused cross entropy loss
+#############################################################################
+
+
+def bench_memory_fused_linear_simpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_simpo_loss import LigerLMHeadSimPO
+    from test.chunked_loss.test_simpo_loss import TorchLMHeadCPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_simpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_simpo = LigerLMHeadSimPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_simpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_simpo(x, target)[0]
+
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, target)
+
+    def full():
+        y = fwd()
+        y.backward()
+
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+# #############################################################################
+# # Test the speed of the fused linear cross entropy loss
+# #############################################################################
+
+
+def bench_speed_fused_linear_simpo_loss(
+    input: SingleBenchmarkRunInput,
+) -> SingleBenchmarkRunOutput:
+    from test.chunked_loss.test_simpo_loss import LigerLMHeadSimPO
+    from test.chunked_loss.test_simpo_loss import TorchLMHeadCPO
+
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    # Instantiate once and retrieve the first output only
+    torch_lm_head_simpo = TorchLMHeadCPO(H=H, V=V, dtype=dtype).to(device)
+    liger_lm_head_simpo = LigerLMHeadSimPO(H=H, V=V, dtype=dtype).to(device)
+    torch_fwd = lambda x, target: torch_lm_head_simpo(x, target)[0]
+    liger_fwd = lambda x, target: liger_lm_head_simpo(x, target)[0]
+
+    _input = torch.randn(B, T, H, requires_grad=True, dtype=dtype, device=device)
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+
+    def fwd():
+        if provider == "liger":
+            return liger_fwd(_input, target)
+        elif provider == "huggingface":
+            return torch_fwd(_input, target)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "fused_linear_simpo_loss",
+        "x_name": "B",
+        "x_label": "B",
+        "x_values": [2**i for i in range(1, 5)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "T": 1024,
+                "H": 4096,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_fused_linear_simpo_loss,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_fused_linear_simpo_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_softmax.py b/benchmark/scripts/benchmark_softmax.py
new file mode 100755
index 0000000000000000000000000000000000000000..10e994c8c12ab90b0accfcbee81eee9961d84ab2
--- /dev/null
+++ b/benchmark/scripts/benchmark_softmax.py
@@ -0,0 +1,140 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.softmax import LigerSoftmax
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def bench_speed_softmax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    N = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    extra_benchmark_config = input.extra_benchmark_config
+    M = extra_benchmark_config["M"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (M, N)
+    liger_softmax = LigerSoftmax().to(device).to(dtype)
+    torch_softmax = torch.nn.Softmax(dim=-1).to(device).to(dtype)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def y_fwd():
+        if provider == "liger":
+            return liger_softmax(x)
+        if provider == "torch":
+            return torch_softmax(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(y_fwd, quantiles=QUANTILES, grad_to_none=[x], rep=500)
+    elif mode == "backward":
+        y = y_fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[x],
+            rep=500,
+        )
+    elif mode == "full":
+
+        def full():
+            y = y_fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, grad_to_none=[x], rep=500)
+
+    if any(val is None for val in (ms_20, ms_50, ms_80)):
+        raise RuntimeError(f"Benchmark speed result is None: ms_20={ms_20}, ms_50={ms_50}, ms_80={ms_80}")
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_softmax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    shape = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    extra_benchmark_config = input.extra_benchmark_config
+    dtype = extra_benchmark_config.get("dtype", torch.float32)
+
+    torch_softmax = torch.nn.Softmax(dim=-1)
+    liger_softmax = LigerSoftmax().to(device).to(dtype)
+
+    x = torch.randn(shape, device=device, dtype=dtype, requires_grad=True)
+
+    def fwd():
+        if provider == "liger":
+            return liger_softmax(x)
+        elif provider == "torch":
+            return torch_softmax(x)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for softmax")
+
+    def full():
+        y = fwd()
+        y.backward(torch.ones_like(y), retain_graph=True)
+
+    if mode == "forward":
+        mem_50, mem_20, mem_80 = _test_memory(fwd, quantiles=QUANTILES)
+    elif mode == "backward":
+        do = torch.ones_like(x)
+        y = fwd()
+        mem_50, mem_20, mem_80 = _test_memory(lambda: y.backward(do, retain_graph=True), quantiles=QUANTILES)
+    else:
+        mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    if any(val is None for val in (mem_20, mem_50, mem_80)):
+        raise RuntimeError(f"Benchmark memory result is None: mem_20={mem_20}, mem_50={mem_50}, mem_80={mem_80}")
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = dict(
+        kernel_name="softmax",
+        x_name="N",
+        x_label="hidden size",
+        x_values=[128, 256, 512, 1024, 2048, 4096],
+        kernel_providers=["liger", "torch"],
+        extra_benchmark_configs=[
+            {"M": 2048, "dtype": torch.float32},
+            {"M": 2048, "dtype": torch.bfloat16},
+        ],
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_softmax,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        overwrite=args.overwrite,
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_softmax,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        overwrite=args.overwrite,
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_sparse_multi_token_attention.py b/benchmark/scripts/benchmark_sparse_multi_token_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..98f47d713920e6842c3be9761672296066d9d644
--- /dev/null
+++ b/benchmark/scripts/benchmark_sparse_multi_token_attention.py
@@ -0,0 +1,254 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.multi_token_attention import LigerMultiTokenAttention
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class TorchSparseMultiTokenAttention(torch.nn.Module):
+    def __init__(self, C_in, C_out, K, groups, bias, dtype, device):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(C_out, C_in // groups, K, K, dtype=dtype, device=device))
+        self.bias = torch.nn.Parameter(torch.empty(C_out, dtype=dtype, device=device)) if bias else None
+        self.K = K
+        self.groups = groups
+        self.dtype = dtype
+        self.compute_dtype = torch.float32
+
+    def forward(self, scores):
+        B, C_in, L, _ = scores.shape
+        mask = torch.tril(torch.ones(L, L, dtype=torch.bool, device=scores.device)).view(1, 1, L, L)
+        inf = torch.tensor(-1e9, device=scores.device, dtype=self.compute_dtype)
+        zero = torch.tensor(0.0, device=scores.device, dtype=self.compute_dtype)
+
+        s_compute = scores.to(self.compute_dtype)
+        s_inf = s_compute.masked_fill(~mask, inf)
+
+        dim = -1
+        z = s_inf
+
+        z_sorted, _ = torch.sort(z, dim=dim, descending=True)
+
+        cum_sum = torch.cumsum(z_sorted, dim=dim)
+
+        k_indices = torch.arange(1, L + 1, device=z.device, dtype=z.dtype).view(1, 1, 1, L)
+
+        is_positive = z_sorted > -1e8
+        condition = (1 + k_indices * z_sorted > cum_sum) & is_positive
+        k_sparsemax = torch.sum(condition, dim=dim, keepdim=True)
+
+        k_sparsemax_safe = torch.max(k_sparsemax, torch.ones_like(k_sparsemax))
+
+        cum_sum_k = torch.gather(cum_sum, dim=dim, index=k_sparsemax_safe.long() - 1)
+
+        tau = (cum_sum_k - 1) / k_sparsemax_safe.to(z.dtype)
+        tau = torch.where(k_sparsemax == 0, torch.full_like(tau, float("inf")), tau)
+
+        probs = torch.clamp(z - tau, min=0)
+
+        weight_compute = self.weight.to(self.compute_dtype)
+        bias_compute = self.bias.to(self.compute_dtype) if self.bias is not None else None
+
+        out_c = torch.nn.functional.conv2d(
+            probs, weight_compute, bias_compute, stride=1, padding=self.K // 2, groups=self.groups
+        )
+        return out_c.masked_fill(~mask, zero).to(scores.dtype)
+
+
+def bench_speed_sparse_multi_token_attention(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    L = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    B = extra_benchmark_config["B"]
+    C_in = extra_benchmark_config["C_in"]
+    C_out = extra_benchmark_config["C_out"]
+    K = extra_benchmark_config["K"]
+    groups = extra_benchmark_config["groups"]
+    bias = extra_benchmark_config["bias"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (B, C_in, L, L)
+
+    liger_attn = (
+        LigerMultiTokenAttention(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=1,
+            padding=K // 2,
+            dilation=1,
+            groups=groups,
+            bias=bias,
+            sparse=True,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    torch_attn = TorchSparseMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    with torch.no_grad():
+        torch.nn.init.kaiming_uniform_(liger_attn.weight, a=5**0.5)
+        if bias:
+            torch.nn.init.zeros_(liger_attn.bias)
+        torch_attn.weight.copy_(liger_attn.weight)
+        if bias:
+            torch_attn.bias.copy_(liger_attn.bias)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return liger_attn(x)
+        elif provider == "torch":
+            return torch_attn(x)
+
+    print(f"Starting Warmup for input size: {x_shape}")
+    _ = fwd()
+    if mode in ("backward", "full"):
+        y = _
+        y.backward(dy, retain_graph=True)
+    print("Done Warmup")
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, grad_to_none=[x], rep=100, quantiles=QUANTILES)
+    elif mode == "backward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            grad_to_none=[x],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, grad_to_none=[x], rep=100, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_sparse_multi_token_attention(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    L = input.x
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    B = extra_benchmark_config["B"]
+    C_in = extra_benchmark_config["C_in"]
+    C_out = extra_benchmark_config["C_out"]
+    K = extra_benchmark_config["K"]
+    groups = extra_benchmark_config["groups"]
+    bias = extra_benchmark_config["bias"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (B, C_in, L, L)
+
+    liger_attn = (
+        LigerMultiTokenAttention(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=1,
+            padding=K // 2,
+            dilation=1,
+            groups=groups,
+            bias=bias,
+            sparse=True,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    torch_attn = TorchSparseMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    with torch.no_grad():
+        torch.nn.init.kaiming_uniform_(liger_attn.weight, a=5**0.5)
+        if bias:
+            torch.nn.init.zeros_(liger_attn.bias)
+        torch_attn.weight.copy_(liger_attn.weight)
+        if bias:
+            torch_attn.bias.copy_(liger_attn.bias)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return liger_attn(x)
+        elif provider == "torch":
+            return torch_attn(x)
+
+    def full():
+        y = fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "sparse_multi_token_attention",
+        "x_name": "L",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(5, 10)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "B": 2,
+                "C_in": 4,
+                "C_out": 4,
+                "K": 3,
+                "groups": 1,
+                "bias": True,
+                "dtype": torch.float32,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_sparse_multi_token_attention,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_sparse_multi_token_attention,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_sparsemax.py b/benchmark/scripts/benchmark_sparsemax.py
new file mode 100755
index 0000000000000000000000000000000000000000..919f4c66defbe1d2388aa17c912fec09058316d4
--- /dev/null
+++ b/benchmark/scripts/benchmark_sparsemax.py
@@ -0,0 +1,172 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.sparsemax import LigerSparsemax
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def torch_sparsemax(input_tensor: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    input_dims = input_tensor.dim()
+    if dim < 0:
+        dim = input_dims + dim
+    input_sorted, _ = torch.sort(input_tensor, dim=dim, descending=True)
+    cumsum_input = torch.cumsum(input_sorted, dim=dim)
+    input_size = input_tensor.size(dim)
+    range_tensor = torch.arange(1, input_size + 1, device=input_tensor.device, dtype=input_tensor.dtype)
+    shape = [1] * input_dims
+    shape[dim] = input_size
+    range_tensor = range_tensor.view(shape)
+    k_bound = 1 + range_tensor * input_sorted
+    support = k_bound > cumsum_input
+    k = support.sum(dim=dim, keepdim=True).clamp(min=1)
+    support_sum = (input_sorted * support).sum(dim=dim, keepdim=True)
+    tau = (support_sum - 1) / k
+    return torch.clamp(input_tensor - tau, min=0)
+
+
+class TorchSparsemax(torch.nn.Module):
+    def __init__(self, dim: int = -1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch_sparsemax(x, dim=self.dim)
+
+
+def bench_speed_sparsemax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    V = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    extra_benchmark_config = input.extra_benchmark_config
+    B = extra_benchmark_config["B"]
+    T = extra_benchmark_config["T"]
+    dim = extra_benchmark_config["dim"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (B * T, V)
+
+    torch_sparsemax_module = TorchSparsemax(dim=dim).to(device)
+    liger_sparsemax_module = LigerSparsemax(dim=dim).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    # utility functions
+    def y_fwd():
+        if provider == "liger":
+            return liger_sparsemax_module(x)
+        elif provider == "torch":
+            return torch_sparsemax_module(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            y_fwd,
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = y_fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = y_fwd()
+            y.backward(dy, retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[x],
+            rep=500,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_sparsemax(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    V = input.x
+    provider = input.kernel_provider
+
+    extra_benchmark_config = input.extra_benchmark_config
+    B = extra_benchmark_config["B"]
+    T = extra_benchmark_config["T"]
+    dim = extra_benchmark_config["dim"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (B * T, V)
+
+    torch_sparsemax_module = TorchSparsemax(dim=dim).to(device)
+    liger_sparsemax_module = LigerSparsemax(dim=dim).to(device)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    # utility functions
+    def y_fwd():
+        if provider == "liger":
+            return liger_sparsemax_module(x)
+        elif provider == "torch":
+            return torch_sparsemax_module(x)
+
+    def full():
+        y = y_fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "sparsemax",
+        "x_name": "V",
+        "x_label": "feature size",
+        "x_values": [2**i for i in range(10, 16)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [{"B": 4, "T": 512, "dim": -1, "dtype": torch.float32}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_sparsemax,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_sparsemax,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_swiglu.py b/benchmark/scripts/benchmark_swiglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..8d46572fdb1ccbd0e14daa4be1e4da3037575ca8
--- /dev/null
+++ b/benchmark/scripts/benchmark_swiglu.py
@@ -0,0 +1,115 @@
+import math
+
+import torch
+
+from benchmark_model_configs import compute_seq_len_sweep_config
+from benchmark_model_configs import estimate_kernel_peak_memory
+from benchmark_model_configs import get_benchmark_model_config
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+from utils import run_memory_benchmark
+from utils import run_speed_benchmark
+
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def _setup_swiglu(input: SingleBenchmarkRunInput):
+    """Create input tensor and SwiGLU layer from benchmark config."""
+    cfg = input.extra_benchmark_config
+    llama_config = LlamaConfig(
+        hidden_size=cfg["hidden_size"],
+        intermediate_size=cfg["intermediate_size"],
+        hidden_act=cfg["hidden_act"],
+    )
+    x = torch.randn(
+        cfg["bsz"],
+        input.x,
+        cfg["hidden_size"],
+        device=device,
+        dtype=cfg["dtype"],
+        requires_grad=True,
+    )
+    if input.kernel_provider == "liger":
+        layer = LigerSwiGLUMLP(config=llama_config).to(device).to(cfg["dtype"])
+    elif input.kernel_provider == "huggingface":
+        layer = LlamaMLP(config=llama_config).to(device).to(cfg["dtype"])
+    else:
+        raise ValueError(f"Invalid provider: {input.kernel_provider} for SwiGLU")
+    return x, layer
+
+
+def bench_speed_swiglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_swiglu(input)
+    return run_speed_benchmark(lambda: layer(x), input.kernel_operation_mode, [x])
+
+
+def bench_memory_swiglu(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    x, layer = _setup_swiglu(input)
+    return run_memory_benchmark(lambda: layer(x), input.kernel_operation_mode)
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    model = get_benchmark_model_config(args.model)
+    probe_seq_len = 1024
+
+    def _probe():
+        probe_input = SingleBenchmarkRunInput(
+            x=probe_seq_len,
+            kernel_provider="huggingface",
+            extra_benchmark_config={
+                "bsz": 1,
+                "hidden_size": model.hidden_size,
+                "intermediate_size": model.intermediate_size,
+                "hidden_act": "silu",
+                "dtype": model.dtype,
+            },
+        )
+        x, layer = _setup_swiglu(probe_input)
+        return layer(x)
+
+    peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe)
+    kernel_bpt = peak_bytes // probe_seq_len
+
+    config = compute_seq_len_sweep_config(model, kernel_bytes_per_token=kernel_bpt)
+
+    common_configs = {
+        "kernel_name": "swiglu",
+        "x_name": "T",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(10, int(math.log2(config.seq_len)) + 1)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "bsz": config.batch_size,
+                "hidden_size": model.hidden_size,
+                "intermediate_size": model.intermediate_size,
+                "hidden_act": "silu",
+                "dtype": model.dtype,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_swiglu,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_swiglu,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/benchmark/scripts/benchmark_tiled_mlp.py b/benchmark/scripts/benchmark_tiled_mlp.py
new file mode 100755
index 0000000000000000000000000000000000000000..1eaf21dac9f5c14aa46332ab832109ca37809957
--- /dev/null
+++ b/benchmark/scripts/benchmark_tiled_mlp.py
@@ -0,0 +1,397 @@
+import math
+
+import torch
+import torch.nn as nn
+import triton
+
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.geglu import LigerGEGLUMLP
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+from liger_kernel.transformers.tiled_mlp import LigerTiledGEGLUMLP
+from liger_kernel.transformers.tiled_mlp import LigerTiledSwiGLUMLP
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+# DeepSpeed TiledMLP implementation
+# Based on: https://github.com/deepspeedai/DeepSpeed/blob/v0.18.2/deepspeed/runtime/sequence_parallel/ulysses_sp.py#L838
+class DeepSpeedTiledMLP(torch.autograd.Function):
+    """
+    DeepSpeed's TiledMLP implementation for fair comparison.
+    This is the actual DeepSpeed algorithm that performs tiled MLP computation
+    to massively reduce memory usage with very long sequence lengths.
+
+    This module re-computes forward in the backward, so forward occurs twice per iteration.
+    """
+
+    @staticmethod
+    def forward(ctx, fn, self, x, shards, compute_params) -> torch.Tensor:
+        ctx.fn = fn
+        ctx.self = self
+        ctx.shards = shards
+        ctx.compute_params = [p for p in compute_params if p.requires_grad] if compute_params else []
+        ctx.save_for_backward(x)
+
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size] (moe experts)
+        x_shards = list(torch.chunk(x, chunks=shards, dim=-2))
+        with torch.no_grad():
+            output_shards = [fn(self, x_shard) for x_shard in x_shards]
+        output_unsharded = torch.cat(output_shards, dim=-2)
+
+        return output_unsharded
+
+    @staticmethod
+    def backward(ctx, *grads):
+        fn = ctx.fn
+        (x,) = ctx.saved_tensors
+        self = ctx.self
+        shards = ctx.shards
+        compute_params = ctx.compute_params
+
+        x_requires_grad = x.requires_grad
+        x = x.detach()
+        # detach() unsets x.requires_grad, so restore it
+        x.requires_grad_(x_requires_grad)
+
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size] (moe experts)
+        hidden_size = x.shape[-1]
+        x_shape_orig = x.shape
+
+        # flatten bs+seqlen to avoid having stride issues when narrowing into seqlen w/ bs>1
+        x = x.view(-1, hidden_size)
+        incoming_grad = grads[0].view(-1, hidden_size)
+        x_grad = torch.zeros_like(x)
+
+        x_shards = list(torch.chunk(x, chunks=shards, dim=0))
+
+        for i, x_shard in enumerate(x_shards):
+            # Tell deepspeed not to add a new grad to its ipg bucket until the last shard is run
+            # XXX: DDP, FSDP will need something similar to make it work
+            if compute_params:
+                if i + 1 < shards:
+                    for param in compute_params:
+                        if hasattr(param, "ds_grad_is_ready"):
+                            param.ds_grad_is_ready = False
+                else:
+                    # last shard, can add the grad
+                    for param in compute_params:
+                        if hasattr(param, "ds_grad_is_ready"):
+                            param.ds_grad_is_ready = True
+
+            x_shard.requires_grad_(x_requires_grad)
+
+            # if seqlen is not exactly divisible by shards the last step will be shorter than shard_step
+            shard_step = x_shards[i].shape[0]
+            shard_offset = i * x_shards[0].shape[0]
+
+            x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
+            incoming_grad_shard = incoming_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
+            with torch.enable_grad():
+                output = fn(self, x_shard)
+            torch.autograd.backward(output, incoming_grad_shard)
+
+        # unflatten
+        x_grad = x_grad.view(x_shape_orig)
+
+        return (None, None, x_grad, None, None)
+
+
+# DeepSpeed TiledMLP wrapper to match our interface
+class DeepSpeedTiledMLPWrapper(nn.Module):
+    """
+    Wrapper for DeepSpeed's TiledMLP to match the interface used in benchmarks.
+    Uses the DeepSpeed TiledMLP algorithm for memory-efficient MLP computation.
+    """
+
+    def __init__(self, config, num_shards=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_shards = num_shards
+
+        self.mlp = LlamaMLP(config=config)
+
+    def forward(self, x):
+        # Calculate num_shards if not provided
+        num_shards = self.num_shards
+        if num_shards is None:
+            hidden_size = x.shape[-1]
+            seqlen = x.shape[-2]
+            num_shards = math.ceil(seqlen / hidden_size)
+        num_shards = max(1, num_shards)
+
+        # Collect compute parameters for DeepSpeed ZeRO compatibility
+        compute_params = [
+            self.mlp.down_proj.weight,
+            self.mlp.gate_proj.weight,
+            self.mlp.up_proj.weight,
+        ]
+
+        # Define the MLP forward function for DeepSpeed TiledMLP
+        def mlp_forward(mlp_module, x_input):
+            return mlp_module.down_proj(mlp_module.act_fn(mlp_module.gate_proj(x_input)) * mlp_module.up_proj(x_input))
+
+        # Use DeepSpeed's TiledMLP implementation
+        return DeepSpeedTiledMLP.apply(
+            mlp_forward,
+            self.mlp,
+            x,
+            num_shards,
+            compute_params,
+        )
+
+
+def bench_speed_tiled_mlp(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    seq_len = input.x
+    bsz = input.extra_benchmark_config["bsz"]
+    hidden_size = input.extra_benchmark_config["hidden_size"]
+    intermediate_size = input.extra_benchmark_config["intermediate_size"]
+    hidden_act = input.extra_benchmark_config["hidden_act"]
+    dtype = input.extra_benchmark_config["dtype"]
+    num_shards = input.extra_benchmark_config.get("num_shards", None)
+    activation_type = input.extra_benchmark_config["activation_type"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    llama_config = LlamaConfig(
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        hidden_act=hidden_act,
+    )
+
+    x_shape = (bsz, seq_len, hidden_size)
+
+    # initialize input
+    x = torch.randn(*x_shape, device=device, dtype=dtype, requires_grad=True)
+
+    if activation_type == "geglu":
+        if provider == "huggingface":
+            layer = LlamaMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger":
+            layer = LigerGEGLUMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger_tiled":
+            layer = LigerTiledGEGLUMLP(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        elif provider == "deepspeed_tiled":
+            layer = DeepSpeedTiledMLPWrapper(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for GEGLU")
+    elif activation_type == "swiglu":
+        if provider == "huggingface":
+            layer = LlamaMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger":
+            layer = LigerSwiGLUMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger_tiled":
+            layer = LigerTiledSwiGLUMLP(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        elif provider == "deepspeed_tiled":
+            layer = DeepSpeedTiledMLPWrapper(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for SwiGLU")
+    else:
+        raise ValueError(f"Invalid activation_type: {activation_type}")
+
+    def fwd():
+        return layer(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            grad_to_none=[x],
+            rep=10,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        do = torch.randn_like(x)
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(do, retain_graph=True),
+            grad_to_none=[x],
+            rep=10,
+            quantiles=QUANTILES,
+        )
+    else:
+
+        def full():
+            y = fwd()
+            y.backward(torch.randn_like(y), retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=[x],
+            rep=10,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_tiled_mlp(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    seq_len = input.x
+    bsz = input.extra_benchmark_config["bsz"]
+    hidden_size = input.extra_benchmark_config["hidden_size"]
+    intermediate_size = input.extra_benchmark_config["intermediate_size"]
+    hidden_act = input.extra_benchmark_config["hidden_act"]
+    dtype = input.extra_benchmark_config["dtype"]
+    num_shards = input.extra_benchmark_config.get("num_shards", None)
+    activation_type = input.extra_benchmark_config["activation_type"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+
+    llama_config = LlamaConfig(
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        hidden_act=hidden_act,
+    )
+
+    x_shape = (bsz, seq_len, hidden_size)
+    # initialize input
+    x = torch.randn(*x_shape, device=device, dtype=dtype, requires_grad=True)
+
+    if activation_type == "geglu":
+        if provider == "huggingface":
+            layer = LlamaMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger":
+            layer = LigerGEGLUMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger_tiled":
+            layer = LigerTiledGEGLUMLP(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        elif provider == "deepspeed_tiled":
+            layer = DeepSpeedTiledMLPWrapper(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for GEGLU")
+    elif activation_type == "swiglu":
+        if provider == "huggingface":
+            layer = LlamaMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger":
+            layer = LigerSwiGLUMLP(config=llama_config).to(device).to(dtype)
+        elif provider == "liger_tiled":
+            layer = LigerTiledSwiGLUMLP(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        elif provider == "deepspeed_tiled":
+            layer = DeepSpeedTiledMLPWrapper(config=llama_config, num_shards=num_shards).to(device).to(dtype)
+        else:
+            raise ValueError(f"Invalid provider: {provider} for SwiGLU")
+    else:
+        raise ValueError(f"Invalid activation_type: {activation_type}")
+
+    def fwd():
+        return layer(x)
+
+    def full():
+        y = fwd()
+        y.backward(torch.randn_like(y), retain_graph=True)
+
+    if mode == "forward":
+        mem_50, mem_20, mem_80 = _test_memory(
+            fwd,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        do = torch.randn_like(x)
+        y = fwd()
+        mem_50, mem_20, mem_80 = _test_memory(
+            lambda: y.backward(do, retain_graph=True),
+            quantiles=QUANTILES,
+        )
+    else:
+        mem_50, mem_20, mem_80 = _test_memory(
+            full,
+            quantiles=QUANTILES,
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    # Benchmark GEGLU variants
+    kernel_providers_geglu = ["huggingface", "liger", "liger_tiled", "deepspeed_tiled"]
+
+    common_configs_geglu = {
+        "kernel_name": "tiled_geglu",
+        "x_name": "T",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(10, 15)],  # 1024 to 16384
+        "kernel_providers": kernel_providers_geglu,
+        "extra_benchmark_configs": [
+            {
+                "bsz": 2,
+                "hidden_size": 2048,
+                "intermediate_size": 4096,
+                "hidden_act": "gelu_pytorch_tanh",
+                "activation_type": "geglu",
+                "num_shards": 4,
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_tiled_mlp,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_geglu,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_tiled_mlp,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_geglu,
+    )
+
+    # Benchmark SwiGLU variants
+    kernel_providers_swiglu = ["huggingface", "liger", "liger_tiled", "deepspeed_tiled"]
+
+    common_configs_swiglu = {
+        "kernel_name": "tiled_swiglu",
+        "x_name": "T",
+        "x_label": "sequence length",
+        "x_values": [2**i for i in range(10, 15)],  # 1024 to 16384
+        "kernel_providers": kernel_providers_swiglu,
+        "extra_benchmark_configs": [
+            {
+                "bsz": 2,
+                "hidden_size": 2048,
+                "intermediate_size": 4096,
+                "hidden_act": "silu",
+                "activation_type": "swiglu",
+                "num_shards": 4,
+                "dtype": torch.bfloat16,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_tiled_mlp,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs_swiglu,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_tiled_mlp,
+        kernel_operation_modes=["full", "forward", "backward"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs_swiglu,
+    )
diff --git a/benchmark/scripts/benchmark_tvd.py b/benchmark/scripts/benchmark_tvd.py
new file mode 100755
index 0000000000000000000000000000000000000000..ef76380a2664a658a2e19c1815fa03cd8a47a5d9
--- /dev/null
+++ b/benchmark/scripts/benchmark_tvd.py
@@ -0,0 +1,145 @@
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.transformers.tvd import LigerTVDLoss
+from liger_kernel.utils import get_total_gpu_memory
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+class TorchTVDLoss(torch.nn.Module):
+    def __init__(self, reduction="batchmean"):
+        super(TorchTVDLoss, self).__init__()
+        self.reduction = reduction
+
+    def forward(self, p, q):
+        tvd = torch.abs(p - q) / 2.0
+        if self.reduction == "mean":
+            return torch.sum(tvd) / (p.size(0) * p.size(1))
+        elif self.reduction == "sum":
+            return torch.sum(tvd)
+        elif self.reduction == "none":
+            return tvd
+        elif self.reduction == "batchmean":
+            return torch.sum(tvd) / p.size(0)
+        else:
+            raise ValueError("Invalid reduction type.")
+
+
+S, E = 12, 18
+
+
+def bench_speed_tvd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    reduction = "batchmean"
+    V = input.x
+    B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
+    torch_tvd = TorchTVDLoss(reduction=reduction)
+    liger_tvd = LigerTVDLoss(reduction=reduction)
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
+
+    def fwd():
+        if input.kernel_provider == "liger":
+            return liger_tvd(_input, target)
+        else:
+            return torch_tvd(_input, target)
+
+    if input.kernel_operation_mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, rep=100)
+    elif input.kernel_operation_mode == "backward":
+        y = fwd()
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[_input],
+            rep=100,
+        )
+    elif input.kernel_operation_mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward(retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_tvd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    reduction = "batchmean"
+    torch_tvd = TorchTVDLoss(reduction=reduction)
+    liger_tvd = LigerTVDLoss(reduction=reduction)
+
+    V = input.x
+    B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
+
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
+
+    def fwd():
+        if input.kernel_provider == "liger":
+            return liger_tvd(_input, target)
+        else:
+            return torch_tvd(_input, target)
+
+    def full():
+        y = fwd()
+        y.backward(retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+    gpu_memory_gbs = get_total_gpu_memory()
+    # We know that the full test will require 66GBs for vocab size 2^17
+    if gpu_memory_gbs >= 66:
+        x_max = 17
+    elif gpu_memory_gbs >= 32:
+        x_max = 16
+    else:
+        x_max = 15
+    common_args = {
+        "kernel_name": "tvd",
+        "x_name": "V",
+        "x_label": "vocab size",
+        "x_values": [2**i for i in range(12, x_max + 1)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [{"B": 8, "T": 2048}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_memory_tvd,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_args,
+    )
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_tvd,
+        kernel_operation_modes=["forward", "full", "backward"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_args,
+    )
diff --git a/benchmark/scripts/utils.py b/benchmark/scripts/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..e6b4fc9e85d03673729fb0edf8d66e315a2d0b16
--- /dev/null
+++ b/benchmark/scripts/utils.py
@@ -0,0 +1,439 @@
+import argparse
+import csv
+import json
+import os
+import time
+
+from collections import OrderedDict
+from dataclasses import asdict
+from dataclasses import dataclass
+from importlib.metadata import version
+from itertools import zip_longest
+from typing import Any
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+LIGER_KERNEL_VERSION = version("liger-kernel")
+
+QUANTILES = [0.5, 0.2, 0.8]
+
+
+@dataclass
+class SingleBenchmarkRunInput:
+    x: Union[int, float]
+    kernel_provider: str
+    kernel_operation_mode: Optional[str] = ""
+    extra_benchmark_config: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class SingleBenchmarkRunOutput:
+    # 20th percentile
+    y_20: float
+    # 50th percentile (median)
+    y_50: float
+    # 80th percentile
+    y_80: float
+
+
+@dataclass
+class BenchmarkData:
+    """
+    BenchmarkData is a dataclass to store the benchmark data for a a completed benchmark
+    run on all x-values for a given kernel/kernel operation mode/metric/extra_benchmark_config
+    """
+
+    kernel_name: str
+    kernel_provider: str
+    metric_name: str
+    metric_unit: str
+    gpu_name: str
+    x_name: str
+    x_label: str
+    x_values: List[float]
+    y_values_50: List[float]
+    y_values_20: List[float]
+    y_values_80: List[float]
+    timestamp: str
+    kernel_operation_mode: Optional[str] = None
+    extra_benchmark_config_str: Optional[str] = None
+    liger_version: str = LIGER_KERNEL_VERSION
+
+
+@dataclass
+class BenchmarkDataCSVRow:
+    # The ordering of field names here will be the order of columns in the CSV
+    kernel_name: str
+    kernel_provider: str
+    kernel_operation_mode: Union[str, None]
+    metric_name: str
+    metric_unit: str
+    x_name: str
+    x_label: str
+    x_value: float
+    y_value_50: float
+    y_value_20: float
+    y_value_80: float
+    extra_benchmark_config_str: Union[str, None]
+    gpu_name: str
+    timestamp: str
+    liger_version: str
+
+
+def _test_memory(
+    func: Callable,
+    _iter: int = 10,
+    quantiles: Optional[List[float]] = None,
+    return_mode="mean",
+) -> float:
+    assert return_mode in ["min", "max", "mean", "median"]
+    total_mem = []
+
+    for _ in range(_iter):
+        getattr(torch, device).memory.reset_peak_memory_stats()
+        func()
+        # Convert to MB
+        mem = getattr(torch, device).max_memory_allocated() / 2**20
+        total_mem.append(mem)
+
+    total_mem = torch.tensor(total_mem, dtype=torch.float)
+    if quantiles is not None:
+        quantiles_data = torch.quantile(total_mem, torch.tensor(quantiles, dtype=torch.float)).tolist()
+        if len(quantiles_data) == 1:
+            quantiles_data = quantiles_data[0]
+        return quantiles_data
+    return getattr(torch, return_mode)(total_mem).item()
+
+
+def run_speed_benchmark(
+    fwd_fn: Callable,
+    mode: str,
+    input_tensors: List[torch.Tensor],
+    rep: int = 10,
+) -> "SingleBenchmarkRunOutput":
+    """Measure execution speed for forward, backward, or full (fwd+bwd).
+
+    Covers the common case where the forward function returns a single tensor
+    and backward uses a random gradient of the same shape.  For kernels with
+    scalar output (losses) or multiple outputs (e.g. RoPE), write custom
+    measurement logic instead.
+    """
+    import triton
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd_fn,
+            grad_to_none=input_tensors,
+            rep=rep,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd_fn()
+        do = torch.randn_like(y)
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(do, retain_graph=True),
+            grad_to_none=input_tensors,
+            rep=rep,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd_fn()
+            y.backward(torch.randn_like(y), retain_graph=True)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            grad_to_none=input_tensors,
+            rep=rep,
+            quantiles=QUANTILES,
+        )
+    else:
+        raise ValueError(f"Unsupported mode: {mode}. Use 'forward', 'backward', or 'full'.")
+    return SingleBenchmarkRunOutput(y_20=ms_20, y_50=ms_50, y_80=ms_80)
+
+
+def run_memory_benchmark(
+    fwd_fn: Callable,
+    mode: str,
+) -> "SingleBenchmarkRunOutput":
+    """Measure peak memory for forward, backward, or full (fwd+bwd).
+
+    Same caveats as :func:`run_speed_benchmark` regarding output shape.
+    """
+    if mode == "forward":
+        mem_50, mem_20, mem_80 = _test_memory(fwd_fn, quantiles=QUANTILES)
+    elif mode == "backward":
+        y = fwd_fn()
+        do = torch.randn_like(y)
+        mem_50, mem_20, mem_80 = _test_memory(
+            lambda: y.backward(do, retain_graph=True),
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd_fn()
+            y.backward(torch.randn_like(y), retain_graph=True)
+
+        mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+    else:
+        raise ValueError(f"Unsupported mode: {mode}. Use 'forward', 'backward', or 'full'.")
+    return SingleBenchmarkRunOutput(y_20=mem_20, y_50=mem_50, y_80=mem_80)
+
+
+def get_current_file_directory() -> str:
+    """
+    Returns the directory path of the current Python file.
+    """
+    # Get the absolute path of the current file
+    current_file_path = os.path.abspath(__file__)
+
+    # Get the directory path of the current file
+    return os.path.dirname(current_file_path)
+
+
+def sleep(seconds):
+    def decorator(function):
+        def wrapper(*args, **kwargs):
+            time.sleep(seconds)
+            return function(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def _print_benchmarking_banner(metric_name: str, kernel_name: str):
+    print("**************************************")
+    print(f"     BENCHMARKING {metric_name.upper()} for {kernel_name.upper()}")
+    print("**************************************")
+
+
+def get_formatted_time():
+    return time.strftime("%Y-%m-%d %H:%M:%S")
+
+
+def get_gpu_name():
+    """
+    Returns the current GPU name, formatted to serve as a directory name
+    """
+    torch_device = getattr(torch, device)
+    if torch_device.is_available():
+        gpu_name = torch_device.get_device_name(torch_device.current_device())
+        return gpu_name
+    else:
+        raise Exception("Benchmarks can only be run on GPU.")
+
+
+def update_benchmark_data_csv(
+    benchmark_data_list: List[BenchmarkData],
+    filename: str = "all_benchmark_data.csv",
+    overwrite: bool = True,
+):
+    """
+    Update the CSV file with the new benchmark data. If the file does not exist, create it.
+    If an entry already exists for the benchmark, then overwrite it if `overwrite` is True.
+    """
+
+    def create_unique_key(row):
+        # This unique key is used to determine if a benchmark run already exists in the CSV
+        # If the key is the same, then the benchmark run already exists and will optionally
+        # be overwritten. Otherwise, it is considered a new benchmark run and appended.
+        return (
+            row["kernel_name"],
+            row["kernel_provider"],
+            row["kernel_operation_mode"] if row["kernel_operation_mode"] else "",
+            row["metric_name"],
+            row["x_name"],
+            str(row["x_value"]),
+            (row["extra_benchmark_config_str"] if row["extra_benchmark_config_str"] else ""),
+            row["gpu_name"],
+        )
+
+    fieldnames = BenchmarkDataCSVRow.__annotations__.keys()
+
+    # Make filename path relative to current file
+    filename_abs_path = os.path.join(get_current_file_directory(), "../data", filename)
+    file_exists = os.path.isfile(filename_abs_path)
+
+    # Read existing data into a list of dicts
+    existing_data = []
+    if file_exists:
+        with open(filename_abs_path, mode="r") as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                existing_data.append(row)
+
+    existing_data_dict = OrderedDict((create_unique_key(row), row) for row in existing_data)
+
+    for benchmark_data in benchmark_data_list:
+        benchmark_data_dict = asdict(benchmark_data)
+        x_values = benchmark_data_dict.pop("x_values")
+        y_values_50 = benchmark_data_dict.pop("y_values_50")
+        y_values_20 = benchmark_data_dict.pop("y_values_20")
+        y_values_80 = benchmark_data_dict.pop("y_values_80")
+
+        # Need to convert benchmark_data into multiple rows based on x_values and y_values
+        for x_value, y_value_50, y_value_20, y_value_80 in zip_longest(x_values, y_values_50, y_values_20, y_values_80):
+            if y_value_50 is None:
+                y_value_50 = float("nan")
+            if y_value_20 is None:
+                y_value_20 = float("nan")
+            if y_value_80 is None:
+                y_value_80 = float("nan")
+
+            row = BenchmarkDataCSVRow(
+                x_value=x_value,
+                y_value_50=y_value_50,
+                y_value_20=y_value_20,
+                y_value_80=y_value_80,
+                **benchmark_data_dict,
+            )
+            row_dict = asdict(row)
+
+            row_key = create_unique_key(row_dict)
+
+            if row_key in existing_data_dict:
+                if overwrite:
+                    # If overwriting, update the row
+                    existing_data_dict[row_key] = row_dict
+                else:
+                    # If not overwriting, skip this row
+                    pass
+            else:
+                existing_data_dict[row_key] = row_dict
+    os.makedirs(os.path.dirname(filename_abs_path), exist_ok=True)
+    with open(filename_abs_path, mode="w", newline="") as file:
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+
+        for row in existing_data_dict.values():
+            writer.writerow(row)
+
+
+class CustomEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, torch.dtype):
+            return str(obj)
+        return super().default(self, obj)
+
+
+def print_benchmark_data(benchmark_data_list: List[BenchmarkData]) -> str:
+    print("********** Benchmark Data **********")
+    formatted_list = [obj.__dict__ for obj in benchmark_data_list]
+    print(json.dumps(formatted_list, indent=2))
+
+
+def run_benchmarks(
+    bench_test_fn: Callable,
+    kernel_name: str,
+    metric_name: str,
+    metric_unit: str,
+    x_name: str,
+    x_label: str,
+    x_values: List[Union[float, int]],
+    kernel_providers: List[str],
+    kernel_operation_modes: Optional[List[str]] = [None],
+    extra_benchmark_configs: Optional[List[Dict[str, Any]]] = None,
+    overwrite: bool = False,
+):
+    """
+    Run benchmarks given a bench_test_fn that takes in a SingleBenchmarkRunInput as input and
+    saves data to the CSV file.
+
+    Args:
+        - bench_test_fn: The benchmark test function to run. This function should take in a
+            SingleBenchmarkRunInput as input and return a SingleBenchmarkRunOutput.
+        - kernel_name: The name of the kernel being benchmarked (e.g. "swiglu")
+        - metric_name: The name of the metric being benchmarked (e.g. "speed" or "memory")
+        - metric_unit: The unit of the metric being benchmarked (e.g. "ms" or "MB")
+        - x_name: The name of the x-axis (e.g. "T" for sequence length)
+        - x_label: The label of the x-axis (e.g. "sequence length")
+        - x_values: The list of x-values to run the benchmark on (e.g. [2**i for i in range(10, 14)])
+        - kernel_providers: The list of kernel providers to run the benchmark on (e.g. ["liger", "huggingface"])
+        - kernel_operation_modes: The list of kernel operation modes to run the benchmark on (e.g. ["full", "backward"])
+        - extra_benchmark_configs: The list of extra benchmark configurations to run the benchmark on.
+        - overwrite: Whether to overwrite the existing benchmark data entry if it already exists.
+    """
+
+    assert len(kernel_operation_modes) >= 1
+    assert len(kernel_providers) >= 1
+
+    _print_benchmarking_banner(metric_name=metric_name, kernel_name=kernel_name)
+
+    gpu_name = get_gpu_name()
+    benchmark_data_list = []
+    for extra_benchmark_config in extra_benchmark_configs:
+        for kernel_operation_mode in kernel_operation_modes:
+            for kernel_provider in kernel_providers:
+                y_values_50 = []
+                y_values_20 = []
+                y_values_80 = []
+
+                for x in x_values:
+                    single_benchmark_run_input = SingleBenchmarkRunInput(
+                        x=x,
+                        kernel_provider=kernel_provider,
+                        kernel_operation_mode=kernel_operation_mode,
+                        extra_benchmark_config=extra_benchmark_config,
+                    )
+                    benchmark_result: SingleBenchmarkRunOutput = bench_test_fn(single_benchmark_run_input)
+                    y_values_50.append(benchmark_result.y_50)
+                    y_values_20.append(benchmark_result.y_20)
+                    y_values_80.append(benchmark_result.y_80)
+
+                benchmark_run_data = BenchmarkData(
+                    kernel_name=kernel_name,
+                    kernel_operation_mode=kernel_operation_mode,
+                    kernel_provider=kernel_provider,
+                    metric_name=metric_name,
+                    metric_unit=metric_unit,
+                    gpu_name=gpu_name,
+                    x_name=x_name,
+                    x_label=x_label,
+                    x_values=x_values,
+                    y_values_50=y_values_50,
+                    y_values_20=y_values_20,
+                    y_values_80=y_values_80,
+                    extra_benchmark_config_str=json.dumps(extra_benchmark_config, cls=CustomEncoder),
+                    timestamp=get_formatted_time(),
+                    liger_version=LIGER_KERNEL_VERSION,
+                )
+
+                benchmark_data_list.append(benchmark_run_data)
+
+    print_benchmark_data(benchmark_data_list)
+
+    update_benchmark_data_csv(benchmark_data_list=benchmark_data_list, overwrite=overwrite)
+
+
+def parse_benchmark_script_args():
+    parser = argparse.ArgumentParser(description="Benchmarking script for Liger-Kernel")
+
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Flag to overwrite existing benchmark data with current run.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default=None,
+        help=(
+            "Model config name from MODEL_REGISTRY "
+            "(e.g. llama_2_7b, llama_3_8b). "
+            "Defaults to llama_3_8b when not specified."
+        ),
+    )
+    args = parser.parse_args()
+    return args
diff --git a/dev/fmt-requirements.txt b/dev/fmt-requirements.txt
new file mode 100755
index 0000000000000000000000000000000000000000..1d8f48692ce6c44eacf156ed5027a1d330741ba4
--- /dev/null
+++ b/dev/fmt-requirements.txt
@@ -0,0 +1 @@
+ruff>=0.1.6
diff --git a/dev/modal/benchmarks.py b/dev/modal/benchmarks.py
new file mode 100755
index 0000000000000000000000000000000000000000..a54fa47d88ba93a9c092f2a062f06a07eb66d992
--- /dev/null
+++ b/dev/modal/benchmarks.py
@@ -0,0 +1,73 @@
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parent.parent.parent
+REMOTE_ROOT_PATH = "/root/liger-kernel"
+PYTHON_VERSION = "3.12"
+
+image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install("uv")
+
+app = modal.App("liger_benchmarks", image=image)
+
+# mount: add local files to the remote container
+repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
+
+
+@app.function(gpu="H100!", image=repo, timeout=60 * 90)
+def liger_benchmarks():
+    import os
+    import subprocess
+
+    subprocess.run(
+        ["uv pip install -e '.[dev]' --system"],
+        check=True,
+        shell=True,
+        cwd=REMOTE_ROOT_PATH,
+    )
+    subprocess.run(["make run-benchmarks"], check=True, shell=True, cwd=REMOTE_ROOT_PATH)
+
+    file_path = Path(REMOTE_ROOT_PATH) / "benchmark" / "data" / "all_benchmark_data.csv"
+    print(f"Checking if file exists at: {file_path}")
+    print(f"File exists: {os.path.exists(file_path)}")
+
+    if not os.path.exists(file_path):
+        print("Listing directory contents:")
+        data_dir = file_path.parent
+        if os.path.exists(data_dir):
+            print(f"Contents of {data_dir}:")
+            print(os.listdir(data_dir))
+        else:
+            print(f"Data directory {data_dir} does not exist")
+        raise FileNotFoundError(f"Benchmark data file not found at {file_path}")
+
+    with open(file_path, "rb") as f:
+        data = f.read()
+        print(f"Successfully read {len(data)} bytes of data")
+        return data
+
+
+@app.local_entrypoint()
+def main():
+    try:
+        # Run the benchmarks and get the data
+        print("Starting benchmark run...")
+        benchmark_data = liger_benchmarks.remote()
+
+        if not benchmark_data:
+            raise ValueError("No data received from remote function")
+
+        # Save the data locally
+        local_data_path = ROOT_PATH / "benchmark" / "data" / "all_benchmark_data.csv"
+        print(f"Attempting to save data to: {local_data_path}")
+
+        local_data_path.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(local_data_path, "wb") as f:
+            f.write(benchmark_data)
+
+        print(f"Successfully saved {len(benchmark_data)} bytes to: {local_data_path}")
+
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        raise
diff --git a/dev/modal/tests.py b/dev/modal/tests.py
new file mode 100755
index 0000000000000000000000000000000000000000..07856dcecb1af92fc260e780b65b8b4c10421eeb
--- /dev/null
+++ b/dev/modal/tests.py
@@ -0,0 +1,86 @@
+from pathlib import Path
+
+import modal
+
+ROOT_PATH = Path(__file__).parent.parent.parent
+REMOTE_ROOT_PATH = "/root/liger-kernel"
+PYTHON_VERSION = "3.12"
+
+OLDEST_SUPPORTED_TRANSFORMERS_V4_VERSION = "4.52.0"
+
+image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install("uv")
+
+app = modal.App("liger_tests", image=image)
+
+# mount: add local files to the remote container
+repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
+
+
+@app.function(gpu="H100!", image=repo, timeout=90 * 60)
+def liger_correctness_tests():
+    import subprocess
+
+    subprocess.run(
+        ["uv pip install -e '.[dev]' --system"],
+        check=True,
+        shell=True,
+        cwd=REMOTE_ROOT_PATH,
+    )
+    subprocess.run(["make test"], check=True, shell=True, cwd=REMOTE_ROOT_PATH)
+
+
+@app.function(gpu="H100!", image=repo, timeout=90 * 60)
+def liger_convergence_tests():
+    import subprocess
+
+    subprocess.run(
+        ["uv pip install -e '.[dev]' --system"],
+        check=True,
+        shell=True,
+        cwd=REMOTE_ROOT_PATH,
+    )
+    subprocess.run(["make test-convergence"], check=True, shell=True, cwd=REMOTE_ROOT_PATH)
+
+
+oldest_v4_app = modal.App("liger_oldest_v4_tests", image=image)  # 4.52.0
+
+
+@oldest_v4_app.function(gpu="H100!", image=repo, timeout=90 * 60)
+def liger_oldest_v4_correctness_tests():
+    import subprocess
+
+    subprocess.run(
+        ["uv pip install -e '.[dev]' --system"],
+        check=True,
+        shell=True,
+        cwd=REMOTE_ROOT_PATH,
+    )
+    subprocess.run(
+        [f"uv pip install 'transformers=={OLDEST_SUPPORTED_TRANSFORMERS_V4_VERSION}' --system"],
+        check=True,
+        shell=True,
+        cwd=REMOTE_ROOT_PATH,
+    )
+    subprocess.run(["make test"], check=True, shell=True, cwd=REMOTE_ROOT_PATH)
+
+
+@oldest_v4_app.function(gpu="H100!", image=repo, timeout=90 * 60)
+def liger_oldest_v4_convergence_tests():
+    import subprocess
+
+    subprocess.run(
+        ["uv pip install -e '.[dev]' --system"],
+        check=True,
+        shell=True,
+        cwd=REMOTE_ROOT_PATH,
+    )
+    subprocess.run(
+        [f"uv pip install 'transformers=={OLDEST_SUPPORTED_TRANSFORMERS_V4_VERSION}' --system"],
+        check=True,
+        shell=True,
+        cwd=REMOTE_ROOT_PATH,
+    )
+    subprocess.run(["make test-convergence"], check=True, shell=True, cwd=REMOTE_ROOT_PATH)
+
+
+latest_v4_app = modal.App("liger_latest_v4_tests", image=image)  # 4.57.6
diff --git a/docs/Examples.md b/docs/Examples.md
new file mode 100755
index 0000000000000000000000000000000000000000..41a1fb92fab04c9b4698fbdfa030f4619144f83a
--- /dev/null
+++ b/docs/Examples.md
@@ -0,0 +1,268 @@
+
+!!! Example "HANDS-ON USECASE EXAMPLES"
+| **Use Case**                                    | **Description**                                                                                   |
+|------------------------------------------------|---------------------------------------------------------------------------------------------------|
+| [**Hugging Face Trainer**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/huggingface)      | Train LLaMA 3-8B ~20% faster with over 40% memory reduction on Alpaca dataset using 4 A100s with FSDP |
+| [**Lightning Trainer**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/lightning)         | Increase 15% throughput and reduce memory usage by 40% with LLaMA3-8B on MMLU dataset using 8 A100s with DeepSpeed ZeRO3 |
+| [**Medusa Multi-head LLM (Retraining Phase)**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa)        | Reduce memory usage by 80% with 5 LM heads and improve throughput by 40% using 8 A100s with FSDP |
+| [**Vision-Language Model SFT**](https://github.com/linkedin/Liger-Kernel/tree/main/examples/huggingface/run_qwen2_vl.sh)      | Finetune Qwen2-VL on image-text data using 4 A100s with FSDP |
+| [**Liger ORPO Trainer**](https://github.com/linkedin/Liger-Kernel/blob/main/examples/alignment/run_orpo.py)      | Align Llama 3.2 using Liger ORPO Trainer with FSDP with 50% memory reduction |
+
+## HuggingFace Trainer
+
+### How to Run
+
+#### Locally on a GPU machine
+You can run the example locally on a GPU machine. The default hyperparameters and configurations work on single node with 4xA100 80GB GPUs and FSDP.
+
+!!! Example
+
+```bash
+pip install -r requirements.txt
+sh run_{MODEL}.sh
+```
+
+#### Remotely on Modal
+If you do not have access to a GPU machine, you can run the example on Modal. Modal is a serverless platform that allows you to run your code on a remote GPU machine. You can sign up for a free account at [Modal](https://www.modal.com/).
+
+!!! Example
+
+```bash
+pip install modal
+modal setup  # authenticate with Modal
+modal run launch_on_modal.py --script "run_qwen2_vl.sh"
+```
+
+!!! Notes
+
+1. This example uses an optional `use_liger` flag. If true, it does a 1 line monkey patch to apply liger kernel.
+
+2. The example uses Llama3 model that requires community license agreement and HuggingFace Hub login. If you want to use Llama3 in this example, please make sure you have done the following:
+    * Agree on the [community license agreement](https://huggingface.co/meta-llama/Meta-Llama-3-8B) .
+    * Run `huggingface-cli login` and enter your HuggingFace token.
+
+3. The default hyperparameters and configurations work on single node with 4xA100 80GB GPUs. For running on device with less GPU RAM, please consider reducing the per-GPU batch size and/or enable `CPUOffload` in FSDP.
+
+
+### Benchmark Result
+
+### Llama
+
+!!! Info
+>Benchmark conditions: 
+>Model= LLaMA 3-8B,Datset= Alpaca, Max seq len = 512, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 4 A100s.
+
+Throughput improves by around 20%, while GPU memory usage drops by 40%. This allows you to train the model on smaller GPUs, use larger batch sizes, or handle longer sequence lengths without incurring additional costs.
+
+![Throughput](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/huggingface/img/llama_tps.png)
+![GPU Memory Allocated](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/huggingface/img/llama_mem_alloc.png)
+
+### Qwen
+
+!!! Info
+>Benchmark conditions:
+>Model= Qwen2-7B, Dataset= Alpaca, Max seq len = 512, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 4 A100s.
+
+Throughput improves by around 10%, while GPU memory usage drops by 50%.
+
+![Throughput](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/huggingface/img/qwen_tps.png)
+![GPU Memory Allocated](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/huggingface/img/qwen_mem_alloc.png)
+
+
+### Gemma 7B
+
+!!! Info
+>Benchmark conditions:
+> Model= Gemma-7B, Dataset= Alpaca, Max seq len = 512, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 4 A100s.
+
+Throughput improves by around 24%, while GPU memory usage drops by 33%.
+
+![Throughput](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/huggingface/img/gemma_7b_mem.png)
+![GPU Memory Allocated](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/huggingface/img/gemma_7b_tp.png)
+
+## Lightning Trainer
+
+### How to Run
+
+#### Locally on a GPU machine
+You can run the example locally on a GPU machine.
+
+!!! Example
+
+```bash
+pip install -r requirements.txt
+
+# For single L40 48GB GPU
+python training.py --model Qwen/Qwen2-0.5B-Instruct --num_gpu 1 --max_length 1024
+
+# For 8XA100 40GB
+python training.py --model meta-llama/Meta-Llama-3-8B --strategy deepspeed
+```
+
+!!! Notes
+
+1. The example uses Llama3 model that requires community license agreement and HuggingFace Hub login. If you want to use Llama3 in this example, please make sure you have done the following:
+    * Agree on the [community license agreement](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+    * Run `huggingface-cli login` and enter your HuggingFace token.
+
+2. The default hyperparameters and configurations for gemma works on single L40 48GB GPU and config for llama work on single node with 8xA100 40GB GPUs. For running on device with less GPU RAM, please consider reducing the per-GPU batch size and/or enable `CPUOffload` in FSDP.
+
+## Medusa
+
+Medusa is a simple framework that democratizes the acceleration techniques for LLM generation with multiple decoding heads. To know more, you can check out the [repo](https://arxiv.org/abs/2401.10774) and the [paper](https://arxiv.org/abs/2401.10774) .
+
+The Liger fused CE kernel is highly effective in this scenario, eliminating the need to materialize logits for each head, which usually consumes a large volume of memory due to the extensive vocabulary size (e.g., for LLaMA-3, the vocabulary size is 128k).
+
+The introduction of multiple heads can easily lead to OOM (Out of Memory) issues. However, thanks to the efficient Liger fused CE, which calculates the gradient in place and doesn't materialize the logits, we have observed very effective results. This efficiency opens up more opportunities for multi-token prediction research and development.
+
+
+### How to Run
+
+!!! Example
+
+```bash
+git clone git@github.com:linkedin/Liger-Kernel.git
+cd {PATH_TO_Liger-Kernel}/Liger-Kernel/
+pip install -e .
+cd {PATH_TO_Liger-Kernel}/Liger-Kernel/examples/medusa
+pip install -r requirements.txt
+sh scripts/llama3_8b_medusa.sh
+```
+
+!!! Notes
+
+1. This example uses an optional `use_liger` flag. If true, it does a monkey patch to apply liger kernel with medusa heads.
+
+2. The example uses Llama3 model that requires community license agreement and HuggingFace Hub login. If you want to use Llama3 in this example, please make sure you have done the followings:
+    * Agree on the community license agreement https://huggingface.co/meta-llama/Meta-Llama-3-8B
+    * Run `huggingface-cli login` and enter your HuggingFace token
+
+3. The default hyperparameters and configurations work on single node with 8xA100 GPUs. For running on device with less GPU RAM, please consider reducing the per-GPU batch size and/or enable `CPUOffload` in FSDP.
+
+4. We are using a smaller sample of shared GPT data primarily to benchmark performance. The example requires hyperparameter tuning and dataset selection to work effectively, also ensuring the dataset has the same distribution as the LLaMA pretraining data. Welcome contribution to enhance the example code.
+
+### Benchmark Result
+
+!!! Info
+> 1. Benchmark conditions: LLaMA 3-8B, Batch Size = 6, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
+
+#### Stage 1
+
+Stage 1 refers to Medusa-1 where the backbone model is frozen and only weights of LLM heads are updated.
+
+!!! Warning
+```bash
+# Modify this flag in llama3_8b_medusa.sh to True enables stage1 
+--medusa_only_heads True
+```
+
+#### num_head = 3
+
+![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Memory_Stage1_num_head_3.png)
+![Throughput](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png)
+
+#### num_head = 5
+
+![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Memory_Stage1_num_head_5.png)
+![Throughput](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png)
+
+#### Stage 2
+
+!!! Warning
+```bash
+# Modify this flag to False in llama3_8b_medusa.sh enables stage2
+--medusa_only_heads False
+```
+
+Stage 2 refers to Medusa-2 where all the model weights are updated including the backbone model and llm heads.
+
+#### num_head = 3
+
+![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Memory_Stage2_num_head_3.png)
+![Throughput](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png)
+
+#### num_head = 5
+
+![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Memory_Stage2_num_head_5.png)
+![Throughput](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png)
+
+
+## Vision-Language Model SFT
+
+## How to Run
+
+### Locally on a GPU Machine
+You can run the example locally on a GPU machine. The default hyperparameters and configurations work on single node with 4xA100 80GB GPUs.
+
+!!! Example
+```bash
+#!/bin/bash
+
+torchrun --nnodes=1 --nproc-per-node=4 training_multimodal.py \
+    --model_name "Qwen/Qwen2-VL-7B-Instruct" \
+    --bf16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --eval_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 6e-6 \
+    --weight_decay 0.05 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --include_num_input_tokens_seen \
+    --report_to none \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_config config/fsdp_config.json \
+    --seed 42 \
+    --use_liger True \
+    --output_dir multimodal_finetuning
+```
+
+## ORPO Trainer
+
+### How to Run
+
+#### Locally on a GPU Machine
+
+You can run the example locally on a GPU machine and FSDP.
+
+!!! Example
+```py
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import ORPOConfig  # noqa: F401
+
+from liger_kernel.transformers.trainer import LigerORPOTrainer  # noqa: F401
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.2-1B-Instruct",
+    dtype=torch.bfloat16,
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "meta-llama/Llama-3.2-1B-Instruct",
+    max_length=512,
+    padding="max_length",
+)
+tokenizer.pad_token = tokenizer.eos_token
+
+train_dataset = load_dataset("trl-lib/tldr-preference", split="train")
+
+training_args = ORPOConfig(
+    output_dir="Llama3.2_1B_Instruct",
+    beta=0.1,
+    max_length=128,
+    per_device_train_batch_size=32,
+    max_steps=100,
+    save_strategy="no",
+)
+
+trainer = LigerORPOTrainer(
+    model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset
+)
+
+trainer.train()
+```
\ No newline at end of file
diff --git a/docs/Getting-Started.md b/docs/Getting-Started.md
new file mode 100755
index 0000000000000000000000000000000000000000..3b6af54777479a2ad7078b338cf1da138754be0d
--- /dev/null
+++ b/docs/Getting-Started.md
@@ -0,0 +1,64 @@
+There are a couple of ways to apply Liger kernels, depending on the level of customization required.
+
+### 1. Use AutoLigerKernelForCausalLM
+
+Using the `AutoLigerKernelForCausalLM` is the simplest approach, as you don't have to import a model-specific patching API. If the model type is supported, the modeling code will be automatically patched using the default settings.
+
+!!! Example
+
+  ```python
+  from liger_kernel.transformers import AutoLigerKernelForCausalLM
+
+  # This AutoModel wrapper class automatically monkey-patches the
+  # model with the optimized Liger kernels if the model is supported.
+  model = AutoLigerKernelForCausalLM.from_pretrained("path/to/some/model")
+  ```
+
+### 2. Apply Model-Specific Patching APIs
+
+Using the [patching APIs](https://github.com/linkedin/Liger-Kernel?tab=readme-ov-file#patching), you can swap Hugging Face models with optimized Liger Kernels.
+
+!!! Example
+
+```python
+import transformers
+from liger_kernel.transformers import apply_liger_kernel_to_llama
+
+# 1a. Adding this line automatically monkey-patches the model with the optimized Liger kernels
+apply_liger_kernel_to_llama()
+
+# 1b. You could alternatively specify exactly which kernels are applied
+apply_liger_kernel_to_llama(
+  rope=True,
+  swiglu=True,
+  cross_entropy=True,
+  fused_linear_cross_entropy=False,
+  rms_norm=False
+)
+
+# 2. Instantiate patched model
+model = transformers.AutoModelForCausalLM("path/to/llama/model")
+```
+
+### 3. Compose Your Own Model
+
+You can take individual [kernels](https://github.com/linkedin/Liger-Kernel?tab=readme-ov-file#model-kernels) to compose your models.
+
+!!! Example
+
+```python
+from liger_kernel.transformers import LigerFusedLinearCrossEntropyLoss
+import torch.nn as nn
+import torch
+
+model = nn.Linear(128, 256).cuda()
+
+# fuses linear + cross entropy layers together and performs chunk-by-chunk computation to reduce memory
+loss_fn = LigerFusedLinearCrossEntropyLoss()
+
+input = torch.randn(4, 128, requires_grad=True, device="cuda")
+target = torch.randint(256, (4, ), device="cuda")
+
+loss = loss_fn(model.weight, input, target)
+loss.backward()
+```
\ No newline at end of file
diff --git a/docs/High-Level-APIs.md b/docs/High-Level-APIs.md
new file mode 100755
index 0000000000000000000000000000000000000000..5433e03d38f05d3078361f950a7fd4fbc8bf0598
--- /dev/null
+++ b/docs/High-Level-APIs.md
@@ -0,0 +1,93 @@
+# High-Level APIs
+
+## AutoModel
+
+| **AutoModel Variant** | **API** |
+|------------------------|---------|
+| AutoModelForCausalLM | `liger_kernel.transformers.AutoLigerKernelForCausalLM` |
+
+This API extends the implementation of the `AutoModelForCausalLM` within the `transformers` library from Hugging Face.
+
+::: liger_kernel.transformers.AutoLigerKernelForCausalLM
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+        show_source: true
+
+!!! Example "Try it Out"
+    You can experiment as shown in this example [here](https://github.com/linkedin/Liger-Kernel?tab=readme-ov-file#1-use-autoligerkernelforcausallm).
+
+---
+
+## Patching
+
+You can also use the Patching APIs to use the kernels for a specific model architecture.
+
+| **Model**   | **API**                                                      | **Supported Operations**                                                |
+|-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
+| LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama`      | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
+| LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
+| Mistral     | `liger_kernel.transformers.apply_liger_kernel_to_mistral`    | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
+| Mixtral     | `liger_kernel.transformers.apply_liger_kernel_to_mixtral`    | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
+| Gemma1      | `liger_kernel.transformers.apply_liger_kernel_to_gemma`      | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
+| Gemma2      | `liger_kernel.transformers.apply_liger_kernel_to_gemma2`     | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy         |
+| Qwen2, Qwen2.5, & QwQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
+| Qwen2-VL    | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl`   | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy   |
+| Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy        |
+
+### Function Signatures
+
+::: liger_kernel.transformers.apply_liger_kernel_to_llama
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_mllama
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_mistral
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_mixtral
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_gemma
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_gemma2
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_qwen2
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
+
+::: liger_kernel.transformers.apply_liger_kernel_to_phi3
+    options:
+      extra:
+        show_docstring: true
+        show_signature: true
diff --git a/docs/Low-Level-APIs.md b/docs/Low-Level-APIs.md
new file mode 100755
index 0000000000000000000000000000000000000000..03cfcb0081c9455ecafbeabce2277217b29c9bd2
--- /dev/null
+++ b/docs/Low-Level-APIs.md
@@ -0,0 +1,133 @@
+## Model Kernels
+
+| **Kernel**                      | **API**                                                     |
+|---------------------------------|-------------------------------------------------------------|
+| RMSNorm                         | `liger_kernel.transformers.LigerRMSNorm`                    |
+| LayerNorm                       | `liger_kernel.transformers.LigerLayerNorm`                  |
+| RoPE                            | `liger_kernel.transformers.liger_rotary_pos_emb`            |
+| SwiGLU                          | `liger_kernel.transformers.LigerSwiGLUMLP`                  |
+| GeGLU                           | `liger_kernel.transformers.LigerGEGLUMLP`                   |
+| CrossEntropy                    | `liger_kernel.transformers.LigerCrossEntropyLoss`           |
+| Fused Linear CrossEntropy       | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
+| Multi Token Attention           | `liger_kernel.transformers.LigerMultiTokenAttention`        |
+| Softmax                         | `liger_kernel.transformers.LigerSoftmax`                    |
+| Sparsemax                       | `liger_kernel.transformers.LigerSparsemax`                  |
+| mHC (Hyper-Connections)         | `liger_kernel.transformers.LigerMHC`                        |
+
+
+### RMS Norm
+
+RMS Norm simplifies the LayerNorm operation by eliminating mean subtraction, which reduces computational complexity while retaining effectiveness. 
+
+This kernel performs normalization by scaling input vectors to have a unit root mean square (RMS) value. This method allows for a ~7x speed improvement and a ~3x reduction in memory footprint compared to
+implementations in PyTorch.
+
+!!! Example "Try it out"
+    You can experiment as shown in this example [here](https://colab.research.google.com/drive/1CQYhul7MVG5F0gmqTBbx1O1HgolPgF0M?usp=sharing).
+
+### RoPE
+
+RoPE (Rotary Position Embedding) enhances the positional encoding used in transformer models.
+
+The implementation allows for effective handling of positional information without incurring significant computational overhead.
+
+!!! Example "Try it out"
+    You can experiment as shown in this example [here](https://colab.research.google.com/drive/1llnAdo0hc9FpxYRRnjih0l066NCp7Ylu?usp=sharing).
+
+### SwiGLU 
+
+### GeGLU 
+
+### CrossEntropy
+
+This kernel is optimized for calculating the loss function used in classification tasks. 
+
+The  kernel achieves a ~3x execution speed increase and a ~5x reduction in memory usage for substantial vocabulary sizes compared to implementations in PyTorch.
+
+!!! Example "Try it out"
+    You can experiment as shown in this example [here](https://colab.research.google.com/drive/1WgaU_cmaxVzx8PcdKB5P9yHB6_WyGd4T?usp=sharing).
+
+### Fused Linear CrossEntropy
+
+This kernel combines linear transformations with cross-entropy loss calculations into a single operation.
+
+!!! Example "Try it out"
+    You can experiment as shown in this example [here](https://colab.research.google.com/drive/1Z2QtvaIiLm5MWOs7X6ZPS1MN3hcIJFbj?usp=sharing)
+
+### Multi Token Attention
+
+The Multi Token Attention kernel implementation provides and optimized fused implementation of multi-token attention over the implemented Pytorch model baseline. This is a new attention mechanism that can operate on multiple Q and K inputs introduced by Meta Research.
+
+Paper: https://arxiv.org/abs/2504.00927
+
+### Softmax
+
+The Softmax kernel implementation provides an optimized implementation of the softmax operation, which is a fundamental component in neural networks for converting raw scores into probability distributions.
+
+The implementation shows notable speedups compared to the Softmax PyTorch implementation
+
+
+### Sparsemax
+
+Sparsemax is a sparse alternative to softmax that produces sparse probability distributions. This kernel implements an efficient version of the sparsemax operation that can be used as a drop-in replacement for softmax in attention mechanisms or classification tasks.
+
+The implementation achieves significant speed improvements and memory savings compared to standard PyTorch implementations, particularly for large input tensors.
+
+### mHC (Manifold-Constrained Hyper-Connections)
+
+mHC implements fused Triton kernels for Manifold-Constrained Hyper-Connections ([arXiv:2512.24880](https://arxiv.org/abs/2512.24880)). It wraps an arbitrary layer `F: [..., C] -> [..., C]` with multiple residual streams, constraining the residual routing matrix `H_res` onto the Birkhoff polytope (doubly-stochastic matrices) via Sinkhorn-Knopp iterations to stabilize training.
+
+The `LigerMHC` module takes input of shape `[..., HC, C]` where `HC` is the number of residual streams, and performs:
+
+1. **Coefficients** -- Compute data-dependent routing coefficients (`h_pre`, `h_post`, `h_res`) via fused matmul + RMS normalization + Sinkhorn-Knopp iterations.
+2. **Pre-aggregate** -- `x_in = sum_i h_pre[i] * x[i]`  (shape: `[..., C]`)
+3. **Layer** -- `f_out = layer(x_in)`  (shape: `[..., C]`)
+4. **Post + residual** -- `x_out[o] = sum_i h_res[o,i] * x[i] + h_post[o] * f_out`  (shape: `[..., HC, C]`)
+
+Usage:
+
+```python
+import torch
+import torch.nn as nn
+from liger_kernel.transformers import LigerMHC
+
+# Wrap a linear layer with 4 residual streams of dimension 256
+layer = nn.Linear(256, 256, bias=False, device="cuda", dtype=torch.bfloat16)
+mhc = LigerMHC(layer, hc=4, c=256, phi_dtype=torch.bfloat16).cuda()
+
+# Input: [batch, seq_len, num_streams, channels] in BF16/FP16
+x = torch.randn(2, 128, 4, 256, device="cuda", dtype=torch.bfloat16)
+out = mhc(x)  # shape: [2, 128, 4, 256]
+```
+
+Functional APIs are also available:
+
+- `liger_kernel.transformers.functional.liger_mhc_coeffs` -- Compute routing coefficients
+- `liger_kernel.transformers.functional.liger_mhc_pre` -- Pre-aggregation
+- `liger_kernel.transformers.functional.liger_mhc_post_res` -- Post-aggregation + residual
+- `liger_kernel.transformers.functional.liger_mhc_apply` -- Combined pre + post_res
+- `liger_kernel.transformers.functional.liger_mhc_forward` -- Full forward pass (coeffs + pre + layer + post_res)
+
+## Alignment Kernels
+
+| **Kernel**                      | **API**                                                     |
+|---------------------------------|-------------------------------------------------------------|
+| Fused Linear CPO Loss           | `liger_kernel.chunked_loss.LigerFusedLinearCPOLoss`       |
+| Fused Linear DPO Loss           | `liger_kernel.chunked_loss.LigerFusedLinearDPOLoss`       |
+| Fused Linear ORPO Loss          | `liger_kernel.chunked_loss.LigerFusedLinearORPOLoss`      |
+| Fused Linear SimPO Loss         | `liger_kernel.chunked_loss.LigerFusedLinearSimPOLoss`     |
+
+## Distillation Kernels
+
+| **Kernel**                      | **API**                                                     |
+|---------------------------------|-------------------------------------------------------------|
+| KLDivergence                    | `liger_kernel.transformers.LigerKLDIVLoss`                  |
+| JSD                             | `liger_kernel.transformers.LigerJSD`                        |
+| Fused Linear JSD                  | `liger_kernel.transformers.LigerFusedLinearJSD`             |
+
+## Experimental Kernels
+
+| **Kernel**                      | **API**                                                     |
+|---------------------------------|-------------------------------------------------------------|
+| Embedding                       | `liger_kernel.transformers.experimental.LigerEmbedding`     |
+| Matmul int2xint8                | `liger_kernel.transformers.experimental.matmul` |
\ No newline at end of file
diff --git a/docs/acknowledgement.md b/docs/acknowledgement.md
new file mode 100755
index 0000000000000000000000000000000000000000..9dfdb4f7b8ba3f96bea71af1e7dc56d121723fab
--- /dev/null
+++ b/docs/acknowledgement.md
@@ -0,0 +1,23 @@
+
+### Design
+
+- [@claire_yishan](https://twitter.com/claire_yishan) for the LOGO design
+- [Wave Snippets](https://www.wavesnippets.com/) for generating the animated code snippets
+
+### Code
+
+We referenced or used the following projects:
+
+
+| # | Project                                                                                      | Description                                                                             | Location                                                                                                                         | License                                                                              |
+|---|----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|
+| 1 | [Unsloth](https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/utils.py#L43)                              | `calculate_settings` to determine block size and warp; We reuse it for Norm and MLP     | [Liger Kernel Utils](https://github.com/linkedin/Liger-Kernel/blob/e249eee723978bf8610ff1ea2297d048a2417e20/src/liger_kernel/ops/utils.py#L23) | [Apache](https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/LICENSE) |
+| 2 | [Unsloth](https://github.com/unslothai/unsloth/blob/976d11a10d54383aeb7a692c69e01151a20bfd72/unsloth/kernels/rms_layernorm.py#L48)                              | We modified and added dW calculation on top of Unsloth implementation                   | [Liger Kernel RMS Norm](https://github.com/linkedin/Liger-Kernel/blob/e249eee723978bf8610ff1ea2297d048a2417e20/src/liger_kernel/ops/rms_norm.py#L50)  | [Apache](https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/LICENSE) |
+| 3 | [Triton tutorial](https://triton-lang.org/main/index.html)                                    | We modified on top of triton tutorials                                                  | [Liger Kernel RMS Norm](https://github.com/linkedin/Liger-Kernel/blob/e249eee723978bf8610ff1ea2297d048a2417e20/src/liger_kernel/ops/rms_norm.py#L50)  | [MIT](https://github.com/triton-lang/triton/blob/main/LICENSE)                                  |
+| 4 | [tiny shakespeare dataset](https://huggingface.co/datasets/karpathy/tiny_shakespeare)         | We use tiny shakespeare dataset to conduct convergence test on mini model               | [Liger Kernel Convergence](https://github.com/linkedin/Liger-Kernel/tree/main/test/convergence)                                  | N/A                                                                                   |
+| 5 | [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy)                 | We use the idea of gradient-in-forward and chunking                                    | [Liger Kernel Linear Cross Entropy](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/fused_linear_cross_entropy.py)          | [MIT](https://github.com/mgmalek/efficient_cross_entropy/blob/main/LICENSE)            |
+| 6 | [Flash attn](https://github.com/Dao-AILab/flash-attention)                                    | We take many optimization ideas from the work, such as tiling and recomputation         |                                                                                                                                  | [BSD](https://github.com/Dao-AILab/flash-attention/blob/main/LICENSE)                  |
+| 7 | [AutoAWQ](https://github.com/casper-hansen/AutoAWQ)                                           | We reference the design of automodel                                                   | [Liger Kernel Auto Model](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/auto_model.py)        | [MIT](https://github.com/casper-hansen/AutoAWQ/blob/main/LICENSE)                      |
+| 8 | [llm.c](https://github.com/karpathy/llm.c)                                                    | We reference the design of end-to-end testing                                          | [Liger Kernel Convergence Tests](https://github.com/linkedin/Liger-Kernel/tree/main/test/convergence)                            | [MIT](https://github.com/karpathy/llm.c/blob/master/LICENSE)                           |
+
+Many thanks to the contributors to these projects for their invaluable work that helped make Liger possible.
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100755
index 0000000000000000000000000000000000000000..8388f6c432b2ceb82fefd48ddd304452229fe480
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,114 @@
+
+
+Thank you for your interest in contributing to Liger-Kernel! This guide will help you set up your development environment, add a new kernel, run tests, and submit a pull request (PR).
+
+### Maintainers
+@ByronHsu(admin) @qingquansong @yundai424 @kvignesh1420 @lancerts @JasonZhu1313 @shimizust @vaibhavjindal @tcc0403 @momochen
+
+## Interested in the ticket?
+
+Leave `#take` in the comment and tag the maintainer. 
+
+## Setting Up Your Development Environment
+
+1. **Clone the Repository**
+```sh
+git clone https://github.com/linkedin/Liger-Kernel.git
+cd Liger-Kernel
+```
+2. **Install Dependencies and Editable Package**
+```
+pip install . -e[dev]
+```
+If encounter error `no matches found: .[dev]`, please use
+```
+pip install -e .'[dev]'
+```
+3. **Install pre-commit hooks using [`prek`](https://prek.j178.dev/), a `pre-commit` alternative built in rust**
+```
+prek install 
+```
+Run pre-commit check without committing (`-a` is equivalent to `--all-files`)
+```
+prek run -a
+```
+
+## Structure
+
+### Source Code
+- `ops/`: Core Triton operations.
+- `transformers/`: PyTorch `nn.Module` implementations built on Triton operations, compliant with the `transformers` API.
+
+### Tests
+
+- `transformers/`: Correctness tests for the Triton-based layers.
+- `convergence/`: Patches Hugging Face models with all kernels, runs multiple iterations, and compares weights, logits, and loss layer-by-layer.
+
+### Benchmark
+
+- `benchmark/`: Execution time and memory benchmarks compared to Hugging Face layers.
+
+## Adding support for a new model
+To get familiar with the folder structure, please refer [here](https://github.com/linkedin/Liger-Kernel?tab=readme-ov-file#structure.).
+
+1. **Figure out the kernels that can be monkey-patched**
+    - Check the `src/liger_kernel/ops` directory to find the kernels that can be monkey-patched.
+    - Kernels like Fused Linear Cross Entropy require a custom lce_forward function to allow monkey-patching. For adding kernels requiring a similar approach, ensure that you create the corresponding forward function in the `src/liger_kernel/transformers/model` directory.
+
+2. **Monkey-patch the HuggingFace model**
+    - Add the monkey-patching code in the `src/liger_kernel/transformers/monkey_patch.py` file.
+    - Ensure that the monkey-patching function is added to the `__init__.py` file in the `src/liger_kernel/transformers/` directory.
+
+3. **Add Unit Tests**
+    - Create unit tests and convergence tests for the monkey-patched model in the tests directory. Ensure that your tests cover all functionalities of the monkey-patched model.
+
+## Adding a New Kernel
+To get familiar with the folder structure, please refer [here](https://github.com/linkedin/Liger-Kernel?tab=readme-ov-file#structure.).
+
+1. **Create Your Kernel**
+Add your kernel implementation in `src/liger_kernel/`.
+
+2. **Add Unit Tests**
+Create unit tests and convergence tests for your kernel in the tests directory. Ensure that your tests cover all kernel functionalities.
+
+3. **Add Benchmark Script**
+Add a benchmarking script under `benchmark/scripts` using the naming convention `benchmark_{kernel_name}.py` showing the performance difference between the Liger kernel and HuggingFace.
+
+## Run tests
+
+### Use Makefile to run full tests
+1. Run `make test` to ensure correctness.
+2. Run `make checkstyle` to ensure code style.
+3. Run `make test-convergence` to ensure convergence.
+
+### Run pytest on single file
+`python -m pytest test_sample.py::test_function_name`
+
+## Run kernel benchmarks
+The `/benchmark` directory contains benchmarking scripts for the individual kernels, demonstrating differences in speed and memory usage between using Liger and HuggingFace module implementations.
+
+1. Run `make run-benchmarks` to run all benchmarking scripts and append data to `benchmark/data/all_benchmark_data.csv`.
+   - Existing entries that are the same (based on `kernel_name`, `kernel_provider`, `kernel_operation_mode`, `metric_name`, `x_name`, `x_value`, `extra_benchmark_config_str`, and `gpu_name`) will not be overwritten.
+2. Run `make run-benchmarks OVERWRITE=1` to overwrite any existing entries that have the same configuration.
+3. Run `python benchmark/scripts/benchmark_{kernel_name}.py` to run an individual benchmark.
+4. You can use the `benchmark/benchmarks_visualizer.py` script to generate visualizations from the CSV, these are then saved to the `benchmark/visualizations` directory (note: this directory is not tracked by git).
+
+## Submit PR
+Fork the repo, copy and paste the successful test logs in the PR and submit the PR followed by the PR template (**[example PR](https://github.com/linkedin/Liger-Kernel/pull/21)**).
+
+> As a contributor, you represent that the code you submit is your original work or that of your employer (in which case you represent you have the right to bind your employer). By submitting code, you (and, if applicable, your employer) are licensing the submitted code to LinkedIn and the open source community subject to the BSD 2-Clause license.
+
+#### Release (Maintainer only)
+
+1. Bump the version in pyproject.toml to the desired version (for example, `0.2.0`)
+2. Submit a PR and merge
+3. Create a new release based on the current HEAD, tag name using `v<version number>` for example `v0.2.0`. Alternatively, If you want to create release based on a different commit hash, `git tag v0.2.0 <commit hash> && git push origin v0.2.0`, and create release based on this tag
+4. Adding release note: Minimum requirement is to click the `Generate Release Notes` button that will automatically generates 1) changes included, 2) new contributors. It's good to add sections on top to highlight the important changes.
+5. New pip uploading will be triggered upon a new release. NOTE: Both pre-release and official release will trigger the workflow to build wheel and publish to pypi, so please be sure that step 1-3 are followed correctly!
+
+### Notes on version
+Here we follow the [sematic versioning](https://semver.org/). Denote the version as `major.minor.patch`, we increment:
+
+- Major version when there is backward incompatible change.
+- Minor version when there is new backward-compatible functionality.
+- Patch version for bug fixes.
diff --git a/docs/images/banner.GIF b/docs/images/banner.GIF
new file mode 100755
index 0000000000000000000000000000000000000000..a6a3f63030044efd0dd184a15f4198fab7b085d4
Binary files /dev/null and b/docs/images/banner.GIF differ
diff --git a/docs/images/compose.gif b/docs/images/compose.gif
new file mode 100755
index 0000000000000000000000000000000000000000..1a7994e536d07d8f2c292e27ca1e0ebfd6a165ff
Binary files /dev/null and b/docs/images/compose.gif differ
diff --git a/docs/images/e2e-memory.png b/docs/images/e2e-memory.png
new file mode 100755
index 0000000000000000000000000000000000000000..ab2f9176055e353199e0bc0ac73e891c8acfe804
Binary files /dev/null and b/docs/images/e2e-memory.png differ
diff --git a/docs/images/e2e-tps.png b/docs/images/e2e-tps.png
new file mode 100755
index 0000000000000000000000000000000000000000..624ba96d956a92b4612e07edb00a3891328d7c78
Binary files /dev/null and b/docs/images/e2e-tps.png differ
diff --git a/docs/images/logo-banner.png b/docs/images/logo-banner.png
new file mode 100755
index 0000000000000000000000000000000000000000..fe69d0044269597f78d733cc594fe96c1c23d1d0
Binary files /dev/null and b/docs/images/logo-banner.png differ
diff --git a/docs/images/patch.gif b/docs/images/patch.gif
new file mode 100755
index 0000000000000000000000000000000000000000..851d239435fffbbf9ad886cc60567b29b854cd1d
Binary files /dev/null and b/docs/images/patch.gif differ
diff --git a/docs/images/post-training.png b/docs/images/post-training.png
new file mode 100755
index 0000000000000000000000000000000000000000..44e33c7be995e28710e2f62d521313cacace362b
Binary files /dev/null and b/docs/images/post-training.png differ
diff --git a/docs/index.md b/docs/index.md
new file mode 100755
index 0000000000000000000000000000000000000000..4342cdcd616cdfe55f0d982fb0a7e220b1bb3038
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,186 @@
+<a name="readme-top"></a>
+
+# Liger Kernel: Efficient Triton Kernels for LLM Training
+
+
+<table style="width: 100%; text-align: center; border-collapse: collapse;">
+    <tr>
+        <th style="padding: 10px;" colspan="2">Stable</th>
+        <th style="padding: 10px;" colspan="2">Nightly</th>
+        <th style="padding: 10px;">Discord</th>
+        <th style="padding: 10px;">Build</th>
+    </tr>
+    <tr>
+        <td style="padding: 10px;">
+            <a href="https://pepy.tech/project/liger-kernel">
+                <img src="https://static.pepy.tech/badge/liger-kernel" alt="Downloads (Stable)">
+            </a>
+        </td>
+        <td style="padding: 10px;">
+            <a href="https://pypi.org/project/liger-kernel">
+                <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/liger-kernel?color=green">
+            </a>
+        </td>
+        <td style="padding: 10px;">
+            <a href="https://pepy.tech/project/liger-kernel-nightly">
+                <img src="https://static.pepy.tech/badge/liger-kernel-nightly" alt="Downloads (Nightly)">
+            </a>
+        </td>
+        <td style="padding: 10px;">
+            <a href="https://pypi.org/project/liger-kernel-nightly">
+                <img alt="PyPI - Version" src="https://img.shields.io/pypi/v/liger-kernel-nightly?color=green">
+            </a>
+        </td>
+        <td style="padding: 10px;">
+            <a href="https://discord.gg/gpumode">
+                <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
+            </a>
+        </td>
+        <td style="padding: 10px;">
+            <div style="display: block;">
+                <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?event=schedule" alt="Build">
+                </a>
+            </div>
+            <div style="display: block;">
+                <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?event=schedule" alt="Build">
+                </a>
+            </div>
+        </td>
+    </tr>
+</table>
+
+
+
+<img src="https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/logo-banner.png">
+
+
+**Liger Kernel** is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU **training throughput by 20%** and reduces **memory usage by 60%**. We have implemented **Hugging Face Compatible** `RMSNorm`, `RoPE`, `SwiGLU`, `CrossEntropy`, `FusedLinearCrossEntropy`, and more to come. The kernel works out of the box with [Flash Attention](https://github.com/Dao-AILab/flash-attention), [PyTorch FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html), and [Microsoft DeepSpeed](https://github.com/microsoft/DeepSpeed). We welcome contributions from the community to gather the best kernels for LLM training.
+
+We've also added optimized Post-Training kernels that deliver **up to 80% memory savings** for alignment and distillation tasks. We support losses like DPO, CPO, ORPO, SimPO, JSD, and many more. Check out [how we optimize the memory](https://x.com/hsu_byron/status/1866577403918917655).
+
+## Supercharge Your Model with Liger Kernel
+
+With one line of code, Liger Kernel can increase throughput by more than 20% and reduce memory usage by 60%, thereby enabling longer context lengths, larger batch sizes, and massive vocabularies.
+
+
+| Speed Up                 | Memory Reduction        |
+|--------------------------|-------------------------|
+| ![Speed up](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-tps.png) | ![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-memory.png) |
+
+> **Note:**
+> - Benchmark conditions: LLaMA 3-8B, Batch Size = 8, Data Type = `bf16`, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
+> - Hugging Face models start to OOM at a 4K context length, whereas Hugging Face + Liger Kernel scales up to 16K.
+
+## Optimize Post Training with Liger Kernel
+
+<p align="center">
+    <img src="https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/post-training.png" width="50%" alt="Post Training">
+</p>
+
+We provide optimized post training kernels like DPO, ORPO, SimPO, and more which can reduce memory usage by up to 80%. You can easily use them as python modules.
+
+```python
+from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
+orpo_loss = LigerFusedLinearORPOLoss()
+y = orpo_loss(lm_head.weight, x, target)
+```
+
+#### Key Features
+
+- **Ease of use:** Simply patch your Hugging Face model with one line of code, or compose your own model using our Liger Kernel modules.
+- **Time and memory efficient:** In the same spirit as Flash-Attn, but for layers like **RMSNorm**, **RoPE**, **SwiGLU**, and **CrossEntropy**! Increases multi-GPU training throughput by 20% and reduces memory usage by 60% with **kernel fusion**, **in-place replacement**, and **chunking** techniques.
+- **Exact:** Computation is exact—no approximations! Both forward and backward passes are implemented with rigorous unit tests and undergo convergence testing against training runs without Liger Kernel to ensure accuracy.
+- **Lightweight:** Liger Kernel has minimal dependencies, requiring only Torch and Triton—no extra libraries needed! Say goodbye to dependency headaches!
+- **Multi-GPU supported:** Compatible with multi-GPU setups (PyTorch FSDP, DeepSpeed, DDP, etc.).
+- **Trainer Framework Integration**: [Axolotl](https://github.com/axolotl-ai-cloud/axolotl), [LLaMa-Factory](https://github.com/hiyouga/LLaMA-Factory), [SFTTrainer](https://github.com/huggingface/trl/releases/tag/v0.10.1), [Hugging Face Trainer](https://github.com/huggingface/transformers/pull/32860), [SWIFT](https://github.com/modelscope/ms-swift)
+
+### Installation
+
+To install the stable version:
+
+```bash
+$ pip install liger-kernel
+```
+
+To install the nightly version:
+
+```bash
+$ pip install liger-kernel-nightly
+```
+
+To install from source:
+
+```bash
+git clone https://github.com/linkedin/Liger-Kernel.git
+cd Liger-Kernel
+
+# Install Default Dependencies
+# Setup.py will detect whether you are using AMD or NVIDIA
+pip install -e .
+
+# Setup Development Dependencies
+pip install -e ".[dev]"
+```
+
+!!! Note " Dependencies " 
+
+    #### CUDA
+
+    - `torch >= 2.1.2`
+    - `triton >= 2.3.0`
+
+    #### ROCm
+
+    - `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
+    - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
+
+!!!Tip "Optional Dependencies "
+
+    - `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
+
+!!! Note
+     Our kernels inherit the full spectrum of hardware compatibility offered by [Triton](https://github.com/triton-lang/triton).
+
+
+#### Sponsorship and Collaboration
+ 
+- [AMD](https://www.amd.com/en.html): Providing AMD GPUs for our AMD CI.
+- [Intel](https://www.intel.com/): Providing Intel GPUs for our Intel CI.
+- [Modal](https://modal.com/): Free 3000 credits from GPU MODE IRL for our NVIDIA CI.
+- [EmbeddedLLM](https://embeddedllm.com/): Making Liger Kernel run fast and stable on AMD. 
+- [HuggingFace](https://huggingface.co/): Integrating Liger Kernel into Hugging Face Transformers and TRL.
+- [Lightning AI](https://lightning.ai/): Integrating Liger Kernel into Lightning Thunder.
+- [Axolotl](https://axolotl.ai/): Integrating Liger Kernel into Axolotl.
+- [Llama-Factory](https://github.com/hiyouga/LLaMA-Factory): Integrating Liger Kernel into Llama-Factory.
+
+
+!!! Note " Contact " 
+
+    - For issues, create a Github ticket in this repository .
+    - For open discussion, join [our discord channel](https://discord.gg/gpumode) .
+    - For formal collaboration, send an email to byhsu@linkedin.com .
+
+### Cite this work
+
+Bib Latex entry:
+```bib
+@inproceedings{
+hsu2025ligerkernel,
+title={Liger-Kernel: Efficient Triton Kernels for {LLM} Training},
+author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen and Zhipeng Wang},
+booktitle={Championing Open-source DEvelopment in ML Workshop @ ICML25},
+year={2025},
+url={https://openreview.net/forum?id=36SjAIT42G}
+}
+```
+
+### Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://star-history.com/#linkedin/Liger-Kernel&Date)
+
+<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
+    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
+        ↑ Back to Top ↑
+    </a>
+</p>
diff --git a/docs/license.md b/docs/license.md
new file mode 100755
index 0000000000000000000000000000000000000000..53e5e7d25e9487689904dcf12a22297d9b4e85a9
--- /dev/null
+++ b/docs/license.md
@@ -0,0 +1,8 @@
+This project is licensed under the [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE) License (see `LICENSE` for details).
+It also includes components from projects licensed under:
+
+- Apache License 2.0 (see `LICENSE-APACHE-2.0` for details).
+- MIT License (see `LICENSE-MIT-AutoAWQ` for details).
+- MIT License (see `LICENSE-MIT-Efficient Cross Entropy` for details).
+- MIT License (see `LICENSE-MIT-llmc` for details).
+- MIT License (see `LICENSE-MIT-triton` for details).
\ No newline at end of file
diff --git a/examples/alignment/accelerate_config.yaml b/examples/alignment/accelerate_config.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..e70f3cdcf7744b6a542583ad87fc67bbe95d835e
--- /dev/null
+++ b/examples/alignment/accelerate_config.yaml
@@ -0,0 +1,26 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/examples/alignment/run_orpo.py b/examples/alignment/run_orpo.py
new file mode 100755
index 0000000000000000000000000000000000000000..7dc9450c0160a70dc595c910529dfcfc265c3943
--- /dev/null
+++ b/examples/alignment/run_orpo.py
@@ -0,0 +1,35 @@
+import torch
+
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+from trl import ORPOConfig  # noqa: F401
+
+from liger_kernel.transformers.trainer import LigerORPOTrainer  # noqa: F401
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.2-1B-Instruct",
+    dtype=torch.bfloat16,
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "meta-llama/Llama-3.2-1B-Instruct",
+    max_length=512,
+    padding="max_length",
+)
+tokenizer.pad_token = tokenizer.eos_token
+
+train_dataset = load_dataset("trl-lib/tldr-preference", split="train")
+
+training_args = ORPOConfig(
+    output_dir="Llama3.2_1B_Instruct",
+    beta=0.1,
+    max_length=128,
+    per_device_train_batch_size=32,
+    max_steps=100,
+    save_strategy="no",
+)
+
+trainer = LigerORPOTrainer(model=model, args=training_args, tokenizer=tokenizer, train_dataset=train_dataset)
+
+trainer.train()
diff --git a/examples/huggingface/README.md b/examples/huggingface/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..41de0dcb737ae912d3935e10d23f23f9ebe212e5
--- /dev/null
+++ b/examples/huggingface/README.md
@@ -0,0 +1,55 @@
+# Liger-Kernel Example with HuggingFace Trainer
+
+## How to Run
+
+### Locally on a GPU machine
+You can run the example locally on a GPU machine. The default hyperparameters and configurations work on single node with 4xA100 80GB GPUs.
+
+```bash
+pip install -r requirements.txt
+sh run_{MODEL}.sh
+```
+
+### Remotely on Modal
+If you do not have access to a GPU machine, you can run the example on Modal. Modal is a serverless platform that allows you to run your code on a remote GPU machine. You can sign up for a free account at [Modal](https://www.modal.com/).
+
+```bash
+pip install modal
+modal setup  # authenticate with Modal
+modal run launch_on_modal.py --script "run_qwen2_vl.sh"
+```
+
+**Notes**
+1. This example uses an optional `use_liger` flag. If true, it does a 1 line monkey patch to apply liger kernel.
+2. The example uses Llama3 model that requires community license agreement and HuggingFace Hub login. If you want to use Llama3 in this example, please make sure you have done the followings:
+    * Agree on the community license agreement https://huggingface.co/meta-llama/Meta-Llama-3-8B
+    * Run `huggingface-cli login` and enter your HuggingFace token
+3. The default hyperparameters and configurations work on single node with 4xA100 80GB GPUs. For running on device with less GPU RAM, please consider reducing the per-GPU batch size and/or enable `CPUOffload` in FSDP.
+
+
+## Benchmark Result
+
+### LLaMA
+Benchmark conditions: LLaMA 3-8B, Alpaca Dataset, Max seq len = 512, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 4 A100s.
+
+Throughput improves by around 20%, while GPU memory usage drops by 40%. This allows you to train the model on smaller GPUs, use larger batch sizes, or handle longer sequence lengths without incurring additional costs.
+
+![Throughput](img/llama_tps.png)
+![GPU Memory Allocated](img/llama_mem_alloc.png)
+
+### QWEN
+Benchmark conditions: Qwen2-7B, Alpaca Dataset, Max seq len = 512, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 4 A100s.
+
+Throughput improves by around 10%, while GPU memory usage drops by 50%.
+
+![Throughput](img/qwen_tps.png)
+![GPU Memory Allocated](img/qwen_mem_alloc.png)
+
+
+### GEMMA 7B
+Benchmark conditions: Gemma-7B, Alpaca Dataset, Max seq len = 512, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 4 A100s.
+
+Throughput improves by around 24%, while GPU memory usage drops by 33%.
+
+![Throughput](img/gemma_7b_mem.png)
+![GPU Memory Allocated](img/gemma_7b_tp.png)
diff --git a/examples/huggingface/callback.py b/examples/huggingface/callback.py
new file mode 100755
index 0000000000000000000000000000000000000000..c834fc56634a4444fb27edd2b1f9a27f061c0001
--- /dev/null
+++ b/examples/huggingface/callback.py
@@ -0,0 +1,257 @@
+import time
+
+from dataclasses import dataclass
+
+import torch
+import transformers
+
+from transformers import TrainerControl
+from transformers import TrainerState
+from transformers import TrainingArguments
+
+from liger_kernel.utils import infer_device
+
+# https://simple.wikipedia.org/wiki/Byte
+# For memory, we use binary system
+M_BIN_UNIT = 2**20
+# For metrics (tflops), we use decimal system
+T_DEC_UNIT = 10**12
+
+
+def round_to_n_decimal(x, n):
+    return round(x, n)
+
+
+@dataclass
+class Precision:
+    """
+    Precision is a dataclass to store the number of decimal points for each metric.
+    """
+
+    n_decimal_time: int
+    n_decimal_memory: int
+    n_decimal_TPS: int
+
+
+@dataclass
+class State:
+    """
+    State is a dataclass to store the internal state of the efficiency callback.
+    """
+
+    n_warmup_steps: int = 0
+    total_peak_memory_allocated: float = float("-inf")
+    total_peak_memory_reserved: float = float("-inf")
+
+    step_start_time: float = 0.0
+    elapsed_time: float = 0.0
+
+    elapsed_step: int = 0
+
+    step_start_tokens_seen: int = 0
+    elapsed_tokens_seen: int = 0
+
+    global_start_step: int = 0
+
+
+@dataclass
+class Time:
+    """
+    Time is a dataclass to store the time-related metrics.
+    """
+
+    step: int = 0
+    step_time_sec: float = 0.0
+    avg_step_time_sec: float = 0.0
+    time_to_completion_sec: float = 0.0
+    estimated_total_time_sec: float = 0.0
+
+
+@dataclass
+class Memory:
+    """
+    Memory is a dataclass to store the memory-related metrics.
+    """
+
+    step_peak_memory_allocated_MB: float = 0.0
+    step_peak_memory_reserved_MB: float = 0.0
+    total_peak_memory_allocated_MB: float = 0.0
+    total_peak_memory_reserved_MB: float = 0.0
+
+
+@dataclass
+class TPS:
+    """
+    TPS is a dataclass to store the tokens per second metrics.
+    """
+
+    step_tokens_per_second: float = 0.0
+    avg_tokens_per_second: float = 0.0
+
+
+class EfficiencyCallback(transformers.TrainerCallback):
+    """
+    EfficiencyCallback is a callback to track the efficiency of the training process.
+    The tracked stats include: step time, memory, and throughput.
+
+    It requires including `--include_num_input_tokens_seen` and `logging_steps=1` in the training arguments.
+
+    Args:
+        n_warmup_steps: number of warmup steps
+            The stats in the first n_warmup_steps will not be added into the aggregated stats
+            This is because the first few steps might take longer due to jit compliation and other initialization overheads
+        n_decimal_time: number of decimal points for time
+        n_decimal_memory: number of decimal points for memory
+        n_decimal_TPS: number of decimal points for TPS
+    """
+
+    def __init__(self, n_warmup_steps=2, n_decimal_time=2, n_decimal_memory=2, n_decimal_TPS=2):
+        self.state = State(
+            n_warmup_steps,
+        )
+
+        self.precision = Precision(n_decimal_time, n_decimal_memory, n_decimal_TPS)
+
+        self.time = Time()
+        self.memory = Memory()
+        self.tps = TPS()
+        self.device = infer_device()
+
+    def on_init_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """
+        Event called at the end of the initialization of the [`Trainer`].
+        """
+        if not args.include_num_input_tokens_seen:
+            raise Exception(
+                'Please pass training argument "--include_num_input_tokens_seen" to track tokens per second'
+            )
+        if args.logging_steps != 1:
+            raise Exception("Please set logging_steps=1 to track the efficiency metrics accurately")
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        # if loaded from checkpoints, global_start_step is not 1 but state.global_step
+        self.state.global_start_step = state.global_step
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: dict[str, float],
+        **kwargs,
+    ):
+        if state.global_step < (self.state.global_start_step + self.state.n_warmup_steps):
+            return
+        else:
+            # spread self.time, self.memory, self.tps to logs
+            logs.update(self.time.__dict__)
+            logs.update(self.memory.__dict__)
+            logs.update(self.tps.__dict__)
+
+    def on_step_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """
+        Event called at the beginning of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        # memory
+        getattr(torch, self.device).reset_peak_memory_stats()
+
+        # time
+        self.state.step_start_time = time.perf_counter()
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if state.global_step < (self.state.global_start_step + self.state.n_warmup_steps):
+            # The end the current step_start_tokens_seen is the start of next iteration
+
+            # tokens
+            self.state.step_start_tokens_seen = state.num_input_tokens_seen
+            return
+
+        # time
+        current_time = time.perf_counter()
+        step_time = current_time - self.state.step_start_time
+        self.state.elapsed_time += step_time
+
+        # step
+        global_step = state.global_step
+        self.state.elapsed_step += 1
+        avg_step_time = self.state.elapsed_time / self.state.elapsed_step
+
+        self.time.step = global_step
+        self.time.step_time_sec = round_to_n_decimal(step_time, self.precision.n_decimal_time)
+        self.time.avg_step_time_sec = round_to_n_decimal(avg_step_time, self.precision.n_decimal_time)
+        self.time.time_to_completion_sec = round_to_n_decimal(
+            avg_step_time * (state.max_steps - global_step),
+            self.precision.n_decimal_time,
+        )
+        self.time.estimated_total_time_sec = round_to_n_decimal(
+            avg_step_time * state.max_steps, self.precision.n_decimal_time
+        )
+
+        # memory
+        step_peak_memory_allocated = getattr(torch, self.device).memory.max_memory_allocated()
+        step_peak_memory_reserved = getattr(torch, self.device).memory.max_memory_reserved()
+
+        self.memory.step_peak_memory_allocated_MB = round_to_n_decimal(
+            step_peak_memory_allocated / M_BIN_UNIT, self.precision.n_decimal_memory
+        )
+        self.state.total_peak_memory_allocated = max(self.state.total_peak_memory_allocated, step_peak_memory_allocated)
+        self.memory.total_peak_memory_allocated_MB = round_to_n_decimal(
+            self.state.total_peak_memory_allocated / M_BIN_UNIT,
+            self.precision.n_decimal_memory,
+        )
+
+        self.memory.step_peak_memory_reserved_MB = round_to_n_decimal(
+            step_peak_memory_reserved / M_BIN_UNIT, self.precision.n_decimal_memory
+        )
+
+        self.state.total_peak_memory_reserved = max(self.state.total_peak_memory_reserved, step_peak_memory_reserved)
+
+        self.memory.total_peak_memory_reserved_MB = round_to_n_decimal(
+            self.state.total_peak_memory_reserved / M_BIN_UNIT,
+            self.precision.n_decimal_memory,
+        )
+
+        # tokens
+        step_tokens_seen = state.num_input_tokens_seen - self.state.step_start_tokens_seen
+
+        self.state.elapsed_tokens_seen += step_tokens_seen
+
+        self.tps.step_tokens_per_second = round_to_n_decimal(
+            step_tokens_seen / step_time,
+            self.precision.n_decimal_TPS,
+        )
+
+        self.tps.avg_tokens_per_second = round_to_n_decimal(
+            self.state.elapsed_tokens_seen / self.state.elapsed_time,
+            self.precision.n_decimal_TPS,
+        )
+
+        # The end the current step_start_tokens_seen is the start of next iteration
+
+        # tokens
+        self.state.step_start_tokens_seen = state.num_input_tokens_seen
diff --git a/examples/huggingface/config/fsdp_config.json b/examples/huggingface/config/fsdp_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..45894b0b50bf5f843837ebe2f07d199f5e8a8df0
--- /dev/null
+++ b/examples/huggingface/config/fsdp_config.json
@@ -0,0 +1,5 @@
+{
+    "backward_prefetch": "backward_pre",
+    "forward_prefetch": "true",
+    "activation_checkpointing": true
+}
\ No newline at end of file
diff --git a/examples/huggingface/img/gemma_7b_mem.png b/examples/huggingface/img/gemma_7b_mem.png
new file mode 100755
index 0000000000000000000000000000000000000000..940d0918bd2c74aaff45666b5d5b4f4778f3fb56
Binary files /dev/null and b/examples/huggingface/img/gemma_7b_mem.png differ
diff --git a/examples/huggingface/img/gemma_7b_tp.png b/examples/huggingface/img/gemma_7b_tp.png
new file mode 100755
index 0000000000000000000000000000000000000000..7163543df49e0fe615e6f881f7cb4209125146aa
Binary files /dev/null and b/examples/huggingface/img/gemma_7b_tp.png differ
diff --git a/examples/huggingface/img/llama_mem_alloc.png b/examples/huggingface/img/llama_mem_alloc.png
new file mode 100755
index 0000000000000000000000000000000000000000..8f89581e5c0c7f2838aa1a9a8bedb05789fe3e18
Binary files /dev/null and b/examples/huggingface/img/llama_mem_alloc.png differ
diff --git a/examples/huggingface/img/llama_tps.png b/examples/huggingface/img/llama_tps.png
new file mode 100755
index 0000000000000000000000000000000000000000..37dd35a3ee417ee213089c308c271748695a808a
Binary files /dev/null and b/examples/huggingface/img/llama_tps.png differ
diff --git a/examples/huggingface/img/qwen_mem_alloc.png b/examples/huggingface/img/qwen_mem_alloc.png
new file mode 100755
index 0000000000000000000000000000000000000000..9f4154bbb4bfe815938ebad940c8830c95b69891
Binary files /dev/null and b/examples/huggingface/img/qwen_mem_alloc.png differ
diff --git a/examples/huggingface/img/qwen_tps.png b/examples/huggingface/img/qwen_tps.png
new file mode 100755
index 0000000000000000000000000000000000000000..cbc86c8f41fddc118861541158645b632f2825ae
Binary files /dev/null and b/examples/huggingface/img/qwen_tps.png differ
diff --git a/examples/huggingface/launch_on_modal.py b/examples/huggingface/launch_on_modal.py
new file mode 100755
index 0000000000000000000000000000000000000000..1171ea42d94f55fbdbd954ba44b3a4447639d800
--- /dev/null
+++ b/examples/huggingface/launch_on_modal.py
@@ -0,0 +1,69 @@
+"""
+launch_on_modal.py
+
+This tool is designed to launch scripts using Modal.
+
+It sets up the necessary environment, including GPU resources and python dependencies,
+and executes the specified training script remotely.
+
+### Setup and Usage
+```bash
+pip install modal
+modal setup  # authenticate with Modal
+export HF_TOKEN="your_huggingface_token"  # if using a gated model such as llama3
+modal run launch_on_modal.py --script "run_qwen2_vl.sh"
+```
+
+### Caveats
+This tool is intended as an easy on-ramp to using Liger-Kernel for fine-tuning LLMs and
+VLMs - it is a reproducible way to run benchmarks and example scripts. However, it is not
+the best way to develop a model on Modal, as it re-downloads the model and dataset each
+time it is run. For iterative development, consider using `modal.Volume` to cache the
+model and dataset between runs.
+"""
+
+import os
+
+import modal
+
+from modal import gpu
+
+TWO_HOURS = 2 * 60 * 60
+SIXTEEN_GB = 16 * 1024
+
+app = modal.App("liger-example")
+
+image = modal.Image.debian_slim().pip_install_from_requirements("requirements.txt").copy_local_dir(".", "/root")
+
+if "HF_TOKEN" not in os.environ:
+    print("HF_TOKEN not found in environment variables, using an empty token.")
+hf_token_secret = modal.Secret.from_dict({"HF_TOKEN": os.environ.get("HF_TOKEN", "")})
+
+
+@app.function(
+    gpu=gpu.A100(count=4, size="80GB"),
+    image=image,
+    timeout=TWO_HOURS,
+    memory=SIXTEEN_GB,
+    secrets=[hf_token_secret],
+)
+def launch_script(script: str):
+    import subprocess
+
+    script_path = f"/root/{script}"
+    os.chmod(script_path, 0o755)  # make script executable
+
+    print(f"Running script: {script_path}")
+    subprocess.run([script_path], check=True, cwd="/root", env=os.environ.copy())
+
+
+@app.local_entrypoint()
+def main(script: str):
+    """
+    Launch a script remotely on modal.
+    ```bash
+    export HF_TOKEN="your_huggingface_token"  # if using a gated model such as llama3
+    modal run --detach launch_on_modal.py --script "run_qwen2_vl.sh"
+    ```
+    """
+    launch_script.remote(script=script)
diff --git a/examples/huggingface/requirements.txt b/examples/huggingface/requirements.txt
new file mode 100755
index 0000000000000000000000000000000000000000..d6d10e9ecd64b0f4e3ed93b43aea09e8b154a6fa
--- /dev/null
+++ b/examples/huggingface/requirements.txt
@@ -0,0 +1,6 @@
+transformers==4.45.2
+trl
+liger-kernel
+triton
+torch
+torchvision
\ No newline at end of file
diff --git a/examples/huggingface/run_benchmarks.sh b/examples/huggingface/run_benchmarks.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cf4234aeaa4d6ad56c742f23657dba14140b0138
--- /dev/null
+++ b/examples/huggingface/run_benchmarks.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+## Benchmarking Script
+## Runs the training script with different configurations and logs the results
+
+MODEL_TYPE="mistral"
+MODEL_PATH="mistralai/Mistral-7B-v0.1"
+USE_LIGER_VALUES=("True" "False")
+BATCH_SIZE_VALUES=(64 128 192)
+NUM_REP=5
+MAX_STEPS=20
+DATASET_PATH="tatsu-lab/alpaca"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+mkdir -p "${SCRIPT_DIR}/results"
+
+for USE_LIGER in "${USE_LIGER_VALUES[@]}"; do
+    for BATCH_SIZE in "${BATCH_SIZE_VALUES[@]}"; do
+        echo "Running with use_liger=$USE_LIGER and batch_size=$BATCH_SIZE"
+
+        for ((i=1; i<=NUM_REP; i++)); do
+
+            LOG_FILE="${SCRIPT_DIR}/results/${MODEL_TYPE}_use_liger_${USE_LIGER}_batch_size_${BATCH_SIZE}_rep_${i}.log"
+
+            torchrun --nnodes=1 --nproc-per-node=4 training.py \
+                --bf16 \
+                --num_train_epochs 1 \
+                --max_steps $MAX_STEPS \
+                --model_name $MODEL_PATH \
+                --dataset $DATASET_PATH \
+                --per_device_train_batch_size $BATCH_SIZE \
+                --per_device_eval_batch_size 16 \
+                --eval_strategy "no" \
+                --save_strategy "no" \
+                --learning_rate 6e-6 \
+                --weight_decay 0.05 \
+                --warmup_ratio 0.1 \
+                --lr_scheduler_type "cosine" \
+                --logging_steps 1 \
+                --include_num_input_tokens_seen \
+                --report_to none \
+                --fsdp "full_shard auto_wrap" \
+                --fsdp_config config/fsdp_config.json \
+                --seed 42 \
+                --use_liger $USE_LIGER \
+                --output_dir model_output_dir \
+                > $LOG_FILE
+
+            sleep 5
+        done
+    done
+done
\ No newline at end of file
diff --git a/examples/huggingface/run_gemma.sh b/examples/huggingface/run_gemma.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c882f5e7f720f32f34c3d8fdad92534c6da16411
--- /dev/null
+++ b/examples/huggingface/run_gemma.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+torchrun --nnodes=1 --nproc-per-node=4 training.py \
+    --model_name "google/gemma-7b-it" \
+    --bf16 \
+    --max_steps 20 \
+    --per_device_train_batch_size 24 \
+    --per_device_eval_batch_size 1 \
+    --eval_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 6e-6 \
+    --weight_decay 0.05 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --include_num_input_tokens_seen \
+    --report_to none \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_config config/fsdp_config.json \
+    --seed 42 \
+    --use_liger True \
+    --output_dir alpaca_finetuning
diff --git a/examples/huggingface/run_llama.sh b/examples/huggingface/run_llama.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b6a1fc73f74572ee56f4850e74021ea958e8843b
--- /dev/null
+++ b/examples/huggingface/run_llama.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+torchrun --nnodes=1 --nproc-per-node=4 training.py \
+    --bf16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 64 \
+    --per_device_eval_batch_size 64 \
+    --eval_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 6e-6 \
+    --weight_decay 0.05 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --include_num_input_tokens_seen \
+    --report_to none \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_config config/fsdp_config.json \
+    --seed 42 \
+    --use_liger True \
+    --output_dir alpaca_finetuning
diff --git a/examples/huggingface/run_qwen.sh b/examples/huggingface/run_qwen.sh
new file mode 100755
index 0000000000000000000000000000000000000000..54a157fbc265a36e75965f65e3686937d9eb1485
--- /dev/null
+++ b/examples/huggingface/run_qwen.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+torchrun --nnodes=1 --nproc-per-node=4 training.py \
+    --model_name "Qwen/Qwen2-7B" \
+    --bf16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 48 \
+    --per_device_eval_batch_size 64 \
+    --eval_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 6e-6 \
+    --weight_decay 0.05 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --include_num_input_tokens_seen \
+    --report_to none \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_config config/fsdp_config.json \
+    --seed 42 \
+    --use_liger True \
+    --output_dir alpaca_finetuning
diff --git a/examples/huggingface/run_qwen2_vl.sh b/examples/huggingface/run_qwen2_vl.sh
new file mode 100755
index 0000000000000000000000000000000000000000..963600f0120f0a9da95da417ca87c7cbf4e4f3ff
--- /dev/null
+++ b/examples/huggingface/run_qwen2_vl.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+torchrun --nnodes=1 --nproc-per-node=4 training_multimodal.py \
+    --model_name "Qwen/Qwen2-VL-7B-Instruct" \
+    --bf16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --eval_strategy "no" \
+    --save_strategy "no" \
+    --learning_rate 6e-6 \
+    --weight_decay 0.05 \
+    --warmup_ratio 0.1 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --include_num_input_tokens_seen \
+    --report_to none \
+    --fsdp "full_shard auto_wrap" \
+    --fsdp_config config/fsdp_config.json \
+    --seed 42 \
+    --use_liger True \
+    --output_dir multimodal_finetuning
diff --git a/examples/huggingface/training.py b/examples/huggingface/training.py
new file mode 100755
index 0000000000000000000000000000000000000000..d431b10111215a87c636c80da92763f329d2e6a9
--- /dev/null
+++ b/examples/huggingface/training.py
@@ -0,0 +1,79 @@
+from dataclasses import dataclass
+
+import datasets
+import torch
+import transformers
+
+from callback import EfficiencyCallback
+from trl import DataCollatorForCompletionOnlyLM
+from trl import SFTTrainer
+
+from liger_kernel.transformers import AutoLigerKernelForCausalLM
+
+
+@dataclass
+class CustomArguments:
+    model_name: str = "meta-llama/Meta-Llama-3-8B"
+    dataset: str = "tatsu-lab/alpaca"
+    max_seq_length: int = 512
+    use_liger: bool = False
+
+
+def formatting_prompts_func(example):
+    return example["text"]
+
+
+def train():
+    parser = transformers.HfArgumentParser((transformers.TrainingArguments, CustomArguments))
+    training_args, custom_args = parser.parse_args_into_dataclasses()
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        custom_args.model_name,
+        padding_side="left",
+        truncation_side="left",
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+
+    dataset = datasets.load_dataset(custom_args.dataset)["train"].train_test_split(test_size=0.1)
+    train_dataset = dataset["train"]
+    eval_dataset = dataset["test"]
+    response_prompt = tokenizer.encode("### Response:\n", add_special_tokens=False)
+    collator = DataCollatorForCompletionOnlyLM(
+        tokenizer=tokenizer,
+        response_template=response_prompt,
+        pad_to_multiple_of=16,
+    )
+
+    if custom_args.use_liger:
+        model = AutoLigerKernelForCausalLM.from_pretrained(
+            custom_args.model_name,
+            trust_remote_code=True,
+            use_cache=False,
+            dtype=torch.bfloat16,
+            # These args will get passed to the appropriate apply_liger_kernel_to_* function
+            # to override the default settings
+            # cross_entropy=True,
+            # fused_linear_cross_entropy=False,
+        )
+    else:
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            custom_args.model_name,
+            trust_remote_code=True,
+            use_cache=False,
+            dtype=torch.bfloat16,
+        )
+
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        data_collator=collator,
+        max_seq_length=custom_args.max_seq_length,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        formatting_func=formatting_prompts_func,
+        callbacks=[EfficiencyCallback()],
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    train()
diff --git a/examples/huggingface/training_multimodal.py b/examples/huggingface/training_multimodal.py
new file mode 100755
index 0000000000000000000000000000000000000000..1fcee87da2d79abccdb8b7608be6a8f57438c372
--- /dev/null
+++ b/examples/huggingface/training_multimodal.py
@@ -0,0 +1,169 @@
+import os
+
+from dataclasses import dataclass
+
+import datasets
+import torch
+import transformers
+
+from callback import EfficiencyCallback
+from datasets import Image as ImageFeature
+from trl import SFTTrainer
+
+from liger_kernel.transformers import monkey_patch
+
+
+@dataclass
+class CustomArguments:
+    model_name: str = "Qwen/Qwen2-VL-2B-Instruct"
+    dataset: str = "HuggingFaceM4/the_cauldron"
+    dataset_subset: str = "ai2d"
+    dataset_split: str = "train"
+    max_seq_length: int = 512
+    dataset_text_field: str = "texts"
+    use_liger: bool = False
+
+
+def construct_model_and_processor(model_name: str, use_liger: bool) -> torch.nn.Module:
+    if "Qwen2-VL" in model_name:
+        from transformers import Qwen2VLForConditionalGeneration
+
+        # These settings are used to reduce the memory footprint of the Qwen2-VL model,
+        # which supports training/inferences on images in their native resolution. Large
+        # images -> many visual tokens (a max of 16384) -> large memory consumption.
+        # If fine-tuning for a real-world application, consider these values carefully.
+        min_visual_tokens_per_image = 256
+        max_visual_tokens_per_image = 256
+
+        processor = transformers.AutoProcessor.from_pretrained(
+            model_name,
+            padding_side="left",
+            truncation_side="left",
+            min_pixels=min_visual_tokens_per_image * 28 * 28,  # patch size is 14x14
+            max_pixels=max_visual_tokens_per_image * 28 * 28,  # 4 patches / token
+        )
+        processor.tokenizer.pad_token = processor.tokenizer.eos_token
+        image_token_id = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>")
+
+        if use_liger:
+            print("Applying Liger Kernel to Qwen2-VL model")
+            monkey_patch.apply_liger_kernel_to_qwen2_vl(
+                # These args can be used to override the default Liger settings
+                # cross_entropy=True,
+                # fused_linear_cross_entropy=False,
+            )
+
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            pretrained_model_name_or_path=model_name,
+            use_cache=False,
+            dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            attn_implementation="sdpa",
+        )
+        return model, processor, image_token_id
+
+    raise NotImplementedError(f"Model {model_name} not supported")
+
+
+def _validate_and_extract_the_cauldron(examples) -> dict[str, list]:
+    batch_texts = []
+    batch_images = []
+    for images, texts in zip(examples["images"], examples["texts"]):
+        if not images:
+            raise ValueError("No image found in example from the_cauldron dataset")
+        if len(images) > 1:
+            raise ValueError("Only one image per example is supported")
+        batch_texts.extend(texts)
+        batch_images.extend([images[0]] * len(texts))
+    return {"texts": batch_texts, "images": batch_images}
+
+
+def _format_for_convo(example, tokenizer):
+    # cauldron data is already in message format {"user": ..., "assistant": ...}
+    text = example["texts"]
+    messages = [
+        {
+            "role": "user",
+            "content": [{"type": "image"}, {"type": "text", "text": text["user"]}],
+        },
+        {"role": "assistant", "content": [{"type": "text", "text": text["assistant"]}]},
+    ]
+    text = tokenizer.apply_chat_template(messages, tokenize=False)
+    return {"texts": text}
+
+
+def train():
+    parser = transformers.HfArgumentParser((transformers.TrainingArguments, CustomArguments))
+    training_args, custom_args = parser.parse_args_into_dataclasses()
+    training_args.remove_unused_columns = False  # required to not drop the image column
+    training_args.dataset_kwargs = {"skip_prepare_dataset": True}
+
+    model, processor, image_token_id = construct_model_and_processor(custom_args.model_name, custom_args.use_liger)
+
+    dataset = (
+        datasets.load_dataset(
+            custom_args.dataset,
+            custom_args.dataset_subset,
+            split=custom_args.dataset_split,
+        )
+        .map(
+            _validate_and_extract_the_cauldron,
+            batched=True,
+            num_proc=min(os.cpu_count(), 16),
+            desc="Extracting text and images",
+        )
+        .map(
+            _format_for_convo,
+            fn_kwargs={"tokenizer": processor.tokenizer},
+            desc="Formatting for convo",
+        )
+        .cast_column("images", ImageFeature())
+        .train_test_split(test_size=0.1)
+    )
+
+    train_dataset = dataset["train"]
+    eval_dataset = dataset["test"]
+
+    def collate_fn(examples):
+        """
+        Taken directly from the TRL documentation with minor modifications:
+        https://huggingface.co/docs/trl/en/sft_trainer#a-custom-collator-for-processing-multi-modal-data
+
+        Modifications:
+        1. `apply_chat_template` is used to preprocess the texts before training begins (see above)
+        2. `example["messages"]` -> `example["texts"]` to conform with the_cauldron dataset schema
+        3. Ignoring image tokens in the loss computation
+        """
+        # Get the texts and images
+        texts = [example["texts"] for example in examples]
+        images = [example["images"] for example in examples]
+
+        # Tokenize the texts and process the images
+        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
+
+        # The labels are the input_ids, and we mask the padding tokens in the loss computation
+        labels = batch["input_ids"].clone()
+        labels[labels == processor.tokenizer.pad_token_id] = -100
+
+        # Ignore the image token index in the loss computation
+        labels[labels == image_token_id] = -100
+        batch["labels"] = labels
+
+        return batch
+
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        data_collator=collate_fn,
+        max_seq_length=custom_args.max_seq_length,
+        dataset_text_field=custom_args.dataset_text_field,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=processor.tokenizer,
+        callbacks=[EfficiencyCallback()],
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    train()
diff --git a/examples/lightning/README.md b/examples/lightning/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..916d79d5fe28b2d6d19fbd84c19ebb4c22b98769
--- /dev/null
+++ b/examples/lightning/README.md
@@ -0,0 +1,21 @@
+# Liger-Kernel Example with Lightning Trainer
+
+## How to Run
+```bash
+pip install -r requirements.txt
+
+# For single L40 48GB GPU
+python training.py --model Qwen/Qwen2-0.5B-Instruct --num_gpu 1 --max_length 1024
+
+# For 8XA100 40GB
+python training.py --model meta-llama/Meta-Llama-3-8B --strategy deepspeed
+```
+
+**Notes**
+1. The example uses Llama3 model that requires community license agreement and HuggingFace Hub login. If you want to use Llama3 in this example, please make sure you have done the followings:
+    * Agree on the community license agreement https://huggingface.co/meta-llama/Meta-Llama-3-8B
+    * Run `huggingface-cli login` and enter your HuggingFace token
+2. The default hyperparameters and configurations for gemma works on single L40 48GB GPU and config for llama work on single node with 8xA100 40GB GPUs. For running on device with less GPU RAM, please consider reducing the per-GPU batch size and/or enable `CPUOffload` in FSDP.
+
+
+<!-- Benchmark TBD -->
\ No newline at end of file
diff --git a/examples/lightning/requirements.txt b/examples/lightning/requirements.txt
new file mode 100755
index 0000000000000000000000000000000000000000..dbba38e0e05e055aaf7acb8a83d7e8b3bad9f614
--- /dev/null
+++ b/examples/lightning/requirements.txt
@@ -0,0 +1,8 @@
+lightning
+transformers
+trl
+liger-kernel
+torch
+triton
+deepspeed
+tf-keras
\ No newline at end of file
diff --git a/examples/lightning/training.py b/examples/lightning/training.py
new file mode 100755
index 0000000000000000000000000000000000000000..ab11648783f5ae7c3867dfa097ac5880549e24cb
--- /dev/null
+++ b/examples/lightning/training.py
@@ -0,0 +1,281 @@
+import argparse
+import math
+import os
+
+from dataclasses import _MISSING_TYPE
+from dataclasses import dataclass
+
+import datasets
+import lightning.pytorch as pl
+import torch
+import transformers
+
+from lightning.pytorch.strategies import DeepSpeedStrategy
+from lightning.pytorch.strategies import FSDPStrategy
+from torch.distributed.fsdp import BackwardPrefetch
+from torch.distributed.fsdp import MixedPrecision
+from torch.utils.data import DataLoader
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
+from trl import DataCollatorForCompletionOnlyLM
+
+from liger_kernel.transformers import AutoLigerKernelForCausalLM
+from liger_kernel.utils import infer_device
+
+_RETAIN_COLUMNS = {"input_ids", "attention_mask", "labels"}
+QUESTION = "<Question>"
+CHOICES = "<Choices>"
+
+
+@dataclass
+class Args:
+    model: str = "Qwen/Qwen2-0.5B-Instruct"
+    data: str = "cais/mmlu"
+    output_dir: str = "mmlu_finetuning"
+    max_length: int = 2048
+    # for llam3 8B model, deepspeed will OOM with 16 on 8XA100 80G and 8 will OOM with 8XA100 40G
+    batch_size: int = 4
+    lr: float = 6e-6
+    weight_decay: float = 0.05
+    warmup_ratio: float = 0.1
+    seed: int = 42
+    strategy: str = "auto"
+    num_gpu: int = None
+
+
+def warmup_cosine_schedule(warmup_steps, total_steps, min_lr=0):
+    def lr_lambda(current_step):
+        if current_step < warmup_steps:
+            # Linear warmup
+            return float(current_step) / float(max(1, warmup_steps))
+        else:
+            # Cosine annealing
+            progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
+            return max(min_lr, 0.5 * (1 + math.cos(math.pi * progress)))
+
+    return lr_lambda
+
+
+def parse_args() -> Args:
+    parser = argparse.ArgumentParser()
+    for k, v in Args.__dataclass_fields__.items():
+        parser.add_argument(f"--{k}", type=v.type, default=v.default)
+    parsed = parser.parse_args()
+    return Args(**{k: v for k, v in vars(parsed).items() if not isinstance(v, _MISSING_TYPE)})
+
+
+class LanguageModel(pl.LightningModule):
+    def __init__(self, args: Args, tokenizer):
+        super().__init__()
+        self.args = args
+        self.tokenizer = tokenizer
+        self.model = None
+
+    def configure_model(self):
+        # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel/fsdp.html#speed-up-model-initialization
+        if self.model is not None:
+            return
+        self.model = AutoLigerKernelForCausalLM.from_pretrained(
+            self.args.model, use_cache=False, ignore_mismatched_sizes=True
+        )
+        if self.args.strategy == "deepspeed":
+            self.model.train()
+            self.model.gradient_checkpointing_enable()
+
+    def forward(self, input_ids, attention_mask, labels=None, **kwargs):
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
+
+    def training_step(self, batch):
+        outputs = self.model(
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            labels=batch["labels"],
+        )
+        loss = outputs.loss
+        self.log_dict(
+            {"train_loss": loss},
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+            rank_zero_only=True,
+            sync_dist=False,
+        )
+        return loss
+
+    def validation_step(self, batch):
+        outputs = self.model(
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            labels=batch["labels"],
+        )
+        loss = outputs.loss
+        self.log_dict(
+            {"val_loss": outputs.loss},
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            logger=True,
+            rank_zero_only=True,
+            sync_dist=True,
+        )
+        return loss
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.args.lr,
+            weight_decay=self.args.weight_decay,
+            fused=True,
+        )
+        lr_lambda = warmup_cosine_schedule(
+            warmup_steps=self.trainer.estimated_stepping_batches * self.args.warmup_ratio,
+            total_steps=self.trainer.estimated_stepping_batches,
+            min_lr=0,
+        )
+        lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {"scheduler": lr_scheduler, "interval": "step"},
+        }
+
+
+class DataModule(pl.LightningDataModule):
+    def __init__(self, tokenizer, args: Args):
+        super().__init__()
+        self.args = args
+        self.tokenizer = tokenizer
+        self.response_template_str = " <Answer>"
+        response_prompt = tokenizer.encode(f"{self.response_template_str}", add_special_tokens=False)
+        self.collator = DataCollatorForCompletionOnlyLM(
+            tokenizer=tokenizer,
+            response_template=response_prompt,
+            pad_to_multiple_of=16,
+        )
+
+    def formatting_func(self, example):
+        output_texts = []
+        for i in range(len(example["question"])):
+            choices = ""
+            for j in range(len(example["choices"][i])):
+                choices += f"{j + 1}. {example['choices'][i][j]}; "
+            s = "Below is a question and multiple choice answers, choices separated by a semicolon. Please select the best answer for the question. "
+            s += f"{QUESTION}{example['question'][i]} "
+            s += f"{CHOICES}{choices} "
+            s += f"{self.response_template_str}{example['answer'][i]}"
+            output_texts.append(s)
+        return output_texts
+
+    def tokenize(self, example):
+        outputs = self.tokenizer(
+            self.formatting_func(example),
+            truncation=True,
+            padding=False,
+            max_length=self.args.max_length,
+        )
+        return {
+            "input_ids": outputs["input_ids"],
+            "attention_mask": outputs["attention_mask"],
+        }
+
+    def setup(self, stage) -> None:
+        dataset = datasets.load_dataset(self.args.data, "auxiliary_train")
+        flattened_data = [
+            {
+                "answer": x["train"]["answer"],
+                "choices": x["train"]["choices"],
+                "question": x["train"]["question"],
+                "subject": x["train"]["subject"],
+            }
+            for x in dataset["train"]
+        ]
+        dataset = datasets.Dataset.from_list(flattened_data)
+        dataset = dataset.train_test_split(test_size=4096, seed=self.args.seed)
+        train_dataset, val_dataset = dataset["train"], dataset["test"]
+        self.train_dataset = train_dataset.map(
+            self.tokenize,
+            remove_columns=list(set(train_dataset.column_names) - _RETAIN_COLUMNS),
+            batched=True,
+            batch_size=1,
+            num_proc=4,
+        )
+        self.val_dataset = val_dataset.map(
+            self.tokenize,
+            remove_columns=list(set(val_dataset.column_names) - _RETAIN_COLUMNS),
+            batched=True,
+            batch_size=1,
+            num_proc=4,
+        )
+
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.args.batch_size,
+            collate_fn=self.collator,
+        )
+
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.args.batch_size,
+            collate_fn=self.collator,
+        )
+
+
+def train():
+    args = parse_args()
+    pl.seed_everything(args.seed)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    if "Meta-Llama-3-8B" in args.model:
+        layers = {LlamaDecoderLayer}
+    elif "Qwen2" in args.model:
+        layers = {Qwen2DecoderLayer}
+    else:
+        layers = {}
+        raise Warning(f"Unimplemented layer wrap policy for {args.model} in this example")
+
+    if args.strategy == "fsdp":
+        strategy = FSDPStrategy(
+            auto_wrap_policy=layers,
+            sharding_strategy="FULL_SHARD",
+            backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+            sync_module_states=True,
+            activation_checkpointing_policy=layers,
+            mixed_precision=MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16),
+            forward_prefetch=True,
+        )
+        precision = None
+    elif args.strategy == "deepspeed":
+        strategy = DeepSpeedStrategy(stage=3)
+        precision = "bf16-mixed"
+    elif args.strategy == "ddp":
+        strategy = "ddp"
+        precision = "bf16-true"
+    else:
+        strategy = "auto"
+        precision = "bf16-true"
+
+    device = infer_device()
+    trainer = pl.Trainer(
+        accelerator=device,
+        strategy=strategy,
+        devices=(getattr(torch, device).device_count() if args.num_gpu is None else args.num_gpu),
+        default_root_dir=args.output_dir,
+        log_every_n_steps=1,
+        max_epochs=1,
+        precision=precision,
+    )
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(args.model, padding_side="left", truncation_side="left")
+    tokenizer.pad_token = tokenizer.eos_token
+    data_module = DataModule(
+        tokenizer=tokenizer,
+        args=args,
+    )
+    model = LanguageModel(args=args, tokenizer=tokenizer)
+    trainer.fit(model, datamodule=data_module)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/examples/medusa/README.md b/examples/medusa/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..6d0ba99eb3c6cf39b5e5a85ba48f38854f409780
--- /dev/null
+++ b/examples/medusa/README.md
@@ -0,0 +1,72 @@
+# Liger-Kernel Example with Medusa
+
+Medusa is a simple framework that democratizes the acceleration techniques for LLM generation with multiple decoding heads. [[repo](https://arxiv.org/abs/2401.10774)], [[paper](https://arxiv.org/abs/2401.10774)]
+
+During training, Medusa requires adding \(k\) decoding heads to the hidden states right before the regular LM head \(h_t\). The \(k\)-th head is used to predict the token in the \((t + k + 1)\)-th position of the next tokens (the original language model head is used to predict the \((t + 1)\)-th position). 
+
+The Liger fused CE kernel is highly effective in this scenario, eliminating the need to materialize logits for each head, which usually consumes a large volume of memory due to the extensive vocabulary size (e.g., for LLaMA-3, the vocabulary size is 128k). The introduction of multiple heads can easily lead to OOM (Out of Memory) issues. However, thanks to the efficient Liger fused CE, which calculates the gradient in place and doesn't materialize the logits, we have observed very effective results. This efficiency opens up more opportunities for multi-token prediction research and development.
+
+
+# Instructions to Run the Training Script
+
+```
+git clone git@github.com:linkedin/Liger-Kernel.git
+cd {PATH_TO_Liger-Kernel}/Liger-Kernel/
+pip install -e .
+cd {PATH_TO_Liger-Kernel}/Liger-Kernel/examples/medusa
+pip install -r requirements.txt
+sh scripts/llama3_8b_medusa.sh
+```
+
+**Notes**
+1. This example uses an optional `use_liger` flag. If true, it does a monkey patch to apply liger kernel with medusa heads.
+2. The example uses Llama3 model that requires community license agreement and HuggingFace Hub login. If you want to use Llama3 in this example, please make sure you have done the followings:
+    * Agree on the community license agreement https://huggingface.co/meta-llama/Meta-Llama-3-8B
+    * Run `huggingface-cli login` and enter your HuggingFace token
+3. The default hyperparameters and configurations work on single node with 8xA100 GPUs. For running on device with less GPU RAM, please consider reducing the per-GPU batch size and/or enable `CPUOffload` in FSDP.
+4. We are using a smaller sample of shared GPT data primarily to benchmark performance. The example requires hyperparameter tuning and dataset selection to work effectively, also ensuring the dataset has the same distribution as the LLaMA pretraining data. Welcome contribution to enhance the example code.
+
+
+# Memory Profiling Result
+
+> **Note:**  
+> 1. Benchmark conditions: LLaMA 3-8B, Batch Size = 6, Data Type = bf16, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
+
+## Stage1
+
+Stage1 refers to Medusa-1 where the backbone model is frozen and only weights of LLM heads are updated.
+
+```
+# Modify this flag in llama3_8b_medusa.sh to True enables stage1 
+--medusa_only_heads True
+```
+
+### num_head = 3
+
+![Memory](./docs/images/Memory_Stage1_num_head_3.png)
+![Throughput](./docs/images/Throughput_Stage1_num_head_3.png)
+
+### num_head = 5
+
+![Memory](./docs/images/Memory_Stage1_num_head_5.png)
+![Throughput](./docs/images/Throughput_Stage1_num_head_5.png)
+
+## Stage2
+
+```
+# Modify this flag to False in llama3_8b_medusa.sh enables stage2
+--medusa_only_heads False
+```
+
+Stage2 refers to Medusa-2 where all the model weights are updated incuding backbone model and llm heads.
+
+### num_head = 3
+
+![Memory](./docs/images/Memory_Stage2_num_head_3.png)
+![Throughput](./docs/images/Throughput_Stage2_num_head_3.png)
+
+### num_head = 5
+
+![Memory](./docs/images/Memory_Stage2_num_head_5.png)
+![Throughput](./docs/images/Throughput_Stage2_num_head_5.png)
+
diff --git a/examples/medusa/callback.py b/examples/medusa/callback.py
new file mode 100755
index 0000000000000000000000000000000000000000..673243b770a871b42d3a99c22e0165e0a5c3b14e
--- /dev/null
+++ b/examples/medusa/callback.py
@@ -0,0 +1,386 @@
+import os
+import time
+
+from dataclasses import dataclass
+
+import torch
+import transformers
+
+from accelerate.utils.constants import FSDP_SHARDING_STRATEGY
+from transformers import TrainerControl
+from transformers import TrainerState
+from transformers import TrainingArguments
+
+from liger_kernel.utils import infer_device
+
+# https://simple.wikipedia.org/wiki/Byte
+# For memory, we use binary system
+M_BIN_UNIT = 2**20
+# For metrics (tflops), we use decimal system
+T_DEC_UNIT = 10**12
+
+
+def round_to_n_decimal(x, n):
+    return round(x, n)
+
+
+@dataclass
+class Precision:
+    """
+    Precision is a dataclass to store the number of decimal points for each metric.
+    """
+
+    n_decimal_time: int
+    n_decimal_memory: int
+    n_decimal_TPS: int
+    n_decimal_MFU: int
+
+
+@dataclass
+class State:
+    """
+    State is a dataclass to store the internal state of the efficiency callback.
+    """
+
+    n_warmup_steps: int = 0
+    total_peak_memory_allocated: float = float("-inf")
+    total_peak_memory_reserved: float = float("-inf")
+
+    step_start_time: float = 0.0
+    elapsed_time: float = 0.0
+
+    elapsed_step: int = 0
+
+    step_start_tokens_seen: int = 0
+    elapsed_tokens_seen: int = 0
+
+    step_start_flos: float = 0.0
+    elapsed_flos: float = 0.0
+
+    global_start_step: int = 0
+
+
+@dataclass
+class Time:
+    """
+    Time is a dataclass to store the time-related metrics.
+    """
+
+    step: int = 0
+    step_time_sec: float = 0.0
+    avg_step_time_sec: float = 0.0
+    time_to_completion_sec: float = 0.0
+    estimated_total_time_sec: float = 0.0
+
+
+@dataclass
+class Memory:
+    """
+    Memory is a dataclass to store the memory-related metrics.
+    """
+
+    step_peak_memory_allocated_MB: float = 0.0
+    total_peak_memory_allocated_MB: float = 0.0
+
+
+@dataclass
+class TPS:
+    """
+    TPS is a dataclass to store the tokens per second metrics.
+    """
+
+    step_tokens_per_second: float = 0.0
+    avg_tokens_per_second: float = 0.0
+
+
+@dataclass
+class MFU:
+    """
+    MFU is a dataclass to store the MFU metrics.
+    """
+
+    step_MFU: float = 0.0
+    avg_MFU: float = 0.0
+
+
+class EfficiencyCallback(transformers.TrainerCallback):
+    """
+    EfficiencyCallback is a callback to track the efficiency of the training process.
+    The tracked stats include: step time, memory, throughput, and MFU.
+
+    It requires including `--include_num_input_tokens_seen` and `logging_steps=1` in the training arguments.
+
+    Args:
+        n_warmup_steps: number of warmup steps
+            The stats in the first n_warmup_steps will not be added into the aggregated stats
+            This is because the first few steps might take longer due to jit compliation and other initialization overheads
+        n_decimal_time: number of decimal points for time
+        n_decimal_memory: number of decimal points for memory
+        n_decimal_TPS: number of decimal points for TPS
+        n_decimal_MFU: number of decimal points for MFU in percentage
+    """
+
+    def __init__(
+        self,
+        n_warmup_steps=2,
+        n_decimal_time=2,
+        n_decimal_memory=2,
+        n_decimal_TPS=2,
+        n_decimal_MFU=4,
+    ):
+        self.state = State(
+            n_warmup_steps,
+        )
+
+        self.precision = Precision(
+            n_decimal_time,
+            n_decimal_memory,
+            n_decimal_TPS,
+            n_decimal_MFU,
+        )
+
+        self.time = Time()
+        self.memory = Memory()
+        self.tps = TPS()
+        self.mfu = MFU()
+        self.device = infer_device()
+
+    def on_init_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """
+        Event called at the end of the initialization of the [`Trainer`].
+        """
+        if not args.include_num_input_tokens_seen:
+            raise Exception(
+                'Please pass training argument "--include_num_input_tokens_seen" to track tokens per second'
+            )
+        if args.logging_steps != 1:
+            raise Exception("Please set logging_steps=1 to track the efficiency metrics accurately")
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        # if loaded from checkpoints, global_start_step is not 1 but state.global_step
+        self.state.global_start_step = state.global_step
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: dict[str, float],
+        **kwargs,
+    ):
+        if state.global_step < (self.state.global_start_step + self.state.n_warmup_steps):
+            return
+        else:
+            # spread self.time, self.memory, self.tps, self.mfu to logs
+            # logs.update(self.time.__dict__)
+            logs.update(self.memory.__dict__)
+            logs.update(self.tps.__dict__)
+            # logs.update(self.mfu.__dict__)
+
+    def on_step_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """
+        Event called at the beginning of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        # memory
+        getattr(torch, self.device).reset_peak_memory_stats()
+
+        # time
+        self.state.step_start_time = time.perf_counter()
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if state.global_step < (self.state.global_start_step + self.state.n_warmup_steps):
+            # The end the current step_start_tokens_seen and step_start_flos are the start of next iteration
+
+            # tokens
+            self.state.step_start_tokens_seen = state.num_input_tokens_seen
+            # flos
+            self.state.step_start_flos = state.total_flos
+            return
+
+        # time
+        current_time = time.perf_counter()
+        step_time = current_time - self.state.step_start_time
+        self.state.elapsed_time += step_time
+
+        # step
+        global_step = state.global_step
+        self.state.elapsed_step += 1
+        avg_step_time = self.state.elapsed_time / self.state.elapsed_step
+
+        self.time.step = global_step
+        self.time.step_time_sec = round_to_n_decimal(step_time, self.precision.n_decimal_time)
+        self.time.avg_step_time_sec = round_to_n_decimal(avg_step_time, self.precision.n_decimal_time)
+        self.time.time_to_completion_sec = round_to_n_decimal(
+            avg_step_time * (state.max_steps - global_step),
+            self.precision.n_decimal_time,
+        )
+        self.time.estimated_total_time_sec = round_to_n_decimal(
+            avg_step_time * state.max_steps, self.precision.n_decimal_time
+        )
+
+        # memory
+        step_peak_memory_allocated = getattr(torch, self.device).memory.max_memory_allocated()
+        step_peak_memory_reserved = getattr(torch, self.device).memory.max_memory_reserved()
+
+        self.memory.step_peak_memory_allocated_MB = round_to_n_decimal(
+            step_peak_memory_allocated / M_BIN_UNIT, self.precision.n_decimal_memory
+        )
+        self.state.total_peak_memory_allocated = max(self.state.total_peak_memory_allocated, step_peak_memory_allocated)
+        self.memory.total_peak_memory_allocated_MB = round_to_n_decimal(
+            self.state.total_peak_memory_allocated / M_BIN_UNIT,
+            self.precision.n_decimal_memory,
+        )
+
+        self.memory.step_peak_memory_reserved_MB = round_to_n_decimal(
+            step_peak_memory_reserved / M_BIN_UNIT, self.precision.n_decimal_memory
+        )
+
+        self.state.total_peak_memory_reserved = max(self.state.total_peak_memory_reserved, step_peak_memory_reserved)
+
+        self.memory.total_peak_memory_reserved_MB = round_to_n_decimal(
+            self.state.total_peak_memory_reserved / M_BIN_UNIT,
+            self.precision.n_decimal_memory,
+        )
+
+        # tokens
+        step_tokens_seen = state.num_input_tokens_seen - self.state.step_start_tokens_seen
+
+        self.state.elapsed_tokens_seen += step_tokens_seen
+
+        self.tps.step_tokens_per_second = round_to_n_decimal(
+            step_tokens_seen / step_time,
+            self.precision.n_decimal_TPS,
+        )
+
+        self.tps.avg_tokens_per_second = round_to_n_decimal(
+            self.state.elapsed_tokens_seen / self.state.elapsed_time,
+            self.precision.n_decimal_TPS,
+        )
+
+        # flos
+        step_flos = state.total_flos - self.state.step_start_flos
+        self.state.elapsed_flos += step_flos
+
+        # MFU
+        # 1. Definition
+        #
+        # MFU is defined as (achieved TPS) / (theoretical maximum TPS) = (achieved floating point operations per sec) / (theoretical maximum floating point operations per sec)
+        # Crucially, the "theoretical maximum" throughput only accounts for the required operations to compute the forward+backward passes, and not rematerialization. MFU therefore allows fair comparisons
+        # between training runs on different systems, as the numerator is simply the observed tokens-per-second, and the denominator is only dependent on the model architecture and published maximum FLOPs for a given system.
+        # Ref: https://arxiv.org/pdf/2204.02311
+        # The benefit of MFU is that it
+        #
+        # 2. Implementation in huggingface
+        #
+        # current_flos = 6 * estimate_tokens(input_dict) * num_parameters()
+        # total_flos = sum(current_flos) # across all GPUs
+        # Ref: https://github.com/huggingface/transformers/blob/616bb11d487aabc231bb230b245c42214ea4b254/src/transformers/modeling_utils.py#L1196
+        #
+        # 3. Derive MFU on rank 0
+        #
+        # rank_0_flos = tatal_flos / n_gpus = measured_flos / effecitve_n_gpus
+        # rank_0_MFU = rank_0_flos / step_time
+        #
+        # For FSDP, num_parameters() is (1 / n_gpus) of the total parameters. So, the effective_n_gpus = 1
+        # For HSDP, num_parameters() is (1 / local_world_size) of the total parameters. So, the effective_n_gpus = n_nodes
+        # For no sharding and zero-2, num_parameters() is the total parameters. So, the effective_n_gpus = n_gpus
+
+        num_gpus = EfficiencyCallback._get_effective_num_gpus()
+        step_achieved_tflops = step_flos / step_time / num_gpus / T_DEC_UNIT
+
+        avg_achieved_tflops = self.state.elapsed_flos / self.state.elapsed_time / num_gpus / T_DEC_UNIT
+
+        precision_bits = 16 if args.bf16 or args.fp16 else 32
+        gpu_peak_tflops = EfficiencyCallback._get_gpu_peak_tflops(precision_bits)
+
+        self.mfu.step_MFU = round_to_n_decimal(step_achieved_tflops / gpu_peak_tflops, self.precision.n_decimal_MFU)
+
+        self.mfu.avg_MFU = round_to_n_decimal(avg_achieved_tflops / gpu_peak_tflops, self.precision.n_decimal_MFU)
+
+        # The end the current step_start_tokens_seen and step_start_flos are the start of next iteration
+
+        # tokens
+        self.state.step_start_tokens_seen = state.num_input_tokens_seen
+        # flos
+        self.state.step_start_flos = state.total_flos
+
+    @staticmethod
+    def _get_effective_num_gpus():
+        # Calculate the number of effective GPUs for the total FLOPs in order to calculate the single GPU FLOP
+        world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+        if transformers.utils.strtobool(os.environ.get("ACCELERATE_USE_FSDP", "false")):
+            sharding_strategy = os.environ.get("FSDP_SHARDING_STRATEGY", FSDP_SHARDING_STRATEGY[0]).upper()
+
+            # Either specified as string or enum number
+            if sharding_strategy in {
+                "FULL_SHARD",
+                str(FSDP_SHARDING_STRATEGY.index("FULL_SHARD") + 1),
+            }:
+                return 1
+
+            elif sharding_strategy in {
+                "HYBRID_SHARD",
+                str(FSDP_SHARDING_STRATEGY.index("HYBRID_SHARD") + 1),
+            }:
+                return world_size // int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+            else:
+                return world_size
+
+        assert world_size != 0, (
+            "WORLD_SIZE should be set to a positive integer. For single GPU training, please explicitly set WORLD_SIZE=1."
+        )
+
+        # TODO: add deepspeed support
+        return world_size
+
+    @staticmethod
+    def _get_gpu_peak_tflops(precision_bits: int = 16):
+        if precision_bits not in {16, 32}:
+            raise Exception(f"Precision bits {precision_bits} is not supported")
+
+        device_name = getattr(torch, infer_device()).get_device_name()
+
+        if "A100" in device_name:
+            # data from https://www.nvidia.com/en-us/data-center/a100/
+            return 312 if precision_bits == 16 else 156
+        elif "H100" in device_name:
+            # data from https://www.nvidia.com/en-us/data-center/h100/
+            # NOTE: Specifications are one-half lower without sparsity.
+            if "NVL" in device_name:
+                return 1979 if precision_bits == 16 else 989
+            elif "PCIe" in device_name:
+                return 756 if precision_bits == 16 else 378
+            else:  # for SXM and other variants
+                return 989 if precision_bits == 16 else 494
+        elif "V100" in device_name:
+            if "NVL" in device_name:
+                return 125
+            else:
+                return 112
+        return None
diff --git a/examples/medusa/docs/images/Memory_Stage1_num_head_3.png b/examples/medusa/docs/images/Memory_Stage1_num_head_3.png
new file mode 100755
index 0000000000000000000000000000000000000000..34f044faf2fe47d1cf972bb09e9472f0dae877d7
Binary files /dev/null and b/examples/medusa/docs/images/Memory_Stage1_num_head_3.png differ
diff --git a/examples/medusa/docs/images/Memory_Stage1_num_head_5.png b/examples/medusa/docs/images/Memory_Stage1_num_head_5.png
new file mode 100755
index 0000000000000000000000000000000000000000..61d66f6b0465c4b4cf83013e69676eb0ed4dd128
Binary files /dev/null and b/examples/medusa/docs/images/Memory_Stage1_num_head_5.png differ
diff --git a/examples/medusa/docs/images/Memory_Stage2_num_head_3.png b/examples/medusa/docs/images/Memory_Stage2_num_head_3.png
new file mode 100755
index 0000000000000000000000000000000000000000..3b860a887ae3a075c905ae04b444d97af199a9df
Binary files /dev/null and b/examples/medusa/docs/images/Memory_Stage2_num_head_3.png differ
diff --git a/examples/medusa/docs/images/Memory_Stage2_num_head_5.png b/examples/medusa/docs/images/Memory_Stage2_num_head_5.png
new file mode 100755
index 0000000000000000000000000000000000000000..f7920168225fa4351cf190cd71c78844ae4ff77b
Binary files /dev/null and b/examples/medusa/docs/images/Memory_Stage2_num_head_5.png differ
diff --git a/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png b/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png
new file mode 100755
index 0000000000000000000000000000000000000000..68d682b6b8600afe2ec618db91bc47340826d438
Binary files /dev/null and b/examples/medusa/docs/images/Throughput_Stage1_num_head_3.png differ
diff --git a/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png b/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png
new file mode 100755
index 0000000000000000000000000000000000000000..77058dc0bb9a17f51914054e528f072f7160eae5
Binary files /dev/null and b/examples/medusa/docs/images/Throughput_Stage1_num_head_5.png differ
diff --git a/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png b/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png
new file mode 100755
index 0000000000000000000000000000000000000000..2595387fd68202192af50ee97afbc464e5721bc9
Binary files /dev/null and b/examples/medusa/docs/images/Throughput_Stage2_num_head_3.png differ
diff --git a/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png b/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png
new file mode 100755
index 0000000000000000000000000000000000000000..9f9c2543335857485abfba79eee71b2160e2a902
Binary files /dev/null and b/examples/medusa/docs/images/Throughput_Stage2_num_head_5.png differ
diff --git a/examples/medusa/fsdp/acc-fsdp.conf b/examples/medusa/fsdp/acc-fsdp.conf
new file mode 100755
index 0000000000000000000000000000000000000000..2ed641fe5a0e9e66f20f1a400b25d264f399c702
--- /dev/null
+++ b/examples/medusa/fsdp/acc-fsdp.conf
@@ -0,0 +1,24 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'yes'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: NO_PREFETCH
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false 
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: HYBRID_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+main_training_function: main
+mixed_precision: bf16
+rdzv_backend: static
+same_network: true
+num_machines: 1
+num_processes: 1
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: yes
\ No newline at end of file
diff --git a/examples/medusa/medusa_util.py b/examples/medusa/medusa_util.py
new file mode 100755
index 0000000000000000000000000000000000000000..7c66e0e088adabc3ff32c187e0c56d44ae10868a
--- /dev/null
+++ b/examples/medusa/medusa_util.py
@@ -0,0 +1,267 @@
+import types
+
+from typing import List
+from typing import Optional
+
+import torch
+
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
+
+
+class MedusaConfig(PretrainedConfig):
+    """
+    Configuration class for Medusa model.
+
+    Args:
+        medusa_num_heads (int, optional): Number of heads for the Medusa layer. Default is 2.
+        medusa_num_layers (int, optional): Number of Medusa layers. Default is 1.
+        base_model_name_or_path (str, optional): The name or path of the base model. Default is "lmsys/vicuna-7b-v1.3".
+        num_unfreezed_layers (int, optional): Number of layers to unfreeze. Default is 0.
+        **kwargs: Additional keyword arguments to be passed to the parent class constructor.
+    """
+
+    def __init__(
+        model,
+        medusa_num_heads=4,
+        medusa_num_layers=1,
+        base_model_name_or_path="/shared/public/models/Meta-Llama-3-8B",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        model.medusa_num_heads = medusa_num_heads
+        model.medusa_num_layers = medusa_num_layers
+        model.base_model_name_or_path = base_model_name_or_path
+
+
+class ResBlock(nn.Module):
+    """
+    A Residual Block module.
+
+    This module performs a linear transformation followed by a SiLU activation,
+    and then adds the result to the original input, creating a residual connection.
+
+    Args:
+        hidden_size (int): The size of the hidden layers in the block.
+    """
+
+    def __init__(model, hidden_size):
+        super().__init__()
+        model.linear = nn.Linear(hidden_size, hidden_size)
+        # Initialize as an identity mapping
+        nn.init.zeros_(model.linear.weight)
+        # Use SiLU activation to keep consistent with the Llama model
+        model.act = nn.SiLU()
+
+    def forward(model, x):
+        """
+        Forward pass of the ResBlock.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Output after the residual connection and activation.
+        """
+        return x + model.act(model.linear(x))
+
+
+def calculate_loss_contribution(
+    loss_i,
+    i,
+    medusa_only_heads,
+    medusa_decay_coefficient,
+    medusa_heads_coefficient,
+    medusa_scheduler_coefficient,
+):
+    if i == 0:
+        return loss_i if not medusa_only_heads else 0
+    else:
+        return loss_i * medusa_decay_coefficient**i * medusa_heads_coefficient * medusa_scheduler_coefficient
+
+
+def add_medusa_heads(
+    model,
+    medusa_num_heads=4,
+    medusa_num_layers=0,
+    medusa_return: bool = False,
+    medusa_only_heads: bool = False,
+    with_liger=True,
+):
+    """
+    Args:
+        model (nn.Module): The base language model to be used.
+        medusa_num_heads (int, optional): The number of additional tokens to predict. Defaults to 3.
+        medusa_num_layers (int, optional): The number of ResBlock layers for each Medusa head. Defaults to 0.
+        medusa_return (bool, optional): If True, returns the Medusa logits; otherwise, the forward pass will use the `lm_head`. Defaults to False.
+        medusa_only_heads (bool, optional): If True, only the Medusa head weights will be updated during fine-tuning; otherwise, the entire model's weights will be updated. Defaults to False.
+        with_liger (bool, optional): If True, applies Liger loss. Defaults to True.
+    """
+    hidden_size = model.lm_head.weight.shape[-1]
+    vocab_size = model.lm_head.weight.shape[0]
+    model.config.medusa_num_layers = medusa_num_layers
+    model.config.medusa_num_heads = medusa_num_heads
+    model.medusa_num_heads = medusa_num_heads
+    # Create a list of Medusa heads
+    model.medusa_head = nn.ModuleList(
+        [
+            nn.Sequential(
+                *([ResBlock(hidden_size) for _ in range(medusa_num_layers)]),
+                nn.Linear(hidden_size, vocab_size, bias=False),
+            )
+            for _ in range(medusa_num_heads)
+        ]
+    )
+
+    # Ensure medusa_head's dtype and device align with the base_model
+    model.medusa_head.to(model.dtype).to(model.device)
+
+    for i in range(medusa_num_heads):
+        # Initialize the weights of each medusa_head using the base model's weights
+        model.medusa_head[i][-1].weight.data[:] = model.lm_head.weight.data[:]
+    # logging the model summary
+    print(model)
+    model.old_forward = model.forward
+
+    def forward(
+        model,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """Forward pass of the MedusaModel.
+        Returns:
+            torch.Tensor: A tensor containing predictions from all Medusa heads.
+            (Optional) Original predictions from the base model's LM head.
+        """
+        loss = 0
+        medusa_logits = None
+        # LOG.debug("medusa_return: %s", medusa_return)
+        if not medusa_return:
+            return model.old_forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # Pass input through the base model
+        if medusa_only_heads:
+            with torch.no_grad():
+                outputs = model.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                )
+                hidden_states = outputs[0]
+                # The lm_head will be frozen as well, so it's within the context of torch.no_grad()
+                if not with_liger:
+                    medusa_logits = [model.lm_head(hidden_states)]
+        else:
+            outputs = model.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = outputs[0]
+            if not with_liger:
+                medusa_logits = [model.lm_head(hidden_states)]
+
+        if not with_liger:
+            for i in range(model.medusa_num_heads):
+                medusa_logits.append(model.medusa_head[i](hidden_states))
+            medusa_logits = torch.stack(medusa_logits, dim=0)
+
+        if model.training:
+            # Fix all the coefficients to 1 for now
+            medusa_scheduler_coefficient = 1
+            medusa_heads_coefficient = 1
+            medusa_decay_coefficient = 1
+            loss = 0
+
+            if with_liger:
+                lce = LigerFusedLinearCrossEntropyLoss()
+                for i in range(model.medusa_num_heads + 1):
+                    shift_hidden_states = (
+                        hidden_states[..., : -(1 + i), :].contiguous().view(-1, model.config.hidden_size)
+                    )
+                    shift_labels = labels[..., (1 + i) :].contiguous().view(-1)
+
+                    weight = model.lm_head.weight if i == 0 else model.medusa_head[i - 1][-1].weight
+                    loss_i = lce(weight, shift_hidden_states, shift_labels)
+
+                    loss += calculate_loss_contribution(
+                        loss_i,
+                        i,
+                        medusa_only_heads,
+                        medusa_decay_coefficient,
+                        medusa_heads_coefficient,
+                        medusa_scheduler_coefficient,
+                    )
+            else:
+                loss_fct = CrossEntropyLoss()
+                for i in range(model.medusa_num_heads + 1):
+                    medusa_logits_i = medusa_logits[i, :, : -(1 + i)].contiguous().view(-1, medusa_logits.shape[-1])
+                    medusa_logits_i = medusa_logits_i.float()
+                    medusa_labels = labels[..., (1 + i) :].contiguous().view(-1).to(medusa_logits_i.device)
+
+                    loss_i = loss_fct(medusa_logits_i, medusa_labels)
+
+                    loss += calculate_loss_contribution(
+                        loss_i,
+                        i,
+                        medusa_only_heads,
+                        medusa_decay_coefficient,
+                        medusa_heads_coefficient,
+                        medusa_scheduler_coefficient,
+                    )
+        else:
+            if model.config.pretraining_tp > 1:
+                raise NotImplementedError
+            else:
+                medusa_logits = [model.lm_head(hidden_states)]
+                for i in range(model.medusa_num_heads):
+                    medusa_logits.append(model.medusa_head[i](hidden_states))
+
+        return_dict = return_dict if return_dict is not None else model.config.use_return_dict
+
+        if not return_dict:
+            output = (medusa_logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=medusa_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    model.forward = types.MethodType(forward, model)
diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt
new file mode 100755
index 0000000000000000000000000000000000000000..36ab7fe12b1bf508e8c86fa49b619a346b13ad32
--- /dev/null
+++ b/examples/medusa/requirements.txt
@@ -0,0 +1,3 @@
+accelerate==1.6.0
+scikit-learn
+transformers==4.51.3
diff --git a/examples/medusa/scripts/llama3_8b_medusa.sh b/examples/medusa/scripts/llama3_8b_medusa.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3426c0fdcc891d57e010b5f96d8a05f226c4510b
--- /dev/null
+++ b/examples/medusa/scripts/llama3_8b_medusa.sh
@@ -0,0 +1,53 @@
+#!/bin/sh
+
+export GPUS_PER_NODE=$(nvidia-smi --list-gpus | wc -l)
+export LOCAL_WORLD_SIZE=$GPUS_PER_NODE
+export NUM_NODES=$WORLD_SIZE
+export WORLD_SIZE=$((GPUS_PER_NODE * NUM_NODES))
+echo "Starting training... Num nodes: $NUM_NODES, Num workers: $WORLD_SIZE"
+
+export OUTPUT_DIR="./llama3-8b-medusa-liger"
+
+export LOCAL_TRAIN_BATCH_SIZE=4
+export GRADIENT_ACCUMULATION_STEPS=1
+export LR=1e-5
+
+export MEDUSA_NUM_HEADS=5
+export MEDUSA_NUM_LAYERS=1
+export MEDUSA_HEADS_COEFFICIENT=0.2
+export MEDUSA_DECAY_COEFFICIENT=0.8
+export MEDUSA_SCHEDULER=constant
+export MEDUSA_LR_MULTIPLIER=4.0
+
+accelerate launch --config_file fsdp/acc-fsdp.conf \
+    --num_machines $NUM_NODES \
+    --num_processes $WORLD_SIZE \
+    train.py \
+    --bf16 True \
+    --output_dir $OUTPUT_DIR \
+    --num_train_epochs 10 \
+    --per_device_train_batch_size $LOCAL_TRAIN_BATCH_SIZE \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --eval_strategy "no" \
+    --save_strategy "no" \
+    --prediction_loss_only \
+    --learning_rate $LR \
+    --weight_decay 0. \
+    --warmup_ratio 0.04 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --model_max_length 1024 \
+    --gradient_checkpointing True \
+    --lazy_preprocess False \
+    --report_to none \
+    --include_num_input_tokens_seen \
+    --medusa_num_heads $MEDUSA_NUM_HEADS \
+    --medusa_num_layers $MEDUSA_NUM_LAYERS \
+    --medusa_heads_coefficient $MEDUSA_HEADS_COEFFICIENT \
+    --medusa_decay_coefficient $MEDUSA_DECAY_COEFFICIENT \
+    --medusa_scheduler $MEDUSA_SCHEDULER \
+    --medusa_lr_multiplier $MEDUSA_LR_MULTIPLIER \
+    --medusa_only_heads False \
+    --medusa_return True \
+    --use_liger True
diff --git a/examples/medusa/train.py b/examples/medusa/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..1d321343055cf8256b77e7de3c0646a3da24cbf5
--- /dev/null
+++ b/examples/medusa/train.py
@@ -0,0 +1,381 @@
+# This code is based on tatsu-lab/stanford_alpaca. Below is the original copyright:
+#
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+# Adapted from: https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py
+
+import json
+import os
+import pathlib
+
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+from typing import Optional
+
+import torch
+import transformers
+
+from callback import EfficiencyCallback
+from medusa_util import add_medusa_heads
+from safetensors.torch import save_file
+from sklearn.model_selection import train_test_split
+from torch.utils.data import Dataset
+from transformers import Trainer
+from transformers.trainer_pt_utils import LabelSmoother
+
+from liger_kernel.transformers import AutoLigerKernelForCausalLM
+
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="meta-llama/Meta-Llama-3-8B-Instruct")
+
+
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default="Aeala/ShareGPT_Vicuna_unfiltered",
+        metadata={"help": "Path to the training data."},
+    )
+    eval_data_path: str = field(default=None, metadata={"help": "Path to the evaluation data."})
+    lazy_preprocess: bool = True
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    report_to: Optional[str] = None
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=2048,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    medusa_num_heads: int = field(
+        default=1,
+        metadata={"help": "Number of Medusa heads."},
+    )
+    medusa_num_layers: int = field(
+        default=1,
+        metadata={"help": "Number of layers for each Medusa head."},
+    )
+    medusa_heads_coefficient: float = field(
+        default=1.0,
+        metadata={"help": "Coefficient for the Medusa heads."},
+    )
+    medusa_decay_coefficient: float = field(
+        default=1.0,
+        metadata={"help": "Coefficient for the Medusa heads."},
+    )
+    medusa_scheduler: str = field(
+        default="constant",
+        metadata={"help": "Scheduler for the Medusa heads."},
+    )
+    medusa_lr_multiplier: float = field(
+        default=0.0,
+        metadata={"help": "Learning rate multiplier for the Medusa heads."},
+    )
+    medusa_return: bool = field(
+        default=False,
+        metadata={
+            "help": "If medusa is not applied, the default is False, and the regular lm_head will be used for single-token prediction."
+        },
+    )
+    medusa_only_heads: bool = field(
+        default=False,
+        metadata={"help": "If train medusa heads only, default is False, the whole model will be trained"},
+    )
+    use_liger: bool = field(
+        default=False,
+        metadata={"help": "If apply liger kernel to the model."},
+    )
+
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """
+    Save the model's state dictionary to a specified directory.
+
+    Args:
+        trainer (transformers.Trainer): The Hugging Face Trainer object.
+        output_dir (str): The directory where the model state dictionary will be saved.
+    """
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def preprocess(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    """
+    Preprocesses conversation data and tokenizes it for model input.
+
+    Args:
+        sources: A list of conversation sources.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for tokenization.
+
+    Returns:
+        Dict: A dictionary containing tokenized inputs, labels, and attention mask.
+    """
+
+    # Apply prompt templates
+    conversations = []
+    prompts = []
+    # import pdb; pdb.set_trace()
+    for conversation in sources[:50]:
+        tokenizer_compatible_conv = [
+            {
+                "role": "user" if c["from"] == "human" else "assistant",
+                "content": c["value"],
+            }
+            for c in conversation["conversations"]
+        ]
+        prompt = tokenizer.apply_chat_template(tokenizer_compatible_conv, tokenize=False)
+        prompts.append(prompt)
+        conversations.append(tokenizer_compatible_conv)
+
+    # Tokenize conversations
+    encoding = tokenizer(
+        prompts,
+        return_tensors="pt",
+        padding="max_length",
+        truncation=True,
+        return_offsets_mapping=True,
+    )
+    # Set everything to be ignored, except the assistant part
+    targets = torch.full_like(encoding.input_ids, IGNORE_TOKEN_ID)
+    input_ids = encoding.input_ids
+
+    # Mask targets. Only compute loss on the assistant outputs.
+    for conv_index, (conversation, target, prompt) in enumerate(zip(conversations, targets, prompts)):
+        # print(conv_index)
+        for turn in conversation:
+            if turn["role"] == "assistant":
+                content = turn["content"]
+                # Unfortunate strip() necessary because chat templates are doing the same.
+                start = prompt.index(content.strip())
+                # stop = start + len(content)
+                indices = []
+                for tok_index, (tok_start, tok_stop) in enumerate(encoding.offset_mapping[conv_index]):
+                    if tok_stop >= start or tok_start < tok_stop:
+                        indices.append(tok_index)
+                target[indices] = encoding.input_ids[conv_index][indices]
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+        attention_mask=input_ids.ne(tokenizer.pad_token_id),
+    )
+
+
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning.
+
+    Args:
+        raw_data (list): A list of raw data examples.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for data preprocessing.
+    """
+
+    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer):
+        super(SupervisedDataset, self).__init__()
+
+        rank0_print("Formatting inputs...")
+        sources = raw_data
+        data_dict = preprocess(sources, tokenizer)
+
+        self.input_ids = data_dict["input_ids"]
+        self.labels = data_dict["labels"]
+        self.attention_mask = data_dict["attention_mask"]
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return dict(
+            input_ids=self.input_ids[i],
+            labels=self.labels[i],
+            attention_mask=self.attention_mask[i],
+        )
+
+
+class LazySupervisedDataset(Dataset):
+    """Lazy dataset for supervised fine-tuning.
+
+    This dataset loads data on-the-fly when requested, which can be memory-efficient but slower.
+
+    Args:
+        raw_data (list): A list of raw data examples.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for data preprocessing.
+    """
+
+    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer):
+        super(LazySupervisedDataset, self).__init__()
+        self.tokenizer = tokenizer
+
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.raw_data = raw_data
+        self.cached_data_dict = {}
+
+    def __len__(self):
+        return len(self.raw_data)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        if i in self.cached_data_dict:
+            return self.cached_data_dict[i]
+
+        ret = preprocess([self.raw_data[i]], self.tokenizer)
+        ret = dict(
+            input_ids=ret["input_ids"][0],
+            labels=ret["labels"][0],
+            attention_mask=ret["attention_mask"][0],
+        )
+        self.cached_data_dict[i] = ret
+
+        return ret
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args, test_size=0.05) -> Dict:
+    """Make dataset and collator for supervised fine-tuning.
+
+    Args:
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for data preprocessing.
+        data_args: Data arguments.
+        test_size: evaluation data ratio (default: 0.05)
+
+    Returns:
+        dict: A dictionary containing train and eval datasets.
+    """
+    dataset_cls = LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
+    rank0_print("Loading data...")
+
+    # Load the entire dataset
+    train_json = json.load(open(data_args.data_path, "r"))
+
+    # Perform a train-test split based on test_size
+    train_data, eval_data = train_test_split(train_json, test_size=test_size, random_state=42)
+    # Create the train and eval datasets
+    train_dataset = dataset_cls(train_data, tokenizer=tokenizer)
+    eval_dataset = dataset_cls(eval_data, tokenizer=tokenizer)
+
+    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
+
+
+def train():
+    global local_rank
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Making sure the tokenizer works before loading the model.
+    print(tokenizer(["This is a test", "secondary"], padding=True))
+    print(tokenizer.apply_chat_template([{"role": "user", "content": "This is a test"}]))
+
+    def _model_loader():
+        # we use a customized model loader to inject medusa heads to FSDP-wrapped model variables properly.
+        # see https://github.com/linkedin/Liger-Kernel/issues/309#issuecomment-2455077623 for details.
+
+        # Load model
+        if training_args.use_liger:
+            model_builder = AutoLigerKernelForCausalLM.from_pretrained
+        else:
+            model_builder = transformers.AutoModelForCausalLM.from_pretrained
+        model = model_builder(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            dtype=torch.bfloat16,
+        )
+
+        # Freeze the base model
+        for param in model.base_model.parameters():
+            param.requires_grad = False
+
+        # Inject Medusa heads
+        add_medusa_heads(
+            model,
+            training_args.medusa_num_heads,
+            training_args.medusa_num_layers,
+            training_args.medusa_return,
+            training_args.medusa_only_heads,
+            training_args.use_liger,
+        )
+        return model
+
+    # Format output dir
+    training_args.output_dir = f"{training_args.output_dir}_medusa_mlp_{model_args.model_name_or_path.split('/')[-1]}_medusa_{training_args.medusa_num_heads}_lr_{training_args.learning_rate}_layers_{training_args.medusa_num_layers}"
+
+    # Load data
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+
+    # Start trainner
+    trainer = Trainer(
+        model_init=_model_loader,
+        tokenizer=tokenizer,
+        args=training_args,
+        callbacks=[EfficiencyCallback()],
+        **data_module,
+    )
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+
+    if training_args.medusa_return and training_args.medusa_only_heads:
+        # Save only the updated head without saving the backbone model
+        state_dict = {
+            k.replace("medusa_head.", ""): v.to(torch.bfloat16)
+            for k, v in trainer.accelerator.get_state_dict(trainer.model).items()
+            if "medusa_head" in k
+        }
+
+        # Save Medusa heads
+        if local_rank == 0:
+            save_file(
+                state_dict,
+                os.path.join(training_args.output_dir, "medusa_lm_head.safetensors"),
+            )
+        trainer.accelerator.wait_for_everyone()
+    else:
+        # Save the whole model weight
+        trainer.save_model(training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/licenses/LICENSE-Apache-2.0 b/licenses/LICENSE-Apache-2.0
new file mode 100755
index 0000000000000000000000000000000000000000..0328c5ff05074b77adab24051b423e722ac9941c
--- /dev/null
+++ b/licenses/LICENSE-Apache-2.0
@@ -0,0 +1,201 @@
+ Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2024-] [Unsloth AI, Daniel Han-Chen & Michael Han-Chen]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/licenses/LICENSE-MIT-AutoAWQ b/licenses/LICENSE-MIT-AutoAWQ
new file mode 100755
index 0000000000000000000000000000000000000000..c8de3cf7f0202fc59b57dbdbee9ac936756e4a29
--- /dev/null
+++ b/licenses/LICENSE-MIT-AutoAWQ
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 MIT HAN Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-MIT-Efficient-Cross-Entropy b/licenses/LICENSE-MIT-Efficient-Cross-Entropy
new file mode 100755
index 0000000000000000000000000000000000000000..17736429bcfbc11fa9d9bdf9ca549f5ea5a2c8a4
--- /dev/null
+++ b/licenses/LICENSE-MIT-Efficient-Cross-Entropy
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 mgmalek
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-MIT-llmc b/licenses/LICENSE-MIT-llmc
new file mode 100755
index 0000000000000000000000000000000000000000..99d8f1f022950f0dc55f01b996d219c122ac2db6
--- /dev/null
+++ b/licenses/LICENSE-MIT-llmc
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Andrej Karpathy
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/licenses/LICENSE-MIT-triton b/licenses/LICENSE-MIT-triton
new file mode 100755
index 0000000000000000000000000000000000000000..0f3852f090ae2ef4dd2c9734669cb404a2f788da
--- /dev/null
+++ b/licenses/LICENSE-MIT-triton
@@ -0,0 +1,23 @@
+/*
+* Copyright 2018-2020 Philippe Tillet
+* Copyright 2020-2022 OpenAI
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100755
index 0000000000000000000000000000000000000000..9a1ba6607eb0e9a37ffdae5f1b4c43efdd1397c0
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,87 @@
+site_name: Liger-Kernel Docs
+site_url: https://linkedin.github.io/Liger-Kernel/
+site_author: LinkedIn
+site_description: Efficient Triton Kernels for LLM Training
+
+theme:
+  name: material
+  font:
+    text: Merriweather Sans
+    code: Red Hat Mono
+  features:
+    - navigation.footer
+    - toc.follow
+    - navigation.top
+    - navigation.sections
+  palette:
+    # Dark Mode
+    - scheme: slate
+      toggle:
+        icon: material/weather-sunny
+        name: Dark mode
+      primary: green
+      accent: deep purple
+
+    # Light Mode
+    - scheme: default
+      toggle:
+        icon: material/weather-night
+        name: Light mode
+      primary: blue
+      accent: deep purple
+
+nav:
+  - Home: index.md
+  - Examples: Examples.md
+  - Getting Started: Getting-Started.md
+  - High Level APIs: High-Level-APIs.md
+  - Low Level APIs: Low-Level-APIs.md
+  - Contributing: contributing.md
+  - Acknowledgment: acknowledgement.md
+  - License: license.md
+
+markdown_extensions:
+  - attr_list
+  - toc:
+      permalink: true
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.superfences:
+      custom_fences:
+        - name: mermaid
+          class: mermaid
+          format: !!python/name:pymdownx.superfences.fence_code_format
+  - pymdownx.tabbed:
+      alternate_style: true
+  - admonition
+  - pymdownx.details
+
+plugins:
+  - search
+  - mkdocstrings:
+      handlers:
+        python:
+          paths: [src] 
+          options:
+            show_root_heading: true
+            show_source: true
+            docstring_style: google  
+            docstring_section_style: table
+            heading_level: 3  
+            show_signature_annotations: false  # Hides type annotations to save space
+            separate_signature: true  # Separates signature from description
+
+
+# Repository
+repo_name: linkedin/Liger-Kernel
+repo_url: https://github.com/linkedin/Liger-Kernel
+edit_uri: edit/main/docs/
+
+extra:
+  social:
+    - icon: simple/github
+      link: https://github.com/linkedin/Liger-Kernel
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100755
index 0000000000000000000000000000000000000000..b613ff44d6037857e5c422307d655554060f8d03
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,86 @@
+[build-system]
+requires = ["setuptools>=42", "wheel", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "liger_kernel"
+version = "0.7.0"
+description = "Efficient Triton kernels for LLM Training"
+urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
+readme = { file = "README.md", content-type = "text/markdown" }
+license = { file = "LICENSE" }
+dynamic = ["dependencies", "optional-dependencies"]
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["liger_kernel*"]
+namespaces = false
+
+[tool.pytest.ini_options]
+pythonpath = ["src", "."]
+asyncio_mode = "auto"
+log_cli = true
+log_cli_level = "INFO"
+addopts = [
+    "--cov=src/liger_kernel",
+    "--cov-report=term-missing",
+    "--cov-report=html",
+    "--cov-config=pyproject.toml",
+    "--durations=0"
+]
+python_files = "test_*.py"
+testpaths = ["test/"]
+
+[tool.coverage.run]
+branch = true
+parallel = true
+source = ["src/liger_kernel"]
+# xdist uses subprocesses; "multiprocessing" is a safe concurrency choice
+concurrency = ["multiprocessing"]
+
+[tool.coverage.paths]
+liger_kernel = [
+  "src/liger_kernel",
+  "*/site-packages/liger_kernel"
+]
+
+[tool.coverage.report]
+omit = ["test/*"]
+show_missing = true
+skip_covered = false
+
+
+[tool.ruff]
+line-length = 120
+target-version = "py310"
+respect-gitignore = true
+src = ["src"]
+
+[tool.ruff.lint]
+select = [
+    "E",  # pycodestyle errors
+    "F",  # pyflakes
+    "I",  # isort
+]
+ignore = ["E501", "B006", "E731", "A002", "E203"]
+
+exclude = [
+    ".git",
+    "__pycache__",
+    "benchmark_internal/others",
+    ".venv",
+]
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+
+[tool.ruff.lint.isort]
+known-first-party = ["liger_kernel"]
+force-single-line = true
+lines-between-types = 1
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000000000000000000000000000000000000..bf3457eebf0dc9cb1e698c39e55755e186a4d1fb
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,132 @@
+# setup.py
+
+import subprocess
+
+from typing import Literal
+
+from setuptools import setup
+
+
+def get_default_dependencies():
+    """Determine the appropriate dependencies based on detected hardware."""
+    platform = get_platform()
+
+    if platform in ["cuda", "cpu"]:
+        return [
+            "torch>=2.1.2",
+            "triton>=2.3.1",
+        ]
+    elif platform == "rocm":
+        return [
+            "triton>=3.0.0",
+        ]
+    elif platform == "xpu":
+        return [
+            "torch>=2.6.0",
+        ]
+    # TODO: Currently, triton-ascend is not compatible with torch 2.7.1. We will upgrade it later.
+    elif platform == "npu":
+        return ["torch==2.6.0", "torch_npu==2.6.0", "triton-ascend"]
+
+
+def get_optional_dependencies():
+    """Get optional dependency groups."""
+    return {
+        "dev": [
+            "transformers>=4.52.0",
+            "matplotlib>=3.7.2",
+            "ruff>=0.12.0",
+            "pytest>=7.1.2",
+            "pytest-xdist",
+            "pytest-cov",
+            "pytest-asyncio",
+            "pytest-rerunfailures",
+            "datasets>=2.19.2",
+            "seaborn",
+            "mkdocs-material",
+            "torchvision>=0.20",
+            "prek>=0.2.28",
+        ]
+    }
+
+
+def is_xpu_available():
+    """
+    Check if Intel XPU is available.
+    xpu-smi is often missing right now.
+    """
+    try:
+        subprocess.run(["xpu-smi"], check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        pass
+
+    try:
+        result = subprocess.run("sycl-ls", check=True, capture_output=True, shell=True)
+        if "level_zero:gpu" in result.stdout.decode():
+            return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        pass
+
+    return False
+
+
+def is_ascend_available() -> bool:
+    """Best-effort Ascend detection.
+
+    Checks for common Ascend environment variables and a possible `npu-smi`
+    utility if present.
+    """
+    try:
+        subprocess.run(["npu-smi", "info"], check=True)
+        return True
+    except (subprocess.SubprocessError, FileNotFoundError):
+        pass
+    return False
+
+
+def get_platform() -> Literal["cuda", "rocm", "cpu", "xpu", "npu"]:
+    """
+    Detect whether the system has NVIDIA or AMD GPU without torch dependency.
+    """
+    # Try nvidia-smi first
+    try:
+        subprocess.run(["nvidia-smi"], check=True)
+        print("NVIDIA GPU detected")
+        return "cuda"
+    except (subprocess.SubprocessError, FileNotFoundError):
+        # If nvidia-smi fails, check for ROCm
+        try:
+            subprocess.run(["rocm-smi"], check=True)
+            print("ROCm GPU detected")
+            return "rocm"
+        except (subprocess.SubprocessError, FileNotFoundError):
+            if is_xpu_available():
+                print("Intel GPU detected")
+                return "xpu"
+            elif is_ascend_available():
+                print("Ascend NPU detected")
+                return "npu"
+            else:
+                print("No GPU detected")
+                return "cpu"
+
+
+setup(
+    name="liger_kernel",
+    package_dir={"": "src"},
+    packages=["liger_kernel"],
+    install_requires=get_default_dependencies(),
+    extras_require=get_optional_dependencies(),
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "Programming Language :: Python :: 3",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+        "License :: OSI Approved :: BSD-2-Clause Software License",
+        "Operating System :: OS Independent",
+    ],
+)
diff --git a/src/liger_kernel/__init__.py b/src/liger_kernel/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/liger_kernel/chunked_loss/README.md b/src/liger_kernel/chunked_loss/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..1dd7037f2dec6cf6189c224111f6825c0d265e2d
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/README.md
@@ -0,0 +1,25 @@
+# Liger FlexChunkLoss: Alignment and Distillation loss 
+
+Liger FlexChunkLoss offers a versatile interface, delivering up to 80% memory savings and a 10% throughput boost for post-training loss functions, including alignment (DPO, ORPO, CPO, KTO) and very soon, distillation. Its flexible design supports custom losses, ensuring efficiency gains across diverse use cases.
+
+### User interface
+
+FlexChunkLoss offers two flexible usage options:  
+
+1. **Via `Liger[Custom Loss]Trainer`**  
+   For example, by simply replacing the HuggingFace `ORPOTrainer` with `LigerORPOTrainer` in your code, you can leverage our optimized ORPO implementation and immediately benefit from improved performance.  
+
+2. **Using `nn.Module` Implementations of Custom Loss Functions**  
+   Explore the [LigerORPOTrainer implementation](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/transformers/orpo_trainer.py) to see how the modular design integrates custom loss functions seamlessly.  
+
+### What's under the hood?
+
+We employ chunking and fused kernel optimizations to enhance performance. By fusing the final linear layer with loss computation and calculating backward gradients during the forward pass, we significantly reduce the need for storing intermediate activations. All operations are implemented in PyTorch, leveraging `torch.compile` to streamline kernel execution without relying on extensive low-level optimizations. Additionally, we minimize `torch.compile` recompilations to reduce overhead and ensure consistent performance gains.
+
+### Extending to custom loss functions
+
+We provide two base classes: `LigerFusedLinearPreferenceBase` for alignment use cases and `LigerFusedLinearDistillationBase` for distillation use cases. These base classes manage chunking, kernel fusions, and Torch compilation.
+
+To implement a custom loss function, you need to create a subclass that defines the custom preference or distillation loss function, capable of processing a given input chunk. The base class will take care of the optimizations, handling most of the heavy lifting for you.
+
+For a working example, refer to the [ORPO loss implementation](https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/chunked_loss/orpo_loss.py).
\ No newline at end of file
diff --git a/src/liger_kernel/chunked_loss/__init__.py b/src/liger_kernel/chunked_loss/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..d3624adbbfb6455f245fa8c98e191d6f0db03fa5
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/__init__.py
@@ -0,0 +1,8 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityLoss  # noqa:F401
+from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDLoss  # noqa: F401
+from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOLoss  # noqa: F401
+from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOLoss  # noqa: F401
diff --git a/src/liger_kernel/chunked_loss/cosine_similarity_loss.py b/src/liger_kernel/chunked_loss/cosine_similarity_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..553ca34237cf98460bb27554ee65e88ef2fe9f92
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/cosine_similarity_loss.py
@@ -0,0 +1,142 @@
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.fused_linear_distillation import LigerFusedLinearDistillationBase
+
+
+class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase):
+    @staticmethod
+    def distillation_loss_fn(
+        student_logits,
+        teacher_logits,
+        target=None,
+        ignore_index=None,
+        beta=1.0,
+    ):
+        """
+        Compute Cosine loss (Cosine Similarity Loss).
+        Args:
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
+            beta: Coefficient beta of generalized Cosine Similarity in the interval [0, 1]. Default: `1.0` (float): .
+        Returns:
+            torch.Tensor: cosine similarity loss
+        """
+        student_norm = F.normalize(student_logits, p=2, dim=-1)
+        teacher_norm = F.normalize(teacher_logits, p=2, dim=-1)
+
+        cosine_sim = F.cosine_similarity(student_norm, teacher_norm, dim=-1)
+        loss = beta * (1 - cosine_sim)
+        return loss.sum()
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            student_input=student_input,
+            student_weight=student_weight,
+            teacher_input=teacher_input,
+            teacher_weight=teacher_weight,
+            target=true_labels,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            chunk_size=chunk_size,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
+        )
+
+    @staticmethod
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
+
+        return (
+            *grads,
+            None,  # teacher_bias
+            None,  # weight_hard_loss
+            None,  # weight_soft_loss
+            None,  # beta
+            None,  # ignore_index
+            None,  # temperature
+            None,  # compiled
+            None,  # chunk_size
+            None,  # return_soft_hard_loss
+        )
+
+
+class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
+    ):
+        super().__init__()
+        assert temperature != 0, "Temperature cannot be 0."
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.compiled = compiled
+        self.beta = beta
+        self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
+
+    def forward(
+        self,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        return LigerFusedLinearCosineSimilarityFunction.apply(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            true_labels,
+            student_bias,
+            teacher_bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+            self.beta,
+            self.ignore_index,
+            self.temperature,
+            self.compiled,
+            self.chunk_size,
+            self.return_soft_hard_loss,
+        )
diff --git a/src/liger_kernel/chunked_loss/cpo_loss.py b/src/liger_kernel/chunked_loss/cpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..1f0a154974ee689a34db3df367c4e54c20f67916
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/cpo_loss.py
@@ -0,0 +1,157 @@
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
+
+
+class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1, label_smoothing=0.0):
+        """
+        Paper: https://arxiv.org/pdf/2401.08417
+
+        Formula:
+        L(π_θ; U) = -E_(x,y_w,y_l)~D[log σ(β log π_θ(y_w|x) - β log π_θ(y_l|x))]
+
+        Where:
+        - π_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - σ: Sigmoid function
+        - β: Temperature parameter
+        - E: Expected value over the dataset D
+        - D: Dataset of preferences
+
+        Args:
+            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
+            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            full_target (torch.Tensor): Non chunked full target tensor
+            beta (float): Weight for the CPO loss
+            label_smoothing (float): Label smoothing factor, will reduce to Equation above when label_smoothing -> 0.
+        """
+        logits = beta * (chosen_logps - rejected_logps)
+        loss = (-F.logsigmoid(logits) * (1 - label_smoothing) - F.logsigmoid(-logits) * label_smoothing).sum() / (
+            full_target.shape[0] // 2
+        )
+
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+
+        return loss, chosen_rewards, rejected_rewards
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ignore_index=-100,
+        beta=0.1,
+        alpha=1.0,
+        label_smoothing=0.0,
+        compute_nll_loss=True,
+        compiled=True,
+        average_log_prob=False,
+        chunk_size=1,
+    ):
+        """
+        Fused linear layer with CPO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            target (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ignore_index (int): Index to ignore in loss computation
+            beta (float): Weight for the odds ratio loss
+            alpha (float): Weight for the alpha parameter
+            label_smoothing (float): Label smoothing factor
+            compute_nll_loss (bool): Whether to compute the NLL loss
+            compiled (bool): Whether to use torch compile
+            average_log_prob (bool): Whether to average the log probability per non-masked token
+            chunk_size (int): Size of chunks for processing.
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            target=target,
+            bias=bias,
+            ignore_index=ignore_index,
+            alpha=alpha,
+            beta=beta,
+            label_smoothing=label_smoothing,
+            compute_nll_loss=compute_nll_loss,
+            average_log_prob=average_log_prob,
+            compiled=compiled,
+            chunk_size=chunk_size,
+        )
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
+        return *grads, None, None, None, None, None, None, None, None
+
+
+class LigerFusedLinearCPOLoss(torch.nn.Module):
+    """
+    Fused linear layer with CPO loss.
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        alpha: float = 1.0,
+        label_smoothing: float = 0.0,
+        compute_nll_loss: bool = True,
+        compiled: bool = True,
+        average_log_prob: bool = False,
+        chunk_size: int = 1,
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss.
+            beta (float): Weight for the odds ratio loss.
+            alpha (float): Weight for the alpha parameter.
+            label_smoothing (float): Label smoothing factor.
+            compute_nll_loss (bool): Whether to compute the NLL loss.
+            compiled (bool): Whether to use the torch compiled kernel.
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
+            chunk_size (int): Size of chunks for processing.
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.alpha = alpha
+        self.label_smoothing = label_smoothing
+        self.compute_nll_loss = compute_nll_loss
+        self.compiled = compiled
+        self.average_log_prob = average_log_prob
+        self.chunk_size = chunk_size
+
+    def forward(
+        self,
+        lin_weight,
+        _input,
+        target,
+        bias=None,
+    ):
+        return LigerFusedLinearCPOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ignore_index,
+            self.beta,
+            self.alpha,
+            self.label_smoothing,
+            self.compute_nll_loss,
+            self.compiled,
+            self.average_log_prob,
+            self.chunk_size,
+        )
diff --git a/src/liger_kernel/chunked_loss/dpo_loss.py b/src/liger_kernel/chunked_loss/dpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..f7a14e539e45130b1301be481f71d5100dceecc7
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/dpo_loss.py
@@ -0,0 +1,229 @@
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
+
+
+class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(
+        chosen_logps,
+        rejected_logps,
+        full_target,
+        ref_chosen_logps=None,
+        ref_rejected_logps=None,
+        beta=0.1,
+        loss_type="sigmoid",
+    ):
+        """
+        Paper: https://arxiv.org/pdf/2305.18290
+
+        Formula:
+        L_DPO = -E[ log_sigmoid( β * (log(π(y_w|x)/π_ref(y_w|x)) - log(π(y_l|x)/π_ref(y_l|x))) ) ]
+
+        Where:
+        - π(y|x): Policy (model) probability
+        - π_ref(y|x): Reference model probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - β: Weight for the direct preference loss
+        - E: Expected value over the dataset
+
+        Args:
+            chosen_logps: Log probabilities of chosen tokens (batch_size,)
+            rejected_logps: Log probabilities of rejected tokens (batch_size,)
+            full_target: Non chunked full target tensor
+            ref_chosen_logps: Reference log probs of chosen tokens (batch_size,)
+            ref_rejected_logps: Reference log probs of rejected tokens (batch_size,)
+            beta: Weight for the direct preference loss
+        """
+
+        if ref_chosen_logps is None:
+            ref_chosen_logps = torch.tensor(0.0, device=chosen_logps.device)
+        if ref_rejected_logps is None:
+            ref_rejected_logps = torch.tensor(0.0, device=rejected_logps.device)
+
+        chosen_logratios = chosen_logps - ref_chosen_logps
+        rejected_logratios = rejected_logps - ref_rejected_logps
+
+        chosen_rewards = beta * chosen_logratios
+        rejected_rewards = beta * rejected_logratios
+
+        if loss_type == "sigmoid":
+            logits_diff = beta * (chosen_logratios - rejected_logratios)
+            loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+
+        elif loss_type == "apo_zero":
+            # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are better than your model's default output
+            losses_chosen = 1 - F.sigmoid(beta * chosen_logratios)  # Increase chosen likelihood
+            losses_rejected = F.sigmoid(beta * rejected_logratios)
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+
+        elif loss_type == "apo_down":
+            # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+            # Use this loss when you believe the chosen outputs are worse than your model's default output.
+            # Decrease chosen likelihood and decrease rejected likelihood more
+            losses_chosen = F.sigmoid(beta * chosen_logratios)
+            losses_rejected = 1 - F.sigmoid(beta * (chosen_logratios - rejected_logratios))
+            losses = losses_chosen + losses_rejected
+            loss = losses.sum() / (full_target.shape[0] // 2)
+
+        elif loss_type == "sppo_hard":
+            # In the paper (https://huggingface.co/papers/2405.00675), SPPO employs a soft probability approach,
+            # estimated using the PairRM score. The probability calculation is conducted outside of the trainer class.
+            # The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is
+            # set to 1 for the winner and 0 for the loser.
+            a = chosen_logps - ref_chosen_logps
+            b = rejected_logps - ref_rejected_logps
+            losses = (a - 0.5 / beta) ** 2 + (b + 0.5 / beta) ** 2
+            loss = losses.sum() / (full_target.shape[0] // 2)
+
+        elif loss_type == "nca_pair":
+            losses = (
+                -F.logsigmoid(chosen_rewards)
+                - 0.5 * F.logsigmoid(-chosen_rewards)
+                - 0.5 * F.logsigmoid(-rejected_rewards)
+            )
+            loss = losses.sum() / (full_target.shape[0] // 2)
+
+        else:
+            raise ValueError(
+                f"Unsupported loss_type: {loss_type}. Supported types are: sigmoid, apo_zero, apo_down, sppo_hard, nca_pair"
+            )
+
+        return loss, chosen_rewards, rejected_rewards
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        ignore_index=-100,
+        beta=0.1,
+        compute_nll_loss=False,
+        compiled=True,
+        use_ref_model=True,
+        average_log_prob=False,
+        chunk_size=1,
+        loss_type="sigmoid",
+    ):
+        """
+        Fused linear layer with DPO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            target (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ref_input (torch.Tensor, optional): Reference model input tensor. Shape: (batch_size * seq_len, hidden_size)
+            ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
+            ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
+            ignore_index (int): Index to ignore in loss computation
+            beta (float): Weight for the odds ratio loss
+            compute_nll_loss (bool): Whether to compute the NLL loss
+            compiled (bool): Whether to use torch compile
+            use_ref_model (bool): Whether to use a reference model
+            average_log_prob (bool): Whether to average the log probability per non-masked token
+            chunk_size (int): Size of chunks for processing.
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            target=target,
+            bias=bias,
+            ignore_index=ignore_index,
+            beta=beta,
+            compute_nll_loss=compute_nll_loss,
+            compiled=compiled,
+            use_ref_model=use_ref_model,
+            ref_input=ref_input,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            average_log_prob=average_log_prob,
+            chunk_size=chunk_size,
+            loss_type=loss_type,
+        )
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
+        return *grads, None, None, None, None, None, None, None, None, None, None, None
+
+
+class LigerFusedLinearDPOLoss(torch.nn.Module):
+    """
+    Fused linear layer with DPO loss.
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        compute_nll_loss: bool = False,
+        compiled: bool = True,
+        use_ref_model: bool = True,
+        average_log_prob: bool = False,
+        chunk_size: int = 1,
+        loss_type: str = "sigmoid",
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss.
+            beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute the NLL loss.
+            compiled (bool): Whether to use the torch compiled kernel.
+            use_ref_model (bool): Whether to use a reference model for the DPO loss.
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
+            chunk_size (int): Size of chunks for processing.
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.compute_nll_loss = compute_nll_loss
+        self.compiled = compiled
+        self.use_ref_model = use_ref_model
+        self.average_log_prob = average_log_prob
+        self.chunk_size = chunk_size
+        self.loss_type = loss_type
+        supported_loss_types = {"sigmoid", "apo_zero", "apo_down", "sppo_hard", "nca_pair"}
+        if self.loss_type not in supported_loss_types:
+            raise ValueError(f"Unsupported loss_type: {self.loss_type}. Supported types are: {supported_loss_types}")
+
+    def forward(
+        self,
+        lin_weight,
+        _input,
+        target,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+    ):
+        return LigerFusedLinearDPOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            ref_input,
+            ref_weight,
+            ref_bias,
+            self.ignore_index,
+            self.beta,
+            self.compute_nll_loss,
+            self.compiled,
+            self.use_ref_model,
+            self.average_log_prob,
+            self.chunk_size,
+            self.loss_type,
+        )
diff --git a/src/liger_kernel/chunked_loss/functional.py b/src/liger_kernel/chunked_loss/functional.py
new file mode 100755
index 0000000000000000000000000000000000000000..722e60d4fb1c440af5e1c61909c1faadd28a1e9e
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/functional.py
@@ -0,0 +1,17 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
+from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
+from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDFunction
+from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOFunction
+from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
+from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
+
+liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
+liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
+liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
+liger_fused_linear_cosine = LigerFusedLinearCosineSimilarityFunction.apply
+liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
+liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
+liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply
+liger_fused_linear_grpo = LigerFusedLinearGRPOFunction.apply
diff --git a/src/liger_kernel/chunked_loss/fused_linear_distillation.py b/src/liger_kernel/chunked_loss/fused_linear_distillation.py
new file mode 100755
index 0000000000000000000000000000000000000000..c58f9320a7301536689b0538b2f8de432782db77
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/fused_linear_distillation.py
@@ -0,0 +1,299 @@
+from abc import abstractmethod
+from functools import partial
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from torch.nn import functional as F
+
+
+class LigerFusedLinearDistillationBase(torch.autograd.Function):
+    @abstractmethod
+    def distillation_loss_fn(
+        student_logits,
+        teacher_logits,
+        target=None,
+        ignore_index=None,
+    ):
+        """
+        Compute distillation loss.
+        Args:
+            student_logits (torch.Tensor): Raw (temperature-scaled) logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
+            teacher_logits (torch.Tensor): Raw (temperature-scaled) logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
+        Returns:
+            torch.Tensor: Sum of distillation losses for the chunk. The class will handle
+                converting this to mean loss by dividing by the full batch size * sequence length in _compute_loss.
+        """
+        raise NotImplementedError("Distillation loss function must be implemented.")
+
+    @staticmethod
+    def chunk_forward(
+        student_input_chunk,
+        student_weight,
+        teacher_input_chunk,
+        teacher_weight,
+        target_chunk,
+        student_bias=None,
+        teacher_bias=None,
+        ignore_index=-100,
+        compute_ce_loss=True,
+    ):
+        # Student
+        student_logits_chunk = student_input_chunk @ student_weight.t()
+        if student_bias is not None:
+            student_logits_chunk += student_bias
+        student_log_probs_chunk = F.log_softmax(student_logits_chunk.float(), dim=-1)
+
+        # Teacher
+        with torch.no_grad():
+            teacher_logits_chunk = teacher_input_chunk @ teacher_weight.t()
+            if teacher_bias is not None:
+                teacher_logits_chunk += teacher_bias
+
+        # The hard/task loss
+        ce_loss = 0.0
+        if compute_ce_loss:
+            ce_loss = F.nll_loss(
+                student_log_probs_chunk.view(-1, student_log_probs_chunk.shape[-1]),
+                target_chunk.view(-1),
+                reduction="sum",
+                ignore_index=ignore_index,
+            )
+
+        return student_logits_chunk, teacher_logits_chunk, ce_loss
+
+    @staticmethod
+    def _compute_loss(
+        student_input_chunk,
+        student_weight,
+        teacher_input_chunk,
+        teacher_weight,
+        target_chunk,
+        student_bias=None,
+        teacher_bias=None,
+        distillation_loss_fn=None,
+        full_target=None,
+        ignore_index=-100,
+        weight_hard_loss=0.5,
+        weight_soft_loss=0.5,
+        compute_ce_loss=True,
+        temperature=1,
+        **loss_kwargs,
+    ):
+        """
+        Compute the total loss for a chunk of input and target, while using an knowledge distillation loss function.
+        Args:
+            distillation_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            student_input_chunk (torch.Tensor): Chunk of input tensor. Shape: (chunk_size, student_hidden_size).
+            student_weight (torch.Tensor): Weight tensor. Shape: (vocab_size, student_hidden_size).
+            teacher_input_chunk (torch.Tensor): Chunk of input tensor. Shape: (chunk_size, teacher_hidden_size).
+            teacher_weight (torch.Tensor): Weight tensor. Shape: (vocab_size, teacher_hidden_size).
+            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (chunk_size,).
+            student_bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            teacher_bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size * sequence_length,).
+            ignore_index (int): Index to ignore for loss computation.
+            weight_hard_loss (float): Weight for hard loss.
+            weight_soft_loss (float): Weight for soft loss.
+            compute_ce_loss (bool): Whether to compute CE loss.
+            temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
+            loss_kwargs (dict): Additional arguments for the loss function.
+        """
+        (
+            student_logits_chunk,
+            teacher_logits_chunk,
+            hard_loss,
+        ) = LigerFusedLinearDistillationBase.chunk_forward(
+            student_input_chunk,
+            student_weight,
+            teacher_input_chunk,
+            teacher_weight,
+            target_chunk,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            ignore_index=ignore_index,
+            compute_ce_loss=compute_ce_loss,
+        )
+
+        student_logits_chunk /= temperature
+        teacher_logits_chunk /= temperature
+
+        # If the teacher and student token size is different, pad student logits to match the teacher's.
+        # This only applies to cases where they share exactly the same vocab and tokenizer just
+        # that teacher logit is padded for some training efficiency such as
+        # https://huggingface.co/Qwen/Qwen1.5-72B-Chat/discussions/1#662883f568adf59b07b176d2
+        teacher_vocab_size = teacher_weight.shape[0]
+        student_vocab_size = student_weight.shape[0]
+        if teacher_vocab_size > student_vocab_size:
+            pad_size = teacher_vocab_size - student_vocab_size
+            pad_tensor = torch.zeros(
+                (*student_logits_chunk.shape[:-1], pad_size),
+                dtype=student_logits_chunk.dtype,
+                device=student_logits_chunk.device,
+            )
+            student_logits_chunk = torch.cat([student_logits_chunk, pad_tensor], dim=-1)
+
+        num_valid_tokens = (full_target != ignore_index).sum()
+        num_valid_tokens = num_valid_tokens.clamp_min(1)  # to avoid division by zero
+
+        hard_loss /= num_valid_tokens
+
+        soft_loss = distillation_loss_fn(
+            student_logits_chunk, teacher_logits_chunk, target=target_chunk, ignore_index=ignore_index, **loss_kwargs
+        )
+        soft_loss /= num_valid_tokens
+
+        loss = weight_hard_loss * hard_loss + weight_soft_loss * soft_loss
+        return loss, (soft_loss, hard_loss, student_logits_chunk, teacher_logits_chunk)
+
+    @staticmethod
+    def forward(
+        cls,
+        ctx,
+        student_input,
+        student_weight,
+        teacher_input,
+        teacher_weight,
+        target,
+        student_bias=None,
+        teacher_bias=None,
+        chunk_size=1024,
+        ignore_index=-100,
+        weight_hard_loss=0.5,
+        weight_soft_loss=0.5,
+        beta=0.5,
+        compute_ce_loss=True,
+        temperature=1.0,
+        compiled=True,
+        return_soft_hard_loss=False,
+        **loss_kwargs,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        """
+        Base class for fused linear layer with distillation loss.
+        Only need to compute gradients for student model.
+
+        Args:
+            student_input (torch.Tensor): Student input tensor. Shape: (batch_size * seq_len, student_hidden_size).
+            student_weight (torch.Tensor): Student weight tensor. Shape: (vocab_size, student_hidden_size).
+            teacher_input (torch.Tensor): Teacher input tensor. Shape: (batch_size * seq_len, teacher_hidden_size).
+            teacher_weight (torch.Tensor): Teacher weight tensor. Shape: (vocab_size, teacher_hidden_size).
+            target (torch.Tensor): Target truth label tensor. Shape: (batch_size * seq_len).
+            student_bias (torch.Tensor, optional): Student bias tensor. Shape: (vocab_size,).
+            teacher_bias (torch.Tensor, optional): Teacher bias tensor. Shape: (vocab_size,).
+            loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            chunk_size (int): Size of a chunk.
+            ignore_index (int): Index to ignore for loss computation.
+            weight_hard_loss (float): Weight for hard/task loss.
+            weight_soft_loss (float): Weight for soft/distillation loss.
+            beta (float): Interpolation coefficient between 0 and 1 (default: 0.5).
+            compute_ce_loss (bool): Whether to compute CE loss.
+            temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
+            compiled (bool): Whether to use torch compile for chunk accumulation.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
+            loss_kwargs (dict): Other possible arguments that a loss function might need
+        """
+        CHUNK_SIZE = chunk_size
+        grad_weight = torch.zeros_like(student_weight)
+        grad_inputs = []
+        grad_bias = torch.zeros_like(student_bias) if student_bias is not None else None
+        loss_acc = torch.zeros((), device=student_input.device)
+        soft_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
+        hard_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
+
+        loss_func_to_call = partial(
+            LigerFusedLinearDistillationBase._compute_loss,
+            distillation_loss_fn=cls.distillation_loss_fn,
+            full_target=target,
+            ignore_index=ignore_index,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            compute_ce_loss=compute_ce_loss,
+            temperature=temperature,
+            beta=beta,
+            **loss_kwargs,
+        )
+
+        def accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk):
+            if student_bias is not None:
+                (
+                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
+                    (
+                        chunk_loss,
+                        (
+                            chunk_soft_loss,
+                            chunk_hard_loss,
+                            chunk_student_logits,
+                            chunk_teacher_logits,
+                        ),
+                    ),
+                ) = torch.func.grad_and_value(loss_func_to_call, argnums=(0, 1, 5), has_aux=True)(
+                    student_input_chunk,
+                    student_weight,
+                    teacher_input_chunk,
+                    teacher_weight,
+                    target_chunk,
+                    student_bias,
+                    teacher_bias,
+                )
+                grad_bias.add_(chunk_grad_bias)
+            else:
+                (
+                    (chunk_grad_input, chunk_grad_weight),
+                    (
+                        chunk_loss,
+                        (
+                            chunk_soft_loss,
+                            chunk_hard_loss,
+                            chunk_student_logits,
+                            chunk_teacher_logits,
+                        ),
+                    ),
+                ) = torch.func.grad_and_value(loss_func_to_call, argnums=(0, 1), has_aux=True)(
+                    student_input_chunk,
+                    student_weight,
+                    teacher_input_chunk,
+                    teacher_weight,
+                    target_chunk,
+                    student_bias,
+                    teacher_bias,
+                )
+            grad_weight.add_(chunk_grad_weight)
+            loss_acc.add_(chunk_loss)
+            if return_soft_hard_loss:
+                soft_loss_acc.add_(chunk_soft_loss)
+                hard_loss_acc.add_(chunk_hard_loss)
+            return chunk_grad_input
+
+        if compiled:
+            accumulate_chunk = torch.compile(accumulate_chunk)
+
+        num_chunks = max(1, student_input.shape[0] // CHUNK_SIZE)
+        _student_input_chunks = torch.chunk(student_input, chunks=num_chunks, dim=0)
+        _teacher_input_chunks = torch.chunk(teacher_input, chunks=num_chunks, dim=0)
+        _target_chunks = torch.chunk(target, chunks=num_chunks, dim=0)
+
+        for student_input_chunk, teacher_input_chunk, target_chunk in zip(
+            _student_input_chunks, _teacher_input_chunks, _target_chunks
+        ):
+            grad_input = accumulate_chunk(student_input_chunk, teacher_input_chunk, target_chunk)
+            grad_inputs.append(grad_input)
+
+        ctx.save_for_backward(
+            torch.cat(grad_inputs, dim=0),
+            grad_weight,
+            grad_bias,
+        )
+        if return_soft_hard_loss:
+            return loss_acc, soft_loss_acc, hard_loss_acc
+        return loss_acc
+
+    @staticmethod
+    def backward(ctx, grad_output, *args):
+        grad_input, grad_weight, grad_bias = ctx.saved_tensors
+        if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
+            grad_input = grad_input * grad_output
+            grad_weight = grad_weight * grad_output
+            grad_bias = grad_bias * grad_output if grad_bias is not None else None
+
+        return grad_input, grad_weight, None, None, None, grad_bias
diff --git a/src/liger_kernel/chunked_loss/fused_linear_ppo.py b/src/liger_kernel/chunked_loss/fused_linear_ppo.py
new file mode 100755
index 0000000000000000000000000000000000000000..a382cda1b4470db4f217967cd2c4b4293d8e224c
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/fused_linear_ppo.py
@@ -0,0 +1,421 @@
+from abc import abstractmethod
+from functools import partial
+
+import torch
+import torch._dynamo.config
+import torch.nn.functional as F
+
+
+class LigerFusedLinearPPOBase(torch.autograd.Function):
+    @abstractmethod
+    def ppo_loss_fn(*args, **kwargs):
+        """
+        To be extended by subclasses.
+        """
+        raise NotImplementedError("PPO loss function must be implemented.")
+
+    @staticmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        bias=None,
+        ref_per_token_logps=None,
+        old_per_token_logps=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        beta=0.04,
+        loss_type="dapo",
+        max_completion_length=None,
+        importance_sampling_level="token",
+        temperature=1.0,
+        compiled=True,
+        use_ref_model=False,
+        chunk_size=1,
+        sapo_temperature_pos=1.0,
+        sapo_temperature_neg=1.05,
+        vllm_is_ratio=None,
+        delta=None,
+        use_bias_correction_kl=False,
+    ):
+        # TODO: check torch compile matmul
+        """Chunked forward pass for PPO loss computation.
+
+        Args:
+            cls: The class
+            ctx: Context for backward
+            _input: Input tensor
+            weight: Weight tensor
+            selected_token_ids: Selected token ids tensor
+            attention_mask: Attention mask tensor
+            advantages: Advantages tensor
+            bias: Bias tensor
+            ref_per_token_logps: Reference model log probs per token tensor
+            old_per_token_logps: Old per token log probabilities tensor
+            ref_input: Reference model input tensor
+            ref_weight: Reference model weight tensor
+            ref_bias: Reference model bias tensor
+            epsilon_low: Lower bound for clipping the importance sampling ratio
+            epsilon_high: Upper bound for clipping the importance sampling ratio
+            beta: Weight for the KL penalty
+            loss_type: Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo", "cispo", "sapo")
+            max_completion_length: Maximum completion length required for "dr_grpo"
+            importance_sampling_level: Level of importance sampling ("token" or "sequence")
+            temperature: Temperature for the logits
+            compiled: Whether to use torch compile
+            use_ref_model: Whether to use a reference model
+            chunk_size: Size of chunks for processing in other loss modules
+            sapo_temperature_pos: Temperature for positive advantages in SAPO
+            sapo_temperature_neg: Temperature for negative advantages in SAPO
+            vllm_is_ratio: vLLM importance sampling ratio tensor (batch_size, seq_len) or (batch_size, 1) or None.
+                Used to correct for distribution mismatch when using vLLM for generation.
+        """
+        if use_ref_model:
+            assert ref_per_token_logps is not None or ref_input is not None, (
+                "If use_ref_model is True, ref_per_token_logps or ref_input must be provided"
+            )
+            if ref_per_token_logps is not None and ref_input is not None:
+                raise Warning("Both ref_per_token_logps and ref_input are provided. Using ref_per_token_logps.")
+        if loss_type == "dr_grpo":
+            assert max_completion_length is not None, "max_completion_length must be provided for loss_type 'dr_grpo'"
+        if vllm_is_ratio is not None:
+            B, T = attention_mask.shape
+            assert vllm_is_ratio.dim() in (1, 2), (
+                f"vllm_is_ratio must be 1D (B,) or 2D (B, T) / (B, 1), got {vllm_is_ratio.dim()}D"
+            )
+            if vllm_is_ratio.dim() == 2:
+                assert vllm_is_ratio.shape[0] == B and vllm_is_ratio.shape[1] in (1, T), (
+                    f"vllm_is_ratio shape must be ({B}, 1) or ({B}, {T}), got {tuple(vllm_is_ratio.shape)}"
+                )
+            else:
+                assert vllm_is_ratio.shape[0] == B, (
+                    f"vllm_is_ratio shape must be ({B},), got {tuple(vllm_is_ratio.shape)}"
+                )
+                vllm_is_ratio = vllm_is_ratio.unsqueeze(-1)  # (B,) -> (B, 1) for broadcasting
+        # Initialize accumulators
+        loss_acc = torch.zeros((), device=_input.device, dtype=torch.float32)
+        grad_weight = torch.zeros_like(weight)  # [V, H]
+        grad_inputs = []
+        grad_bias = torch.zeros_like(bias) if bias is not None else None  # [V]
+        aggregated_metrics = []
+
+        # Create a partial function with fixed arguments
+        compute_loss = partial(
+            LigerFusedLinearPPOBase._compute_chunk_loss,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            full_attention_mask=attention_mask,
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
+            beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
+            temperature=temperature,
+            use_ref_model=use_ref_model,
+            ppo_loss_fn=cls.ppo_loss_fn,
+            sapo_temperature_pos=sapo_temperature_pos,
+            sapo_temperature_neg=sapo_temperature_neg,
+            delta=delta,
+            use_bias_correction_kl=use_bias_correction_kl,
+        )
+
+        def fused_fwd_bwd(
+            input_chunk,
+            selected_token_ids_chunk,
+            attention_mask_chunk,
+            advantages_chunk,
+            ref_per_token_logps_chunk,
+            old_per_token_logps_chunk,
+            ref_input_chunk,
+            vllm_is_ratio_chunk,
+        ):
+            """Fused forward and backward for a chunk."""
+            argnums = (0, 1, 5) if bias is not None else (0, 1)
+            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=True)(
+                input_chunk,  # arg 0
+                weight,  # arg 1
+                selected_token_ids_chunk,  # arg 2
+                attention_mask_chunk,  # arg 3
+                advantages_chunk,  # arg 4
+                bias,  # arg 5
+                ref_per_token_logps_chunk=ref_per_token_logps_chunk,  # arg 6
+                old_per_token_logps_chunk=old_per_token_logps_chunk,  # arg 7
+                ref_input_chunk=ref_input_chunk,  # arg 8
+                vllm_is_ratio_chunk=vllm_is_ratio_chunk,  # arg 9
+            )
+
+        def accumulate_chunk(
+            input_chunk,
+            selected_token_ids_chunk,
+            attention_mask_chunk,
+            advantages_chunk,
+            ref_per_token_logps_chunk=None,
+            old_per_token_logps_chunk=None,
+            ref_input_chunk=None,
+            vllm_is_ratio_chunk=None,
+        ):
+            (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
+                input_chunk,
+                selected_token_ids_chunk,
+                attention_mask_chunk,
+                advantages_chunk,
+                ref_per_token_logps_chunk,
+                old_per_token_logps_chunk,
+                ref_input_chunk,
+                vllm_is_ratio_chunk,
+            )
+            if bias is not None:
+                grad_bias.add_(chunk_grad_bias[0])
+
+            # Accumulate gradients and loss
+            grad_weight.add_(chunk_grad_weight)
+            grad_inputs.append(chunk_grad_input)
+            loss_acc.add_(chunk_loss)
+            # Initialize storage for metrics on first chunk
+            if len(aggregated_metrics) == 0:
+                for metric in chunk_metrics:
+                    if metric.ndim == 0:
+                        aggregated_metrics.append(torch.zeros((), device=metric.device))
+                    else:
+                        aggregated_metrics.append([])
+
+            # Accumulate metrics
+            for i, metric in enumerate(chunk_metrics):
+                if metric.ndim == 0:
+                    aggregated_metrics[i].add_(metric)
+                else:
+                    aggregated_metrics[i].append(metric)
+
+        if compiled:
+            # TODO: Figure out what is better to compile here
+            # accumulate_chunk = torch.compile(accumulate_chunk)
+            fused_fwd_bwd = torch.compile(fused_fwd_bwd)
+
+        # Process input in chunks based on chunk_size
+        chunks = max(1, _input.shape[0] // chunk_size)
+        _input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
+        _selected_token_ids_chunks = torch.chunk(selected_token_ids, chunks=chunks, dim=0)
+        _attention_mask_chunks = torch.chunk(attention_mask, chunks=chunks, dim=0)
+        _advantages_chunks = torch.chunk(advantages, chunks=chunks, dim=0)
+        _ref_per_token_logps_chunks = (
+            torch.chunk(ref_per_token_logps, chunks=chunks, dim=0)
+            if use_ref_model and ref_per_token_logps is not None
+            else [None] * chunks
+        )
+        _old_per_token_logps_chunks = (
+            torch.chunk(old_per_token_logps, chunks=chunks, dim=0)
+            if old_per_token_logps is not None
+            else [None] * chunks
+        )
+        # if ref_log_probs is not none, then we don't need ref_input to calculate the log probs
+        _ref_input_chunks = (
+            torch.chunk(ref_input, chunks=chunks, dim=0)
+            if use_ref_model and ref_per_token_logps is None
+            else [None] * chunks
+        )
+        _vllm_is_ratio_chunks = (
+            torch.chunk(vllm_is_ratio, chunks=chunks, dim=0) if vllm_is_ratio is not None else [None] * chunks
+        )
+
+        for (
+            input_chunk,
+            selected_token_ids_chunk,
+            attention_mask_chunk,
+            advantages_chunk,
+            ref_per_token_logps_chunk,
+            old_per_token_logps_chunk,
+            ref_input_chunk,
+            vllm_is_ratio_chunk,
+        ) in zip(
+            _input_chunks,
+            _selected_token_ids_chunks,
+            _attention_mask_chunks,
+            _advantages_chunks,
+            _ref_per_token_logps_chunks,
+            _old_per_token_logps_chunks,
+            _ref_input_chunks,
+            _vllm_is_ratio_chunks,
+        ):
+            # Mark dynamic dimensions
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(selected_token_ids_chunk, 1)
+            torch._dynamo.mark_dynamic(attention_mask_chunk, 1)
+            if ref_per_token_logps_chunk is not None:
+                torch._dynamo.mark_dynamic(ref_per_token_logps_chunk, 1)
+            if ref_input_chunk is not None:
+                torch._dynamo.mark_dynamic(ref_input_chunk, 1)
+            if old_per_token_logps_chunk is not None:
+                torch._dynamo.mark_dynamic(old_per_token_logps_chunk, 1)
+            if vllm_is_ratio_chunk is not None:
+                torch._dynamo.mark_dynamic(vllm_is_ratio_chunk, 1)
+
+            accumulate_chunk(
+                input_chunk,
+                selected_token_ids_chunk,
+                attention_mask_chunk,
+                advantages_chunk,
+                ref_per_token_logps_chunk,
+                old_per_token_logps_chunk,
+                ref_input_chunk,
+                vllm_is_ratio_chunk,
+            )
+
+        # Combine gradients
+        grad_input = torch.cat(grad_inputs, dim=0)
+
+        # Save for backward
+        ctx.save_for_backward(grad_input, grad_weight, grad_bias)
+
+        # Finalize metrics
+        final_metrics = []
+        for metric in aggregated_metrics:
+            if isinstance(metric, list):
+                final_metrics.append(torch.cat(metric, dim=0))
+            else:
+                final_metrics.append(metric)
+
+        return loss_acc, tuple(final_metrics)
+
+    @staticmethod
+    def _compute_dapo_normalizer(attention_mask):
+        """Global active tokens averaged per process."""
+        normalizer = attention_mask.to(torch.float32).sum()
+        world_size = 1
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            import torch.distributed as dist
+
+            normalizer = normalizer.clone()
+            dist.all_reduce(normalizer, op=dist.ReduceOp.SUM)
+            world_size = dist.get_world_size()
+
+        normalizer = normalizer / world_size
+        return torch.clamp(normalizer, min=1.0)
+
+    @staticmethod
+    def _compute_chunk_loss(
+        input_chunk,
+        weight,
+        selected_token_ids_chunk,
+        attention_mask_chunk,
+        advantages_chunk,
+        bias=None,
+        ref_per_token_logps_chunk=None,
+        old_per_token_logps_chunk=None,
+        ref_input_chunk=None,
+        vllm_is_ratio_chunk=None,
+        ref_weight=None,
+        ref_bias=None,
+        full_attention_mask=None,
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        beta=0.04,
+        loss_type="dapo",
+        max_completion_length=None,
+        importance_sampling_level="token",
+        temperature=1.0,
+        use_ref_model=False,
+        ppo_loss_fn=None,
+        sapo_temperature_pos=1.0,
+        sapo_temperature_neg=1.05,
+        delta=None,
+        use_bias_correction_kl=False,
+    ):
+        """Compute loss for a single chunk."""
+        # Get policy log probabilities using chunk_forward
+        log_probs, _ = LigerFusedLinearPPOBase.chunk_forward(input_chunk, weight, bias=bias, temperature=temperature)
+
+        # Get reference log probabilities if needed
+        ref_log_probs = None
+        if use_ref_model and ref_per_token_logps_chunk is None:
+            with torch.no_grad():
+                ref_log_probs, _ = LigerFusedLinearPPOBase.chunk_forward(
+                    ref_input_chunk, ref_weight, bias=ref_bias, temperature=temperature
+                )
+
+        # Compute chunk loss and metrics using the provided loss function
+        chunk_loss, chunk_metrics = ppo_loss_fn(
+            log_probs=log_probs,
+            selected_token_ids=selected_token_ids_chunk,
+            attention_mask=attention_mask_chunk,
+            advantages=advantages_chunk,
+            full_attention_mask=full_attention_mask,
+            ref_per_token_logps=ref_per_token_logps_chunk.float() if ref_per_token_logps_chunk is not None else None,
+            old_per_token_logps=old_per_token_logps_chunk.float() if old_per_token_logps_chunk is not None else None,
+            ref_log_probs=ref_log_probs,  # used when ref_per_token_logps is None
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
+            beta=beta,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
+            sapo_temperature_pos=sapo_temperature_pos,
+            sapo_temperature_neg=sapo_temperature_neg,
+            vllm_is_ratio=vllm_is_ratio_chunk,
+            delta=delta,
+            use_bias_correction_kl=use_bias_correction_kl,
+        )
+
+        return chunk_loss, chunk_metrics
+
+    @staticmethod
+    def chunk_forward(input_chunk, weight, bias=None, temperature=1.0):
+        """Forward pass computation for a single chunk without explicit reshaping."""
+        # Directly compute logits via batched matrix multiplication: [B, T, H] @ [H, V] -> [B, T, V]
+        logits = torch.matmul(input_chunk, weight.t())
+        if bias is not None:
+            logits = logits + bias  # Broadcasts bias to [B, T, V]
+        if temperature != 1.0:
+            logits = logits / temperature
+
+        # Compute log probabilities using softmax over the last dimension
+        log_probs = F.log_softmax(logits.float(), dim=-1)
+
+        return log_probs, logits
+
+    @staticmethod
+    def backward(ctx, grad_output, *grad_metrics):
+        """Backward pass for PPO loss."""
+        grad_input, grad_weight, grad_bias = ctx.saved_tensors
+
+        if grad_output != 1.0:
+            grad_input = grad_input * grad_output
+            grad_weight = grad_weight * grad_output
+            if grad_bias is not None:
+                grad_bias = grad_bias * grad_output
+
+        return (
+            grad_input,
+            grad_weight,
+            None,  # grad_selected_token_ids
+            None,  # grad_attention_mask
+            None,  # grad_advantages
+            grad_bias,
+            None,  # grad_ref_per_token_logps
+            None,  # grad_old_per_token_logps
+            None,  # grad_ref_input
+            None,  # grad_ref_weight
+            None,  # grad_ref_bias
+            None,  # grad_epsilon_low
+            None,  # grad_epsilon_high
+            None,  # grad_beta
+            None,  # grad_loss_type
+            None,  # grad_max_completion_length
+            None,  # grad_importance_sampling_level
+            None,  # grad_temperature
+            None,  # grad_compiled
+            None,  # grad_use_ref_model
+            None,  # grad_chunk_size
+            None,  # grad_sapo_temperature_pos
+            None,  # grad_sapo_temperature_neg
+            None,  # grad_vllm_is_ratio
+            None,  # grad_delta
+            None,  # grad_use_bias_correction_kl
+        )
diff --git a/src/liger_kernel/chunked_loss/fused_linear_preference.py b/src/liger_kernel/chunked_loss/fused_linear_preference.py
new file mode 100755
index 0000000000000000000000000000000000000000..72269be663089da7db09b87c0a9fc494adbeb3e3
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/fused_linear_preference.py
@@ -0,0 +1,433 @@
+from abc import abstractmethod
+from functools import partial
+
+import torch
+
+from torch.nn import functional as F
+
+
+class LigerFusedLinearPreferenceBase(torch.autograd.Function):
+    @abstractmethod
+    def preference_loss_fn(*args, **kwargs):
+        """
+        To be extended by subclasses.
+        """
+        raise NotImplementedError("Preference loss function must be implemented.")
+
+    @staticmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        chunk_size=1,
+        ignore_index=-100,
+        alpha=1.0,
+        beta=0.1,
+        compute_nll_loss=True,
+        nll_target=None,
+        compiled=True,
+        use_ref_model=False,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        average_log_prob=True,
+        **loss_kwargs,
+    ):
+        """
+        Base class for fused linear layer with preference loss.
+        Expects _input to be stacked with chosen and rejected inputs on the batch dimension.
+
+        The mental model is:
+
+        forward()
+        ├── Loop over chunks
+            └── compute_loss()
+                ├── chunk_forward()  # Compute logits and log probs
+                └── prefer_loss()    # Calculate preference loss
+
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size, seq_len, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target (torch.Tensor): Target tensor. Shape: (batch_size, seq_len).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            chunk_size (int): Size of a chunk (# of batches of stacked chosen and rejected inputs).
+            ignore_index (int): Index to ignore for loss computation.
+            alpha (float): Weight for the NLL loss.
+            beta (float): Weight for the preference loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            nll_target (torch.Tensor, optional): Target tensor for NLL loss. Shape: (batch_size, seq_len). If not provided the target is used.
+            compiled (bool): Whether to use torch compile for chunk accumulation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average log probabilities or to sum them over the completion.
+            loss_kwargs (dict): Other possible arguments that a loss function might need
+        """
+        # TODO: Tune CHUNK_SIZE to fully utilize the GPU
+        CHUNK_SIZE = chunk_size
+
+        # Gradients to be accumulated
+        grad_weight = torch.zeros_like(weight)
+        grad_chosen_inputs = []
+        grad_rejected_inputs = []
+        grad_bias = torch.zeros_like(bias) if bias is not None else None
+
+        # Loss to be accumulated
+        loss_acc = torch.zeros((), device=_input.device)
+
+        # Metrics to be recorded
+        policy_chosen_logps = []
+        policy_rejected_logps = []
+        policy_chosen_logits_mean = torch.zeros((), device=_input.device)
+        policy_rejected_logits_mean = torch.zeros((), device=_input.device)
+        policy_nll_loss = torch.zeros((), device=_input.device)
+        aggregated_aux_outputs = []  # aggregated aux outputs from all chunks
+
+        compute_loss = partial(
+            LigerFusedLinearPreferenceBase._compute_loss,
+            preference_loss_fn=cls.preference_loss_fn,
+            ignore_index=ignore_index,
+            alpha=alpha,
+            beta=beta,
+            compute_nll_loss=compute_nll_loss,
+            full_target=target,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            full_nll_target=nll_target,
+            average_log_prob=average_log_prob,
+            **loss_kwargs,
+        )
+
+        def fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk):
+            """
+            Fused forward and backward pass for a chunk of input and target.
+            """
+            if bias is not None:
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1, 3), has_aux=True)(
+                    input_chunk,
+                    weight,
+                    target_chunk,
+                    bias,
+                    ref_input_chunk=ref_input_chunk,
+                    chosen_nll_target_chunk=chosen_nll_target_chunk,
+                )
+            else:
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1), has_aux=True)(
+                    input_chunk,
+                    weight,
+                    target_chunk,
+                    ref_input_chunk=ref_input_chunk,
+                    chosen_nll_target_chunk=chosen_nll_target_chunk,
+                )
+
+        def accumulate_chunk(input_chunk, target_chunk, ref_input_chunk=None, chosen_nll_target_chunk=None):
+            if bias is not None:
+                (
+                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
+                    (
+                        chunk_loss,
+                        (
+                            chunk_chosen_logps,
+                            chunk_rejected_logps,
+                            chunk_chosen_logits_mean,
+                            chunk_rejected_logits_mean,
+                            chunk_nll_loss,
+                            *aux_outputs,
+                        ),
+                    ),
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
+                grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
+            else:
+                (
+                    (chunk_grad_input, chunk_grad_weight),
+                    (
+                        chunk_loss,
+                        (
+                            chunk_chosen_logps,
+                            chunk_rejected_logps,
+                            chunk_chosen_logits_mean,
+                            chunk_rejected_logits_mean,
+                            chunk_nll_loss,
+                            *aux_outputs,
+                        ),
+                    ),
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
+
+            # Accumulate gradients
+            grad_weight.add_(chunk_grad_weight)
+            grad_chosen_inputs.append(chunk_grad_input[: chosen_target_chunk.shape[0]])
+            grad_rejected_inputs.append(chunk_grad_input[chosen_target_chunk.shape[0] :])
+
+            # Accumulate loss
+            loss_acc.add_(chunk_loss)
+
+            # Accumulate metrics
+            policy_chosen_logps.append(chunk_chosen_logps)
+            policy_rejected_logps.append(chunk_rejected_logps)
+            policy_chosen_logits_mean.add_(chunk_chosen_logits_mean)
+            policy_rejected_logits_mean.add_(chunk_rejected_logits_mean)
+            policy_nll_loss.add_(chunk_nll_loss)
+
+            # aux_outputs
+            # Initialize storage for aux_outputs
+            if len(aggregated_aux_outputs) == 0:
+                for aux in aux_outputs:
+                    if aux.ndim == 0:
+                        aggregated_aux_outputs.append(torch.zeros((), device=aux.device))
+                    else:
+                        aggregated_aux_outputs.append([])
+
+            # Process each aux_output
+            for i, aux in enumerate(aux_outputs):
+                if aux.ndim == 0:
+                    aggregated_aux_outputs[i].add_(aux)
+                else:
+                    aggregated_aux_outputs[i].append(aux)
+
+        if compiled:
+            fused_fwd_bwd = torch.compile(fused_fwd_bwd)
+
+        len_chosen = target.shape[0] // 2
+        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
+        _chosen_input_chunks = torch.chunk(_input[:len_chosen], chunks=chunks, dim=0)
+        _chosen_target_chunks = torch.chunk(target[:len_chosen], chunks=chunks, dim=0)
+        _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
+        _rejected_target_chunks = torch.chunk(target[len_chosen:], chunks=chunks, dim=0)
+
+        if nll_target is not None:
+            _chosen_nll_target_chunks = torch.chunk(nll_target[:len_chosen], chunks=chunks, dim=0)
+
+        if use_ref_model:
+            _ref_chosen_input_chunks = torch.chunk(ref_input[:len_chosen], chunks=chunks, dim=0)
+            _ref_rejected_input_chunks = torch.chunk(ref_input[len_chosen:], chunks=chunks, dim=0)
+
+        for (
+            chosen_input_chunk,
+            rejected_input_chunk,
+            chosen_target_chunk,
+            rejected_target_chunk,
+            ref_chosen_input_chunk,
+            ref_rejected_input_chunk,
+            chosen_nll_target_chunk,
+        ) in zip(
+            _chosen_input_chunks,
+            _rejected_input_chunks,
+            _chosen_target_chunks,
+            _rejected_target_chunks,
+            (_ref_chosen_input_chunks if use_ref_model else [None] * len(_chosen_input_chunks)),
+            (_ref_rejected_input_chunks if use_ref_model else [None] * len(_rejected_input_chunks)),
+            (_chosen_nll_target_chunks if nll_target is not None else [None] * len(_chosen_input_chunks)),
+        ):
+            input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
+            ref_input_chunk = (
+                torch.cat([ref_chosen_input_chunk, ref_rejected_input_chunk], dim=0) if use_ref_model else None
+            )
+            target_chunk = torch.cat([chosen_target_chunk, rejected_target_chunk], dim=0)
+
+            # mark input_chunk, target_chunk, and target dimension 1 as dynamic to prevent torch.compile recompilation
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(target_chunk, 1)
+            torch._dynamo.mark_dynamic(target, 1)
+            torch._dynamo.mark_dynamic(ref_input_chunk, 1) if use_ref_model else None
+            torch._dynamo.mark_dynamic(chosen_nll_target_chunk, 1) if nll_target is not None else None
+
+            # accumulate loss, gradients, and metrics
+            accumulate_chunk(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
+
+        # combine grad_chosen_inputs and grad_rejected_inputs
+        grad_inputs = grad_chosen_inputs + grad_rejected_inputs
+        policy_chosen_logps = torch.cat(policy_chosen_logps, dim=0)
+        policy_rejected_logps = torch.cat(policy_rejected_logps, dim=0)
+
+        # Aggregate aux outputs lists into tensors
+        for i, aux in enumerate(aggregated_aux_outputs):
+            if isinstance(aux, list):
+                aggregated_aux_outputs[i] = torch.cat(aux, dim=0)
+
+        ctx.save_for_backward(
+            torch.cat(grad_inputs, dim=0),
+            grad_weight,
+            grad_bias,
+        )
+        return_vars = (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits_mean,
+            policy_rejected_logits_mean,
+            policy_nll_loss,
+        )
+        return loss_acc, (*return_vars, *aggregated_aux_outputs)
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grad_input, grad_weight, grad_bias = ctx.saved_tensors
+        if torch.ne(grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)):
+            grad_input = grad_input * grad_output[0][0]
+            grad_weight = grad_weight * grad_output[0][0]
+            grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
+
+        return grad_input, grad_weight, None, grad_bias, None, None, None, None
+
+    @staticmethod
+    def chunk_forward(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        ignore_index=-100,
+        compute_nll_loss=True,
+        chosen_nll_target_chunk=None,
+        average_log_prob=True,
+    ):
+        len_chosen_chunk = target_chunk.shape[0] // 2
+        logits_chunk = input_chunk @ weight.t()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+
+        chosen_nll_loss = 0.0
+        if compute_nll_loss:
+            nll_labels = (
+                chosen_nll_target_chunk if chosen_nll_target_chunk is not None else target_chunk[:len_chosen_chunk]
+            )
+            chosen_nll_loss = F.nll_loss(
+                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
+                nll_labels.view(-1),
+                reduction="sum",
+                ignore_index=ignore_index,
+            )
+
+        loss_mask = target_chunk != ignore_index
+        label_chunk = torch.where(loss_mask, target_chunk, 0)
+
+        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(-1)
+        if average_log_prob:
+            log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            log_prob = (per_token_logps * loss_mask).sum(-1)
+
+        chosen_logps = log_prob[:len_chosen_chunk]
+        rejected_logps = log_prob[len_chosen_chunk:]
+
+        chosen_logits = logits_chunk[:len_chosen_chunk]
+        rejected_logits = logits_chunk[len_chosen_chunk:]
+
+        return (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        )
+
+    @staticmethod
+    def _compute_loss(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        preference_loss_fn=None,
+        full_target=None,
+        ignore_index=-100,
+        alpha=1.0,
+        beta=0.1,
+        compute_nll_loss=True,
+        use_ref_model=False,
+        ref_input_chunk=None,
+        ref_weight=None,
+        ref_bias=None,
+        full_nll_target=None,
+        chosen_nll_target_chunk=None,
+        average_log_prob=True,
+        **loss_kwargs,
+    ):
+        """
+        Compute the total loss for a chunk of input and target, while using an alignment/preference loss function.
+        Args:
+            preference_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            input_chunk (torch.Tensor): Chunk of input tensor. Shape: (2 * chunk_size, sequence_length, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (2 * chunk_size, sequence_length).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
+            ignore_index (int): Index to ignore for loss computation.
+            alpha (float): Weight for the NLL loss.
+            beta (float): Weight for the preference loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            full_nll_target (torch.Tensor, optional): Full target tensor for NLL loss. Shape: (batch_size, sequence_length).
+            chosen_nll_target_chunk (torch.Tensor, optional): Target tensor for NLL loss. Shape: (chunk_size, sequence_length) If not provided the target_chunk is used.
+            average_log_prob (bool): Whether to average log probabilities or the sum.
+            loss_kwargs (dict): Additional arguments for the loss function.
+        """
+        (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        ) = LigerFusedLinearPreferenceBase.chunk_forward(
+            input_chunk,
+            weight,
+            target_chunk,
+            bias=bias,
+            ignore_index=ignore_index,
+            compute_nll_loss=compute_nll_loss,
+            chosen_nll_target_chunk=chosen_nll_target_chunk,
+            average_log_prob=average_log_prob,
+        )
+        if full_nll_target is not None:
+            chosen_nll_loss = chosen_nll_loss / (full_nll_target[: full_nll_target.shape[0] // 2] != ignore_index).sum()
+        else:
+            chosen_nll_loss = chosen_nll_loss / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+
+        chosen_logits_mean = chosen_logits.sum() / (full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0])
+        rejected_logits_mean = rejected_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+
+        if use_ref_model:
+            with torch.no_grad():
+                (
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    _,
+                    _,
+                    _,
+                ) = LigerFusedLinearPreferenceBase.chunk_forward(
+                    ref_input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                    chosen_nll_target_chunk=None,
+                    average_log_prob=average_log_prob,
+                )
+            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
+            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
+
+        preference_loss_outputs = preference_loss_fn(
+            chosen_logps, rejected_logps, full_target, beta=beta, **loss_kwargs
+        )
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss, aux_outputs = preference_loss_outputs, []
+
+        loss = alpha * chosen_nll_loss + preference_loss
+        return_vars = (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits_mean,
+            rejected_logits_mean,
+            chosen_nll_loss,
+        )
+        return loss, (*return_vars, *aux_outputs)
diff --git a/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py b/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py
new file mode 100755
index 0000000000000000000000000000000000000000..73118e493dd9882d4debd5c4ddce8c6b5e6bacaa
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py
@@ -0,0 +1,341 @@
+from abc import abstractmethod
+from functools import partial
+
+import torch
+
+from torch.nn import functional as F
+
+
+class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
+    @abstractmethod
+    def preference_loss_fn(*args, **kwargs):
+        """
+        To be extended by subclasses.
+        """
+        raise NotImplementedError("Preference loss function must be implemented.")
+
+    @staticmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        target,
+        preference_labels,
+        bias=None,
+        chunk_size=1,
+        ignore_index=-100,
+        compiled=True,
+        use_ref_model=False,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        average_log_prob=False,
+        **loss_kwargs,
+    ):
+        """
+        Base class for fused linear layer with unpaired preference loss like KTO
+        Expects _input to be stacked with chosen and rejected inputs on the batch dimension.
+
+        The mental model is:
+
+        forward()
+        ├── Loop over chunks
+            └── compute_loss()
+                ├── chunk_forward()  # Compute logits and log probs
+                └── prefer_loss()    # Calculate preference loss
+
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size, seq_len, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target (torch.Tensor): Target tensor. Shape: (batch_size, seq_len).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            chunk_size (int): Size of a chunk (# of batches of stacked chosen and rejected inputs).
+            ignore_index (int): Index to ignore for loss computation.
+            beta (float): Weight for the preference loss.
+            compiled (bool): Whether to use torch compile for chunk accumulation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            preference_labels (torch.Tensor): Boolean tensor indicating chosen (True) vs rejected (False) examples.
+                Shape: (batch_size,).
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
+            loss_kwargs (dict): Other possible arguments that a loss function might need
+        """
+        # TODO: Tune CHUNK_SIZE to fully utilize the GPU
+        CHUNK_SIZE = chunk_size
+
+        # Gradients to be accumulated
+        grad_inputs = []
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias) if bias is not None else None
+
+        # Loss to be accumulated
+        loss_acc = torch.zeros((), device=_input.device)
+
+        # Metrics to be recorded
+        chosen_logps_sum = torch.zeros((), device=_input.device)
+        rejected_logps_sum = torch.zeros((), device=_input.device)
+        chosen_logits_sum = torch.zeros((), device=_input.device)
+        rejected_logits_sum = torch.zeros((), device=_input.device)
+        aggregated_aux_outputs = []
+
+        compute_loss = partial(
+            LigerFusedLinearUnpairedPreferenceBase._compute_loss,
+            preference_loss_fn=cls.preference_loss_fn,
+            full_target=target,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            average_log_prob=average_log_prob,
+            **loss_kwargs,
+        )
+
+        def fused_fwd_bwd(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk):
+            """
+            Fused forward and backward pass for a chunk of input and target.
+            """
+            argnums = (0, 1, 4) if bias is not None else (0, 1)
+            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=True)(
+                input_chunk,
+                weight,
+                target_chunk,
+                preference_labels_chunk,
+                bias,
+                ref_input_chunk=ref_input_chunk,
+            )
+
+        def accumulate_chunk(
+            input_chunk,
+            target_chunk,
+            preference_labels_chunk=None,
+            ref_input_chunk=None,
+        ):
+            (
+                (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias),
+                (
+                    chunk_loss,
+                    (
+                        chunk_chosen_logps_sum,
+                        chunk_rejected_logps_sum,
+                        chunk_chosen_logits_sum,
+                        chunk_rejected_logits_sum,
+                        *aux_outputs,
+                    ),
+                ),
+            ) = fused_fwd_bwd(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk)
+            if bias is not None:
+                grad_bias.add_(chunk_grad_bias[0])  # accumulate bias gradient
+
+            # Accumulate gradients
+            grad_weight.add_(chunk_grad_weight)
+            grad_inputs.append(chunk_grad_input)
+
+            # Accumulate loss
+            loss_acc.add_(chunk_loss)
+
+            # Accumulate metrics
+            chosen_logps_sum.add_(chunk_chosen_logps_sum)
+            rejected_logps_sum.add_(chunk_rejected_logps_sum)
+            chosen_logits_sum.add_(chunk_chosen_logits_sum)
+            rejected_logits_sum.add_(chunk_rejected_logits_sum)
+
+            # aux_outputs
+            # Initialize storage for aux_outputs
+            if len(aggregated_aux_outputs) == 0:
+                for aux in aux_outputs:
+                    aggregated_aux_outputs.append(torch.zeros((), device=aux.device))
+
+            # Process each aux_output
+            for i, aux in enumerate(aux_outputs):
+                if aux.ndim == 0:
+                    aggregated_aux_outputs[i].add_(aux)
+
+        if compiled:
+            fused_fwd_bwd = torch.compile(fused_fwd_bwd)
+
+        # When not paired, use labels to separate chosen and rejected
+        assert preference_labels is not None, "preference_labels must be provided for unpaired preference loss"
+
+        chunks = max(1, _input.shape[0] // CHUNK_SIZE)
+        _input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
+        _target_chunks = torch.chunk(target, chunks=chunks, dim=0)
+        _preference_labels_chunks = torch.chunk(preference_labels, chunks=chunks, dim=0)
+
+        if use_ref_model:
+            _ref_input_chunks = torch.chunk(ref_input, chunks=chunks, dim=0)
+
+        for (
+            input_chunk,
+            target_chunk,
+            ref_input_chunk,
+            preference_labels_chunk,
+        ) in zip(
+            _input_chunks,
+            _target_chunks,
+            (_ref_input_chunks if use_ref_model else [None] * len(_input_chunks)),
+            _preference_labels_chunks,
+        ):
+            # mark input_chunk, target_chunk, and target dimension 1 (sequence length) as dynamic to prevent torch.compile recompilation
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(target_chunk, 1)
+            torch._dynamo.mark_dynamic(target, 1)
+            torch._dynamo.mark_dynamic(ref_input_chunk, 1) if use_ref_model else None
+            torch._dynamo.mark_dynamic(preference_labels_chunk, 1)
+
+            # accumulate loss, gradients, and metrics
+            accumulate_chunk(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk)
+
+        # Aggregate aux outputs lists into tensors
+        for i, aux in enumerate(aggregated_aux_outputs):
+            if isinstance(aux, list):
+                aggregated_aux_outputs[i] = torch.cat(aux, dim=0)
+
+        ctx.save_for_backward(
+            torch.cat(grad_inputs, dim=0),
+            grad_weight,
+            grad_bias,
+        )
+
+        return_vars = (
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        )
+
+        return loss_acc, (*return_vars, *aggregated_aux_outputs)
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grad_input, grad_weight, grad_bias = ctx.saved_tensors
+        if torch.ne(grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)):
+            grad_input = grad_input * grad_output[0][0]
+            grad_weight = grad_weight * grad_output[0][0]
+            grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
+
+        return grad_input, grad_weight, None, None, grad_bias
+
+    @staticmethod
+    def chunk_forward(
+        input_chunk,
+        weight,
+        target_chunk,
+        preference_labels_chunk,
+        bias=None,
+        ignore_index=-100,
+        average_log_prob=False,
+    ):
+        logits_chunk = input_chunk @ weight.t()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+        loss_mask_chunk = target_chunk != ignore_index
+        label_chunk = torch.where(loss_mask_chunk, target_chunk, 0)
+
+        per_token_logps_chunk = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(-1)
+        if average_log_prob:
+            log_probs = (per_token_logps_chunk * loss_mask_chunk).sum(-1) / loss_mask_chunk.sum(-1)
+        else:
+            log_probs = (per_token_logps_chunk * loss_mask_chunk).sum(-1)
+
+        chosen_logps_sum = (log_probs * preference_labels_chunk.unsqueeze(1)).sum()
+        rejected_logps_sum = (log_probs * (~preference_labels_chunk).unsqueeze(1)).sum()
+
+        chosen_logits_sum = (logits_chunk * preference_labels_chunk.unsqueeze(1)).sum()
+        rejected_logits_sum = (logits_chunk * (~preference_labels_chunk).unsqueeze(1)).sum()
+
+        return (
+            log_probs,
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        )
+
+    @staticmethod
+    def _compute_loss(
+        input_chunk,
+        weight,
+        target_chunk,
+        preference_labels_chunk,
+        bias=None,
+        preference_loss_fn=None,
+        full_target=None,
+        ignore_index=-100,
+        use_ref_model=False,
+        ref_input_chunk=None,
+        ref_weight=None,
+        ref_bias=None,
+        average_log_prob=False,
+        **loss_kwargs,
+    ):
+        """
+        Compute the total loss for a chunk of input and target, while using an alignment/preference loss function.
+        Args:
+            preference_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            input_chunk (torch.Tensor): Chunk of input tensor. Shape: (2 * chunk_size, sequence_length, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (2 * chunk_size, sequence_length).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
+            ignore_index (int): Index to ignore for loss computation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
+            loss_kwargs (dict): Additional arguments for the loss function.
+        """
+        (
+            log_prob_chunk,
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        ) = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+            input_chunk,
+            weight,
+            target_chunk,
+            preference_labels_chunk,
+            bias=bias,
+            ignore_index=ignore_index,
+            average_log_prob=average_log_prob,
+        )
+
+        if use_ref_model:
+            with torch.no_grad():
+                (
+                    ref_log_prob_chunk,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+                    ref_input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    preference_labels_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                    average_log_prob=average_log_prob,
+                )
+            loss_kwargs["ref_log_prob_chunk"] = ref_log_prob_chunk
+
+        preference_loss_outputs = preference_loss_fn(
+            log_prob_chunk, preference_labels_chunk, full_target, **loss_kwargs
+        )
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss_chunk, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss_chunk, aux_outputs = preference_loss_outputs, []
+
+        return_vars = (
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        )
+
+        return preference_loss_chunk, (*return_vars, *aux_outputs)
diff --git a/src/liger_kernel/chunked_loss/grpo_loss.py b/src/liger_kernel/chunked_loss/grpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..f05cc874468028274a3f8f3b53698fbe1a9ff3e1
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/grpo_loss.py
@@ -0,0 +1,462 @@
+from typing import Optional
+
+import torch
+
+from liger_kernel.chunked_loss.fused_linear_ppo import LigerFusedLinearPPOBase
+
+
+def k3_loss_fn(log_p, log_q):
+    # computes k3 estimate of KL[q, p]
+    # ref: http://joschu.net/blog/kl-approx.html
+    return torch.exp(log_p - log_q) - (log_p - log_q) - 1.0
+
+
+def sapo_loss_fn(importance_ratio: torch.Tensor, temperature: float) -> torch.Tensor:
+    """SAPO (Soft Adaptive Policy Optimization) loss function.
+
+    Replaces hard clipping with a smooth, temperature-controlled gate that
+    adaptively attenuates off-policy updates while preserving useful learning signals.
+
+    Reference: https://huggingface.co/papers/2511.20347
+    TRL implementation: https://github.com/huggingface/trl/blob/1bd2a52ec2d8344050af736d60cdc735181ae4b8/trl/trainer/grpo_trainer.py#L1913
+
+    Args:
+        importance_ratio: The importance sampling ratio (pi_theta / pi_old).
+        temperature: Temperature parameter controlling the softness of the gate.
+
+    Returns:
+        The SAPO loss value.
+    """
+    if temperature <= 0:
+        raise ValueError("sapo_temperature must be > 0.")
+    sigmoid_input = temperature * (importance_ratio - 1)
+    sigmoid_smoothed_loss = torch.sigmoid(sigmoid_input)
+    return sigmoid_smoothed_loss * 4 / temperature
+
+
+def clip_coef_fn(coef, epsilon_low, epsilon_high, loss_type):
+    if loss_type == "cispo":
+        # CISPO: clip and detach the importance weights
+        upper_bound = epsilon_high
+        lower_bound = None
+        clipped_coef = torch.clamp(coef, lower_bound, upper_bound).detach()
+        is_lower_clipped = False
+        is_upper_clipped = coef > upper_bound
+    elif loss_type == "sapo":
+        # SAPO doesn't use clipping metrics
+        clipped_coef = None
+        is_lower_clipped = torch.zeros_like(coef, dtype=torch.bool)
+        is_upper_clipped = torch.zeros_like(coef, dtype=torch.bool)
+    else:
+        upper_bound = 1 + epsilon_high
+        lower_bound = 1 - epsilon_low
+        clipped_coef = torch.clamp(coef, lower_bound, upper_bound)
+        is_lower_clipped = coef < lower_bound
+        is_upper_clipped = coef > upper_bound
+    return clipped_coef, is_lower_clipped, is_upper_clipped
+
+
+class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
+    @staticmethod
+    def ppo_loss_fn(
+        log_probs,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        full_attention_mask,
+        ref_per_token_logps=None,  # shape: [chunk_size, seq_len]
+        old_per_token_logps=None,
+        ref_log_probs=None,  # used when ref_per_token_logps is None (shape: [chunk_size, seq_len, vocab_size])
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        beta=0.04,
+        loss_type="dapo",  # ["grpo", "bnpo", "dr_grpo", "dapo", "cispo", "sapo", "luspo"]
+        max_completion_length=None,  # Required for dr_grpo
+        importance_sampling_level="token",  # ["token", "sequence"] - new parameter for GSPO
+        sapo_temperature_pos=1.0,  # Temperature for positive advantages in SAPO
+        sapo_temperature_neg=1.05,  # Temperature for negative advantages in SAPO
+        vllm_is_ratio=None,  # vLLM importance sampling ratio (chunk_size, seq_len) or (chunk_size, 1) or None
+        delta=None,  # Upper clamp for two-sided clipping (INTELLECT-2)
+        use_bias_correction_kl=False,  # Importance-sampling-corrected KL (DeepSeek-V3.2)
+        **kwargs,
+    ):
+        """GRPO Loss Function matching GRPOTrainer implementation."""
+        # Validate sequence-level + loss_type combinations
+        if importance_sampling_level == "sequence" and loss_type in ("cispo", "sapo"):
+            raise ValueError(
+                f"Sequence-level importance sampling is not supported for loss_type='{loss_type}'. "
+                f"Use importance_sampling_level='token' instead."
+            )
+
+        per_token_logps = log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(
+            -1
+        )  # (batch_size, seq_len)
+
+        # Get reference model probabilities
+        if ref_per_token_logps is None:
+            if ref_log_probs is not None:
+                with torch.no_grad():
+                    ref_per_token_logps = ref_log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(
+                        -1
+                    )
+            else:
+                ref_per_token_logps = per_token_logps.detach()
+
+        # Compute policy gradient loss with importance sampling ratio
+        old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
+        log_ratio = per_token_logps - old_per_token_logps
+
+        if importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+
+        # From here, log_importance_weights (and all subsequent tensors, coef_1, coef_2, etc.) shape depends on
+        # importance_sampling_level: "token" level: (B, T); "sequence" level: (B, 1)
+        coef_1 = torch.exp(log_importance_weights)
+        coef_2, is_lower_clipped, is_upper_clipped = clip_coef_fn(coef_1, epsilon_low, epsilon_high, loss_type)
+        if loss_type == "cispo":
+            # CISPO: clip and detach the importance weights, multiply by log probs
+            # Reference: https://github.com/huggingface/trl/blob/035c3ff151b953ca72cdfe0ee966bc1469a26fde/trl/trainer/grpo_trainer.py#L2030
+            per_token_loss = -coef_2 * advantages.unsqueeze(1) * per_token_logps
+        elif loss_type == "sapo":
+            # SAPO: Soft Adaptive Policy Optimization
+            # Uses sigmoid-based soft gating instead of hard clipping
+            # Reference: https://huggingface.co/papers/2511.20347
+            # TRL implementation: https://github.com/huggingface/trl/blob/1bd2a52ec2d8344050af736d60cdc735181ae4b8/trl/trainer/grpo_trainer.py#L2037-L2046
+            per_token_loss = torch.empty_like(coef_1)
+            # Expand advantages to match coef_1 shape for masking
+            advantages_expanded = advantages.unsqueeze(1).expand_as(coef_1)
+            positive_advantages_mask = advantages_expanded > 0
+
+            # Apply different temperatures based on advantage sign
+            per_token_loss[positive_advantages_mask] = sapo_loss_fn(
+                coef_1[positive_advantages_mask], sapo_temperature_pos
+            )
+            per_token_loss[~positive_advantages_mask] = sapo_loss_fn(
+                coef_1[~positive_advantages_mask], sapo_temperature_neg
+            )
+            per_token_loss = -per_token_loss * advantages_expanded
+        else:
+            # Apply delta (two-sided clipping from INTELLECT-2) to coef_1
+            if delta is not None:
+                coef_1 = torch.clamp(coef_1, max=delta)
+            per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+            per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+            per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+
+        # Apply vLLM importance sampling correction BEFORE adding KL penalty
+        if vllm_is_ratio is not None:
+            per_token_loss = per_token_loss * vllm_is_ratio
+
+        if beta != 0.0:
+            # Compute KL penalty (approximates KL[per_token_logps, ref_per_token_logps])
+            kl_div = k3_loss_fn(ref_per_token_logps, per_token_logps)
+            if use_bias_correction_kl:
+                # Importance-sampling-corrected KL (DeepSeek-V3.2): kl *= token-level coef_1
+                token_coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+                kl_div = kl_div * token_coef_1
+            # Combine losses
+            per_token_loss = per_token_loss + beta * kl_div
+
+        # Note: We normalize by the number of tokens in the batch (using full_attention_mask),
+        # which is consistent with the DAPO loss implementation (https://arxiv.org/html/2503.14476v1)
+        # and TRL GRPO implementation
+        # (https://github.com/huggingface/trl/blob/e751a16df56e70190fb94bed4a2035eec3303777/trl/trainer/grpo_trainer.py#L966)
+        if loss_type == "grpo" or loss_type == "sapo":
+            # Average per-sequence loss (SAPO uses same normalization as GRPO)
+            loss = (
+                (per_token_loss * attention_mask).sum(-1) / torch.clamp(attention_mask.sum(-1), min=1.0)
+            ).sum() / full_attention_mask.shape[0]
+        elif loss_type == "bnpo":
+            # Batch Normalized Per-token loss (original implementation)
+            loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
+        elif loss_type == "dr_grpo":
+            # Dimension-Reduced GRPO (normalize by batch_size * max_completion_length)
+            if max_completion_length is None:
+                raise ValueError("max_completion_length must be provided for loss_type 'dr_grpo'")
+            loss = (per_token_loss * attention_mask).sum() / (full_attention_mask.shape[0] * max_completion_length)
+        elif loss_type == "dapo" or loss_type == "cispo":
+            loss_normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(full_attention_mask)
+            loss = (per_token_loss * attention_mask).sum() / loss_normalizer
+        elif loss_type == "luspo":
+            # LUSPO: loss = (per_token_loss * mask.sum(1, keepdim=True)).mean()
+            # Reformulated as: sum_i(sum_j(per_token_loss_ij) * seq_len_i) / numel
+            # to avoid (B,T) * (B,1) broadcast which amplifies torch.compile differences.
+            seq_lens = attention_mask.sum(-1)  # (chunk_B,)
+            per_seq_sum = per_token_loss.sum(-1)  # (chunk_B,)
+            weighted = per_seq_sum * seq_lens  # (chunk_B,)
+            if importance_sampling_level == "sequence" and beta == 0.0:
+                # per_token_loss stays (B, 1), so .mean() divides by B
+                loss = weighted.sum() / full_attention_mask.shape[0]
+            else:
+                # per_token_loss is (B, T), .mean() divides by B*T
+                loss = weighted.sum() / (full_attention_mask.shape[0] * full_attention_mask.shape[1])
+        else:
+            raise ValueError(f"Unknown loss type: {loss_type}")
+
+        # Calculate metrics
+        metrics = []
+        if beta != 0.0:
+            metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
+
+        # Adjust clipping metric calculation based on importance sampling level
+        if importance_sampling_level == "token":
+            is_clipped = (is_lower_clipped & (advantages.unsqueeze(1) < 0)) | (
+                is_upper_clipped & (advantages.unsqueeze(1) > 0)
+            )
+        else:  # sequence level
+            # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
+            is_clipped = (is_lower_clipped & (advantages.unsqueeze(1) < 0)) | (
+                is_upper_clipped & (advantages.unsqueeze(1) > 0)
+            )
+            is_clipped = is_clipped.expand_as(attention_mask)
+
+        metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
+        return loss, metrics
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        bias=None,
+        ref_per_token_logps=None,
+        old_per_token_logps=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        beta=0.04,
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        loss_type="dapo",
+        max_completion_length=None,
+        importance_sampling_level="token",
+        sapo_temperature_pos=1.0,
+        sapo_temperature_neg=1.05,
+        temperature=1.0,
+        compiled=True,
+        use_ref_model=True,
+        chunk_size=1,
+        vllm_is_ratio=None,
+        delta=None,
+        use_bias_correction_kl=False,
+    ):
+        """
+        Fused linear layer with GRPO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            selected_token_ids (torch.Tensor): Selected token ids tensor. Shape: (batch_size, seq_len)
+            attention_mask (torch.Tensor): Attention mask tensor. Shape: (batch_size, seq_len)
+            advantages (torch.Tensor): Advantages tensor. Shape: (batch_size,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ref_per_token_logps:  Reference model log probs per token tensor. Shape:(batch_size, seq_len)
+            ref_input (torch.Tensor, optional): Reference model input tensor. Shape: (batch_size * seq_len, hidden_size)
+            ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
+            ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
+            beta (float): Weight for the KL penalty
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo", "cispo", "sapo", "luspo").
+                Defaults to "dapo".
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
+            sapo_temperature_pos (float): Temperature for positive advantages in SAPO. Defaults to 1.0.
+            sapo_temperature_neg (float): Temperature for negative advantages in SAPO. Defaults to 1.05.
+            temperature (float): Temperature for the logits
+            compiled (bool): Whether to use torch compile
+            use_ref_model (bool): Whether to use a reference model
+            chunk_size (int): Size of chunks for processing.
+            vllm_is_ratio (torch.Tensor, optional): vLLM importance sampling ratio (batch_size, seq_len) or (batch_size, 1) or None.
+                Used to correct for distribution mismatch when using vLLM for generation.
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        # Validate before entering torch.compile boundary
+        if importance_sampling_level == "sequence" and loss_type in ("cispo", "sapo"):
+            raise ValueError(
+                f"Sequence-level importance sampling is not supported for loss_type='{loss_type}'. "
+                f"Use importance_sampling_level='token' instead."
+            )
+
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            selected_token_ids=selected_token_ids,
+            attention_mask=attention_mask,
+            advantages=advantages,
+            bias=bias,
+            ref_per_token_logps=ref_per_token_logps,
+            old_per_token_logps=old_per_token_logps,
+            ref_input=ref_input,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            beta=beta,
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
+            temperature=temperature,
+            compiled=compiled,
+            use_ref_model=use_ref_model,
+            chunk_size=chunk_size,
+            importance_sampling_level=importance_sampling_level,
+            sapo_temperature_pos=sapo_temperature_pos,
+            sapo_temperature_neg=sapo_temperature_neg,
+            vllm_is_ratio=vllm_is_ratio,
+            delta=delta,
+            use_bias_correction_kl=use_bias_correction_kl,
+        )
+
+    @staticmethod
+    def backward(ctx, grad_output, *grad_metrics):
+        """Backward pass for GRPO loss.
+
+        Args:
+            grad_output: Gradient of the loss (scalar)
+            grad_metrics: Gradients of the metrics (not used in backward computation)
+        """
+        grads = LigerFusedLinearPPOBase.backward(ctx, grad_output)
+        return (
+            *grads[
+                :6
+            ],  # grad_input, grad_weight, grad_selected_token_ids, grad_attention_mask, grad_advantages, grad_bias
+            None,  # grad_ref_per_token_logps
+            None,  # grad_old_per_token_logps
+            None,  # grad_ref_input
+            None,  # grad_ref_weight
+            None,  # grad_ref_bias
+            None,  # grad_beta
+            None,  # grad_epsilon_low
+            None,  # grad_epsilon_high
+            None,  # grad_loss_type (string, not differentiable)
+            None,  # grad_max_completion_length (int, not differentiable)
+            None,  # grad_importance_sampling_level (string, not differentiable)
+            None,  # grad_sapo_temperature_pos (float, not differentiable)
+            None,  # grad_sapo_temperature_neg (float, not differentiable)
+            None,  # grad_temperature
+            None,  # grad_compiled
+            None,  # grad_use_ref_model
+            None,  # grad_chunk_size
+            None,  # grad_vllm_is_ratio
+            None,  # grad_delta
+            None,  # grad_use_bias_correction_kl
+        )
+
+
+class LigerFusedLinearGRPOLoss(torch.nn.Module):
+    """Fused linear layer with GRPO loss."""
+
+    def __init__(
+        self,
+        beta: float = 0.04,
+        compiled: bool = True,
+        use_ref_model: bool = True,
+        chunk_size: int = 1,
+        epsilon_low: float = 0.2,
+        epsilon_high: float = 0.2,
+        loss_type: str = "dapo",
+        max_completion_length: Optional[int] = None,
+        importance_sampling_level: str = "token",
+        sapo_temperature_pos: float = 1.0,
+        sapo_temperature_neg: float = 1.05,
+        temperature: float = 1.0,
+        delta: Optional[float] = None,
+        use_bias_correction_kl: bool = False,
+    ):
+        """
+        Args:
+            beta (float): Weight for the KL penalty.
+            compiled (bool): Whether to use torch compile.
+            use_ref_model (bool): Whether to use a reference model.
+            chunk_size (int): Size of chunks for processing.
+            epsilon_low (float): Lower bound for the importance sampling ratio.
+            epsilon_high (float): Upper bound for the importance sampling ratio.
+            loss_type (str): Type of loss calculation ("grpo", "bnpo", "dr_grpo", "dapo", "cispo", "sapo", "luspo").
+                Defaults to "dapo". For "cispo", epsilon_high is typically larger (e.g. 5.0) and
+                epsilon_low is unused. For "sapo", uses soft gating instead of hard clipping.
+            max_completion_length (int, optional): Maximum completion length, required for "dr_grpo". Defaults to None.
+            importance_sampling_level (str): Level of importance sampling ("token" or "sequence"). Defaults to "token".
+            sapo_temperature_pos (float): Temperature for positive advantages in SAPO. Defaults to 1.0.
+            sapo_temperature_neg (float): Temperature for negative advantages in SAPO. Defaults to 1.05.
+            temperature (float): Temperature for the logits.
+            delta (float, optional): Upper clamp for two-sided clipping (INTELLECT-2). None means disabled.
+            use_bias_correction_kl (bool): If True, multiply KL by importance sampling ratio (DeepSeek-V3.2).
+        """
+        super().__init__()
+        # Validate SAPO temperatures to prevent division by zero or numerical instability
+        if sapo_temperature_pos <= 0:
+            raise ValueError(f"sapo_temperature_pos must be positive, got {sapo_temperature_pos}")
+        if sapo_temperature_neg <= 0:
+            raise ValueError(f"sapo_temperature_neg must be positive, got {sapo_temperature_neg}")
+        if delta is not None and delta <= 0:
+            raise ValueError(f"delta must be positive, got {delta}")
+        self.beta = beta
+        self.compiled = compiled
+        self.use_ref_model = use_ref_model
+        self.chunk_size = chunk_size
+        self.epsilon_low = epsilon_low
+        self.epsilon_high = epsilon_high
+        self.loss_type = loss_type
+        self.max_completion_length = max_completion_length
+        self.importance_sampling_level = importance_sampling_level
+        self.sapo_temperature_pos = sapo_temperature_pos
+        self.sapo_temperature_neg = sapo_temperature_neg
+        self.temperature = temperature
+        self.delta = delta
+        self.use_bias_correction_kl = use_bias_correction_kl
+
+    def forward(
+        self,
+        _input,
+        lin_weight,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        bias=None,
+        ref_per_token_logps=None,
+        old_per_token_logps=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        vllm_is_ratio=None,
+    ):
+        return LigerFusedLinearGRPOFunction.apply(
+            _input,
+            lin_weight,
+            selected_token_ids,
+            attention_mask,
+            advantages,
+            bias,
+            ref_per_token_logps,
+            old_per_token_logps,
+            ref_input,
+            ref_weight,
+            ref_bias,
+            self.beta,
+            self.epsilon_low,
+            self.epsilon_high,
+            self.loss_type,
+            self.max_completion_length,
+            self.importance_sampling_level,
+            self.sapo_temperature_pos,
+            self.sapo_temperature_neg,
+            self.temperature,
+            self.compiled,
+            self.use_ref_model,
+            self.chunk_size,
+            vllm_is_ratio,
+            self.delta,
+            self.use_bias_correction_kl,
+        )
diff --git a/src/liger_kernel/chunked_loss/jsd_loss.py b/src/liger_kernel/chunked_loss/jsd_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..64cc75a40dc7272d18271a465b5286087514c117
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/jsd_loss.py
@@ -0,0 +1,215 @@
+import math
+
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.fused_linear_distillation import LigerFusedLinearDistillationBase
+
+
+class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
+    @staticmethod
+    def distillation_loss_fn(student_logits, teacher_logits, beta=0.5, target=None, ignore_index=-100):
+        """
+        Compute JSD loss (Jensen-Shannon Divergence Loss).
+        Args:
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
+            beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+            target (torch.Tensor): Target labels for masking. Shape: (chunk_size,).
+            ignore_index (int): Index to ignore in loss computation.
+        Returns:
+            torch.Tensor: Jensen-Shannon Divergence loss
+        Note:
+            - Uses reduction="none" to preserve per-token losses for masking
+            - KL divergence requires summing over vocab dimension (not mean)
+            - Masking excludes padding/prompt tokens from loss computation
+        """
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+
+        if beta == 0:
+            jsd_loss = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
+        elif beta == 1:
+            jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
+        else:
+            # Compute probabilities (only required for mean calculation)
+            log_mean_probs = torch.logsumexp(
+                torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
+            )
+
+            student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="none", log_target=True)
+            teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="none", log_target=True)
+
+            # JSD is the weighted average of the KL divergences
+            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
+
+        # Sum over vocab dimension (KL divergence definition)
+        jsd_loss = jsd_loss.sum(dim=-1)  # (chunk_size,)
+
+        # Apply ignore_index mask
+        if target is not None:
+            mask = target != ignore_index
+            jsd_loss = jsd_loss.masked_fill(~mask, 0.0)
+
+        return jsd_loss.sum()
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
+    ):
+        """
+        Fused linear layer with JSD distillation loss.
+        Args:
+            student_input (torch.Tensor): Student input tensor. Shape: (batch_size * seq_len, hidden_size_student)
+            student_weight (torch.Tensor): Student weight tensor. Shape: (vocab_size, hidden_size_student)
+            teacher_input (torch.Tensor): Teacher input tensor. Shape: (batch_size * seq_len, hidden_size_teacher)
+            teacher_weight (torch.Tensor): Teacher weight tensor. Shape: (vocab_size, hidden_size_teacher)
+            true_labels (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
+            weight_hard_loss (float): Weight for hard loss.
+            weight_soft_loss (float): Weight for soft loss.
+            beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+            ignore_index (int): Index to ignore in loss computation
+            temperature (float): Temperature for softening/sharpening distributions
+            compiled (bool): Whether to use torch compile
+            chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
+        Returns:
+            torch.Tensor: Computed loss, or tuple (loss, soft_loss, hard_loss) if return_soft_hard_loss=True
+        """
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            student_input=student_input,
+            student_weight=student_weight,
+            teacher_input=teacher_input,
+            teacher_weight=teacher_weight,
+            target=true_labels,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            chunk_size=chunk_size,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            compiled=compiled,
+            return_soft_hard_loss=return_soft_hard_loss,
+        )
+
+    @staticmethod
+    def backward(ctx, grad_output, *args):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
+
+        return (
+            *grads,
+            None,  # teacher_bias
+            None,  # weight_hard_loss
+            None,  # weight_soft_loss
+            None,  # beta
+            None,  # ignore_index
+            None,  # temperature
+            None,  # compiled
+            None,  # chunk_size
+            None,  # return_soft_hard_loss
+        )
+
+
+class LigerFusedLinearJSDLoss(torch.nn.Module):
+    """
+    Fused linear layer with JSD distillation loss.
+    """
+
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+        return_soft_hard_loss: bool = False,
+    ):
+        """
+        Args:
+            weight_hard_loss (float): Weight for hard loss.
+            weight_soft_loss (float): Weight for soft loss.
+            ignore_index (int): Index to ignore in the loss
+            temperature (float): Temperature for softening distributions
+            compiled (bool): Whether to use torch compile
+            beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+            chunk_size (int): Size of chunks for processing.
+            return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
+        """
+        super().__init__()
+        assert temperature != 0, "Temperature cannot be 0."
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.compiled = compiled
+        self.beta = beta
+        self.chunk_size = chunk_size
+        self.return_soft_hard_loss = return_soft_hard_loss
+
+    def forward(
+        self,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        """
+        Compute the JSD distillation loss.
+
+        Args:
+            student_input (torch.Tensor): Student input tensor
+            student_weight (torch.Tensor): Student weight tensor
+            teacher_input (torch.Tensor): Teacher input tensor
+            teacher_weight (torch.Tensor): Teacher weight tensor
+            true_labels (torch.LongTensor): Target labels tensor
+
+        Returns:
+            torch.Tensor or Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+                If return_soft_hard_loss is False: Computed combined loss
+                If return_soft_hard_loss is True: Tuple of (combined_loss, soft_loss, hard_loss)
+        """
+        return LigerFusedLinearJSDFunction.apply(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            true_labels,
+            student_bias,
+            teacher_bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+            self.beta,
+            self.ignore_index,
+            self.temperature,
+            self.compiled,
+            self.chunk_size,
+            self.return_soft_hard_loss,
+        )
diff --git a/src/liger_kernel/chunked_loss/kto_loss.py b/src/liger_kernel/chunked_loss/kto_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..e7b4d503368815de8f59ce7353b0adef9c901705
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/kto_loss.py
@@ -0,0 +1,210 @@
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.fused_linear_unpaired_preference import LigerFusedLinearUnpairedPreferenceBase
+
+
+class LigerFusedLinearKTOFunction(LigerFusedLinearUnpairedPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(
+        log_prob_chunk,
+        preference_labels_chunk,
+        full_target,
+        ref_log_prob_chunk=None,
+        beta=0.1,
+        kl=None,
+    ):
+        """
+        Implements the Kahneman-Tversky Optimization (KTO) loss function.
+        Paper: "KTO: Model Alignment as Prospect Theory-Guided Optimization"
+        https://arxiv.org/abs/2402.01306
+
+        KTO loss is inspired by prospect theory (https://en.wikipedia.org/wiki/Prospect_theory)
+        from behavioral economics, which models how humans make decisions under uncertainty.
+        The loss function is asymmetric, treating gains and losses differently, similar to
+        human decision-making patterns.
+
+        Formula:
+        When y is chosen:
+        L_KTO = 1 - σ(β * (log[π(x)/π₀(x)] - KL(π||π₀)_y))
+        When y is rejected:
+        L_KTO = 1 - σ(β * (KL(π||π₀)_y - log[π(x)/π₀(x)]))
+
+        Where:
+        - σ: Sigmoid function
+        - β: Temperature parameter controlling the strength of the preference signal
+        - π(x): Policy (current model)
+        - π₀(x): Reference policy (reference model)
+        - KL(π||π₀)_y: KL divergence estimated using the rejected response y
+
+        The loss encourages the model to:
+        1. Assign higher probability to chosen responses
+        2. Assign lower probability to rejected responses
+        3. Maintain reasonable distance from the reference model
+
+        Args:
+            log_prob_chunk: Log probabilities for the chunk (batch_size,)
+            preference_labels_chunk: Preference labels for the chunk (batch_size,)
+            full_target: Non chunked full target tensor
+            ref_log_prob_chunk: Reference log probs for the chunk (batch_size,)
+            beta: Weight for the KTO loss
+            kl: KL divergence between the policy model and the reference model for the chosen responses. Shape: (batch_size,)
+        Returns:
+            - loss: The KTO loss value
+        """
+        if ref_log_prob_chunk is not None:
+            logratios_chunk = log_prob_chunk - ref_log_prob_chunk
+        else:
+            logratios_chunk = log_prob_chunk
+        multiplier_chunk = torch.where(preference_labels_chunk, 1, -1)
+        if kl is not None:
+            losses = 1 - F.sigmoid(beta * (logratios_chunk - kl) * multiplier_chunk)
+        else:
+            losses = 1 - F.sigmoid(beta * logratios_chunk * multiplier_chunk)
+
+        rewards = beta * logratios_chunk
+        chosen_rewards_sum = (rewards * preference_labels_chunk.unsqueeze(1)).sum()
+        rejected_rewards_sum = (rewards * (~preference_labels_chunk).unsqueeze(1)).sum()
+
+        return losses.sum() / (full_target.shape[0]), chosen_rewards_sum, rejected_rewards_sum
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        target,
+        preference_labels,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        kl=None,
+        ignore_index=-100,
+        beta=0.1,
+        compiled=True,
+        use_ref_model=True,
+        average_log_prob=False,
+        chunk_size=1,
+    ):
+        """
+        Fused linear layer with KTO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            target (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
+            preference_labels (torch.Tensor): Preference labels tensor. Shape: (batch_size,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ref_input (torch.Tensor, optional): Reference model input tensor. Shape: (batch_size * seq_len, hidden_size)
+            ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
+            ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
+            kl (torch.Tensor, optional): KL divergence tensor. Shape: (batch_size,)
+            ignore_index (int): Index to ignore in loss computation
+            beta (float): Temperature parameter for the KTO loss
+            compiled (bool): Whether to use torch compile
+            use_ref_model (bool): Whether to use a reference model
+            average_log_prob (bool): Whether to average the log probability per non-masked token
+            chunk_size (int): Size of chunks for processing
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            target=target,
+            preference_labels=preference_labels,
+            bias=bias,
+            ignore_index=ignore_index,
+            beta=beta,
+            compiled=compiled,
+            use_ref_model=use_ref_model,
+            ref_input=ref_input,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            average_log_prob=average_log_prob,
+            kl=kl,
+            chunk_size=chunk_size,
+        )
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grads = LigerFusedLinearUnpairedPreferenceBase.backward(ctx, grad_output)[:5]
+        return (
+            *grads,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class LigerFusedLinearKTOLoss(torch.nn.Module):
+    """
+    Fused linear layer with Kahneman-Tversky Optimization (KTO) loss.
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        compiled: bool = True,
+        use_ref_model: bool = False,
+        average_log_prob: bool = False,
+        chunk_size: int = 1,
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss calculation
+            beta (float): Temperature parameter for the KTO loss
+            compiled (bool): Whether to use compiled operations
+            use_ref_model (bool): Whether to use a reference model for the DPO loss.
+            average_log_prob (bool): Whether to average the log probability per non-masked token
+            chunk_size (int): Size of chunks for processing
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.compiled = compiled
+        self.use_ref_model = use_ref_model
+        self.average_log_prob = average_log_prob
+        self.chunk_size = chunk_size
+
+    def forward(
+        self,
+        _input,
+        lin_weight,
+        target,
+        bias=None,
+        preference_labels=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        kl=None,
+    ):
+        return LigerFusedLinearKTOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            preference_labels,
+            bias,
+            ref_input,
+            ref_weight,
+            ref_bias,
+            kl,
+            self.ignore_index,
+            self.beta,
+            self.compiled,
+            self.use_ref_model,
+            self.average_log_prob,
+            self.chunk_size,
+        )
diff --git a/src/liger_kernel/chunked_loss/orpo_loss.py b/src/liger_kernel/chunked_loss/orpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..7cb0bc3716444b5f77d8ab80cbee445495bd6f59
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/orpo_loss.py
@@ -0,0 +1,144 @@
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
+
+
+class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
+        """
+        Paper: https://arxiv.org/pdf/2403.07691
+
+        Formula:
+        Compute odds-ratio loss: L_OR = -log(σ(log(odds_θ(y_w|x) / odds_θ(y_l|x))))
+        where odds_θ(y|x) = P_θ(y|x) / (1 - P_θ(y|x))
+
+        Where:
+        - P_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - σ: Sigmoid function
+        - β: Weight for the odds ratio loss
+        - odds_θ: Odds function for the policy
+
+        Args:
+            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
+            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            full_target (torch.Tensor): Non chunked full target tensor
+            beta (float): Weight for the odds ratio loss.
+        """
+        log_odds = (chosen_logps - rejected_logps) - (
+            torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps))
+        )
+        ratio = F.logsigmoid(log_odds)
+        loss = -beta * ratio.sum() / (full_target.shape[0] // 2)
+
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+
+        log_odds_ratio = torch.sum(ratio) / (full_target.shape[0] // 2)
+        log_odds_chosen = torch.sum(log_odds) / (full_target.shape[0] // 2)
+
+        return loss, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ignore_index=-100,
+        beta=0.1,
+        compute_nll_loss=True,
+        nll_target=None,
+        compiled=True,
+        chunk_size=1,
+    ):
+        """
+        Fused linear layer with ORPO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            target (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ignore_index (int): Index to ignore in loss computation
+            beta (float): Weight for the odds ratio loss
+            compute_nll_loss (bool): Whether to compute the NLL loss
+            nll_target (torch.LongTensor, optional): Target tensor for NLL loss. Shape: (batch_size * seq_len,)
+            compiled (bool): Whether to use torch compile
+            chunk_size (int): Size of chunks for processing
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            target=target,
+            bias=bias,
+            ignore_index=ignore_index,
+            beta=beta,
+            compute_nll_loss=compute_nll_loss,
+            nll_target=nll_target,
+            compiled=compiled,
+            chunk_size=chunk_size,
+        )
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
+        return *grads, None, None, None, None, None, None
+
+
+class LigerFusedLinearORPOLoss(torch.nn.Module):
+    """
+    Fused linear layer with ORPO (Odds-Ratio Preference Optimization) loss.
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        compute_nll_loss: bool = True,
+        compiled: bool = True,
+        chunk_size: int = 1,
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss.
+            beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute the NLL loss.
+            compiled (bool): Whether to use the torch compiled kernel.
+            chunk_size (int): Size of chunks for processing.
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.compute_nll_loss = compute_nll_loss
+        self.compiled = compiled
+        self.chunk_size = chunk_size
+
+    def forward(
+        self,
+        lin_weight,
+        _input,
+        target,
+        bias=None,
+        nll_target=None,
+    ):
+        return LigerFusedLinearORPOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ignore_index,
+            self.beta,
+            self.compute_nll_loss,
+            nll_target,
+            self.compiled,
+            self.chunk_size,
+        )
diff --git a/src/liger_kernel/chunked_loss/simpo_loss.py b/src/liger_kernel/chunked_loss/simpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..e24bb29b6ea82dbdb895020e5c953391cf9f86cf
--- /dev/null
+++ b/src/liger_kernel/chunked_loss/simpo_loss.py
@@ -0,0 +1,165 @@
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.fused_linear_preference import LigerFusedLinearPreferenceBase
+
+
+class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(
+        chosen_logps,
+        rejected_logps,
+        full_target,
+        beta=0.1,
+        gamma=0.5,
+        label_smoothing=0.0,
+    ):
+        """
+        Paper: https://arxiv.org/pdf/2405.14734
+
+        Formula:
+        L_SimPO(π_θ) = -E [log σ(β/|y_w| log π_θ(y_w|x) - β/|y_l| log π_θ(y_l|x) - γ)]
+
+        Where:
+        - π_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - |y_w|, |y_l|: Sequence lengths
+        - σ: Sigmoid function
+        - β: beta weight
+        - γ: gemma margin term
+
+        Args:
+            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
+            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            full_target: Non chunked full target tensor
+            beta (float): beta weight
+            gamma (float): gemma margin term
+            label_smoothing (float): Label smoothing factor, will reduce to Equation above when label_smoothing -> 0.
+        """
+        logits = beta * (chosen_logps - rejected_logps) - gamma
+        loss = (-F.logsigmoid(logits) * (1 - label_smoothing) - F.logsigmoid(-logits) * label_smoothing).sum() / (
+            full_target.shape[0] // 2
+        )
+
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+
+        return loss, chosen_rewards, rejected_rewards
+
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ignore_index=-100,
+        beta=0.1,
+        alpha=1.0,
+        label_smoothing=0.0,
+        compute_nll_loss=False,
+        compiled=True,
+        gamma=0.5,
+        chunk_size=1,
+    ):
+        """
+        Fused linear layer with SimPO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            target (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ignore_index (int): Index to ignore in loss computation
+            beta (float): Weight for the odds ratio loss
+            alpha (float): Weight for the alpha parameter
+            label_smoothing (float): Label smoothing factor
+            compute_nll_loss (bool): Whether to compute the NLL loss
+            compiled (bool): Whether to use torch compile
+            gamma (float): Weight for the gamma parameter
+            chunk_size (int): Size of chunks for processing
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            target=target,
+            bias=bias,
+            ignore_index=ignore_index,
+            alpha=alpha,
+            beta=beta,
+            label_smoothing=label_smoothing,
+            compute_nll_loss=compute_nll_loss,
+            compiled=compiled,
+            gamma=gamma,
+            chunk_size=chunk_size,
+        )
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
+        return *grads, None, None, None, None, None, None, None, None
+
+
+class LigerFusedLinearSimPOLoss(torch.nn.Module):
+    """
+    Fused linear layer with SimPO loss.
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        alpha: float = 1.0,
+        label_smoothing: float = 0.0,
+        compute_nll_loss: bool = True,
+        compiled: bool = True,
+        gamma: float = 0.5,
+        chunk_size: int = 1,
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss.
+            beta (float): Weight for the odds ratio loss.
+            alpha (float): Weight for the alpha parameter.
+            label_smoothing (float): Label smoothing factor.
+            compute_nll_loss (bool): Whether to compute the NLL loss.
+            compiled (bool): Whether to use the torch compiled kernel.
+            gamma (float): Weight for the gamma parameter.
+            chunk_size (int): Size of chunks for processing.
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.alpha = alpha
+        self.label_smoothing = label_smoothing
+        self.compute_nll_loss = compute_nll_loss
+        self.compiled = compiled
+        self.gamma = gamma
+        self.chunk_size = chunk_size
+
+    def forward(
+        self,
+        lin_weight,
+        _input,
+        target,
+        bias=None,
+    ):
+        return LigerFusedLinearSimPOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ignore_index,
+            self.beta,
+            self.alpha,
+            self.label_smoothing,
+            self.compute_nll_loss,
+            self.compiled,
+            self.gamma,
+            self.chunk_size,
+        )
diff --git a/src/liger_kernel/env_report.py b/src/liger_kernel/env_report.py
new file mode 100755
index 0000000000000000000000000000000000000000..ff31855090ffffa54dd42a6d70ab333739cc2c55
--- /dev/null
+++ b/src/liger_kernel/env_report.py
@@ -0,0 +1,63 @@
+import platform
+import sys
+
+from importlib.metadata import version
+
+
+def print_env_report():
+    """
+
+    Prints a report of the environment.  Useful for debugging and reproducibility.
+    Usage:
+    ```
+    python -m liger_kernel.env_report
+    ```
+
+    """
+    print("Environment Report:")
+    print("-------------------")
+    print(f"Operating System: {platform.platform()}")
+    print(f"Python version: {sys.version.split()[0]}")
+
+    try:
+        print(f"Liger Kernel version: {version('liger-kernel')}")
+    except ImportError:
+        print("Liger Kernel: Not installed")
+
+    try:
+        import torch
+
+        print(f"PyTorch version: {torch.__version__}")
+        cuda_version = torch.version.cuda if torch.cuda.is_available() else "Not available"
+        print(f"CUDA version: {cuda_version}")
+        hip_version = torch.version.hip if torch.cuda.is_available() and torch.version.hip else "Not available"
+        print(f"HIP(ROCm) version: {hip_version}")
+
+    except ImportError:
+        print("PyTorch: Not installed")
+        print("CUDA version: Unable to query")
+        print("HIP(ROCm) version: Unable to query")
+
+    try:
+        import triton
+
+        print(f"Triton version: {triton.__version__}")
+    except ImportError:
+        print("Triton: Not installed")
+
+    try:
+        import transformers
+
+        print(f"Transformers version: {transformers.__version__}")
+    except ImportError:
+        print("Transformers: Not installed")
+
+    try:
+        xpu_version = torch.version.xpu if torch.xpu.is_available() else "XPU Not Available"
+        print(f"XPU version: {xpu_version}")
+    except ImportError:
+        print("XPU version: Unable to query")
+
+
+if __name__ == "__main__":
+    print_env_report()
diff --git a/src/liger_kernel/ops/__init__.py b/src/liger_kernel/ops/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..cc7d0b038ba8e049f29678580c9afc342ba89411
--- /dev/null
+++ b/src/liger_kernel/ops/__init__.py
@@ -0,0 +1,144 @@
+"""
+Liger-Kernel operators with automatic vendor-specific replacement.
+
+This module provides two ways to import operators:
+
+1. Import from this package (recommended for Function classes):
+       from liger_kernel.ops import LigerGELUMulFunction
+
+   This automatically uses vendor-specific implementation if available.
+
+2. Import from submodules (for kernel functions or specific access):
+       from liger_kernel.ops.geglu import geglu_forward, geglu_backward
+
+   This always uses the default implementation (no auto-replacement).
+
+The replacement mechanism:
+1. Default implementations are imported from individual modules (e.g., geglu.py)
+2. On module load, device is detected via infer_device()
+3. If running on a supported vendor device (npu, xpu, etc.), the default
+   implementations are replaced with vendor-specific ones
+4. All subsequent imports from this package get the replaced versions
+
+Note: Direct imports from submodules (e.g., from liger_kernel.ops.geglu import ...)
+      are NOT affected by the replacement mechanism.
+"""
+
+# =============================================================================
+# Import default implementations
+# Both Function classes and kernel functions are imported here.
+# All of these can be replaced by vendor-specific implementations.
+# =============================================================================
+
+from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction  # noqa: F401
+from liger_kernel.ops.cross_entropy import cross_entropy_backward  # noqa: F401
+from liger_kernel.ops.cross_entropy import cross_entropy_forward  # noqa: F401
+from liger_kernel.ops.dyt import LigerDyTFunction  # noqa: F401
+from liger_kernel.ops.experimental.embedding import LigerEmbeddingFunction  # noqa: F401
+from liger_kernel.ops.fused_add_rms_norm import LigerFusedAddRMSNormFunction  # noqa: F401
+from liger_kernel.ops.fused_add_rms_norm import fused_add_rms_norm_backward  # noqa: F401
+from liger_kernel.ops.fused_add_rms_norm import fused_add_rms_norm_forward  # noqa: F401
+from liger_kernel.ops.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyFunction  # noqa: F401
+from liger_kernel.ops.fused_linear_cross_entropy import fused_linear_cross_entropy_backward  # noqa: F401
+from liger_kernel.ops.fused_linear_cross_entropy import fused_linear_cross_entropy_forward  # noqa: F401
+from liger_kernel.ops.fused_linear_jsd import LigerFusedLinearJSDFunction  # noqa: F401
+from liger_kernel.ops.fused_linear_jsd import fused_linear_jsd_backward  # noqa: F401
+from liger_kernel.ops.fused_linear_jsd import fused_linear_jsd_forward  # noqa: F401
+from liger_kernel.ops.fused_neighborhood_attention import LigerFusedNeighborhoodAttentionFunction  # noqa: F401
+from liger_kernel.ops.geglu import LigerGELUMulFunction  # noqa: F401
+from liger_kernel.ops.geglu import geglu_backward  # noqa: F401
+from liger_kernel.ops.geglu import geglu_forward  # noqa: F401
+from liger_kernel.ops.group_norm import LigerGroupNormFunction  # noqa: F401
+from liger_kernel.ops.group_norm import group_norm_backward  # noqa: F401
+from liger_kernel.ops.group_norm import group_norm_forward  # noqa: F401
+from liger_kernel.ops.grpo_loss import GrpoLossFunction  # noqa: F401
+from liger_kernel.ops.jsd import LigerJSDFunction  # noqa: F401
+from liger_kernel.ops.jsd import jsd_backward  # noqa: F401
+from liger_kernel.ops.jsd import jsd_forward  # noqa: F401
+from liger_kernel.ops.kl_div import LigerKLDivLossFunction  # noqa: F401
+from liger_kernel.ops.layer_norm import LigerLayerNormFunction  # noqa: F401
+from liger_kernel.ops.layer_norm import layer_norm_backward  # noqa: F401
+from liger_kernel.ops.layer_norm import layer_norm_forward  # noqa: F401
+from liger_kernel.ops.llama4_rope import LigerLlama4RopeFunction  # noqa: F401
+from liger_kernel.ops.mhc import LigerMHCCoeffsFunction  # noqa: F401
+from liger_kernel.ops.mhc import LigerMHCPostResFunction  # noqa: F401
+from liger_kernel.ops.mhc import LigerMHCPreFunction  # noqa: F401
+from liger_kernel.ops.multi_token_attention import LigerMultiTokenAttentionFunction  # noqa: F401
+from liger_kernel.ops.poly_norm import LigerPolyNormFunction  # noqa: F401
+from liger_kernel.ops.poly_norm import poly_norm_backward  # noqa: F401
+from liger_kernel.ops.poly_norm import poly_norm_forward  # noqa: F401
+from liger_kernel.ops.qwen2vl_mrope import LigerQwen2VLMRopeFunction  # noqa: F401
+from liger_kernel.ops.rms_norm import LigerRMSNormFunction  # noqa: F401
+from liger_kernel.ops.rms_norm import rms_norm_backward  # noqa: F401
+from liger_kernel.ops.rms_norm import rms_norm_forward  # noqa: F401
+from liger_kernel.ops.rope import LigerRopeFunction  # noqa: F401
+from liger_kernel.ops.rope import rope_backward  # noqa: F401
+from liger_kernel.ops.rope import rope_forward  # noqa: F401
+from liger_kernel.ops.softmax import LigerSoftmaxFunction  # noqa: F401
+from liger_kernel.ops.sparsemax import LigerSparsemaxFunction  # noqa: F401
+from liger_kernel.ops.swiglu import LigerSiLUMulFunction  # noqa: F401
+from liger_kernel.ops.swiglu import swiglu_backward  # noqa: F401
+from liger_kernel.ops.swiglu import swiglu_forward  # noqa: F401
+from liger_kernel.ops.tiled_mlp import LigerTiledMLPFunction  # noqa: F401
+from liger_kernel.ops.tiled_mlp import apply_tiled_mlp  # noqa: F401
+from liger_kernel.ops.tvd import LigerTVDLossFunction  # noqa: F401
+
+# NOTE: __all__ is intentionally NOT defined.
+# - Import from this package (liger_kernel.ops) -> subject to vendor replacement
+# - Import from submodules (liger_kernel.ops.geglu) -> always use default implementation
+
+
+# =============================================================================
+# Vendor-specific replacement logic
+# =============================================================================
+
+
+def _replace_with_vendor_ops():
+    """
+    Replace/add vendor-specific operator implementations.
+
+    This function is called automatically on module load. It:
+    1. Detects the current device (cuda, npu, xpu, etc.)
+    2. Looks up the vendor for that device via VENDOR_REGISTRY
+    3. Loads and applies vendor-specific implementations
+
+    Vendor implementations should be placed in:
+        liger_kernel/ops/backends/_<vendor>/ops/
+
+    If the vendor module defines __all__, only those symbols are exported.
+    Otherwise, all public symbols (not starting with _) are auto-discovered.
+
+    Note: Vendor can both override existing ops AND add new vendor-specific ops.
+    """
+    from liger_kernel.ops.backends import get_vendor_for_device
+    from liger_kernel.utils import infer_device
+
+    device = infer_device()
+
+    # Look up vendor info for this device
+    vendor_info = get_vendor_for_device(device)
+    if vendor_info is None:
+        return
+
+    try:
+        import importlib
+
+        vendor_ops = importlib.import_module(vendor_info.module_path)
+
+        # Get names to export: use __all__ if defined, otherwise auto-discover
+        names_to_export = getattr(vendor_ops, "__all__", None)
+
+        if names_to_export is None:
+            # Auto-discover: find all public symbols (classes and functions)
+            names_to_export = [name for name in dir(vendor_ops) if not name.startswith("_")]
+
+        # Replace or add to this module's globals
+        for name in names_to_export:
+            globals()[name] = getattr(vendor_ops, name)
+
+    except ImportError:
+        # Vendor module not available, use default implementations
+        pass
+
+
+_replace_with_vendor_ops()
diff --git a/src/liger_kernel/ops/backends/README.md b/src/liger_kernel/ops/backends/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..d4067157b60be225fc361acca9dea025d63d757c
--- /dev/null
+++ b/src/liger_kernel/ops/backends/README.md
@@ -0,0 +1,151 @@
+# Adding a New Vendor Backend
+
+This directory contains vendor-specific operator implementations that automatically replace the default (CUDA) implementations when running on the corresponding device.
+
+## Concepts
+
+- **Vendor**: Chip manufacturer (e.g., `ascend`, `intel`, `nvidia`)
+- **Device**: Device type (e.g., `npu`, `xpu`, `cuda`)
+- **VendorInfo**: Defines the mapping between vendor and device
+
+## Directory Structure
+
+```
+backends/
+├── README.md          
+├── __init__.py         
+├── registry.py         # VendorInfo, register_vendor(), VENDOR_REGISTRY
+├── _ascend/            # Ascend (Huawei) vendor - supports NPU
+│   ├── __init__.py     # Registers VendorInfo for NPU
+│   └── ops/
+│       ├── __init__.py # Exports vendor-specific implementations
+│       └── geglu.py    # NPU-specific GEGLU implementation
+└── _<vendor>/          # Your new vendor backend
+    └── ...
+```
+
+## How It Works
+
+1. When `liger_kernel.ops.backends` is imported, it imports all vendor packages (e.g., `_ascend`)
+2. Each vendor's `__init__.py` calls `register_vendor()` to register itself
+3. When `liger_kernel.ops` is imported, `_replace_with_vendor_ops()` is called
+4. It detects the current device via `infer_device()` and looks up the vendor
+5. Vendor implementations replace/add to the `liger_kernel.ops` namespace
+
+## Adding a New Vendor
+
+### Step 1: Create Directory Structure
+
+```bash
+mkdir -p backends/_<vendor>/ops
+touch backends/_<vendor>/__init__.py
+touch backends/_<vendor>/ops/__init__.py
+```
+
+### Step 2: Register Your Vendor
+
+In `backends/_<vendor>/__init__.py`, register your vendor:
+
+```python
+"""
+<Vendor> backend for Liger-Kernel.
+"""
+
+from liger_kernel.ops.backends.registry import VendorInfo, register_vendor
+
+register_vendor(
+    VendorInfo(
+        vendor="<vendor>",
+        device="<device>",
+    )
+)
+```
+
+
+### Step 3: Ensure Device Detection Works
+
+Make sure `infer_device()` in `liger_kernel/utils.py` can detect your device:
+
+```python
+def infer_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    if is_npu_available():
+        return "npu"
+    # Add your device detection here
+    if is_<device>_available():
+        return "<device>"
+    return "cpu"
+```
+
+### Step 4: Implement Vendor-Specific Operators
+
+Create operator files in `backends/_<vendor>/ops/`. For example, `geglu.py`:
+
+```python
+import torch
+
+class LigerGELUMulFunction(torch.autograd.Function):
+    """
+    Vendor-specific LigerGELUMulFunction implementation.
+    """
+    @staticmethod
+    def forward(ctx, a, b):
+        # Your vendor-specific forward implementation
+        ...
+
+    @staticmethod
+    def backward(ctx, dc):
+        # Your vendor-specific backward implementation
+        ...
+
+# Optional: vendor-specific kernel functions
+def geglu_forward_vendor(a, b):
+    ...
+
+def geglu_backward_vendor(a, b, dc):
+    ...
+```
+
+### Step 5: Export in `ops/__init__.py`
+
+In `backends/_<vendor>/ops/__init__.py`, export your implementations:
+
+```python
+"""
+<Vendor>-specific operator implementations.
+"""
+
+from .<module> import (
+    LigerGELUMulFunction,
+    geglu_forward_vendor as geglu_forward,   # Rename to match default API
+    geglu_backward_vendor as geglu_backward,
+)
+
+# Explicitly declare what to export (recommended)
+__all__ = [
+    "LigerGELUMulFunction",
+    "geglu_forward",
+    "geglu_backward",
+]
+```
+
+## Key Points
+
+### Incremental Override
+
+You **don't need to implement all operators**. Only implement the ones that require vendor-specific adaptations. Unimplemented operators will automatically fall back to the default (CUDA) implementation.
+
+### Vendor-Specific Additions
+
+Vendors can also **add new operators** that don't exist in the default implementation. These will be exported to `liger_kernel.ops` namespace for users to import.
+
+### Naming Convention
+
+- Use the **same class/function names** as the default implementations for overrides
+- This allows seamless replacement without changing user code
+- Use `as` imports to rename if your internal naming differs
+
+## Example: Ascend NPU Backend
+
+See `_ascend/` directory for a complete example of the Ascend NPU backend implementation.
diff --git a/src/liger_kernel/ops/backends/__init__.py b/src/liger_kernel/ops/backends/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..ad7779c48afdc2efb3783ad80a87224c407e1c28
--- /dev/null
+++ b/src/liger_kernel/ops/backends/__init__.py
@@ -0,0 +1,13 @@
+import importlib
+import pkgutil
+
+from liger_kernel.ops.backends.registry import VENDOR_REGISTRY  # noqa: F401
+from liger_kernel.ops.backends.registry import VendorInfo  # noqa: F401
+from liger_kernel.ops.backends.registry import get_vendor_for_device  # noqa: F401
+from liger_kernel.ops.backends.registry import register_vendor  # noqa: F401
+
+# Auto-import all _<vendor> subpackages to trigger registration
+# Each vendor's __init__.py calls register_vendor() when imported
+for _, modname, ispkg in pkgutil.iter_modules(__path__):
+    if ispkg and modname.startswith("_"):
+        importlib.import_module(f"{__name__}.{modname}")
diff --git a/src/liger_kernel/ops/backends/_ascend/__init__.py b/src/liger_kernel/ops/backends/_ascend/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..a07e7ab09e89f8781949bb4d6d6b1fb1e27e3344
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/__init__.py
@@ -0,0 +1,5 @@
+from liger_kernel.ops.backends.registry import VendorInfo
+from liger_kernel.ops.backends.registry import register_vendor
+
+# Register Ascend vendor for NPU device
+register_vendor(VendorInfo(vendor="ascend", device="npu"))
diff --git a/src/liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md b/src/liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md
new file mode 100755
index 0000000000000000000000000000000000000000..bf9faa3dc09ce5b3c57aa1a7827f4faa0e1fab61
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md
@@ -0,0 +1,492 @@
+# Ascend NPU UB Manager Design Document
+
+## Overview
+
+The UB Manager (Unified Buffer Manager) is a core component in **Liger-Kernel** responsible for managing the Unified Buffer (UB) capacity on Ascend NPUs. By automatically detecting UB capacity and providing unified tiling strategy computation, it helps Triton kernels avoid UB overflow errors while maintaining high performance.
+
+## Design Goals
+
+1. **Automated UB Management**: Automatically detect device UB capacity without manual configuration
+2. **Unified Strategy System**: Use a single unified strategy function for all kernels, abstracting memory calculations
+3. **Flexible Parameters**: Support different memory multipliers and safety margins for different kernels
+4. **Easy to Use**: Simple interface that directly computes tiling results
+
+## Architecture Design
+
+### Core Components
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    UB Manager System                    │
+├─────────────────────────────────────────────────────────┤
+│                                                         │
+│  ┌──────────────┐         ┌──────────────────┐          │
+│  │  UBManager   │         │ Default Strategy │          │
+│  │   (Singleton)│────────▶│    Function      │          │
+│  └──────────────┘         └──────────────────┘          │
+│         │                            │                  │
+│         │                            │                  │
+│         ▼                            ▼                  │
+│  ┌──────────────┐         ┌──────────────────┐          │
+│  │   Capacity   │         │  compute_default │          │
+│  │  Detection   │         │  _tiling_strategy│          │
+│  └──────────────┘         └──────────────────┘          │
+│                                                         │
+└─────────────────────────────────────────────────────────┘
+         │                            │
+         │                            │
+         ▼                            ▼
+┌──────────────┐         ┌──────────────────┐
+│   GEGLU      │         │      ROPE        │
+│   Kernel     │         │     Kernel       │
+└──────────────┘         └──────────────────┘
+```
+
+### Class Diagram
+
+```
+┌──────────────────────────────────────┐
+│          UBManager                   │
+├──────────────────────────────────────┤
+│ - _npu_model: str                    │
+│ - _ub_capacity_bits: int             │
+├──────────────────────────────────────┤
+│ + ub_capacity_bits: int              │
+│ + ub_capacity_bytes: int             │
+│ + npu_model: str                     │
+│ - _detect_npu_model()                │
+│ - _detect_ub_capacity()              │
+│   (raises RuntimeError if fails)     │
+└──────────────────────────────────────┘
+
+┌──────────────────────────────────────┐
+│   compute_default_tiling_strategy    │
+├──────────────────────────────────────┤
+│ + safety_margin: float                │
+│ + dtype_size: int                    │
+│ + memory_multiplier: float            │
+│ + shapes: Tuple[Tuple[int, ...], ...]│
+│ + tiling_dims: Tuple                 │
+├──────────────────────────────────────┤
+│ Returns: Tuple[Tuple[int, ...], ...] │
+│   (same structure as shapes)         │
+└──────────────────────────────────────┘
+
+┌──────────────────────────────────────┐
+│   _normalize_tiling_dims             │
+├──────────────────────────────────────┤
+│ Helper function to normalize         │
+│ tiling_dim (int or tuple) to set     │
+└──────────────────────────────────────┘
+```
+
+## Core Functionality
+
+### 1. UB Capacity Detection
+
+The UB Manager detects UB capacity in the following priority order:
+
+1. **Environment Variable**: `ASCEND_UB_CAPACITY_BITS` (in bits)
+   - If set, this value is used directly
+   - Must be a positive integer representing UB capacity in bits
+
+2. **get_soc_spec**: Query UB size from CANN's `get_soc_spec("UB_SIZE")`
+   - Returns UB size in bytes
+   - Automatically converted to bits (bytes * 8)
+   - Requires CANN environment to be sourced (e.g., `source /usr/local/Ascend/ascend-toolkit/set_env.sh`)
+
+3. **Error Handling**: If neither method succeeds, raises `RuntimeError` with clear instructions
+
+
+```python
+# Detection flow:
+# 1. Check ASCEND_UB_CAPACITY_BITS env var (bits)
+# 2. Try get_soc_spec("UB_SIZE") (bytes) -> convert to bits
+# 3. Raise RuntimeError if both fail
+```
+
+### 2. Unified Strategy System
+
+All kernels use a single unified strategy function `_default_strategy` that abstracts memory calculations:
+
+```
+Memory Formula: memory_multiplier * BLOCK_SIZE * unit_param * dtype_size * 8 bits
+```
+
+Where `unit_param` is automatically calculated as the product of all fixed (non-tiling) dimensions in each shape.
+
+The strategy function:
+- Takes UB capacity, safety margin, dtype size, memory multiplier, shapes, and tiling dimension specifications
+- For each shape, identifies which dimensions can be tiled (from `tiling_dims`)
+- Calculates `unit_param` as the product of fixed (non-tiling) dimensions
+- Calculates the maximum safe block size that fits within UB capacity
+- Returns a tuple of max_safe_block_size values (one for each shape)
+
+The `compute_default_tiling_strategy` function:
+- Calls `_default_strategy` to get max_safe_block_size for each shape
+- For each tiling dimension, computes desired block size using `triton.next_power_of_2(original_dim)`
+- Returns the final result with same structure as input shapes: tiling dimensions replaced with computed block sizes, non-tiling dimensions padded to next power of 2
+
+### 3. Parameter Structure
+
+The unified strategy uses the following parameters:
+
+- **`safety_margin`**: Safety margin as a float (e.g., 0.80 for 80%). Default is 0.80.
+- **`dtype_size`**: Size of data type in bytes (e.g., 2 for float16, 4 for float32)
+- **`memory_multiplier`**: Memory multiplier for estimating peak memory usage
+  - For GEGLU: typically 10.0 for backward, 7.0 for forward
+  - For ROPE: typically 3.0
+- **`shapes`**: Tuple of full shapes. Each shape is a tuple of dimension sizes.
+  - For ROPE: `((n_q_head, hd), (n_kv_head, hd))`
+  - For GEGLU: `((n_cols,),)`
+  - Can pass original shapes (will handle padding internally) or padded shapes
+- **`tiling_dims`**: Tuple specifying which dimensions can be tiled for each shape.
+  - Each element can be:
+    - `int`: single dimension index (e.g., `0` for first dimension)
+    - `tuple of ints`: multiple dimensions that can be tiled together (non-empty)
+  - For ROPE: `(0, 0)` means first dimension of each shape can be tiled
+  - For GEGLU: `(0,)` means first dimension of the shape can be tiled
+  - Length must match `len(shapes)`
+  - Fixed dimensions (non-tiling) are automatically extracted from shapes and multiplied to get `unit_param`
+  - **Validation**: Raises `ValueError` if:
+    - Any `tiling_dim` is empty or invalid (e.g., empty tuple)
+    - Any dimension index is out of bounds (negative or >= shape length)
+
+### 4. Strategy Computation Flow
+
+```
+User calls compute_default_tiling_strategy()
+         │
+         ▼
+Get UB manager instance
+         │
+         ▼
+Validate shapes and tiling_dims (lengths must match)
+         │
+         ▼
+Set defaults for dtype_size (4) and memory_multiplier (10.0)
+         │
+         ▼
+Call _default_strategy() with:
+  - ub_capacity_bits
+  - safety_margin
+  - dtype_size
+  - memory_multiplier
+  - shapes
+  - tiling_dims
+         │
+         ▼
+For each (shape, tiling_dim) pair:
+  Normalize tiling_dim to set of dimension indices
+  Validate tiling dimensions are within shape bounds
+  (Raises ValueError if invalid)
+         │
+         ▼
+  Calculate unit_param:
+    unit_param = product of all non-tiling dimensions
+         │
+         ▼
+  Calculate max_block_size:
+    SAFE_UB_CAPACITY_BITS = ub_capacity_bits * safety_margin
+    max_block_size = SAFE_UB_CAPACITY_BITS / (memory_multiplier * unit_param * dtype_size * 8)
+         │
+         ▼
+  Find largest power of 2 <= max_block_size
+         │
+         ▼
+Return tuple of max_safe_block_size (one per shape)
+         │
+         ▼
+Build result with same structure as shapes:
+  For each (shape, tiling_dim, max_safe):
+    For each tiling dimension:
+      desired = triton.next_power_of_2(original_dim)
+      final = min(desired, max_safe)
+      final = max(1, final)
+    For each non-tiling dimension:
+      pad to triton.next_power_of_2(original_dim)
+         │
+         ▼
+Return tuple of tiled shapes
+```
+
+## Usage Examples
+
+### Basic Usage
+
+```python
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+
+# GEGLU forward
+shapes = ((4096,),)
+tile_shapes = compute_default_tiling_strategy(
+    safety_margin=0.80,
+    dtype_size=2,  # float16
+    memory_multiplier=7.0,
+    shapes=shapes,
+    tiling_dims=(0,)  # First dimension can be tiled
+)
+if tile_shapes is not None and len(tile_shapes) > 0:
+    block_size = tile_shapes[0][0]
+    # Call kernel with block_size
+
+# ROPE forward
+shapes = ((32, 128), (32, 128))  # (n_q_head, hd), (n_kv_head, hd)
+tile_shapes = compute_default_tiling_strategy(
+    safety_margin=0.90,
+    dtype_size=4,  # float32
+    memory_multiplier=3.0,
+    shapes=shapes,
+    tiling_dims=(0, 0)  # First dimension of each shape can be tiled
+)
+if tile_shapes is not None and len(tile_shapes) == len(shapes):
+    q_tile_shape, k_tile_shape = tile_shapes
+    BLOCK_Q, _ = q_tile_shape  # Tiled dimension
+    BLOCK_K, _ = k_tile_shape  # Tiled dimension
+    # Call kernel with BLOCK_Q and BLOCK_K
+```
+
+## Strategy Function Details
+
+### `_normalize_tiling_dims` Helper Function
+
+A helper function that normalizes tiling dimension specifications:
+
+```python
+def _normalize_tiling_dims(tiling_dim: Union[int, Tuple[int, ...]]) -> set:
+    """
+    Normalize tiling dimension specification to a set of dimension indices.
+    
+    Args:
+        tiling_dim: Either an int (single dimension) or tuple of ints (multiple dimensions).
+    
+    Returns:
+        Set of dimension indices that can be tiled.
+    """
+```
+
+This function handles the conversion of `tiling_dim` from either an `int` or `tuple` to a `set` for consistent processing.
+
+### `_default_strategy` Function
+
+The core strategy function that calculates maximum safe block size:
+
+```python
+def _default_strategy(
+    ub_capacity_bits: int,
+    safety_margin: float,
+    dtype_size: int,
+    memory_multiplier: float,
+    shapes: Tuple[Tuple[int, ...], ...],
+    tiling_dims: Tuple[Union[int, Tuple[int, ...]], ...],
+) -> Tuple[int, ...]:
+    """
+    Calculate maximum safe block size based on UB capacity.
+    
+    Memory formula: memory_multiplier * BLOCK_SIZE * unit_param * dtype_size * 8 bits
+    
+    For each shape, fixed dimensions (non-tiling) are multiplied together to get unit_param.
+    
+    Returns:
+        Tuple of max_safe_block_size (power of 2), one for each shape.
+    
+    Raises:
+        ValueError: If any tiling_dim is empty or invalid, or if any dimension
+                    index is out of bounds for the corresponding shape.
+    """
+```
+
+**Key Steps:**
+1. For each `(shape, tiling_dim)` pair:
+   - Normalize `tiling_dim` to a set of dimension indices using `_normalize_tiling_dims`
+   - Validate tiling dimensions are within shape bounds
+     - Raises `ValueError` if `tiling_dim` is empty or invalid
+     - Raises `ValueError` if any dimension index is out of bounds
+   - Calculate `unit_param` as the product of all non-tiling dimensions
+   - If all dimensions are tiling, `unit_param = 1.0`
+2. Calculate `SAFE_UB_CAPACITY_BITS = ub_capacity_bits * safety_margin`
+3. Solve for max_block_size: `SAFE_UB_CAPACITY_BITS / (memory_multiplier * unit_param * dtype_size * 8)`
+4. Find largest power of 2 <= max_block_size
+5. Return tuple with one max_safe_block_size per shape
+
+### `compute_default_tiling_strategy` Function
+
+The public interface that computes final tiling results:
+
+```python
+def compute_default_tiling_strategy(
+    safety_margin: float = 0.80,
+    dtype_size: Optional[int] = None,
+    memory_multiplier: Optional[float] = None,
+    shapes: Optional[Tuple[Tuple[int, ...], ...]] = None,
+    tiling_dims: Optional[Tuple[Union[int, Tuple[int, ...]], ...]] = None,
+) -> Optional[Tuple[Tuple[int, ...], ...]]:
+    """
+    Compute tiling strategy using the default strategy function.
+    
+    Returns tuple of tiled shapes with same structure as input shapes.
+    Tiling dimensions are replaced with computed block sizes (power of 2),
+    while non-tiling dimensions are padded to next power of 2.
+    
+    Returns:
+        Tuple of tiled shapes, or None if shapes/tiling_dims are empty or
+        lengths don't match.
+    
+    Raises:
+        ValueError: If any tiling_dim is empty or invalid, or if any dimension
+                    index is out of bounds for the corresponding shape.
+    """
+```
+
+**Key Steps:**
+1. Get UB manager instance
+2. Validate `shapes` and `tiling_dims` (lengths must match, cannot be empty)
+   - Returns `None` if validation fails (empty or mismatched lengths)
+3. Set defaults for `dtype_size` (4) and `memory_multiplier` (10.0) if not provided
+4. Call `_default_strategy` to get `max_supported` (tuple of max_safe_block_size, one per shape)
+   - May raise `ValueError` if `tiling_dims` are invalid (see `_default_strategy` documentation)
+5. For each `(shape, tiling_dim, max_safe)`:
+   - Normalize `tiling_dim` to a set of dimension indices
+   - Validate tiling dimensions are within shape bounds
+     - Raises `ValueError` if `tiling_dim` is empty or invalid
+     - Raises `ValueError` if any dimension index is out of bounds
+   - For each tiling dimension:
+     - Compute `desired = triton.next_power_of_2(original_dim)`
+     - Compute `final = min(desired, max_safe)`
+     - Ensure `final >= 1`
+     - Replace dimension with `final`
+   - For each non-tiling dimension:
+     - Pad to `triton.next_power_of_2(original_dim)`
+6. Return tuple of tiled shapes (same structure as input `shapes`)
+
+## Memory Analysis Examples
+
+### GEGLU Forward
+
+```
+Memory analysis:
+- Inputs: a, b
+- Intermediates: a_cubed, tanh_arg, tanh_result, geglu_a
+- Output: c
+- Total: ~7x * BLOCK_SIZE * dtype_size
+
+Strategy:
+- shapes: ((n_cols,),)
+- tiling_dims: (0,)  # First dimension can be tiled
+- Fixed dimensions: none (all dimensions are tiling)
+- unit_param = 1 (product of fixed dimensions)
+- memory_multiplier = 7.0
+- Formula: 7.0 * BLOCK_SIZE * 1 * dtype_size * 8 bits
+- Returns: ((block_size,),)
+```
+
+### GEGLU Backward
+
+```
+Memory analysis:
+- More intermediates for gradient computation
+- Total: ~10x * BLOCK_SIZE * dtype_size
+
+Strategy:
+- shapes: ((n_cols,),)
+- tiling_dims: (0,)  # First dimension can be tiled
+- Fixed dimensions: none (all dimensions are tiling)
+- unit_param = 1 (product of fixed dimensions)
+- memory_multiplier = 10.0
+- Formula: 10.0 * BLOCK_SIZE * 1 * dtype_size * 8 bits
+- Returns: ((block_size,),)
+```
+
+### ROPE Forward/Backward
+
+```
+Memory analysis (based on optimized ROPE kernel):
+- cos_vals and sin_vals: pad_hd // 2 elements each (shared)
+- In q heads loop (peak memory):
+  * q_left, q_right, new_left, new_right: 2 * BLOCK_Q * pad_hd elements
+- In k heads loop (peak memory):
+  * k_left, k_right, new_left, new_right: 2 * BLOCK_K * pad_hd elements
+- Plus shared cos/sin: pad_hd elements
+- Conservative estimate: 3 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+
+Strategy:
+- shapes: ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+- tiling_dims: (0, 0)  # First dimension of each shape can be tiled
+- Fixed dimensions: pad_hd (second dimension, non-tiling)
+- unit_param = pad_hd (product of fixed dimensions)
+- memory_multiplier = 3.0
+- Formula: 3.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+- Returns: ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+```
+
+## Extension Guide
+
+### Adding a New Kernel
+
+To add tiling support for a new kernel:
+
+1. **Analyze memory usage**:
+   - Identify peak memory usage in the kernel
+   - Determine memory multiplier (e.g., 7.0, 10.0, 3.0)
+   - Identify which dimensions can be tiled and which are fixed
+   - Fixed dimensions will be automatically extracted and multiplied to get `unit_param`
+
+2. **Use `compute_default_tiling_strategy`** in your kernel:
+
+```python
+def my_kernel_forward(input):
+    # Prepare parameters
+    n_cols = input.shape[-1]
+    dtype_size = input.element_size()
+    
+    # Compute strategy
+    # Example 1: Simple case (all dimensions can be tiled)
+    shapes = ((n_cols,),)
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.80,
+        dtype_size=dtype_size,
+        memory_multiplier=7.0,  # Based on your memory analysis
+        shapes=shapes,
+        tiling_dims=(0,)  # First dimension can be tiled
+    )
+    
+    if tile_shapes is not None and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+    else:
+        block_size = triton.next_power_of_2(n_cols)  # Fallback
+    
+    # Example 2: Multiple shapes with fixed dimensions
+    # shapes = ((M, K), (K, N))
+    # tiling_dims = (0, 1)  # First shape: dim 0 can be tiled, dim 1 is fixed
+    #                      # Second shape: dim 0 is fixed, dim 1 can be tiled
+    # Returns: ((block_M, K), (K, block_N))
+    
+    # Call kernel
+    kernel[(grid_size,)](
+        input,
+        BLOCK_SIZE=block_size,
+    )
+```
+
+3. **Document memory analysis** in comments:
+
+```python
+# My kernel tiling strategy:
+# - Memory analysis:
+#   * Input: input
+#   * Intermediates: intermediate1, intermediate2
+#   * Output: output
+#   * Total: ~7x * BLOCK_SIZE * dtype_size
+# - shapes: ((n_cols,),)
+# - tiling_dims: (0,) means first dimension can be tiled
+# - Fixed dimensions: none (all dimensions are tiling)
+# - unit_param = 1 (product of fixed dimensions)
+# - Uses memory_multiplier=7.0 * BLOCK_SIZE * dtype_size * 8 bits for safety
+# - compute_default_tiling_strategy returns: ((block_size,),)
+#   where block_size = min(triton.next_power_of_2(n_cols), max_safe_block_size)
+```
+
+## Future Improvements
+
+1. **Strategy Variants**: If needed, could add specialized strategy functions for specific kernels while keeping the unified interface
+2. **Multi-dimensional Tiling**: Could extend to support more complex tiling patterns if needed
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/__init__.py b/src/liger_kernel/ops/backends/_ascend/ops/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..38fcc7ee1c9677373665cfff5071d13cc3dd0dcc
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/__init__.py
@@ -0,0 +1,139 @@
+"""
+Ascend NPU operator implementations.
+
+This module exports Ascend NPU-optimized implementations that will automatically
+replace the default implementations when running on NPU devices.
+
+Both Function classes and kernel functions can be exported here.
+
+To add a new operator:
+1. Create the implementation file (e.g., rms_norm.py)
+2. Import the Function class and/or kernel functions here
+3. Optionally add to __all__ for explicit control
+
+If __all__ is not defined, all public symbols will be auto-discovered.
+"""
+
+from liger_kernel.ops.backends._ascend.ops.cross_entropy import LigerCrossEntropyFunction
+from liger_kernel.ops.backends._ascend.ops.cross_entropy import cross_entropy_backward
+from liger_kernel.ops.backends._ascend.ops.cross_entropy import cross_entropy_forward
+from liger_kernel.ops.backends._ascend.ops.dyt import LigerDyTFunction
+from liger_kernel.ops.backends._ascend.ops.dyt import liger_dyt_bwd
+from liger_kernel.ops.backends._ascend.ops.dyt import liger_dyt_fwd
+from liger_kernel.ops.backends._ascend.ops.embedding import LigerEmbeddingFunction
+from liger_kernel.ops.backends._ascend.ops.embedding import embedding_backward
+from liger_kernel.ops.backends._ascend.ops.embedding import embedding_forward
+from liger_kernel.ops.backends._ascend.ops.fused_add_rms_norm import LigerFusedAddRMSNormFunction
+from liger_kernel.ops.backends._ascend.ops.fused_add_rms_norm import fused_add_rms_norm_backward
+from liger_kernel.ops.backends._ascend.ops.fused_add_rms_norm import fused_add_rms_norm_forward
+from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import LigerFusedLinearJSDFunction
+from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import fused_linear_jsd_backward
+from liger_kernel.ops.backends._ascend.ops.fused_linear_jsd import fused_linear_jsd_forward
+from liger_kernel.ops.backends._ascend.ops.geglu import LigerGELUMulFunction
+from liger_kernel.ops.backends._ascend.ops.geglu import geglu_backward
+from liger_kernel.ops.backends._ascend.ops.geglu import geglu_forward
+from liger_kernel.ops.backends._ascend.ops.group_norm import LigerGroupNormFunction
+from liger_kernel.ops.backends._ascend.ops.group_norm import group_norm_backward
+from liger_kernel.ops.backends._ascend.ops.group_norm import group_norm_forward
+from liger_kernel.ops.backends._ascend.ops.grpo_loss import GrpoLossFunction
+from liger_kernel.ops.backends._ascend.ops.grpo_loss import grpo_loss_backward_triton
+from liger_kernel.ops.backends._ascend.ops.grpo_loss import grpo_loss_forward_triton
+from liger_kernel.ops.backends._ascend.ops.jsd import LigerJSDFunction
+from liger_kernel.ops.backends._ascend.ops.jsd import jsd_backward
+from liger_kernel.ops.backends._ascend.ops.jsd import jsd_forward
+from liger_kernel.ops.backends._ascend.ops.kl_div import LigerKLDivLossFunction
+from liger_kernel.ops.backends._ascend.ops.kl_div import kldiv_backward_triton
+from liger_kernel.ops.backends._ascend.ops.kl_div import kldiv_forward_triton
+from liger_kernel.ops.backends._ascend.ops.layer_norm import LigerLayerNormFunction
+from liger_kernel.ops.backends._ascend.ops.layer_norm import layer_norm_backward
+from liger_kernel.ops.backends._ascend.ops.layer_norm import layer_norm_forward
+from liger_kernel.ops.backends._ascend.ops.llama4_rope import LigerLlama4RopeFunction
+from liger_kernel.ops.backends._ascend.ops.llama4_rope import llama4_rope_backward
+from liger_kernel.ops.backends._ascend.ops.llama4_rope import llama4_rope_forward
+from liger_kernel.ops.backends._ascend.ops.poly_norm import LigerPolyNormFunction
+from liger_kernel.ops.backends._ascend.ops.poly_norm import poly_norm_backward
+from liger_kernel.ops.backends._ascend.ops.poly_norm import poly_norm_forward
+from liger_kernel.ops.backends._ascend.ops.qwen2vl_mrope import LigerQwen2VLMRopeFunction
+from liger_kernel.ops.backends._ascend.ops.qwen2vl_mrope import qwen2vl_mrope_backward
+from liger_kernel.ops.backends._ascend.ops.qwen2vl_mrope import qwen2vl_mrope_forward
+from liger_kernel.ops.backends._ascend.ops.rms_norm import LigerRMSNormFunction
+from liger_kernel.ops.backends._ascend.ops.rms_norm import rms_norm_backward
+from liger_kernel.ops.backends._ascend.ops.rms_norm import rms_norm_forward
+from liger_kernel.ops.backends._ascend.ops.rope import LigerRopeFunction
+from liger_kernel.ops.backends._ascend.ops.rope import rope_backward
+from liger_kernel.ops.backends._ascend.ops.rope import rope_forward
+from liger_kernel.ops.backends._ascend.ops.softmax import LigerSoftmaxFunction
+from liger_kernel.ops.backends._ascend.ops.softmax import _softmax_backward
+from liger_kernel.ops.backends._ascend.ops.softmax import _softmax_forward
+from liger_kernel.ops.backends._ascend.ops.sparsemax import LigerSparsemaxFunction
+from liger_kernel.ops.backends._ascend.ops.sparsemax import sparsemax_backward
+from liger_kernel.ops.backends._ascend.ops.sparsemax import sparsemax_forward
+from liger_kernel.ops.backends._ascend.ops.swiglu import LigerSiLUMulFunction
+from liger_kernel.ops.backends._ascend.ops.swiglu import swiglu_backward
+from liger_kernel.ops.backends._ascend.ops.swiglu import swiglu_forward
+from liger_kernel.ops.backends._ascend.ops.tvd import LigerTVDLossFunction
+from liger_kernel.ops.backends._ascend.ops.tvd import tv_distance_forward_triton
+from liger_kernel.ops.backends._ascend.ops.tvd import tvd_backward_triton
+
+__all__ = [
+    "LigerEmbeddingFunction",
+    "embedding_forward",
+    "embedding_backward",
+    "LigerFusedAddRMSNormFunction",
+    "fused_add_rms_norm_forward",
+    "fused_add_rms_norm_backward",
+    "LigerGELUMulFunction",
+    "geglu_forward",
+    "geglu_backward",
+    "LigerQwen2VLMRopeFunction",
+    "qwen2vl_mrope_forward",
+    "qwen2vl_mrope_backward",
+    "LigerRMSNormFunction",
+    "rms_norm_forward",
+    "rms_norm_backward",
+    "LigerRopeFunction",
+    "rope_forward",
+    "rope_backward",
+    "LigerSiLUMulFunction",
+    "swiglu_forward",
+    "swiglu_backward",
+    "LigerTVDLossFunction",
+    "tv_distance_forward_triton",
+    "tvd_backward_triton",
+    "LigerLlama4RopeFunction",
+    "llama4_rope_forward",
+    "llama4_rope_backward",
+    "LigerPolyNormFunction",
+    "poly_norm_forward",
+    "poly_norm_backward",
+    "LigerDyTFunction",
+    "liger_dyt_fwd",
+    "liger_dyt_bwd",
+    "LigerKLDivLossFunction",
+    "kldiv_forward_triton",
+    "kldiv_backward_triton",
+    "LigerLayerNormFunction",
+    "layer_norm_backward",
+    "layer_norm_forward",
+    "LigerSoftmaxFunction",
+    "_softmax_forward",
+    "_softmax_backward",
+    "LigerJSDFunction",
+    "jsd_forward",
+    "jsd_backward",
+    "LigerCrossEntropyFunction",
+    "cross_entropy_backward",
+    "cross_entropy_forward",
+    "GrpoLossFunction",
+    "grpo_loss_forward_triton",
+    "grpo_loss_backward_triton",
+    "LigerFusedLinearJSDFunction",
+    "fused_linear_jsd_forward",
+    "fused_linear_jsd_backward",
+    "LigerGroupNormFunction",
+    "group_norm_forward",
+    "group_norm_backward",
+    "LigerSparsemaxFunction",
+    "sparsemax_forward",
+    "sparsemax_backward",
+]
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/cross_entropy.py b/src/liger_kernel/ops/backends/_ascend/ops/cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..7bc512a0339b6be4129a582512c910d5a792a0d8
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/cross_entropy.py
@@ -0,0 +1,568 @@
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import tanh
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def liger_cross_entropy_kernel(
+    X_ptr,
+    X_stride,
+    Y_ptr,
+    Y_stride,
+    weight_ptr,
+    loss_ptr,
+    z_loss_ptr,
+    loss_stride,
+    token_accuracy_ptr,
+    token_accuracy_stride,
+    predicted_tokens_ptr,
+    predicted_tokens_stride,
+    n_cols,
+    n_rows,
+    n_non_ignore,
+    sum_non_ignore_weight,
+    weight_sum,
+    ignore_index,
+    lse_square_scale: tl.constexpr,
+    label_smoothing: tl.constexpr,
+    reduction: tl.constexpr,  # set it as constexpr since reduction is always known at compile time
+    softcap,
+    RETURN_Z_LOSS: tl.constexpr,
+    RETURN_TOKEN_ACCURACY: tl.constexpr,
+    RETURN_PREDICTED_TOKENS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
+    HAS_SOFTCAPPING: tl.constexpr,
+    HAS_GRADIENTS: tl.constexpr,
+):
+    """
+    This kernel computes both cross entropy loss and the gradient of the input.
+    We only consider hard label + mean reduction for now. Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math.
+
+    Parameters:
+    X_ptr: Pointer to input tensor.
+    X_stride (int): The stride of the input tensor.
+    Y_ptr: Pointer to target tensor.
+    Y_stride (int): The stride of the target tensor.
+    weight_ptr: Pointer to weight tensor.
+    loss_ptr: Pointer to tensor to store the loss.
+    z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0.
+    loss_stride (int): The stride of the loss tensor.
+    token_accuracy_ptr: Pointer to tensor to store the per-token accuracy. No operation if RETURN_TOKEN_ACCURACY is 0.
+    token_accuracy_stride (int): The stride of the token accuracy tensor.
+    n_cols (int): The number of columns in the input tensor.
+    n_rows (int): The total number of rows to process.
+    n_non_ignore (float): The number of non-ignored elements in the batch.
+    sum_non_ignore_weight (float): The sum of non-ignored target's weights in the batch.
+    weight_sum (float): The sum of weight tensor.
+    ignore_index (int): The index to ignore in the target.
+    label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
+    lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
+    reduction (str): The string for the reduction to apply
+    softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap).
+    RETURN_Z_LOSS (int): The boolean value to decide whether to store z loss to z_loss_ptr or not. It must be 0 or 1.
+    RETURN_TOKEN_ACCURACY (int): The boolean value to decide whether to store per-token accuracy to token_accuracy_ptr or not. It must be 0 or 1.
+    BLOCK_SIZE (int): The block size for Triton operations.
+    HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
+    HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
+    HAS_GRADIENTS (bool): The boolean value to determine whether calculating gradients in forward pass.
+    """
+
+    # Grid-Stride Loop: each program processes multiple rows
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    start_row = pid
+    stride = num_progs
+
+    for row_idx in range(start_row, n_rows, stride):
+        # https://github.com/triton-lang/triton/issues/1058
+        # If B*T*V is too large, program_id * stride will overflow out of int32, so we convert to int64
+        program_id = row_idx.to(tl.int64)
+
+        # 1. Load Y_ptr first because if the target is ignore_index, we can return right away
+        Y_ptr_offset = program_id * Y_stride
+        y = tl.load(Y_ptr + Y_ptr_offset)
+
+        # 2. locate the start index
+        X_ptr_offset = program_id * X_stride
+
+        is_ignored = y == ignore_index
+
+        if is_ignored:
+            # set all X_ptr as 0
+            for i in range(0, n_cols, BLOCK_SIZE):
+                X_offsets = i + tl.arange(0, BLOCK_SIZE)
+                tl.store(X_ptr + X_ptr_offset + X_offsets, 0.0, mask=X_offsets < n_cols)
+            # For ignored tokens, set token accuracy to 0
+            if RETURN_TOKEN_ACCURACY:
+                token_accuracy_ptr_offset = program_id * token_accuracy_stride
+                tl.store(token_accuracy_ptr + token_accuracy_ptr_offset, 0.0)
+            if RETURN_PREDICTED_TOKENS:
+                predicted_tokens_ptr_offset = program_id * predicted_tokens_stride
+                tl.store(predicted_tokens_ptr + predicted_tokens_ptr_offset, -1)
+        else:
+            loss_ptr_offset = program_id * loss_stride
+            if RETURN_Z_LOSS:
+                z_loss_ptr_offset = program_id * loss_stride
+            if RETURN_TOKEN_ACCURACY:
+                token_accuracy_ptr_offset = program_id * token_accuracy_stride
+            if RETURN_PREDICTED_TOKENS:
+                predicted_tokens_ptr_offset = program_id * predicted_tokens_stride
+
+            if HAS_WEIGHT:
+                weight_y = tl.load(weight_ptr + y).cast(tl.float32)
+
+            # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
+            # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
+
+            # 3. [Online softmax] first pass: find max + sum
+            m = float("-inf")  # m is the max value. use the notation from the paper
+            d = 0.0  # d is the sum. use the notation from the paper
+            argmax_idx = 0  # Track the index of the maximum value for token accuracy / predicted tokens computation
+            ori_X_y = tl.load(X_ptr + X_ptr_offset + y).cast(
+                tl.float32
+            )  # we need to store the original value of X_y for the loss calculation
+            if HAS_SOFTCAPPING:
+                ori_X_y = softcap * tanh(ori_X_y / softcap)
+
+            # Label smoothing is a general case of normal cross entropy
+            # See the full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issue-2503665310
+            scaled_x_sum = 0.0
+            eps = label_smoothing / n_cols
+
+            for i in range(0, n_cols, BLOCK_SIZE):
+                X_offsets = i + tl.arange(0, BLOCK_SIZE)
+                X_block = tl.load(
+                    X_ptr + X_ptr_offset + X_offsets,
+                    mask=X_offsets < n_cols,
+                    other=float("-inf"),
+                    # Ensure float32 precision for softmax calculation
+                ).cast(tl.float32)
+                if HAS_SOFTCAPPING:
+                    X_block = softcap * tanh(X_block / softcap)
+                block_max = tl.max(X_block)
+
+                # Track argmax for accuracy / predicted tokens computation
+                if RETURN_TOKEN_ACCURACY or RETURN_PREDICTED_TOKENS:
+                    # Find the index of the maximum value in this block
+                    is_max_mask = X_block == block_max
+                    # Mask out invalid indices with a value larger than n_cols
+                    masked_offsets = tl.where(is_max_mask, X_offsets, n_cols)
+                    # Get the first (smallest) index where max occurs
+                    current_block_argmax_idx = tl.min(masked_offsets)
+
+                    is_new_max = block_max > m
+                    argmax_idx = tl.where(is_new_max, current_block_argmax_idx, argmax_idx)
+
+                if label_smoothing > 0:
+                    # scale X beforehand to avoid overflow
+                    if HAS_WEIGHT:
+                        weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                        scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block * weight_block, 0.0))
+                    else:
+                        scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0))
+                m_new = tl.maximum(m, block_max)
+                d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new))
+                m = m_new
+
+            # log (sum(e^(X_i))) = log (sum(e ^ (max(X) * e ^ (X_i - max(X)))))
+            #                    = log (e^(max(X)) * sum(e ^ (X_i - max(X))))
+            #                    = max(X) + log (sum(e ^ (X_i - max(X)))) = m + log d
+            lse = m + tl.log(d)
+
+            # 4. [Online Softmax] Second pass: compute gradients
+            # For 'mean' reduction, gradients are normalized by number of non-ignored elements (N)
+            # dx_y = (softmax(x_y) - 1) / N
+            # dx_i = softmax(x_i) / N, i != y
+            # For label smoothing:
+            # dx_i = (softmax(x_i) - label_smoothing / V) / N, V = n_cols, i != y
+            # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) / N
+            #      = dx_i - (1 - label_smoothing) / N
+            # With Z loss:
+            # dx_i = ((1 + 2 * lse_square_scale * lse) * softmax(x_i) - label_smoothing / V) / N, i != y
+            # dx_y = dx_i - (1 - label_smoothing) / N
+            # For 'sum' reduction, no normalization is applied:
+            # dx_y = softmax(x_y) - 1
+            # dx_i = softmax(x_i), for i ≠ y
+            if HAS_GRADIENTS:
+                for i in range(0, n_cols, BLOCK_SIZE):
+                    X_offsets = i + tl.arange(0, BLOCK_SIZE)
+                    X_block = tl.load(
+                        X_ptr + X_ptr_offset + X_offsets,
+                        mask=X_offsets < n_cols,
+                        other=float("-inf"),
+                        # Ensure float32 precision for softmax calculation
+                    ).cast(tl.float32)
+                    if HAS_SOFTCAPPING:
+                        intermediate = tanh(X_block / softcap)
+                        X_block = softcap * intermediate
+
+                    if not HAS_WEIGHT:
+                        # softmax(x_i)
+                        X_block = tl.exp(X_block - m) / d
+                        # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
+                        X_block += 2 * lse_square_scale * lse * X_block
+                        # smoothing term
+                        X_block += -eps
+                        # special handle dx_y
+                        X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
+                        # reduction scale
+                        if reduction == "mean":
+                            X_block = X_block / n_non_ignore
+                    else:
+                        weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                        softmax_X = tl.exp(X_block - m) / d
+                        # derivative of original_loss
+                        dloss_ori = (1 - label_smoothing) * softmax_X
+                        # specially handle dx_y
+                        dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
+                        dloss_ori = dloss_ori * weight_y
+                        # derivative of smooth_loss
+                        dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
+                        # derivative of z-loss
+                        dz_loss = 2 * lse_square_scale * lse * softmax_X
+                        # reduction scale
+                        if reduction == "mean":
+                            dloss_ori = dloss_ori / sum_non_ignore_weight
+                            dloss_smooth = dloss_smooth / sum_non_ignore_weight
+                            # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+                            dz_loss = dz_loss / n_non_ignore
+                        # derivative of total_loss
+                        X_block = dloss_ori + dloss_smooth + dz_loss
+
+                    # chain rule softcapping
+                    # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
+                    if HAS_SOFTCAPPING:
+                        X_block = X_block * (1 - intermediate * intermediate)
+
+                    tl.store(X_ptr + X_ptr_offset + X_offsets, X_block, mask=X_offsets < n_cols)
+
+            # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
+            # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
+            tl.debug_barrier()
+
+            # 5. Calculate the loss
+
+            # loss = log (softmax(X_y)) = log ((e ^ (X_y - max(X)) / sum(e ^ (X - max(X))))
+            #      = (X_y - max(X)) - log(sum(e ^ (X - max(X))))
+            #      = X_y - m - log d = X_y - lse
+            # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1
+            # So we can safely calculate log (softmax(X_y)) without overflow
+            loss = lse - ori_X_y
+            if HAS_WEIGHT:
+                loss = weight_y * loss
+
+            # Original loss = H(q, p),  with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
+            # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
+            #          = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i))
+            # By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as:
+            #          = (1 - label_smoothing) * H(q, p) + (sum(-eps * x_i) + label_smoothing * (m + logd))
+            # Refer to H(q', p) in section 7 of the paper: https://arxiv.org/pdf/1512.00567
+            # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
+            # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
+            if label_smoothing > 0:
+                if HAS_WEIGHT:
+                    smooth_loss = scaled_x_sum + eps * lse * weight_sum
+                else:
+                    smooth_loss = scaled_x_sum + label_smoothing * lse
+                loss = loss * (1 - label_smoothing) + smooth_loss
+
+            # An auxiliary loss, z_loss
+            # Refer to Page14 Loss function section in the paper PaLM: https://www.jmlr.org/papers/v24/22-1144.html
+            z_loss = lse_square_scale * lse * lse
+            # Normalize the loss by the number of non-ignored elements if reduction is "mean"
+            if reduction == "mean":
+                if HAS_WEIGHT:
+                    loss = loss / sum_non_ignore_weight
+                else:
+                    loss = loss / n_non_ignore
+                # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+                z_loss = z_loss / n_non_ignore
+            loss += z_loss
+
+            tl.store(loss_ptr + loss_ptr_offset, loss)
+            if RETURN_Z_LOSS:
+                tl.store(z_loss_ptr + z_loss_ptr_offset, z_loss)
+            if RETURN_TOKEN_ACCURACY:
+                # Store 1.0 if prediction is correct, 0.0 otherwise
+                is_correct = 1.0 if argmax_idx == y else 0.0
+                tl.store(token_accuracy_ptr + token_accuracy_ptr_offset, is_correct)
+            if RETURN_PREDICTED_TOKENS:
+                tl.store(predicted_tokens_ptr + predicted_tokens_ptr_offset, argmax_idx)
+
+
+def get_optimal_block_size(n_cols, has_gradients=True):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy
+    """
+    # Cross entropy is more memory intensive than swiglu because it needs softmax computation
+    # Forward needs online softmax calculation, backward needs more memory for intermediate variables
+    # 10.0 and 16.0 are empirical values based on Atlas 800I A2 UB (192KB)
+    multiplier = 12.0 if has_gradients else 8.0
+
+    # Call calculation function
+    # Treat input as 1D (n_cols,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((n_cols,),), tiling_dims=(0,)
+    )
+
+    # Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return block_size
+    else:
+        return 2048
+
+
+def cross_entropy_forward(
+    _input,
+    target,
+    weight,
+    ignore_index,
+    lse_square_scale,
+    label_smoothing,
+    reduction,
+    softcap,
+    return_z_loss,
+    return_token_accuracy=False,
+    return_predicted_tokens=False,
+):
+    assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
+    assert isinstance(return_token_accuracy, bool), (
+        f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
+    )
+    assert isinstance(return_predicted_tokens, bool), (
+        f"return_predicted_tokens must be True or False. Got: {return_predicted_tokens}"
+    )
+
+    BT, V = _input.shape
+    n_rows = BT
+
+    BLOCK_SIZE = get_optimal_block_size(V, has_gradients=_input.requires_grad)
+
+    # unreduced loss
+    loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
+    z_loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    token_accuracy_1d = (
+        torch.zeros(n_rows, dtype=torch.float32, device=_input.device) if return_token_accuracy else None
+    )
+    predicted_tokens_1d = (
+        torch.full((n_rows,), -1, dtype=torch.int64, device=_input.device) if return_predicted_tokens else None
+    )
+
+    target_mask = target != ignore_index
+    n_non_ignore = target_mask.sum().item()
+    assert (target * target_mask).max() < _input.shape[-1], (
+        f"Target {target.max()} is out of bounds. Expected < {_input.shape[-1]}"
+    )
+    assert (target * target_mask).min() >= 0, f"Target {target.min()} is out of bounds. Expected >= 0"
+    sum_non_ignore_weight = n_non_ignore
+    weight_sum = 0.0
+    if weight is not None:
+        assert weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {weight.shape}"
+        assert torch.is_floating_point(weight), (
+            f"If given, weight has to be a Tensor of floating point dtype. Got: {weight.dtype}"
+        )
+        sum_non_ignore_weight = torch.gather(weight, dim=0, index=target.masked_select(target_mask)).sum().item()
+        weight_sum = weight.sum().item()
+        # ensure weight is contiguous
+        if weight.stride(-1) != 1:
+            weight = weight.contiguous()
+
+    # ensure _input and target are contiguous in the last dimension
+    if _input.stride(-1) != 1:
+        _input = _input.contiguous()
+    if target.stride(-1) != 1:
+        target = target.contiguous()
+
+    # NPU-optimized grid configuration
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_rows)
+
+    # Here we use a trick to store X_ptr gradient in X_ptr so we can save memory
+    liger_cross_entropy_kernel[(grid_size,)](
+        X_ptr=_input,
+        X_stride=_input.stride(-2),
+        Y_ptr=target,
+        Y_stride=target.stride(-1),  # always 1
+        weight_ptr=weight,  # dummy if None
+        loss_ptr=loss_1d,
+        z_loss_ptr=z_loss_1d,
+        loss_stride=loss_1d.stride(-1),  # always 1
+        token_accuracy_ptr=token_accuracy_1d,
+        token_accuracy_stride=token_accuracy_1d.stride(-1)
+        if return_token_accuracy
+        else 0,  # always 1 if accuracy is enabled
+        predicted_tokens_ptr=predicted_tokens_1d,
+        predicted_tokens_stride=predicted_tokens_1d.stride(-1)
+        if return_predicted_tokens
+        else 0,  # always 1 if predicted tokens is enabled
+        n_cols=V,
+        n_rows=n_rows,
+        n_non_ignore=n_non_ignore,
+        sum_non_ignore_weight=sum_non_ignore_weight,
+        ignore_index=ignore_index,
+        weight_sum=weight_sum,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        reduction=reduction,
+        softcap=softcap,
+        RETURN_Z_LOSS=return_z_loss,
+        RETURN_TOKEN_ACCURACY=return_token_accuracy,
+        RETURN_PREDICTED_TOKENS=return_predicted_tokens,
+        BLOCK_SIZE=BLOCK_SIZE,
+        HAS_WEIGHT=True if weight is not None else False,
+        HAS_SOFTCAPPING=True if softcap is not None else False,
+        HAS_GRADIENTS=_input.requires_grad,
+    )
+
+    if reduction == "none":
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
+        token_accuracy = token_accuracy_1d if return_token_accuracy else None
+    else:
+        loss = torch.sum(loss_1d)
+        z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+        # For accuracy, we compute the mean across all non-ignored tokens
+        token_accuracy = torch.sum(token_accuracy_1d) / n_non_ignore if return_token_accuracy else None
+
+    predicted_tokens = predicted_tokens_1d if return_predicted_tokens else None
+
+    return loss, z_loss, token_accuracy, predicted_tokens, _input
+
+
+def cross_entropy_backward(_input, grad_output):
+    # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        pass
+    # If reduction is 'none'
+    elif grad_output.ndim > 0:
+        _input = _input * grad_output.unsqueeze(dim=1)
+    # If reduction is ['mean', 'sum'], grad_output is just a scalar
+    # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
+    # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
+    else:
+        BT, V = _input.shape
+        n_rows = BT
+        BLOCK_SIZE = min(2048, triton.next_power_of_2(V))
+
+        element_mul_kernel[(n_rows,)](
+            _input,
+            _input.stride(-2),
+            grad_output,
+            V,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    return _input
+
+
+class LigerCrossEntropyFunction(torch.autograd.Function):
+    """
+    This class implements a custom autograd function for the Liger Cross Entropy loss.
+    It overrides the forward and backward methods of the torch.autograd.Function class.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        _input: torch.Tensor,
+        target: torch.Tensor,
+        weight: Optional[torch.FloatTensor],
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
+        return_token_accuracy: bool = False,
+        return_predicted_tokens: bool = False,
+    ):
+        """
+        The forward pass of the Liger Cross Entropy loss.
+
+        Parameters:
+        ctx : The context object.
+        _input (tensor): The input tensor of shape (BT, V) where B is batch size, T is sequence length, V is vocab size.
+        target (tensor): The target tensor of shape (BT) where each value is in [0, V-1].
+        weight(Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype
+        ignore_index (int): The index to ignore in the target.
+        lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
+        label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
+        reduction (str): The reduction to apply to the output: "none" | "mean | "sum".
+        softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
+        return_z_loss (bool): When `return_z_loss` is `True`, returns (loss, z_loss, token_accuracy, predicted_tokens) instead of (loss, None, None, None). Default: `False`
+        return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
+        return_predicted_tokens (bool): When `return_predicted_tokens` is `True`, returns per-token predicted class indices (argmax) without materializing logits. Default: `False`
+
+        Returns:
+        tuple: A tuple with the computed losses, accuracy, and predicted tokens: (loss, z_loss, token_accuracy, predicted_tokens). z_loss, token_accuracy, and predicted_tokens are None if not requested.
+        """
+        input_requires_grad = _input.requires_grad
+
+        loss, z_loss, token_accuracy, predicted_tokens, _input = cross_entropy_forward(
+            _input,
+            target,
+            weight,
+            ignore_index,
+            lse_square_scale,
+            label_smoothing,
+            reduction,
+            softcap,
+            return_z_loss,
+            return_token_accuracy,
+            return_predicted_tokens,
+        )
+        if input_requires_grad:
+            ctx.save_for_backward(_input.detach())
+        ctx.return_z_loss = return_z_loss
+        ctx.return_token_accuracy = return_token_accuracy
+        ctx.return_predicted_tokens = return_predicted_tokens
+
+        return loss, z_loss, token_accuracy, predicted_tokens
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_output2, grad_output3, grad_output4):
+        """
+        The backward pass of the Liger Cross Entropy loss.
+
+        Parameters:
+        ctx : The context object with saved tensors.
+        grad_output (tensor): The tensor containing the gradient of the loss with respect to the output.
+        grad_output2 (tensor): No use. Gradient for z_loss (not used as z_loss is only for logging).
+        grad_output3 (tensor): No use. Gradient for token_accuracy (not used as token_accuracy is only for metrics).
+        grad_output4 (tensor): No use. Gradient for predicted_tokens (not used as predicted_tokens is only for metrics).
+        Returns:
+        tuple: A tuple with the gradients with respect to the inputs. The elements are tensors or None.
+        """
+        if ctx.return_z_loss:
+            del grad_output2  # z_loss is only for logging
+        if ctx.return_token_accuracy:
+            del grad_output3  # token_accuracy is only for metrics
+        if ctx.return_predicted_tokens:
+            del grad_output4  # predicted_tokens is only for metrics
+
+        (_input,) = ctx.saved_tensors
+        _input = cross_entropy_backward(_input, grad_output)
+        return (
+            _input,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/dyt.py b/src/liger_kernel/ops/backends/_ascend/ops/dyt.py
new file mode 100755
index 0000000000000000000000000000000000000000..cdcb1a327fd940ec3c3c734a60816cf130b9d856
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/dyt.py
@@ -0,0 +1,285 @@
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import tanh
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+# -----------------------------------------------------------------------------
+# Forward Kernel
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _dyt_fwd_kernel(
+    X,
+    Y,
+    Alpha,
+    Gamma,
+    Beta,
+    HAVE_BETA: tl.constexpr,
+    M: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """
+    Forward kernel for DYT: y = tanh(α·x) · γ + β
+
+    Grid: (num_col_blocks, num_row_programs)
+    Each program processes multiple rows using grid-stride loop
+    """
+    pid_n = tl.program_id(0)
+    pid_m = tl.program_id(1)
+    num_row_programs = tl.num_programs(1)
+
+    col_start = pid_n * BLOCK_N
+    col_offsets = col_start + tl.arange(0, BLOCK_N)
+    col_mask = col_offsets < N
+
+    alpha = tl.load(Alpha).to(tl.float32)
+    gamma = tl.load(Gamma + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+    if HAVE_BETA:
+        beta = tl.load(Beta + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+
+    # Grid-stride loop over rows
+    for row_idx in range(pid_m, M, num_row_programs):
+        row_offset = row_idx * N
+
+        x = tl.load(X + row_offset + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+
+        # Compute: y = tanh(α·x) · γ + β
+        tanh_x = tanh(alpha * x)
+        y = tanh_x * gamma
+
+        if HAVE_BETA:
+            y += beta
+
+        tl.store(Y + row_offset + col_offsets, y, mask=col_mask)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _dyt_bwd_kernel(
+    DY,
+    DX,
+    DA,
+    DG,
+    DB,
+    X,
+    Alpha,
+    Gamma,
+    HAVE_BETA: tl.constexpr,
+    M: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """
+    Backward kernel for DYT
+
+    Grid: (num_col_blocks, num_row_programs)
+    Each program processes multiple rows using grid-stride loop
+    Accumulates gradients in local buffers, then stores to global memory
+    """
+    pid_n = tl.program_id(0)
+    pid_m = tl.program_id(1)
+    num_row_programs = tl.num_programs(1)
+
+    col_start = pid_n * BLOCK_N
+    col_offsets = col_start + tl.arange(0, BLOCK_N)
+    col_mask = col_offsets < N
+
+    alpha = tl.load(Alpha).to(tl.float32)
+    gamma = tl.load(Gamma + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+
+    da_vec = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    dg_acc = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAVE_BETA:
+        db_acc = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    # Grid-stride loop over rows
+    for row_idx in range(pid_m, M, num_row_programs):
+        row_offset = row_idx * N
+
+        x = tl.load(X + row_offset + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row_offset + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+
+        tanh_x = tanh(alpha * x)
+
+        if HAVE_BETA:
+            db_acc += dy
+
+        dg_acc += dy * tanh_x
+
+        # Compute intermediate: tmp = (1 - tanh²) · dy · γ
+        tmp = (1.0 - tanh_x * tanh_x) * dy * gamma
+
+        # Accumulate dα = Σ(x · tmp)
+        da_vec += x * tmp
+
+        # Compute dx = α · tmp
+        dx = alpha * tmp
+        tl.store(DX + row_offset + col_offsets, dx, mask=col_mask)
+
+    da_acc = tl.sum(da_vec, 0)
+    da_offset = pid_m * triton.cdiv(N, BLOCK_N) + pid_n
+    tl.store(DA + da_offset, da_acc)
+
+    dg_offset = pid_m * N + col_offsets
+    tl.store(DG + dg_offset, dg_acc, mask=col_mask)
+
+    if HAVE_BETA:
+        db_offset = pid_m * N + col_offsets
+        tl.store(DB + db_offset, db_acc, mask=col_mask)
+
+
+def get_optimal_block_size(total_elements, is_backward=False):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy
+    """
+    multiplier = 8.0 if is_backward else 4.0
+
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((total_elements,),), tiling_dims=(0,)
+    )
+
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return block_size
+    else:
+        return 2048
+
+
+def _compute_grid_size(n_cols, n_rows, block_n):
+    """
+    Compute grid size to avoid launching idle programs
+
+    Args:
+        n_cols: Number of columns
+        n_rows: Number of rows
+        block_n: Block size for column dimension
+
+    Returns:
+        (num_col_blocks, num_row_programs)
+    """
+    num_cores = get_npu_core_count()
+    num_col_blocks = triton.cdiv(n_cols, block_n)
+    num_row_blocks = n_rows
+
+    num_row_programs = min(max(1, (num_cores // num_col_blocks)), num_row_blocks)
+
+    return num_col_blocks, num_row_programs
+
+
+# -----------------------------------------------------------------------------
+# Python Wrapper Functions
+# -----------------------------------------------------------------------------
+
+
+def liger_dyt_fwd(x, alpha, gamma, beta):
+    """
+    Forward pass of DYT: y = tanh(α·x) · γ + β
+
+    Args:
+        x: Input tensor of shape [..., N]
+        alpha: Scalar parameter
+        gamma: Vector parameter of shape [N]
+        beta: Vector parameter of shape [N] (optional)
+
+    Returns:
+        y: Output tensor of same shape as x
+    """
+    assert x.is_contiguous()
+    HAVE_BETA = beta is not None
+
+    # Flatten to 2D
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
+
+    # Allocate output
+    y = torch.empty_like(x)
+
+    block_n = get_optimal_block_size(N, is_backward=False)
+
+    # Compute grid size
+    num_col_blocks, num_row_programs = _compute_grid_size(N, M, block_n)
+    grid = (num_col_blocks, num_row_programs)
+
+    # Launch kernel
+    _dyt_fwd_kernel[grid](x, y, alpha, gamma, beta, HAVE_BETA, M, N, BLOCK_N=block_n)
+
+    return y.view(input_shape)
+
+
+def liger_dyt_bwd(dy, x, alpha, gamma, beta):
+    """
+    Backward pass of DYT
+
+    Args:
+        dy: Upstream gradient of shape [..., N]
+        x: Input tensor of shape [..., N]
+        alpha: Scalar parameter
+        gamma: Vector parameter of shape [N]
+        beta: Vector parameter of shape [N] (optional)
+
+    Returns:
+        dx: Gradient w.r.t. x
+        dalpha: Gradient w.r.t. alpha
+        dgamma: Gradient w.r.t. gamma
+        dbeta: Gradient w.r.t. beta (or None)
+    """
+    assert dy.is_contiguous()
+    HAVE_BETA = beta is not None
+
+    # Flatten to 2D
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    dy = dy.view(-1, input_shape[-1])
+    M, N = x.shape
+
+    block_n = get_optimal_block_size(N, is_backward=True)
+
+    # Compute grid size
+    num_col_blocks, num_row_programs = _compute_grid_size(N, M, block_n)
+    grid = (num_col_blocks, num_row_programs)
+
+    da = torch.zeros(num_row_programs, triton.cdiv(N, block_n), dtype=torch.float32, device=x.device)
+    dg = torch.empty(num_row_programs, N, dtype=torch.float32, device=x.device)
+    db = torch.empty(num_row_programs, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
+    dx = torch.empty_like(dy)
+
+    _dyt_bwd_kernel[grid](dy, dx, da, dg, db, x, alpha, gamma, HAVE_BETA, M, N, BLOCK_N=block_n)
+
+    da = da.sum().to(x.dtype).unsqueeze(0)
+    dg = dg.sum(0).to(gamma.dtype)
+    db = db.sum(0).to(x.dtype) if HAVE_BETA else None
+
+    return dx.view(input_shape), da, dg, db
+
+
+# -----------------------------------------------------------------------------
+# Autograd Function
+# -----------------------------------------------------------------------------
+
+
+class LigerDyTFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, x, alpha, gamma, beta):
+        y = liger_dyt_fwd(x, alpha, gamma, beta)
+        ctx.save_for_backward(x, alpha, gamma, beta)
+        return y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dy):
+        x, alpha, gamma, beta = ctx.saved_tensors
+        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(dy, x, alpha, gamma, beta)
+        return dx, dalpha, dgamma, dbeta
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/embedding.py b/src/liger_kernel/ops/backends/_ascend/ops/embedding.py
new file mode 100755
index 0000000000000000000000000000000000000000..dc6016d41eb1b45f5aa0dfb3b5098269e56713ee
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/embedding.py
@@ -0,0 +1,210 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def embedding_forward_kernel(
+    embeddings_ptr,
+    indices_ptr,
+    output_ptr,
+    n_elements,
+    embedding_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    grid_m = tl.cdiv(n_elements, BLOCK_SIZE_M)
+    grid_n = tl.cdiv(embedding_dim, BLOCK_SIZE_N)
+    total_2d_blocks = grid_m * grid_n
+
+    for block_idx in tl.range(pid, total_2d_blocks, num_progs):
+        block_m = block_idx // grid_n
+        block_n = block_idx % grid_n
+
+        start_m = block_m * BLOCK_SIZE_M
+        start_n = block_n * BLOCK_SIZE_N
+
+        offsets_m = start_m + tl.arange(0, BLOCK_SIZE_M)
+        mask_m = offsets_m < n_elements
+
+        indices = tl.load(indices_ptr + offsets_m, mask=mask_m, other=0)
+
+        offsets_n = start_n + tl.arange(0, BLOCK_SIZE_N)
+        mask_n = offsets_n < embedding_dim
+
+        block_mask = mask_m[:, None] & mask_n[None, :]
+
+        embedding_offsets = indices[:, None] * embedding_dim + offsets_n[None, :]
+        embeddings = tl.load(
+            embeddings_ptr + embedding_offsets,
+            mask=block_mask,
+            other=0.0,
+        )
+
+        output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
+        tl.store(
+            output_ptr + output_offsets,
+            embeddings,
+            mask=block_mask,
+        )
+
+
+@triton.jit
+def embedding_backward_kernel(
+    grad_output_ptr,
+    grad_weight_ptr,
+    indices_ptr,
+    n_elements,
+    embedding_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    grid_m = tl.cdiv(n_elements, BLOCK_SIZE_M)
+    grid_n = tl.cdiv(embedding_dim, BLOCK_SIZE_N)
+    total_2d_blocks = grid_m * grid_n
+
+    for block_idx in tl.range(pid, total_2d_blocks, num_progs):
+        block_m = block_idx // grid_n
+        block_n = block_idx % grid_n
+
+        start_m = block_m * BLOCK_SIZE_M
+        start_n = block_n * BLOCK_SIZE_N
+
+        offsets_m = start_m + tl.arange(0, BLOCK_SIZE_M)
+        mask_m = offsets_m < n_elements
+
+        indices = tl.load(indices_ptr + offsets_m, mask=mask_m, other=0)
+
+        offsets_n = start_n + tl.arange(0, BLOCK_SIZE_N)
+        mask_n = offsets_n < embedding_dim
+
+        block_mask = mask_m[:, None] & mask_n[None, :]
+
+        grad_output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
+        grad_output = tl.load(
+            grad_output_ptr + grad_output_offsets,
+            mask=block_mask,
+            other=0.0,
+        )
+
+        grad_weight_offsets = indices[:, None] * embedding_dim + offsets_n[None, :]
+        tl.atomic_add(
+            grad_weight_ptr + grad_weight_offsets,
+            grad_output,
+            mask=block_mask,
+        )
+
+
+def get_optimal_block_size(total_elements, dtype_size, BLOCK_SIZE_N: tl.constexpr):
+    # 1. Set Memory Multiplier
+    # 3.0 are empirical values based on Atlas 800I A2 UB (192KB)
+    # embedding_offsets, embedding_offsets : BLOCK_SIZE_N * BLOCK_SIZE_M (total 2 * BLOCK_SIZE_N * BLOCK_SIZE_M)
+    # Reserve a unit of space for the remaining one-dimensional ub to occupy.
+    # A conservative estimate of the total space occupation is 3 * BLOCK_SIZE_N * BLOCK_SIZE_M
+    multiplier = 3.0
+
+    # 2. Call calculation function
+    # Treat input as 1D (total_elements,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=dtype_size,
+        memory_multiplier=multiplier,
+        shapes=((total_elements, BLOCK_SIZE_N),),
+        tiling_dims=(0,),
+    )
+
+    # 3. Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return block_size
+    else:
+        return triton.next_power_of_2(min(128, total_elements))
+
+
+def embedding_forward(embeddings, indices):
+    ori_shape = indices.shape
+    indices = indices.view(-1)
+
+    n_elements = indices.numel()
+    embedding_dim = embeddings.shape[1]
+    output = torch.empty(
+        indices.shape[0],
+        embeddings.shape[1],
+        device=indices.device,
+        dtype=embeddings.dtype,
+    )
+
+    # Due to the involvement of two-dimensional partitioning,
+    # the sizes of block_m and block_n in the ub space will influence each other.
+    # Considering that embedding_dim is usually relatively smaller in most cases,
+    # a value is first assigned to block_n, and then the largest possible block_m is used.
+    BLOCK_SIZE_N = triton.next_power_of_2(min(128, embedding_dim))
+    BLOCK_SIZE_M = get_optimal_block_size(n_elements, embeddings.element_size(), BLOCK_SIZE_N)
+    num_cores = get_npu_core_count()
+    total_blocks = triton.cdiv(n_elements, BLOCK_SIZE_M) * triton.cdiv(embedding_dim, BLOCK_SIZE_N)
+    grid = min(num_cores, total_blocks)
+
+    embedding_forward_kernel[(grid,)](
+        embeddings,
+        indices,
+        output,
+        n_elements,
+        embedding_dim=embedding_dim,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+    )
+
+    return output.view(*ori_shape, -1)
+
+
+def embedding_backward(embeddings, indices, grad_output):
+    grad_output = grad_output.contiguous().view(-1, embeddings.shape[1])
+
+    grad_weight = torch.zeros_like(embeddings)
+
+    n_elements = indices.numel()
+    embedding_dim = embeddings.shape[1]
+    BLOCK_SIZE_N = triton.next_power_of_2(min(128, embedding_dim))
+    BLOCK_SIZE_M = get_optimal_block_size(n_elements, embeddings.element_size(), BLOCK_SIZE_N)
+    num_cores = get_npu_core_count()
+    total_blocks = triton.cdiv(n_elements, BLOCK_SIZE_M) * triton.cdiv(embedding_dim, BLOCK_SIZE_N)
+    grid = min(num_cores, total_blocks)
+
+    embedding_backward_kernel[(grid,)](
+        grad_output,
+        grad_weight,
+        indices,
+        n_elements,
+        embedding_dim=embedding_dim,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+    )
+
+    return grad_weight
+
+
+class LigerEmbeddingFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, embeddings: torch.Tensor, indices: torch.Tensor):
+        output = embedding_forward(embeddings, indices)
+        ctx.save_for_backward(indices, embeddings)
+        return output
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor):
+        indices, embeddings = ctx.saved_tensors
+        grad_weight = embedding_backward(embeddings, indices, grad_output)
+
+        return grad_weight, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/fused_add_rms_norm.py b/src/liger_kernel/ops/backends/_ascend/ops/fused_add_rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..07e4c0db0d5c3e7ba8319a30a727edb604698cb3
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/fused_add_rms_norm.py
@@ -0,0 +1,781 @@
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import rsqrt
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+from liger_kernel.ops.utils import torch_to_triton_dtype
+
+_CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
+_CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
+_CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
+
+
+def torch_dtype_to_triton(dtype):
+    mapping = {
+        torch.float32: tl.float32,
+        torch.bfloat16: tl.bfloat16,
+    }
+    return mapping.get(dtype, tl.float32)
+
+
+# -----------------------------------------------------------------------------
+# Forward Kernel - No Tiling (for n_cols <= 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _fused_add_rms_norm_forward_kernel_no_tiling(
+    Y_ptr,
+    Y_row_stride,
+    S_ptr,  # output residual
+    S_row_stride,
+    X_ptr,
+    X_row_stride,
+    R_ptr,  # input residual
+    R_row_stride,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows,
+    n_cols,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,
+    X_DTYPE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    NPU-optimized fused_add_rms_norm forward kernel for small n_cols (< 2048).
+
+    Performance optimizations:
+    1. Keep S_row in registers, avoid reload from memory
+    2. Minimize type conversions by careful ordering
+    3. Use optimal cache policies
+    4. Preload W_row while computing rstd (instruction-level parallelism)
+    5. Use 2D vector loading to maximize UB utilization (e.g., (1,2048), (2,1024), (4,512))
+
+    Used when n_cols < 2048 to avoid the overhead of column blocking.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(X_DTYPE)
+        offset = offset.to(X_DTYPE)
+
+    # Grid-stride loop setup for 2D blocks
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0)
+
+    # Grid-stride loop over row blocks
+    for i in tl.range(num_iterations):
+        row_idx = i * grid_stride + pid * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        # Load multiple rows at once using 2D indexing
+        X_rows = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        R_rows = tl.load(
+            R_ptr + row_idx[:, None] * R_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        S_rows = X_rows + R_rows
+
+        # Compute sum_square for all rows
+        if casting_mode == _CASTING_MODE_LLAMA or casting_mode == _CASTING_MODE_GEMMA:
+            S_rows = S_rows.to(tl.float32)
+
+        sum_squares = tl.sum(tl.where(block_mask, S_rows * S_rows, 0.0), axis=1)
+
+        # Compute rstd for all rows
+        mean_squares = sum_squares / n_cols
+        rstd_rows = rsqrt(mean_squares + eps)
+
+        # Store S_rows and rstd_rows
+        tl.store(
+            S_ptr + row_idx[:, None] * S_row_stride + col_offsets[None, :],
+            S_rows,
+            mask=block_mask,
+            cache_modifier=".cg",
+        )
+        tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd_rows, mask=row_mask)
+
+        # Normalize and apply weight - optimized for each casting mode
+        if casting_mode == _CASTING_MODE_GEMMA:
+            Y_rows = ((S_rows * rstd_rows[:, None]) * (offset + W_row[None, :])).to(X_DTYPE)
+        elif casting_mode == _CASTING_MODE_LLAMA:
+            S_normalized = (S_rows * rstd_rows[:, None]).to(X_DTYPE)
+            Y_rows = S_normalized * (offset + W_row[None, :])
+        else:
+            Y_rows = (S_rows * rstd_rows[:, None]) * (offset + W_row[None, :])
+
+        # Store results
+        tl.store(Y_ptr + row_idx[:, None] * Y_row_stride + col_offsets[None, :], Y_rows, mask=block_mask)
+
+
+# -----------------------------------------------------------------------------
+# Forward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _fused_add_rms_norm_forward_kernel_npu(
+    Y_ptr,
+    Y_row_stride,
+    S_ptr,  # output residual
+    S_row_stride,
+    X_ptr,
+    X_row_stride,
+    R_ptr,  # input residual
+    R_row_stride,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows,
+    n_cols,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,
+    X_DTYPE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    NPU-optimized fused_add_rms_norm forward kernel.
+
+    This kernel processes rows using a grid-stride loop pattern:
+    1. Each program handles multiple rows
+    2. For each row, we process it in column chunks of BLOCK_SIZE_N
+    3. Grid size is limited to NPU core count to avoid resource overflow
+
+    This solves two problems:
+    1. UB overflow when n_cols is too large (original kernel used n_cols as BLOCK_SIZE_N)
+    2. Efficient multi-row processing within a single kernel launch
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(X_DTYPE)
+        offset = offset.to(X_DTYPE)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    # Grid-stride loop over rows
+    for row_idx in tl.range(pid, n_rows, num_progs):
+        Y_row_ptr = Y_ptr + row_idx * Y_row_stride
+        S_row_ptr = S_ptr + row_idx * S_row_stride
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        R_row_ptr = R_ptr + row_idx * R_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        # Accumulator for mean_square computation across all column blocks
+        sum_square = 0.0
+
+        # First pass: compute S_row = X_row + R_row and accumulate sum of squares
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, eviction_policy="evict_first")
+            R_block = tl.load(R_row_ptr + col_offsets, mask=mask, other=0.0, eviction_policy="evict_first")
+            S_block = X_block + R_block
+
+            # Store S_row
+            tl.store(S_row_ptr + col_offsets, S_block, mask=mask, cache_modifier=".cg")
+
+            if casting_mode == _CASTING_MODE_LLAMA or casting_mode == _CASTING_MODE_GEMMA:
+                S_block = S_block.to(tl.float32)
+
+            # Accumulate sum of squares (only for valid elements)
+            sum_square += tl.sum(tl.where(mask, S_block * S_block, 0.0))
+
+        # Compute rstd for this row
+        mean_square = sum_square / n_cols
+
+        rstd = rsqrt(mean_square + eps)
+
+        # Store rstd
+        tl.store(RSTD_row_ptr, rstd)
+
+        # Second pass: normalize and multiply by weight
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            # Load S_block (already computed in first pass)
+            S_block = tl.load(S_row_ptr + col_offsets, mask=mask, other=0.0, cache_modifier=".ca")
+            W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+
+            # Apply casting based on mode
+            if casting_mode == _CASTING_MODE_GEMMA:
+                S_block = S_block.to(tl.float32)
+                W_block = W_block.to(tl.float32)
+            elif casting_mode == _CASTING_MODE_LLAMA:
+                S_block = S_block.to(tl.float32)
+
+            # Normalize
+            S_block = S_block * rstd
+
+            # Cast back for Llama mode before weight multiplication
+            if casting_mode == _CASTING_MODE_LLAMA:
+                S_block = S_block.to(X_DTYPE)
+            # Apply weight
+            Y_block = S_block * (offset + W_block)
+
+            # Cast back for Gemma mode
+            if casting_mode == _CASTING_MODE_GEMMA:
+                Y_block = Y_block.to(X_DTYPE)
+
+            # Store result
+            tl.store(Y_row_ptr + col_offsets, Y_block, mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel - No Tiling (for n_cols < 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _fused_add_rms_norm_backward_kernel_no_tiling(
+    dY_ptr,
+    dY_row_stride,
+    dS_out_ptr,
+    dS_out_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows,
+    n_cols,
+    offset,
+    casting_mode: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    has_dS_out: tl.constexpr,
+):
+    """
+    NPU-optimized fused_add_rms_norm backward kernel for small n_cols (< 2048).
+
+    Performance optimizations:
+    1. Keep all data in registers, minimize conversions
+    2. Reuse X_normalized (X * rstd) for both dX and dW
+    3. Optimize computation order to reduce register pressure
+    4. Combine operations where possible
+    5. Use 2D vector loading to maximize UB utilization (e.g., (1,2048), (2,1024), (4,512))
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Grid-stride loop setup for 2D blocks
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    # Load W once for all iterations
+    W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0)
+    W_offset = W_row + offset
+
+    # Grid-stride loop over row blocks
+    for i in tl.range(num_iterations):
+        row_idx = i * grid_stride + pid * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        dY_rows = tl.load(
+            dY_ptr + row_idx[:, None] * dY_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        X_rows = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+
+        # Load rstd for all rows in the block
+        rstd_rows = tl.load(RSTD_ptr + row_idx * RSTD_row_stride, mask=row_mask, other=0.0)
+
+        # Convert X to fp32 once
+        X_rows = X_rows.to(tl.float32)
+
+        # Compute X_normalized (reused in dX and dW)
+        X_normalized = X_rows * rstd_rows[:, None]
+
+        # Compute m based on casting mode (optimized for each mode)
+        if casting_mode == _CASTING_MODE_LLAMA:
+            m_rows = (dY_rows * W_offset[None, :]).to(tl.float32)
+            # For dW in Llama mode, we need X_normalized in original dtype
+            X_normalized_for_dW = X_normalized.to(X_dtype)
+        elif casting_mode == _CASTING_MODE_GEMMA:
+            m_rows = dY_rows.to(tl.float32) * W_offset[None, :]
+            X_normalized_for_dW = X_normalized
+        else:
+            m_rows = dY_rows * W_offset[None, :]
+            X_normalized_for_dW = X_normalized
+
+        # Compute sum(m * X) for correction factor
+        sum_m_X = tl.sum(tl.where(block_mask, m_rows * X_rows, 0.0), axis=1)
+
+        # Compute correction factor
+        correction_factors = -(1.0 / n_cols) * rstd_rows * rstd_rows * sum_m_X
+
+        # Compute dX = rstd * m + rstd * correction_factor * X
+        dX_rows = rstd_rows[:, None] * m_rows + rstd_rows[:, None] * correction_factors[:, None] * X_rows
+
+        # Add dS_out gradient if present
+        if has_dS_out:
+            dS_out_rows = tl.load(
+                dS_out_ptr + row_idx[:, None] * dS_out_row_stride + col_offsets[None, :], mask=block_mask, other=0.0
+            )
+            dX_rows += dS_out_rows
+
+        # Store dX
+        tl.store(dX_ptr + row_idx[:, None] * dX_row_stride + col_offsets[None, :], dX_rows.to(X_dtype), mask=block_mask)
+
+        # Compute dW contribution: dY * X_normalized
+        dW_rows = (dY_rows * X_normalized_for_dW).to(tl.float32)
+
+        # Accumulate to per-program dW buffer
+        dW_row_ptr = dW_ptr + pid * dW_row_stride
+        existing_dW = tl.load(dW_row_ptr + col_offsets, mask=col_mask, other=0.0)
+        new_dW = existing_dW + tl.sum(tl.where(block_mask, dW_rows, 0.0), axis=0)
+        tl.store(dW_row_ptr + col_offsets, new_dW, mask=col_mask)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _fused_add_rms_norm_backward_kernel_npu(
+    dY_ptr,
+    dY_row_stride,
+    dS_out_ptr,
+    dS_out_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows,
+    n_cols,
+    offset,
+    casting_mode: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    has_dS_out: tl.constexpr,
+):
+    """
+    NPU-optimized fused_add_rms_norm backward kernel.
+
+    Each program processes multiple rows using grid-stride loop.
+    For each row, we process columns in blocks to avoid UB overflow.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Initialize dW accumulator (per-program, will be reduced later)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+    offsets = tl.arange(0, BLOCK_SIZE)
+
+    # Grid-stride loop over rows
+    for row_idx in tl.range(pid, n_rows, num_progs):
+        # Base pointers for this row
+        dY_row_ptr = dY_ptr + row_idx * dY_row_stride
+        dX_row_ptr = dX_ptr + row_idx * dX_row_stride
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        # Load rstd for this row
+        rstd = tl.load(RSTD_row_ptr)
+
+        # First pass: compute sum(m * X) for the correction term
+        sum_m_X = 0.0
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            dY_block = tl.load(dY_row_ptr + col_offsets, mask=mask, other=0.0)
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0)
+            W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+
+            # Convert to fp32 for computation
+            X_block = X_block.to(tl.float32)
+
+            # Compute m based on casting mode
+            W_offset = W_block + offset
+
+            if casting_mode == _CASTING_MODE_LLAMA:
+                m = (dY_block * W_offset).to(tl.float32)
+            elif casting_mode == _CASTING_MODE_GEMMA:
+                dY_block = dY_block.to(tl.float32)
+                m = dY_block * W_offset
+            else:
+                m = dY_block * W_offset
+
+            # Accumulate sum(m * X)
+            sum_m_X += tl.sum(tl.where(mask, m * X_block, 0.0))
+
+        # Compute the correction factor
+        correction_factor = -(1.0 / n_cols) * rstd * rstd * sum_m_X
+
+        # Second pass: compute gradients
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            dY_block = tl.load(dY_row_ptr + col_offsets, mask=mask, other=0.0)
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0)
+            W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+
+            X_block = X_block.to(tl.float32)
+
+            # Compute m based on casting mode
+            W_offset = W_block + offset
+
+            if casting_mode == _CASTING_MODE_LLAMA:
+                m = (dY_block * W_offset).to(tl.float32)
+            elif casting_mode == _CASTING_MODE_GEMMA:
+                dY_block = dY_block.to(tl.float32)
+                m = dY_block * W_offset
+            else:
+                m = dY_block * W_offset
+
+            # Compute dX
+            dX_block = rstd * m + rstd * correction_factor * X_block
+
+            # Add dS_out gradient if present
+            if has_dS_out:
+                dS_out_row_ptr = dS_out_ptr + row_idx * dS_out_row_stride
+                dS_out_block = tl.load(dS_out_row_ptr + col_offsets, mask=mask, other=0.0)
+                dX_block += dS_out_block
+
+            # Store dX
+            tl.store(dX_row_ptr + col_offsets, dX_block.to(X_dtype), mask=mask)
+
+            # Compute dW contribution (accumulate per program)
+            if casting_mode == _CASTING_MODE_LLAMA:
+                dW_block = dY_block * (X_block * rstd).to(X_dtype)
+            else:
+                dW_block = dY_block * (X_block * rstd)
+
+            # Atomic add to dW_ptr (each program writes to its own row)
+            dW_row_ptr = dW_ptr + pid * dW_row_stride
+
+            # Load existing dW, add contribution, store back
+            existing_dW = tl.load(dW_row_ptr + col_offsets, mask=mask, other=0.0)
+            new_dW = existing_dW + dW_block.to(tl.float32)
+            tl.store(dW_row_ptr + col_offsets, new_dW, mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Helper Functions
+# -----------------------------------------------------------------------------
+
+
+def get_optimal_block_size(n_cols, is_forward: bool):
+    """
+    Calculate optimal block size for forward pass using compute_default_tiling_strategy.
+
+    Memory analysis for forward pass (per row):
+    - Load: X_block, R_block, W_block (3 blocks)
+    - Store: S_block, Y_block (2 blocks)
+    - Compute: S_block, Y_block intermediate (2 blocks)
+    - Total: conservative estimate 8 blocks of memory
+
+    Memory analysis for backward pass (per row):
+    - Load: dY_block, X_block, W_block, existing_dW (4 blocks)
+    - Store: dX_block, new_dW (2 blocks)
+    - Compute: m, dX_block intermediate, dW_block intermediate (3 blocks)
+    - Additional: dS_out_block if present (1 block)
+    - Total: conservative estimate 12 blocks of memory
+
+    Args:
+        n_cols: Number of columns in the tensor
+
+    Returns:
+        Optimal block size
+    """
+    if n_cols <= 2048:
+        return triton.next_power_of_2(n_cols)
+
+    memory_multiplier = 8.0 if is_forward else 12.0
+
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=4,
+        memory_multiplier=memory_multiplier,
+        shapes=((n_cols,),),
+        tiling_dims=(0,),
+    )
+
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(2048, block_size)
+    else:
+        return 2048
+
+
+# -----------------------------------------------------------------------------
+# Forward and Backward Functions
+# -----------------------------------------------------------------------------
+
+
+_str_to_casting_mode = {
+    "llama": _CASTING_MODE_LLAMA.value,
+    "gemma": _CASTING_MODE_GEMMA.value,
+    "none": _CASTING_MODE_NONE.value,
+}
+
+
+def fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode):
+    if not isinstance(casting_mode, int):
+        assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
+        casting_mode = _str_to_casting_mode[casting_mode]
+    else:
+        assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    R = R.view(-1, dim)
+    n_rows, n_cols = X.shape
+    X_DTYPE = torch_dtype_to_triton(X.dtype)
+
+    # Get optimal block size for column processing
+    BLOCK_SIZE = get_optimal_block_size(n_cols, True)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE
+
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    S = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+
+    # RSTD is always fp32 for Llama/Gemma modes
+    rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
+    RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
+
+    # Check constraints
+    assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension"
+
+    # Grid size limited to NPU core count
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores * 2, n_rows)
+
+    # Choose kernel based on n_cols
+    if n_cols <= 2048:
+        # Use no-tiling kernel for small n_cols
+        _fused_add_rms_norm_forward_kernel_no_tiling[(grid_size,)](
+            Y,
+            Y.stride(0),
+            S,
+            S.stride(0),
+            X,
+            X.stride(0),
+            R,
+            R.stride(0),
+            W,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            X_DTYPE,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE,
+        )
+    else:
+        # Use tiled kernel for large n_cols
+        _fused_add_rms_norm_forward_kernel_npu[(grid_size,)](
+            Y,
+            Y.stride(0),
+            S,
+            S.stride(0),
+            X,
+            X.stride(0),
+            R,
+            R.stride(0),
+            W,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            X_DTYPE,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    return Y.view(*shape), S.view(*shape), RSTD, casting_mode
+
+
+def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, in_place):
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    if dS_out is not None:
+        dS_out = dS_out.view(-1, dim)
+    S = S.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    # Get NPU core count for grid size
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores * 2, n_rows)
+
+    # Get optimal block size for backward pass
+    BLOCK_SIZE = get_optimal_block_size(n_cols, False)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE
+
+    # fp32 for numerical stability
+    _dW = torch.zeros((grid_size, n_cols), dtype=torch.float32, device=W.device)
+
+    if in_place:
+        dX = dY
+    else:
+        dX = torch.empty_like(dY)
+
+    # Choose kernel based on n_cols
+    if n_cols <= 2048:
+        # Use no-tiling kernel for small n_cols
+        _fused_add_rms_norm_backward_kernel_no_tiling[(grid_size,)](
+            dY,
+            dY.stride(0),
+            dS_out if dS_out is not None else dY,  # Dummy pointer if dS_out is None
+            dS_out.stride(0) if dS_out is not None else 0,
+            dX,
+            dX.stride(0),
+            S,
+            S.stride(0),
+            torch_to_triton_dtype[S.dtype],
+            W,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0),
+            n_rows,
+            n_cols,
+            offset,
+            casting_mode,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE,
+            has_dS_out=dS_out is not None,
+        )
+    else:
+        # Use tiled kernel for large n_cols
+        _fused_add_rms_norm_backward_kernel_npu[(grid_size,)](
+            dY,
+            dY.stride(0),
+            dS_out if dS_out is not None else dY,  # Dummy pointer if dS_out is None
+            dS_out.stride(0) if dS_out is not None else 0,
+            dX,
+            dX.stride(0),
+            S,
+            S.stride(0),
+            torch_to_triton_dtype[S.dtype],
+            W,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0),
+            n_rows,
+            n_cols,
+            offset,
+            casting_mode,
+            BLOCK_SIZE=BLOCK_SIZE,
+            has_dS_out=dS_out is not None,
+        )
+
+    dX = dX.view(*shape)
+    dW = _dW.sum(dim=0).to(W.dtype)
+
+    return dX, dX, dW  # dR is equal to dX
+
+
+# -----------------------------------------------------------------------------
+# Autograd Function
+# -----------------------------------------------------------------------------
+
+
+class LigerFusedAddRMSNormFunction(torch.autograd.Function):
+    """
+    NPU-optimized fused operation for residual addition and RMSNorm.
+
+    This implementation solves two key issues:
+    1. UB overflow when n_cols is too large (by using column-wise blocking)
+    2. Efficient multi-row processing (by using grid-stride loop with core count limit)
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, R, W, eps, offset=0.0, casting_mode="llama", in_place=False):
+        """
+        X: (B, T, H) or (BxT, H)
+        R: (B, T, H) or (BxT, H)
+        W: (H,)
+        """
+        Y, S, RSTD, casting_mode = fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode)
+        ctx.offset = offset
+        ctx.casting_mode = casting_mode
+        ctx.in_place = in_place
+        ctx.save_for_backward(S, W, RSTD)
+        return Y, S
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY, dS_out):
+        """
+        dY: (B, T, H) or (BxT, H)
+        dS_out: (B, T, H) or (BxT, H)
+        """
+        S, W, RSTD = ctx.saved_tensors
+        dX, dR, dW = fused_add_rms_norm_backward(
+            dY,
+            dS_out,
+            S,
+            W,
+            RSTD,
+            ctx.offset,
+            ctx.casting_mode,
+            ctx.in_place,
+        )
+
+        return dX, dR, dW, None, None, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/fused_linear_jsd.py b/src/liger_kernel/ops/backends/_ascend/ops/fused_linear_jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..10c8fc354217d6c6d511358346a9466670a57fa2
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/fused_linear_jsd.py
@@ -0,0 +1,227 @@
+from typing import Optional
+
+import torch
+import triton
+
+from liger_kernel.ops.backends._ascend.ops.jsd import _jsd_kernel
+from liger_kernel.ops.utils import amp_custom_bwd
+from liger_kernel.ops.utils import amp_custom_fwd
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import get_npu_core_count
+
+MAX_FUSED_SIZE = 4096
+
+
+def fused_linear_jsd_forward(
+    student_input,
+    student_weight,
+    teacher_input,
+    teacher_weight,
+    shift_labels,
+    jsd_beta,
+    ignore_index,
+    has_label,
+    temperature,
+):
+    device = student_input.device
+    dtype = student_input.dtype
+
+    # inputs have shape: BT x H
+    # materialized activations will have shape: BT x V
+    # the increase in memory = BT x V
+    # reduction can be achieved by partitioning the number of tokens BT into smaller chunks.
+    # for ex: if we were to achieve the same memory consumption as BT x H, then the chunk size should be:
+    # inc_factor = (V+H-1)//H, chunk_size = (BT + inc_factor - 1)//inc_factor
+    # for ex: BT = 4096*4, V = 32000, H = 4096 ==> inc_factor = 8, chunk_size = 2048
+    BT, H = student_input.shape
+    V = student_weight.shape[0]
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+
+    inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
+    chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
+    num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
+
+    grad_weight = torch.zeros_like(student_weight, device=device) if student_weight.requires_grad else None
+    grad_input = torch.zeros_like(student_input)
+    # we use fp32 for loss accumulator
+    loss_1d = torch.zeros((BT, V), dtype=torch.float32, device=device)
+
+    if has_label:
+        n_non_ignore = (shift_labels != ignore_index).sum().item()
+    else:
+        n_non_ignore = BT
+
+    num_cores = get_npu_core_count()
+
+    for chunk_id in range(num_chunks):
+        start_idx = chunk_id * chunk_size
+        end_idx = min((chunk_id + 1) * chunk_size, BT)
+
+        # chunk both inputs, shape: chunk_size x H
+        student_input_chunk = student_input[start_idx:end_idx]
+        teacher_input_chunk = teacher_input[start_idx:end_idx]
+
+        # shape: chunk_size x V
+        # For anything starting from logits to the final JSD loss, we do computation
+        # in FP32 to avoid losing numerical stability.
+        student_logits_chunk = (student_input_chunk @ student_weight.t()).to(torch.float32)
+        teacher_logits_chunk = (teacher_input_chunk @ teacher_weight.t()).to(torch.float32)
+        chunk_n_rows = student_logits_chunk.shape[0]
+
+        # unreduced loss
+        loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size
+        # log-softmax with temperature
+        student_logits_chunk = student_logits_chunk / temperature
+        teacher_logits_chunk = teacher_logits_chunk / temperature
+        student_prob_chunk = torch.log_softmax(student_logits_chunk, dim=-1)
+        teacher_prob_chunk = torch.log_softmax(teacher_logits_chunk, dim=-1)
+
+        # ensure _input and target are contiguous
+        student_prob_chunk = student_prob_chunk.contiguous()
+        teacher_prob_chunk = teacher_prob_chunk.contiguous()
+
+        # Here we calculate the gradient of prob_chunk in place so we can save memory.
+        # Grid size is capped at NPU core count; the kernel uses a grid-stride loop
+        # to process multiple rows per program, consistent with the NPU backend pattern.
+        grid_size = min(num_cores, chunk_n_rows)
+        _jsd_kernel[(grid_size,)](
+            X_ptr=student_prob_chunk,
+            X_stride=student_prob_chunk.stride(-2),
+            Y_ptr=teacher_prob_chunk,
+            Y_stride=teacher_prob_chunk.stride(-2),
+            loss_ptr=loss_1d_slice,
+            loss_stride=loss_1d_slice.stride(-2),
+            dX_ptr=student_prob_chunk,
+            dX_stride=student_prob_chunk.stride(-2),
+            label_ptr=(
+                shift_labels[start_idx:end_idx] if has_label else torch.empty(1, device=device)
+            ),  # dummy ptr if no label
+            beta=jsd_beta,
+            n_non_ignore=n_non_ignore,
+            ignore_index=ignore_index,
+            n_rows=chunk_n_rows,
+            n_cols=V,
+            BLOCK_SIZE=BLOCK_SIZE,
+            HAS_LABEL=has_label,
+        )
+        # gradients of prob_chunk in place, shape: chunk_size x V
+        # gradients of logits_chunk in place, shape: chunk_size x V
+        student_logits_chunk = (
+            student_prob_chunk
+            - torch.softmax(student_logits_chunk, dim=-1)
+            * student_prob_chunk.sum(dim=-1, keepdim=True).expand_as(student_prob_chunk)
+        ) / temperature
+        # now we traverse back to grad w.r.t. input to `lm_head` and grad
+        # w.r.t. `lm_head` which should be computed in original dtype
+        student_logits_chunk = student_logits_chunk.to(dtype)
+        grad_input[start_idx:end_idx] = student_logits_chunk @ student_weight
+
+        if grad_weight is not None:
+            grad_weight.add_(student_logits_chunk.t() @ student_input_chunk)
+
+    loss = torch.sum(loss_1d)
+    return loss, grad_input, grad_weight
+
+
+def fused_linear_jsd_backward(grad_output, grad_input, grad_weight):
+    # If JSD is the last layer, grad_output is 1.0. Skip the mul to save time
+    if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
+        # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
+        BT, H = grad_input.shape
+        n_rows = BT
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
+
+        element_mul_kernel[(n_rows,)](
+            grad_input,
+            grad_input.stride(-2),
+            grad_output,
+            H,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+        # handle grad_weight
+        if grad_weight is not None:
+            V, H = grad_weight.shape
+            n_rows = V
+
+            element_mul_kernel[(n_rows,)](
+                grad_weight,
+                grad_weight.stride(-2),
+                grad_output,
+                H,
+                BLOCK_SIZE=BLOCK_SIZE,
+            )
+
+    return grad_input, grad_weight
+
+
+class LigerFusedLinearJSDFunction(torch.autograd.Function):
+    """
+    Fusing the last linear layer with generalized JSD
+
+    Handle the forward and backward pass of the final linear layer via JSD by avoiding
+    the materialization of the large logits tensor. Since JSD is the last layer, we can
+    compute the gradient at the forward pass.
+    """
+
+    @staticmethod
+    @amp_custom_fwd
+    def forward(
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+        jsd_beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        """
+        Args:
+
+            student_input (torch.tensor): input of the last projection layer in student model, with shape (B*T, H), where B is batch size, T is sequence length, H is hidden dimension.
+            student_weight (torch.tensor): the last projection layer in student model, with shape (V, H), where V is vocab size
+            teacher_input (torch.tensor): input of the last projection layer in teacher model, with shape (B*T, H), where B is batch size, T is sequence length, H is hidden dimension.
+            teacher_weight (torch.tensor): the last projection layer in teacher model, with shape (V, H), where V is vocab size
+            shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
+            jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
+            ignore_index (int): the index to ignore. Default: -100
+            temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`
+
+        Returns:
+            loss (torch.Tensor): generalized JSD
+        """
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (teacher_input.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+
+        loss, grad_input, grad_weight = fused_linear_jsd_forward(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            shift_labels,
+            jsd_beta,
+            ignore_index,
+            has_label,
+            temperature,
+        )
+        # downcast to dtype and store for backward
+        ctx.save_for_backward(
+            grad_input.detach(),
+            grad_weight.detach() if grad_weight is not None else None,
+        )
+        return loss
+
+    @staticmethod
+    @amp_custom_bwd
+    def backward(ctx, grad_output):
+        (grad_input, grad_weight) = ctx.saved_tensors
+        grad_input, grad_weight = fused_linear_jsd_backward(grad_output, grad_input, grad_weight)
+        return (grad_input, grad_weight, None, None, None, None, None, None)
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/geglu.py b/src/liger_kernel/ops/backends/_ascend/ops/geglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..123b2b4f262b4e7744961f4b21f96b3a1379678a
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/geglu.py
@@ -0,0 +1,187 @@
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import tanh
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def _geglu_forward_kernel_flat(a_ptr, b_ptr, c_ptr, total_elements, BLOCK_SIZE: tl.constexpr):
+    """
+    High-performance GEGLU forward kernel using flatten 1D approach.
+
+    Uses grid-stride loop pattern for optimal performance on NPU.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Grid-Stride Loop
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+
+    # Constants for GELU tanh approximation
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    gelu_coeff = 0.044715
+
+    for idx in tl.range(start_idx, total_elements, stride):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+
+        a_val = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b_val = tl.load(b_ptr + offsets, mask=mask, other=0.0)
+
+        # tanh approximation form of GELU is computed with:
+        # 0.5 * a * (1 + tanh(sqrt(2 / pi) * (a + 0.044715 * a^3)))
+        a_cubed = a_val * a_val * a_val
+        tanh_arg = sqrt_2_over_pi * (a_val + gelu_coeff * a_cubed)
+        tanh_result = tanh(tanh_arg)
+        geglu_a = 0.5 * a_val * (1.0 + tanh_result)
+        c_row = geglu_a.cast(b_val.dtype) * b_val
+        tl.store(c_ptr + offsets, c_row, mask=mask)
+
+
+@triton.jit
+def _geglu_backward_kernel_flat(dc_ptr, a_ptr, b_ptr, da_ptr, db_ptr, total_elements, BLOCK_SIZE: tl.constexpr):
+    """
+    High-performance GEGLU backward kernel using flatten 1D approach.
+
+    Uses grid-stride loop pattern for optimal performance on NPU.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+
+    # Constants for GELU tanh approximation
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    gelu_coeff = 0.044715
+
+    for idx in tl.range(start_idx, total_elements, stride):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+
+        dc = tl.load(dc_ptr + offsets, mask=mask, other=0.0)
+        a = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b = tl.load(b_ptr + offsets, mask=mask, other=0.0)
+
+        # recomputation to save memory
+        a_cubed = a * a * a
+        tanh_arg = sqrt_2_over_pi * (a + gelu_coeff * a_cubed)
+        tanh_result = tanh(tanh_arg)
+        geglu_a = 0.5 * a * (1 + tanh_result)
+        geglu_a = geglu_a.to(dc.dtype).to(tl.float32)
+
+        db = dc.cast(tl.float32) * geglu_a
+
+        # Gradient w.r.t. a can be computed with:
+        # b * (0.5 * (1 + tanh(z)) + 0.5 * a * (1 - tanh(z)^2) * (sqrt(2/pi) * (1 + 3 * 0.044715 * a^2)))
+        # where z = sqrt(2/pi) * (a + 0.044715 * a^3)
+        term1 = 0.5 * (1.0 + tanh_result)
+        tanh_sq = tanh_result * tanh_result
+        a_sq = a * a
+        term2 = 0.5 * a * (1.0 - tanh_sq) * (sqrt_2_over_pi * (1.0 + 3.0 * gelu_coeff * a_sq))
+        da = dc * b * (term1 + term2)
+
+        tl.store(da_ptr + offsets, da, mask=mask)
+        tl.store(db_ptr + offsets, db.to(dc.dtype), mask=mask)
+
+
+def get_optimal_block_size(total_elements, is_backward=False):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy.
+
+    Args:
+        total_elements: Total number of elements to process
+        is_backward: Whether this is for backward pass (requires more memory)
+
+    Returns:
+        Optimal block size for the kernel
+    """
+    # Memory multiplier based on peak memory usage analysis
+    if is_backward:
+        memory_multiplier = 6.0
+    else:
+        memory_multiplier = 3.0
+    # Call calculation function
+    # Treat input as 1D (total_elements,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=4,
+        memory_multiplier=memory_multiplier,
+        shapes=((total_elements,),),
+        tiling_dims=(0,),
+    )
+
+    # Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(256, block_size)
+    else:
+        return 2048
+
+
+def geglu_forward(a, b):
+    """
+    High-performance GEGLU forward pass for NPU using flatten 1D approach.
+    """
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+
+    total_elements = a.numel()
+    c = torch.empty_like(a)
+
+    block_size = get_optimal_block_size(total_elements, is_backward=False)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+
+    _geglu_forward_kernel_flat[(grid_size,)](a, b, c, total_elements, BLOCK_SIZE=block_size)
+    return c
+
+
+def geglu_backward(a, b, dc):
+    """
+    High-performance GEGLU backward pass for NPU using flatten 1D approach.
+    """
+    if not dc.is_contiguous():
+        dc = dc.contiguous()
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+
+    total_elements = dc.numel()
+    grad_a = torch.empty_like(a)
+    grad_b = torch.empty_like(b)
+
+    block_size = get_optimal_block_size(total_elements, is_backward=True)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+
+    _geglu_backward_kernel_flat[(grid_size,)](dc, a, b, grad_a, grad_b, total_elements, BLOCK_SIZE=block_size)
+    return grad_a, grad_b
+
+
+class LigerGELUMulFunction(torch.autograd.Function):
+    """High-performance GEGLU function for Ascend NPU."""
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, a, b):
+        c = geglu_forward(a, b)
+        ctx.save_for_backward(a, b)
+        return c
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dc):
+        a, b = ctx.saved_tensors
+        grad_a, grad_b = geglu_backward(a, b, dc)
+        return grad_a, grad_b
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/group_norm.py b/src/liger_kernel/ops/backends/_ascend/ops/group_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..85d9efc751ebf49ea986895a9800c92ee9b1ee24
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/group_norm.py
@@ -0,0 +1,474 @@
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import rsqrt
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+# -----------------------------------------------------------------------------
+# Kernels (2D row/col tiling + persistent programs)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _group_norm_forward_kernel(
+    Y_ptr,  # pointer to output, shape (B, G, hidden_size)
+    Y_row_stride,  # stride of each batch row in Y
+    Y_col_stride,  # stride of each group row in Y
+    X_ptr,  # pointer to input, shape (B, G, hidden_size)
+    X_row_stride,  # stride of each batch row in X
+    X_col_stride,  # stride of each group row in X
+    Mean_ptr,  # pointer to mean output, shape (B, G)
+    Mean_row_stride,  # stride of each batch row in Mean
+    Mean_col_stride,  # stride of each group row in Mean
+    RSTD_ptr,  # pointer to rstd output, shape (B, G)
+    RSTD_row_stride,  # stride of each batch row in RSTD
+    RSTD_col_stride,  # stride of each group row in RSTD
+    W_ptr,  # pointer to affine scale weights, shape (C)
+    B_ptr,  # pointer to affine bias weights, shape (C)
+    n_rows,  # total logical rows = B * G
+    hidden_size,
+    channels_per_group,
+    num_groups,
+    SINGLE_CHANNEL_TILE: tl.constexpr,
+    eps,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    grid_m = tl.cdiv(n_rows, BLOCK_SIZE_M)
+    num_col_blocks = tl.cdiv(hidden_size, BLOCK_SIZE_N)
+    hidden_size_per_channel = hidden_size // channels_per_group
+    hidden_size_inv = 1.0 / hidden_size
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+    col_offsets_base = tl.arange(0, BLOCK_SIZE_N)
+
+    # Persistent-program loop over row tiles.
+    for block_m in tl.range(pid, grid_m, num_progs):
+        row_idx = block_m * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        batch_idx = row_idx // num_groups
+        group_idx = row_idx % num_groups
+
+        row_sum = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+        row_square_sum = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+
+        # Pass 1: accumulate E[x] and E[x^2] for each row tile.
+        for cb in range(num_col_blocks):
+            col_offsets = cb * BLOCK_SIZE_N + col_offsets_base
+            col_mask = col_offsets < hidden_size
+            mask = row_mask[:, None] & col_mask[None, :]
+            X_ptrs = (
+                X_ptr + batch_idx[:, None] * X_row_stride + group_idx[:, None] * X_col_stride + col_offsets[None, :]
+            )
+            X_block = tl.load(X_ptrs, mask=mask, other=0.0).to(tl.float32)
+            row_sum += tl.sum(X_block, axis=1)
+            row_square_sum += tl.sum(X_block * X_block, axis=1)
+
+        mean = row_sum * hidden_size_inv
+        var = row_square_sum * hidden_size_inv - mean * mean
+        rstd = rsqrt(tl.maximum(var, 0.0) + eps)
+
+        mean_ptrs = Mean_ptr + batch_idx * Mean_row_stride + group_idx * Mean_col_stride
+        rstd_ptrs = RSTD_ptr + batch_idx * RSTD_row_stride + group_idx * RSTD_col_stride
+        tl.store(mean_ptrs, mean, mask=row_mask)
+        tl.store(rstd_ptrs, rstd, mask=row_mask)
+
+        # Pass 2: normalize + affine transform.
+        # SINGLE_CHANNEL_TILE indicates the current column tile maps to one channel,
+        # so W/B can be loaded once per row and broadcast to the tile.
+        for cb in range(num_col_blocks):
+            col_offsets = cb * BLOCK_SIZE_N + col_offsets_base
+            col_mask = col_offsets < hidden_size
+            mask = row_mask[:, None] & col_mask[None, :]
+            X_ptrs = (
+                X_ptr + batch_idx[:, None] * X_row_stride + group_idx[:, None] * X_col_stride + col_offsets[None, :]
+            )
+            X_block = tl.load(X_ptrs, mask=mask, other=0.0).to(tl.float32)
+            if SINGLE_CHANNEL_TILE:
+                local_channel = (cb * BLOCK_SIZE_N) // hidden_size_per_channel
+                global_channel = group_idx * channels_per_group + local_channel
+                W_block = tl.load(W_ptr + global_channel, mask=row_mask, other=0.0).to(tl.float32)[:, None]
+                B_block = tl.load(B_ptr + global_channel, mask=row_mask, other=0.0).to(tl.float32)[:, None]
+            else:
+                local_channel = col_offsets // hidden_size_per_channel
+                global_channel = group_idx[:, None] * channels_per_group + local_channel[None, :]
+                W_block = tl.load(W_ptr + global_channel, mask=mask, other=0.0).to(tl.float32)
+                B_block = tl.load(B_ptr + global_channel, mask=mask, other=0.0).to(tl.float32)
+            Y_block = (X_block - mean[:, None]) * rstd[:, None] * W_block + B_block
+            Y_ptrs = (
+                Y_ptr + batch_idx[:, None] * Y_row_stride + group_idx[:, None] * Y_col_stride + col_offsets[None, :]
+            )
+            tl.store(Y_ptrs, Y_block, mask=mask)
+
+
+@triton.jit
+def _group_norm_backward_kernel(
+    X_ptr,  # pointer to input, shape (B, G, hidden_size)
+    X_row_stride,  # stride of each batch row in X
+    X_col_stride,  # stride of each group row in X
+    W_ptr,  # pointer to affine scale weights, shape (C)
+    Mean_ptr,  # pointer to saved group mean, shape (B, G)
+    Mean_row_stride,  # stride of each batch row in Mean
+    Mean_col_stride,  # stride of each group row in Mean
+    RSTD_ptr,  # pointer to saved reciprocal std, shape (B, G)
+    DX_ptr,  # pointer to input gradients, shape (B, G, hidden_size)
+    DW_scratch_ptr,  # pointer to scratch buffer for dW partial sums, shape (grid, C)
+    DW_scratch_stride,  # row stride for DW_scratch
+    DB_scratch_ptr,  # pointer to scratch buffer for dB partial sums, shape (grid, C)
+    DB_scratch_stride,  # row stride for DB_scratch
+    DY_ptr,  # pointer to upstream gradients, shape (B, G, hidden_size)
+    DY_row_stride,  # stride of each batch row in DY
+    DY_col_stride,  # stride of each group row in DY
+    n_rows,  # total logical rows = B * G
+    hidden_size,
+    channels_per_group,
+    num_groups,
+    SINGLE_CHANNEL_TILE: tl.constexpr,
+    COMPUTE_PARAM_GRAD: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    grid_m = tl.cdiv(n_rows, BLOCK_SIZE_M)
+    num_col_blocks = tl.cdiv(hidden_size, BLOCK_SIZE_N)
+    hidden_size_per_channel = hidden_size // channels_per_group
+    N_inv = 1.0 / hidden_size
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+    col_offsets_base = tl.arange(0, BLOCK_SIZE_N)
+
+    if COMPUTE_PARAM_GRAD:
+        DW_scratch_base = DW_scratch_ptr + pid * DW_scratch_stride
+        DB_scratch_base = DB_scratch_ptr + pid * DB_scratch_stride
+
+    # Persistent-program loop over row tiles.
+    for block_m in tl.range(pid, grid_m, num_progs):
+        row_idx = block_m * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        batch_idx = row_idx // num_groups
+        group_idx = row_idx % num_groups
+
+        mean = tl.load(
+            Mean_ptr + batch_idx * Mean_row_stride + group_idx * Mean_col_stride,
+            mask=row_mask,
+            other=0.0,
+        ).to(tl.float32)
+        rstd = tl.load(
+            RSTD_ptr + batch_idx * Mean_row_stride + group_idx * Mean_col_stride,
+            mask=row_mask,
+            other=0.0,
+        ).to(tl.float32)
+
+        sum_x_hat_wdy = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+        sum_wdy = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+
+        # Pass 1: compute row-wise reduction terms (c1, c2).
+        for cb in range(num_col_blocks):
+            col_offsets = cb * BLOCK_SIZE_N + col_offsets_base
+            col_mask = col_offsets < hidden_size
+            mask = row_mask[:, None] & col_mask[None, :]
+
+            X_ptrs = (
+                X_ptr + batch_idx[:, None] * X_row_stride + group_idx[:, None] * X_col_stride + col_offsets[None, :]
+            )
+            DY_ptrs = (
+                DY_ptr + batch_idx[:, None] * DY_row_stride + group_idx[:, None] * DY_col_stride + col_offsets[None, :]
+            )
+            X_block = tl.load(X_ptrs, mask=mask, other=0.0).to(tl.float32)
+            DY_block = tl.load(DY_ptrs, mask=mask, other=0.0).to(tl.float32)
+
+            if SINGLE_CHANNEL_TILE:
+                local_channel = (cb * BLOCK_SIZE_N) // hidden_size_per_channel
+                global_channel = group_idx * channels_per_group + local_channel
+                W_block = tl.load(W_ptr + global_channel, mask=row_mask, other=0.0).to(tl.float32)[:, None]
+            else:
+                local_channel = col_offsets // hidden_size_per_channel
+                global_channel = group_idx[:, None] * channels_per_group + local_channel[None, :]
+                W_block = tl.load(W_ptr + global_channel, mask=mask, other=0.0).to(tl.float32)
+
+            x_hat = (X_block - mean[:, None]) * rstd[:, None]
+            wdy = W_block * DY_block
+            sum_x_hat_wdy += tl.sum(tl.where(mask, x_hat * wdy, 0.0), axis=1)
+            sum_wdy += tl.sum(tl.where(mask, wdy, 0.0), axis=1)
+
+        c1 = sum_x_hat_wdy * N_inv
+        c2 = sum_wdy * N_inv
+
+        # Pass 2: compute DX and optionally accumulate DW/DB.
+        # COMPUTE_PARAM_GRAD=False is used to skip expensive atomics in cases
+        # where host-side dense reduction is faster/more stable.
+        for cb in range(num_col_blocks):
+            col_offsets = cb * BLOCK_SIZE_N + col_offsets_base
+            col_mask = col_offsets < hidden_size
+            mask = row_mask[:, None] & col_mask[None, :]
+
+            X_ptrs = (
+                X_ptr + batch_idx[:, None] * X_row_stride + group_idx[:, None] * X_col_stride + col_offsets[None, :]
+            )
+            DY_ptrs = (
+                DY_ptr + batch_idx[:, None] * DY_row_stride + group_idx[:, None] * DY_col_stride + col_offsets[None, :]
+            )
+            X_block = tl.load(X_ptrs, mask=mask, other=0.0).to(tl.float32)
+            DY_block = tl.load(DY_ptrs, mask=mask, other=0.0).to(tl.float32)
+
+            if SINGLE_CHANNEL_TILE:
+                local_channel = (cb * BLOCK_SIZE_N) // hidden_size_per_channel
+                global_channel = group_idx * channels_per_group + local_channel
+                W_block = tl.load(W_ptr + global_channel, mask=row_mask, other=0.0).to(tl.float32)[:, None]
+            else:
+                local_channel = col_offsets // hidden_size_per_channel
+                global_channel = group_idx[:, None] * channels_per_group + local_channel[None, :]
+                W_block = tl.load(W_ptr + global_channel, mask=mask, other=0.0).to(tl.float32)
+
+            x_hat = (X_block - mean[:, None]) * rstd[:, None]
+            wdy = W_block * DY_block
+            DX_block = (wdy - (x_hat * c1[:, None] + c2[:, None])) * rstd[:, None]
+
+            DX_ptrs = (
+                DX_ptr + batch_idx[:, None] * X_row_stride + group_idx[:, None] * X_col_stride + col_offsets[None, :]
+            )
+            tl.store(DX_ptrs, DX_block.to(X_ptr.dtype.element_ty), mask=mask)
+
+            if COMPUTE_PARAM_GRAD:
+                if SINGLE_CHANNEL_TILE:
+                    dW_partial = tl.sum(tl.where(mask, DY_block * x_hat, 0.0), axis=1)
+                    dB_partial = tl.sum(tl.where(mask, DY_block, 0.0), axis=1)
+                    tl.atomic_add(DW_scratch_base + global_channel, dW_partial, mask=row_mask)
+                    tl.atomic_add(DB_scratch_base + global_channel, dB_partial, mask=row_mask)
+
+
+# -----------------------------------------------------------------------------
+# Helper: call compute_default_tiling_strategy
+# -----------------------------------------------------------------------------
+
+
+def get_optimal_block_size(n_rows, dtype_size, BLOCK_SIZE_N, is_backward: bool = False):
+    # Backward keeps larger live-state than forward in this kernel.
+    multiplier = 7.0 if is_backward else 6.0
+
+    # Use fp32-size as conservative UB estimate for tiling.
+    dtype_size = max(dtype_size, 4)
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=dtype_size,
+        memory_multiplier=multiplier,
+        shapes=((n_rows, BLOCK_SIZE_N),),
+        tiling_dims=(0,),
+    )
+    if tile_shapes and len(tile_shapes) > 0:
+        return tile_shapes[0][0]
+    return triton.next_power_of_2(min(128, n_rows))
+
+
+def group_norm_forward(X, num_channels, num_groups, W, B, eps):
+    shape = X.shape
+    batch_size = shape[0]
+    channels_per_group = num_channels // num_groups
+    # Reshape X so that the mean / std are computed across each group
+    X = X.view(batch_size, num_groups, -1).contiguous()
+
+    hidden_size = X.shape[-1]
+    hidden_size_per_channel = hidden_size // channels_per_group
+    n_rows = batch_size * num_groups
+
+    BLOCK_SIZE_N = min(128, triton.next_power_of_2(hidden_size))
+    BLOCK_SIZE_M = get_optimal_block_size(n_rows, X.element_size(), BLOCK_SIZE_N)
+
+    # Fast path condition: each column tile must lie entirely inside one channel
+    # segment of length `hidden_size_per_channel`.
+    #
+    # Layout of a row:
+    #   | channel0 | channel1 | channel2 | ...
+    #   |----Hc----|----Hc----|
+    #   Hc = hidden_size_per_channel
+    #
+    # The kernel processes tiles of shape (BLOCK_SIZE_M, BLOCK_SIZE_N).
+    # Channel boundaries exist only along the column dimension, because
+    # each row corresponds to a different (batch, group).
+    #
+    # Therefore only BLOCK_SIZE_N matters for whether a tile crosses
+    # channel boundaries; BLOCK_SIZE_M does not affect channel mapping.
+    #
+    # If BLOCK_SIZE_N divides Hc and is <= Hc, each column tile belongs
+    # to exactly one channel. In that case W/B can be loaded once and
+    # broadcast across the tile (fast path).
+    #
+    # Otherwise a tile may span multiple channels, requiring per-element
+    # channel index computation and parameter loads (slow path).
+    single_channel_tile = BLOCK_SIZE_N <= hidden_size_per_channel and hidden_size_per_channel % BLOCK_SIZE_N == 0
+
+    num_cores = get_npu_core_count()
+    grid = min(num_cores, triton.cdiv(n_rows, BLOCK_SIZE_M))
+
+    Y = torch.empty((batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device)
+    Mean = torch.empty((batch_size, num_groups), dtype=X.dtype, device=X.device)
+    RSTD = torch.empty((batch_size, num_groups), dtype=X.dtype, device=X.device)
+
+    _group_norm_forward_kernel[(grid,)](
+        Y,
+        Y.stride(0),
+        Y.stride(1),
+        X,
+        X.stride(0),
+        X.stride(1),
+        Mean,
+        Mean.stride(0),
+        Mean.stride(1),
+        RSTD,
+        RSTD.stride(0),
+        RSTD.stride(1),
+        W,
+        B,
+        n_rows,
+        hidden_size,
+        channels_per_group,
+        num_groups,
+        SINGLE_CHANNEL_TILE=single_channel_tile,
+        eps=eps,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+    )
+    return Y.view(*shape), X.view(*shape), Mean, RSTD
+
+
+def group_norm_backward(dY, X, W, B, Mean, RSTD, num_channels, num_groups):
+    shape = dY.shape
+    batch_size = shape[0]
+    channels_per_group = num_channels // num_groups
+    X_grouped = X.view(batch_size, num_groups, -1)
+    dY_grouped = dY.view(batch_size, num_groups, -1)
+    hidden_size = dY_grouped.shape[-1]
+    hidden_size_per_channel = hidden_size // channels_per_group
+    n_rows = batch_size * num_groups
+
+    BLOCK_SIZE_N = min(128, triton.next_power_of_2(hidden_size))
+    BLOCK_SIZE_M = get_optimal_block_size(
+        n_rows,
+        X.element_size(),
+        BLOCK_SIZE_N,
+        is_backward=True,
+    )
+
+    # Same condition as forward:
+    # if true, each BLOCK_SIZE_N tile maps cleanly to one channel segment.
+    single_channel_tile = BLOCK_SIZE_N <= hidden_size_per_channel and hidden_size_per_channel % BLOCK_SIZE_N == 0
+
+    num_cores = get_npu_core_count()
+    grid = min(num_cores, triton.cdiv(n_rows, BLOCK_SIZE_M))
+    # For non-single-channel tiles, per-element atomic updates are costly.
+    # In that case, kernel computes DX only and DW/DB are reduced on host side.
+    compute_param_grad = single_channel_tile
+
+    DX = torch.empty((batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device)
+    if compute_param_grad:
+        DW_scratch = torch.zeros((grid, num_channels), dtype=torch.float32, device=W.device)
+        DB_scratch = torch.zeros((grid, num_channels), dtype=torch.float32, device=W.device)
+    else:
+        # Not used when COMPUTE_PARAM_GRAD=False.
+        # Intentionally set to None to enforce fail-fast behavior if accidentally accessed.
+        DW_scratch = None
+        DB_scratch = None
+
+    _group_norm_backward_kernel[(grid,)](
+        X_grouped,
+        X_grouped.stride(0),
+        X_grouped.stride(1),
+        W,
+        Mean,
+        Mean.stride(0),
+        Mean.stride(1),
+        RSTD,
+        DX,
+        DW_scratch,
+        0 if not compute_param_grad else DW_scratch.stride(0),
+        DB_scratch,
+        0 if not compute_param_grad else DB_scratch.stride(0),
+        dY_grouped,
+        dY_grouped.stride(0),
+        dY_grouped.stride(1),
+        n_rows,
+        hidden_size,
+        channels_per_group,
+        num_groups,
+        SINGLE_CHANNEL_TILE=single_channel_tile,
+        COMPUTE_PARAM_GRAD=compute_param_grad,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+    )
+
+    # Precision note:
+    # - In-kernel atomic_add on floating-point values is order-dependent under parallel
+    #   scheduling (non-associative summation), which can introduce run-to-run numerical
+    #   differences in DW/DB for contention-heavy shapes.
+    # - Host-side dense reduction provides a more stable accumulation pattern for these
+    #   difficult layouts.
+    if compute_param_grad:
+        DW = DW_scratch.sum(dim=0).to(W.dtype)
+        DB = DB_scratch.sum(dim=0).to(W.dtype)
+    else:
+        # Fallback path to avoid severe atomic contention when SINGLE_CHANNEL_TILE=False.
+        # Layout: [B, G, hidden_size] -> [B, G, C_per_G, hidden_per_channel]
+        X4 = X_grouped.reshape(batch_size, num_groups, channels_per_group, hidden_size_per_channel).to(torch.float32)
+        dY4 = dY_grouped.reshape(batch_size, num_groups, channels_per_group, hidden_size_per_channel).to(torch.float32)
+        mean4 = Mean.reshape(batch_size, num_groups, 1, 1).to(torch.float32)
+        rstd4 = RSTD.reshape(batch_size, num_groups, 1, 1).to(torch.float32)
+
+        x_hat4 = (X4 - mean4) * rstd4
+        DW = (dY4 * x_hat4).sum(dim=(0, 3)).reshape(-1).to(W.dtype)
+        DB = dY4.sum(dim=(0, 3)).reshape(-1).to(W.dtype)
+
+    return DX.view(*shape), DW, DB
+
+
+class LigerGroupNormFunction(torch.autograd.Function):
+    """
+    Group Normalization autograd function for Ascend NPU.
+
+    Forward computes, for each sample/group:
+        y = (x - mean) * rstd * weight + bias
+    where:
+        mean = E[x], rstd = 1 / sqrt(Var[x] + eps)
+
+    The kernel uses row/column tiling with persistent programs. Backward computes
+    input gradients in Triton and computes parameter gradients either via Triton
+    atomics (fast path) or host-side dense reduction (fallback path), depending
+    on the tile/channel layout.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        X,
+        affine_scaling_weight,
+        affine_shifting_bias,
+        num_channels,
+        num_groups,
+        eps,
+    ):
+        Y, X, Mean, RSTD = group_norm_forward(
+            X,
+            num_channels,
+            num_groups,
+            affine_scaling_weight,
+            affine_shifting_bias,
+            eps,
+        )
+        ctx.num_channels = num_channels
+        ctx.num_groups = num_groups
+        ctx.save_for_backward(X, affine_scaling_weight, affine_shifting_bias, Mean, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        X, W, B, Mean, RSTD = ctx.saved_tensors
+        DX, DW, DB = group_norm_backward(dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups)
+        return DX, DW, DB, None, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/grpo_loss.py b/src/liger_kernel/ops/backends/_ascend/ops/grpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..b430f4c3b37be888bf1a56336e4c645ff1293413
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/grpo_loss.py
@@ -0,0 +1,1006 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+# Loss type mapping for Triton constexpr branching
+# GRPO/DAPO/BNPO/DR_GRPO share identical per-token loss computation (standard PPO clipping)
+_TYPE_GRPO: tl.constexpr = tl.constexpr(0)
+_TYPE_CISPO: tl.constexpr = tl.constexpr(1)
+_TYPE_SAPO: tl.constexpr = tl.constexpr(2)
+
+_str_to_loss_type = {
+    "grpo": _TYPE_GRPO.value,
+    "dapo": _TYPE_GRPO.value,
+    "bnpo": _TYPE_GRPO.value,
+    "dr_grpo": _TYPE_GRPO.value,
+    "luspo": _TYPE_GRPO.value,
+    "cispo": _TYPE_CISPO.value,
+    "sapo": _TYPE_SAPO.value,
+}
+
+
+def calculate_tile_count_2d(batch_size, seq_len, num_cores):
+    """Compute optimal grid configuration for parallel processing."""
+    grid_batch = batch_size
+    cores_per_sample = min(seq_len, num_cores // batch_size)
+    cores_per_sample = max(1, cores_per_sample)
+    grid_seq = cores_per_sample
+    total = grid_batch * grid_seq
+    if total > num_cores:
+        grid_seq = max(1, num_cores // grid_batch)
+    return (grid_batch, grid_seq)
+
+
+def compute_block_size_softmax(seq_vocab_size):
+    """Determine optimal block size for selective log-softmax kernel."""
+    multiplier = 6.0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((seq_vocab_size,),), tiling_dims=(0,)
+    )
+    if tile_shapes and len(tile_shapes) > 0:
+        return tile_shapes[0][0]
+    return 2048
+
+
+def compute_block_size_forward(seq_vocab_size):
+    """Determine optimal block size for forward pass kernel."""
+    multiplier = 10.0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((seq_vocab_size,),), tiling_dims=(0,)
+    )
+    if tile_shapes and len(tile_shapes) > 0:
+        return tile_shapes[0][0]
+    return 2048
+
+
+def compute_block_size_backward(seq_vocab_size):
+    """Determine optimal block size for backward pass kernel."""
+    multiplier = 12.0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((seq_vocab_size,),), tiling_dims=(0,)
+    )
+    if tile_shapes and len(tile_shapes) > 0:
+        return tile_shapes[0][0]
+    return 2048
+
+
+@triton.jit
+def _selective_log_softmax_kernel(
+    LOGITS,
+    INPUT_IDS,
+    LOG_P,
+    MASK,
+    TEMPERATURE,
+    stride_input_ids_b,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 2048,
+):
+    pid_b = tl.program_id(0)
+    pid_l = tl.program_id(1)
+    num_progs_l = tl.num_programs(1)
+
+    batch_start = pid_b * L
+    batch_end = batch_start + L
+    start_token = batch_start + pid_l
+    stride = num_progs_l
+
+    for token_idx in tl.range(start_token, batch_end, stride):
+        off_b = token_idx // L
+        off_l = token_idx % L
+
+        should_process = 1
+        if MASK is not None:
+            MASK_local = MASK + off_b * stride_input_ids_b + off_l
+            not_skip = tl.load(MASK_local)
+            should_process = not_skip
+
+        if should_process != 0:
+            LOGITS_local = LOGITS + off_b * (L + 1) * N + off_l * N
+            INPUT_IDS_local = INPUT_IDS + off_b * stride_input_ids_b + off_l
+            LOG_P_local = LOG_P + token_idx
+
+            m_i = float("-inf")
+            l_i = 0.0
+            for start in range(0, N, BLOCK_N):
+                cols = start + tl.arange(0, BLOCK_N)
+                logits = tl.load(LOGITS_local + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+                new_m_i = tl.maximum(m_i, tl.max(logits))
+                alpha = tl.exp(m_i - new_m_i)
+                l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+                m_i = new_m_i
+            lse = m_i + tl.log(l_i)
+
+            ids = tl.load(INPUT_IDS_local)
+            x = tl.load(LOGITS_local + ids).to(tl.float32) / TEMPERATURE
+            logp = x - lse
+            tl.store(LOG_P_local, logp)
+
+
+@triton.jit
+def _grpo_loss_fwd_kernel(
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    COMPLETION_MASK,
+    ADVANTAGES,
+    VLLM_IS_RATIO,
+    VLLM_IS_RATIO_STRIDE,
+    LOSS,
+    LSE,
+    KL,
+    IS_CLIPPED,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    LOSS_TYPE: tl.constexpr,
+    SAPO_TEMP_POS,
+    SAPO_TEMP_NEG,
+    DELTA,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 2048,
+):
+    pid_b = tl.program_id(0)
+    pid_l = tl.program_id(1)
+    num_progs_l = tl.num_programs(1)
+
+    batch_start = pid_b * L
+    batch_end = batch_start + L
+    start_token = batch_start + pid_l
+    stride = num_progs_l
+
+    for token_idx in tl.range(start_token, batch_end, stride):
+        off_b = token_idx // L
+        off_l = token_idx % L
+
+        should_process = 1
+        if COMPLETION_MASK is not None:
+            COMPLETION_MASK_local = COMPLETION_MASK + off_b * L + off_l
+            not_skip = tl.load(COMPLETION_MASK_local)
+            should_process = not_skip
+
+        if should_process != 0:
+            LOGITS_local = LOGITS + off_b * (L + 1) * N + off_l * N
+            INPUT_IDS_local = INPUT_IDS + off_b * L + off_l
+            ADVANTAGES_local = ADVANTAGES + off_b
+            LOSS_local = LOSS + token_idx
+            LSE_local = LSE + token_idx
+            IS_CLIPPED_local = IS_CLIPPED + token_idx
+
+            m_i = float("-inf")
+            l_i = 0.0
+            for start in range(0, N, BLOCK_N):
+                cols = start + tl.arange(0, BLOCK_N)
+                logits = tl.load(LOGITS_local + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+                new_m_i = tl.maximum(m_i, tl.max(logits))
+                alpha = tl.exp(m_i - new_m_i)
+                l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+                m_i = new_m_i
+            lse = m_i + tl.log(l_i)
+
+            idx = tl.load(INPUT_IDS_local)
+            x = tl.load(LOGITS_local + idx).to(tl.float32) / TEMPERATURE
+            logp = x - lse
+            if OLD_LOGP is None:
+                old_logp = logp
+            else:
+                OLD_LOGP_local = OLD_LOGP + token_idx
+                old_logp = tl.load(OLD_LOGP_local).to(tl.float32)
+            coef_1 = tl.exp(logp - old_logp)
+            advantage = tl.load(ADVANTAGES_local).to(tl.float32)
+
+            if LOSS_TYPE == 0:  # GRPO/DAPO/BNPO/DR_GRPO
+                coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+                is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
+                is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
+                is_clipped = is_low_clipped | is_high_clipped
+                if DELTA != 0.0:
+                    coef_1 = tl.minimum(coef_1, DELTA)
+                per_token_loss1 = coef_1 * advantage
+                per_token_loss2 = coef_2 * advantage
+                per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
+
+            elif LOSS_TYPE == 1:  # CISPO
+                coef_2 = tl.minimum(coef_1, EPS_HIGH)
+                per_token_loss = -coef_2 * advantage * logp
+                is_clipped = (coef_1 > EPS_HIGH) & (advantage > 0)
+
+            elif LOSS_TYPE == 2:  # SAPO
+                temperature = tl.where(advantage > 0, SAPO_TEMP_POS, SAPO_TEMP_NEG)
+                sigmoid_input = temperature * (coef_1 - 1.0)
+                sapo_coef = tl.sigmoid(sigmoid_input) * 4.0 / temperature
+                per_token_loss = -sapo_coef * advantage
+                is_clipped = 0.0
+
+            if VLLM_IS_RATIO is not None:
+                vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+                    tl.float32
+                )
+                per_token_loss = per_token_loss * vllm_is_ratio
+
+            if BETA != 0.0:
+                REF_LOGP_local = REF_LOGP + token_idx
+                KL_local = KL + token_idx
+                ref_logp = tl.load(REF_LOGP_local).to(tl.float32)
+                kl = tl.exp(ref_logp - logp) - (ref_logp - logp) - 1
+                if USE_BIAS_CORRECTION_KL:
+                    kl = kl * tl.exp(logp - old_logp)
+                per_token_loss += BETA * kl
+                tl.store(KL_local, kl)
+
+            tl.store(LOSS_local, per_token_loss)
+            tl.store(LSE_local, lse)
+            tl.store(IS_CLIPPED_local, is_clipped)
+
+
+@triton.jit
+def _grpo_loss_fwd_kernel_seq(
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    COMPLETION_MASK,
+    ADVANTAGES,
+    COEF_1,
+    COEF_2,
+    IS_CLIPPED_SEQ,
+    VLLM_IS_RATIO,
+    VLLM_IS_RATIO_STRIDE,
+    LOSS,
+    LSE,
+    KL,
+    IS_CLIPPED,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 2048,
+):
+    pid_b = tl.program_id(0)
+    pid_l = tl.program_id(1)
+    num_progs_l = tl.num_programs(1)
+
+    batch_start = pid_b * L
+    batch_end = batch_start + L
+    start_token = batch_start + pid_l
+    stride = num_progs_l
+
+    for token_idx in tl.range(start_token, batch_end, stride):
+        off_b = token_idx // L
+        off_l = token_idx % L
+
+        should_process = 1
+        if COMPLETION_MASK is not None:
+            COMPLETION_MASK_local = COMPLETION_MASK + off_b * L + off_l
+            not_skip = tl.load(COMPLETION_MASK_local)
+            should_process = not_skip
+
+        if should_process != 0:
+            LOGITS_local = LOGITS + off_b * (L + 1) * N + off_l * N
+            INPUT_IDS_local = INPUT_IDS + off_b * L + off_l
+            ADVANTAGES_local = ADVANTAGES + off_b
+            COEF_1_local = COEF_1 + off_b
+            COEF_2_local = COEF_2 + off_b
+            IS_CLIPPED_SEQ_local = IS_CLIPPED_SEQ + off_b
+            LOSS_local = LOSS + token_idx
+            LSE_local = LSE + token_idx
+            IS_CLIPPED_local = IS_CLIPPED + token_idx
+
+            m_i = float("-inf")
+            l_i = 0.0
+            for start in range(0, N, BLOCK_N):
+                cols = start + tl.arange(0, BLOCK_N)
+                logits = tl.load(LOGITS_local + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+                new_m_i = tl.maximum(m_i, tl.max(logits))
+                alpha = tl.exp(m_i - new_m_i)
+                l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+                m_i = new_m_i
+            lse = m_i + tl.log(l_i)
+
+            idx = tl.load(INPUT_IDS_local)
+            x = tl.load(LOGITS_local + idx).to(tl.float32) / TEMPERATURE
+            logp = x - lse
+
+            coef_1 = tl.load(COEF_1_local).to(tl.float32)
+            coef_2 = tl.load(COEF_2_local).to(tl.float32)
+            is_clipped_seq = tl.load(IS_CLIPPED_SEQ_local)
+
+            advantage = tl.load(ADVANTAGES_local).to(tl.float32)
+            per_token_loss1 = coef_1 * advantage
+            per_token_loss2 = coef_2 * advantage
+            per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
+
+            if VLLM_IS_RATIO is not None:
+                vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+                    tl.float32
+                )
+                per_token_loss = per_token_loss * vllm_is_ratio
+
+            if BETA != 0.0:
+                REF_LOGP_local = REF_LOGP + token_idx
+                KL_local = KL + token_idx
+                ref_logp = tl.load(REF_LOGP_local).to(tl.float32)
+                kl = tl.exp(ref_logp - logp) - (ref_logp - logp) - 1
+                if USE_BIAS_CORRECTION_KL:
+                    if OLD_LOGP is None:
+                        old_logp = logp
+                    else:
+                        old_logp = tl.load(OLD_LOGP + token_idx).to(tl.float32)
+                    kl = kl * tl.exp(logp - old_logp)
+                per_token_loss += BETA * kl
+                tl.store(KL_local, kl)
+
+            tl.store(LOSS_local, per_token_loss)
+            tl.store(LSE_local, lse)
+            tl.store(IS_CLIPPED_local, is_clipped_seq)
+
+
+@triton.jit
+def _grpo_loss_bwd_kernel_seq(
+    DLOSS,
+    DLOSS_SUM,
+    DLOGITS,
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    ADVANTAGES,
+    COMPLETION_MASK,
+    LSE,
+    COEF_1,
+    SEQ_LEN,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    DELTA,
+    loss_stride0,
+    loss_stride1,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 2048,
+):
+    pid_b = tl.program_id(0)
+    pid_l = tl.program_id(1)
+    num_progs_l = tl.num_programs(1)
+
+    batch_start = pid_b * L
+    batch_end = batch_start + L
+    start_token = batch_start + pid_l
+    stride = num_progs_l
+
+    for token_idx in tl.range(start_token, batch_end, stride):
+        off_b = token_idx // L
+        off_l = token_idx % L
+
+        DLOGITS_local = DLOGITS + off_b * (L + 1) * N + off_l * N
+
+        should_process = 1
+        if COMPLETION_MASK is not None:
+            COMPLETION_MASK_local = COMPLETION_MASK + off_b * L + off_l
+            not_skip = tl.load(COMPLETION_MASK_local)
+            should_process = not_skip
+
+        if should_process == 0:
+            for start in range(0, N, BLOCK_N):
+                cols = tl.arange(0, BLOCK_N) + start
+                tl.store(DLOGITS_local + cols, 0.0, mask=cols < N)
+        else:
+            LOGITS_local = LOGITS + off_b * (L + 1) * N + off_l * N
+            DLOSS_local = DLOSS + off_b * loss_stride0 + off_l * loss_stride1
+            DLOSS_SUM_local = DLOSS_SUM + off_b
+            INPUT_IDS_local = INPUT_IDS + off_b * L + off_l
+            ADVANTAGES_local = ADVANTAGES + off_b
+            LSE_local = LSE + token_idx
+            COEF_1_local = COEF_1 + off_b
+            SEQ_LEN_local = SEQ_LEN + off_b
+
+            dloss = tl.load(DLOSS_local).to(tl.float32)
+            dloss_sum = tl.load(DLOSS_SUM_local).to(tl.float32)
+            lse = tl.load(LSE_local).to(tl.float32)
+            coef_1 = tl.load(COEF_1_local).to(tl.float32)
+            seq_len = tl.load(SEQ_LEN_local).to(tl.float32)
+
+            idx = tl.load(INPUT_IDS_local)
+            x = tl.load(LOGITS_local + idx).to(tl.float32) / TEMPERATURE
+            logp = x - lse
+
+            advantage = tl.load(ADVANTAGES_local).to(tl.float32)
+            coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+            if DELTA != 0.0:
+                coef_1_for_loss = tl.minimum(coef_1, DELTA)
+            else:
+                coef_1_for_loss = coef_1
+            per_token_loss1 = coef_1_for_loss * advantage
+            per_token_loss2 = coef_2 * advantage
+            is_unclipped = per_token_loss2 >= per_token_loss1
+
+            dlogp = -coef_1 * advantage / seq_len * is_unclipped * dloss_sum
+            if DELTA != 0.0:
+                dlogp = dlogp * (coef_1 <= DELTA)
+
+            if BETA != 0.0:
+                REF_LOGP_local = REF_LOGP + token_idx
+                ref_logp = tl.load(REF_LOGP_local).to(tl.float32)
+                if USE_BIAS_CORRECTION_KL:
+                    if OLD_LOGP is None:
+                        old_logp = logp
+                    else:
+                        old_logp = tl.load(OLD_LOGP + token_idx).to(tl.float32)
+                    token_coef_1 = tl.exp(logp - old_logp)
+                    dlogp += BETA * token_coef_1 * (logp - ref_logp) * dloss
+                else:
+                    dlogp += BETA * (1 - tl.exp(ref_logp - logp)) * dloss
+
+            dlogp = dlogp / TEMPERATURE
+            for start_n in tl.range(0, N, BLOCK_N):
+                cols = start_n + tl.arange(0, BLOCK_N)
+                logits = tl.load(LOGITS_local + cols, mask=cols < N, other=-float("inf")).to(tl.float32) / TEMPERATURE
+                probs = tl.exp(logits - lse)
+                dlogits = tl.where(cols == idx, 1 - probs, -probs) * dlogp
+                tl.store(DLOGITS_local + cols, dlogits, mask=cols < N)
+
+
+@triton.jit
+def _grpo_loss_bwd_kernel(
+    DLOSS,
+    DLOGITS,
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    ADVANTAGES,
+    COMPLETION_MASK,
+    LSE,
+    VLLM_IS_RATIO,
+    VLLM_IS_RATIO_STRIDE,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    LOSS_TYPE: tl.constexpr,
+    SAPO_TEMP_POS,
+    SAPO_TEMP_NEG,
+    DELTA,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    loss_stride0,
+    loss_stride1,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 2048,
+):
+    pid_b = tl.program_id(0)
+    pid_l = tl.program_id(1)
+    num_progs_l = tl.num_programs(1)
+
+    batch_start = pid_b * L
+    batch_end = batch_start + L
+    start_token = batch_start + pid_l
+    stride = num_progs_l
+
+    for token_idx in tl.range(start_token, batch_end, stride):
+        off_b = token_idx // L
+        off_l = token_idx % L
+
+        DLOGITS_local = DLOGITS + off_b * (L + 1) * N + off_l * N
+
+        should_process = 1
+        if COMPLETION_MASK is not None:
+            COMPLETION_MASK_local = COMPLETION_MASK + off_b * L + off_l
+            not_skip = tl.load(COMPLETION_MASK_local)
+            should_process = not_skip
+
+        if should_process == 0:
+            for start in range(0, N, BLOCK_N):
+                cols = tl.arange(0, BLOCK_N) + start
+                tl.store(DLOGITS_local + cols, 0.0, mask=cols < N)
+        else:
+            LOGITS_local = LOGITS + off_b * (L + 1) * N + off_l * N
+            DLOSS_local = DLOSS + off_b * loss_stride0 + off_l * loss_stride1
+            INPUT_IDS_local = INPUT_IDS + off_b * L + off_l
+            ADVANTAGES_local = ADVANTAGES + off_b
+            LSE_local = LSE + token_idx
+
+            dloss = tl.load(DLOSS_local).to(tl.float32)
+            lse = tl.load(LSE_local).to(tl.float32)
+
+            idx = tl.load(INPUT_IDS_local)
+            x = tl.load(LOGITS_local + idx).to(tl.float32) / TEMPERATURE
+            logp = x - lse
+            if OLD_LOGP is None:
+                old_logp = logp
+            else:
+                OLD_LOGP_local = OLD_LOGP + token_idx
+                old_logp = tl.load(OLD_LOGP_local).to(tl.float32)
+            coef_1 = tl.exp(logp - old_logp)
+            advantage = tl.load(ADVANTAGES_local).to(tl.float32)
+
+            if LOSS_TYPE == 0:  # GRPO/DAPO/BNPO/DR_GRPO
+                coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+                if DELTA != 0.0:
+                    coef_1_for_loss = tl.minimum(coef_1, DELTA)
+                else:
+                    coef_1_for_loss = coef_1
+                per_token_loss1 = coef_1_for_loss * advantage
+                per_token_loss2 = coef_2 * advantage
+                mask = per_token_loss2 >= per_token_loss1
+                dlogp = -coef_1 * advantage * mask
+                if DELTA != 0.0:
+                    dlogp = dlogp * (coef_1 <= DELTA)
+
+            elif LOSS_TYPE == 1:  # CISPO
+                coef_2 = tl.minimum(coef_1, EPS_HIGH)
+                dlogp = -coef_2 * advantage
+
+            elif LOSS_TYPE == 2:  # SAPO
+                temperature = tl.where(advantage > 0, SAPO_TEMP_POS, SAPO_TEMP_NEG)
+                sigmoid_input = temperature * (coef_1 - 1.0)
+                sigmoid_val = tl.sigmoid(sigmoid_input)
+                d_sapo_d_coef1 = 4.0 * sigmoid_val * (1.0 - sigmoid_val)
+                dlogp = -advantage * d_sapo_d_coef1 * coef_1
+
+            if VLLM_IS_RATIO is not None:
+                vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+                    tl.float32
+                )
+                dlogp = dlogp * vllm_is_ratio
+
+            if BETA != 0.0:
+                REF_LOGP_local = REF_LOGP + token_idx
+                ref_logp = tl.load(REF_LOGP_local).to(tl.float32)
+                if USE_BIAS_CORRECTION_KL:
+                    dlogp += BETA * coef_1 * (logp - ref_logp)
+                else:
+                    dlogp += BETA * (1 - tl.exp(ref_logp - logp))
+
+            dlogp = dlogp * dloss / TEMPERATURE
+            tl.debug_barrier()
+            for start_n in tl.range(0, N, BLOCK_N):
+                cols = start_n + tl.arange(0, BLOCK_N)
+                logits = tl.load(LOGITS_local + cols, mask=cols < N, other=-float("inf")).to(tl.float32) / TEMPERATURE
+                probs = tl.exp(logits - lse)
+                dlogits = tl.where(cols == idx, 1 - probs, -probs) * dlogp
+                tl.store(DLOGITS_local + cols, dlogits, mask=cols < N)
+
+
+@torch.no_grad
+def fused_selective_log_softmax(logits: torch.Tensor, input_ids: torch.Tensor, temperature: float = 0.9, mask=None):
+    """Compute log probabilities for specific token IDs with selective masking."""
+    assert logits.is_contiguous()
+    B, L_ADD_1, N = logits.shape
+    L = L_ADD_1 - 1
+    input_ids = input_ids[:, -L:]
+    if mask is not None:
+        mask = mask[:, -L:]
+    log_p = torch.zeros(B, L, dtype=torch.float32, device=logits.device)
+
+    block_n = compute_block_size_softmax(N)
+    num_cores = get_npu_core_count()
+    grid = calculate_tile_count_2d(B, L, num_cores)
+    _selective_log_softmax_kernel[grid](
+        logits,
+        input_ids,
+        log_p,
+        mask,
+        temperature,
+        input_ids.stride(0),
+        L,
+        N,
+        BLOCK_N=block_n,
+    )
+    return log_p
+
+
+def compute_distribution_normalizer(completion_mask):
+    """Calculate global active token count for distributed loss normalization."""
+    normalizer = completion_mask.to(torch.float32).sum()
+    world_size = 1
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        normalizer = normalizer.clone()
+        torch.distributed.all_reduce(normalizer, op=torch.distributed.ReduceOp.SUM)
+        world_size = torch.distributed.get_world_size()
+    normalizer = normalizer / world_size
+    return torch.clamp(normalizer, min=1.0)
+
+
+def reduce_loss(per_token_loss, mask, loss_type, max_completion_length, batch_size, seq_len):
+    """Apply reduction strategy based on specified loss type."""
+    if loss_type == "grpo" or loss_type == "sapo":
+        return ((per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean()
+    elif loss_type == "bnpo":
+        return (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0)
+    elif loss_type == "dr_grpo":
+        max_len = max_completion_length if max_completion_length is not None else seq_len
+        return (per_token_loss * mask).sum() / (batch_size * max_len)
+    elif loss_type == "dapo" or loss_type == "cispo":
+        return (per_token_loss * mask).sum() / compute_distribution_normalizer(mask)
+    elif loss_type == "luspo":
+        return (per_token_loss * mask.sum(-1, keepdim=True)).mean()
+    raise ValueError(f"Unknown loss_type: {loss_type}. Expected one of: grpo, bnpo, dr_grpo, dapo, cispo, sapo, luspo")
+
+
+def grpo_loss_forward_triton(
+    ctx,
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask,
+    temperature,
+    beta,
+    eps_low,
+    eps_high,
+    inplace,
+    loss_type="grpo",
+    max_completion_length=None,
+    reduce=True,
+    importance_sampling_level="token",
+    sapo_temperature_pos=1.0,
+    sapo_temperature_neg=1.05,
+    vllm_is_ratio=None,
+    delta=None,
+    use_bias_correction_kl=False,
+):
+    """Forward pass computation for GRPO loss."""
+    assert logits.is_contiguous() and completion_ids.is_contiguous()
+    assert old_logp is None or old_logp.is_contiguous()
+    assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
+    assert importance_sampling_level in ("token", "sequence"), (
+        f"importance_sampling_level must be 'token' or 'sequence', got {importance_sampling_level}"
+    )
+
+    if loss_type not in _str_to_loss_type:
+        raise ValueError(f"Unknown loss_type '{loss_type}'. Supported types: {list(_str_to_loss_type.keys())}")
+
+    if delta is not None and loss_type in ("cispo", "sapo"):
+        raise ValueError(f"delta (two-sided clipping) is not supported for loss_type='{loss_type}'.")
+
+    delta_val = 0.0 if delta is None else float(delta)
+
+    if importance_sampling_level == "sequence" and loss_type in ("cispo", "sapo"):
+        raise ValueError(
+            f"Sequence-level importance sampling is not supported for loss_type='{loss_type}'. "
+            f"Use importance_sampling_level='token' instead."
+        )
+
+    if loss_type == "sapo":
+        if sapo_temperature_pos <= 0:
+            raise ValueError(f"sapo_temperature_pos must be positive, got {sapo_temperature_pos}")
+        if sapo_temperature_neg <= 0:
+            raise ValueError(f"sapo_temperature_neg must be positive, got {sapo_temperature_neg}")
+
+    loss_type_int = _str_to_loss_type[loss_type]
+
+    B, L_ADD_1, N = logits.shape
+    L = L_ADD_1 - 1
+
+    if completion_mask is not None:
+        assert completion_mask.is_contiguous()
+
+    mask = completion_mask.float() if completion_mask is not None else torch.ones(B, L, device=logits.device)
+
+    vllm_is_ratio_ptr = None
+    vllm_is_ratio_stride = L
+    if vllm_is_ratio is not None:
+        assert vllm_is_ratio.dim() in (1, 2), (
+            f"vllm_is_ratio must be 1D (B,) or 2D (B, L) / (B, 1), got {vllm_is_ratio.dim()}D"
+        )
+        if vllm_is_ratio.dim() == 2:
+            assert vllm_is_ratio.shape[0] == B and vllm_is_ratio.shape[1] in (1, L), (
+                f"vllm_is_ratio shape must be ({B}, 1) or ({B}, {L}), got {tuple(vllm_is_ratio.shape)}"
+            )
+        else:
+            assert vllm_is_ratio.shape[0] == B, f"vllm_is_ratio shape must be ({B},), got {tuple(vllm_is_ratio.shape)}"
+        vllm_is_ratio = vllm_is_ratio.contiguous()
+        vllm_is_ratio_ptr = vllm_is_ratio
+        vllm_is_ratio_stride = vllm_is_ratio.shape[1] if vllm_is_ratio.dim() > 1 else 1
+
+    loss = torch.zeros(B, L, device=logits.device, dtype=torch.float32)
+    lse = torch.zeros_like(loss)
+    is_clipped = torch.zeros_like(loss)
+    kl = torch.zeros_like(loss) if beta != 0.0 else None
+
+    block_n = compute_block_size_forward(N)
+    num_cores = get_npu_core_count()
+    grid = calculate_tile_count_2d(B, L, num_cores)
+
+    if importance_sampling_level == "sequence":
+        per_token_logps = fused_selective_log_softmax(logits, completion_ids, temperature, completion_mask)
+
+        if old_logp is None:
+            log_ratio = torch.zeros_like(per_token_logps)
+        else:
+            log_ratio = per_token_logps - old_logp
+
+        seq_lens = mask.sum(-1).clamp(min=1.0)
+        seq_log_importance = (log_ratio * mask).sum(-1) / seq_lens
+        coef_1 = torch.exp(seq_log_importance)
+        coef_2 = torch.clamp(coef_1, 1 - eps_low, 1 + eps_high)
+
+        is_clipped_seq = ((coef_1 < 1 - eps_low) & (advantages < 0)) | ((coef_1 > 1 + eps_high) & (advantages > 0))
+        is_clipped_seq = is_clipped_seq.float()
+
+        if delta is not None:
+            coef_1_for_loss = torch.clamp(coef_1, max=delta)
+        else:
+            coef_1_for_loss = coef_1
+
+        _grpo_loss_fwd_kernel_seq[grid](
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            completion_mask,
+            advantages,
+            coef_1_for_loss.contiguous(),
+            coef_2.contiguous(),
+            is_clipped_seq.contiguous(),
+            vllm_is_ratio_ptr,
+            vllm_is_ratio_stride,
+            loss,
+            lse,
+            kl,
+            is_clipped,
+            temperature,
+            beta,
+            use_bias_correction_kl,
+            L,
+            N,
+            BLOCK_N=block_n,
+        )
+
+        ctx.save_for_backward(
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            advantages,
+            completion_mask,
+            lse,
+            mask,
+            coef_1,
+            seq_lens,
+            vllm_is_ratio_ptr,
+        )
+    else:
+        _grpo_loss_fwd_kernel[grid](
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            completion_mask,
+            advantages,
+            vllm_is_ratio_ptr,
+            vllm_is_ratio_stride,
+            loss,
+            lse,
+            kl,
+            is_clipped,
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            loss_type_int,
+            sapo_temperature_pos,
+            sapo_temperature_neg,
+            delta_val,
+            use_bias_correction_kl,
+            L,
+            N,
+            BLOCK_N=block_n,
+        )
+        ctx.save_for_backward(
+            logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse, mask, vllm_is_ratio_ptr
+        )
+
+    ctx.infos = (
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace,
+        loss_type,
+        loss_type_int,
+        sapo_temperature_pos,
+        sapo_temperature_neg,
+        max_completion_length,
+        B,
+        L,
+        importance_sampling_level,
+        vllm_is_ratio_stride,
+        reduce,
+        delta_val,
+        use_bias_correction_kl,
+    )
+
+    mask_sum = mask.sum().clamp(min=1.0)
+    kl_mean = (kl * mask).sum() / mask_sum if kl is not None else None
+    clip_ratio = (is_clipped.float() * mask).sum() / mask_sum
+
+    if not reduce:
+        loss_out = loss * mask
+        kl_out = kl * mask if kl is not None else None
+        is_clipped_out = is_clipped * mask
+        return loss_out, kl_out, is_clipped_out
+
+    reduced_loss = reduce_loss(loss, mask, loss_type, max_completion_length, B, L)
+    return reduced_loss, kl_mean, clip_ratio
+
+
+def grpo_loss_backward_triton(ctx, *args):
+    """Backward pass computation for GRPO loss."""
+    dloss_input = args[0]
+    saved_tensors = ctx.saved_tensors
+    (
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace,
+        loss_type,
+        loss_type_int,
+        sapo_temperature_pos,
+        sapo_temperature_neg,
+        max_completion_length,
+        B,
+        L,
+        importance_sampling_level,
+        vllm_is_ratio_stride,
+        reduce,
+        delta_val,
+        use_bias_correction_kl,
+    ) = ctx.infos
+
+    if importance_sampling_level == "sequence":
+        (
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            advantages,
+            completion_mask,
+            lse,
+            mask,
+            coef_1,
+            seq_lens,
+            vllm_is_ratio,
+        ) = saved_tensors
+    else:
+        (logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse, mask, vllm_is_ratio) = (
+            saved_tensors
+        )
+
+    _, L_ADD_1, N = logits.shape
+
+    if not reduce:
+        dloss = dloss_input
+    elif loss_type == "grpo" or loss_type == "sapo":
+        seq_lens_bwd = mask.sum(-1, keepdim=True).clamp(min=1.0)
+        dloss = dloss_input * mask / (seq_lens_bwd * B)
+    elif loss_type == "bnpo":
+        dloss = dloss_input * mask / mask.sum().clamp(min=1.0)
+    elif loss_type == "dr_grpo":
+        max_len = max_completion_length if max_completion_length is not None else L
+        dloss = dloss_input * mask / (B * max_len)
+    elif loss_type == "dapo" or loss_type == "cispo":
+        dloss = dloss_input * mask / compute_distribution_normalizer(mask)
+    elif loss_type == "luspo":
+        seq_lens_bwd = mask.sum(-1, keepdim=True).clamp(min=1.0)
+        dloss = dloss_input * seq_lens_bwd / (B * L)
+    else:
+        raise ValueError(f"Unknown loss_type: {loss_type}")
+
+    dlogits = logits.data if inplace else torch.empty_like(logits)
+
+    block_n = compute_block_size_backward(N)
+    num_cores = get_npu_core_count()
+    grid = calculate_tile_count_2d(B, L, num_cores)
+
+    if importance_sampling_level == "sequence":
+        if vllm_is_ratio is None:
+            dloss_sum = dloss.sum(-1).contiguous()
+        else:
+            if vllm_is_ratio.dim() == 1:
+                ratio = vllm_is_ratio.unsqueeze(-1)
+            else:
+                ratio = vllm_is_ratio
+            dloss_sum = (dloss * ratio).sum(-1).contiguous()
+        _grpo_loss_bwd_kernel_seq[grid](
+            dloss,
+            dloss_sum,
+            dlogits,
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            advantages,
+            completion_mask,
+            lse,
+            coef_1,
+            seq_lens,
+            temperature,
+            beta,
+            use_bias_correction_kl,
+            eps_low,
+            eps_high,
+            delta_val,
+            *dloss.stride(),
+            L,
+            N,
+            BLOCK_N=block_n,
+        )
+    else:
+        _grpo_loss_bwd_kernel[grid](
+            dloss,
+            dlogits,
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            advantages,
+            completion_mask,
+            lse,
+            vllm_is_ratio,
+            vllm_is_ratio_stride,
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            loss_type_int,
+            sapo_temperature_pos,
+            sapo_temperature_neg,
+            delta_val,
+            use_bias_correction_kl,
+            *dloss.stride(),
+            L,
+            N,
+            BLOCK_N=block_n,
+        )
+
+    dlogits[:, -1, :] = 0
+    return (
+        dlogits,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+class GrpoLossFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, *args):
+        return grpo_loss_forward_triton(ctx, *args)
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, *args):
+        return grpo_loss_backward_triton(ctx, *args)
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/jsd.py b/src/liger_kernel/ops/backends/_ascend/ops/jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..a28eecda1e71806b6010927b4453dbfb50e067dc
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/jsd.py
@@ -0,0 +1,229 @@
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def _jsd_kernel(
+    X_ptr,  # input in logspace, X = log Q
+    X_stride,
+    Y_ptr,  # ground truth in logspace, Y = log P
+    Y_stride,
+    loss_ptr,
+    loss_stride,
+    dX_ptr,
+    dX_stride,
+    label_ptr,
+    beta: tl.constexpr,
+    n_non_ignore: int,
+    ignore_index: tl.constexpr,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    HAS_LABEL: tl.constexpr,
+):
+    # JSD(P || Q) = (KL(P || M) + KL(Q || M)) / 2, M = (1/2) * (P + Q) = (1/2) * (e ^ Y + e ^ X)
+    #             = sum(P * log P + Q * log Q - 2 * M * log M) / 2
+    #             = sum(e ^ Y * Y + e ^ X * X - 2 * M * log M) / 2
+    # grad_x_i = 0.5 * Q * (X - log_M)
+
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Grid-Stride Loop - each kernel processes multiple rows
+    for row_idx in range(pid, n_rows, num_progs):
+        X_row_ptr = X_ptr + row_idx * X_stride
+        Y_row_ptr = Y_ptr + row_idx * Y_stride
+        loss_row_ptr = loss_ptr + row_idx * loss_stride
+        dX_row_ptr = dX_ptr + row_idx * dX_stride
+
+        should_skip = False
+        if HAS_LABEL:
+            label = tl.load(label_ptr + row_idx)
+            should_skip = label == ignore_index
+
+        if should_skip:
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + tl.arange(0, BLOCK_SIZE)
+                mask = offsets < n_cols
+                tl.store(dX_row_ptr + offsets, 0.0, mask=mask)
+                tl.store(loss_row_ptr + offsets, 0.0, mask=mask)
+        else:
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + tl.arange(0, BLOCK_SIZE)
+                mask = offsets < n_cols
+                X = tl.load(X_row_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+                Y = tl.load(Y_row_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+
+                if beta == 0.0:  # forward KL
+                    Y_max = tl.max(Y, axis=0)
+                    Y_shifted = Y - Y_max
+                    Y_prob = tl.exp(Y_shifted) * tl.exp(Y_max)  # Compensate for the shift
+                    loss = Y_prob * (Y - X)
+                    dX = -Y_prob
+                elif beta == 1.0:  # reverse KL
+                    X_max = tl.max(X, axis=0)
+                    X_shifted = X - X_max
+                    X_prob = tl.exp(X_shifted) * tl.exp(X_max)  # Compensate for the shift
+                    loss = X_prob * (X - Y)
+                    dX = loss + X_prob
+                else:
+                    max_val = tl.maximum(tl.max(X, axis=0), tl.max(Y, axis=0))
+                    X_shifted = X - max_val
+                    Y_shifted = Y - max_val
+
+                    # Pre-compute exp(max_val) since it's used twice
+                    exp_max = tl.exp(max_val)
+
+                    # Compute exp terms with compensation
+                    Q = tl.exp(X_shifted) * exp_max  # = exp(X)
+                    P = tl.exp(Y_shifted) * exp_max  # = exp(Y)
+
+                    # Pre-compute common terms
+                    beta_P = beta * P
+                    one_minus_beta_Q = (1 - beta) * Q
+                    M = beta_P + one_minus_beta_Q
+                    log_M = tl.log(M)
+
+                    loss = beta_P * Y + one_minus_beta_Q * X - M * log_M
+                    dX = one_minus_beta_Q * (X - log_M)
+
+                # Pre-compute scaling factor
+                scale = 1.0 / n_non_ignore
+                loss = loss * scale
+                dX = dX * scale
+
+                tl.store(loss_row_ptr + offsets, loss, mask=mask)
+                tl.store(dX_row_ptr + offsets, dX, mask=mask)
+
+
+def get_optimal_block_size(total_elements):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy
+    """
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=8.0, shapes=((total_elements,),), tiling_dims=(0,)
+    )
+
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return block_size
+    else:
+        return 2048
+
+
+def jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label):
+    BT, V = _input.shape
+    n_rows = BT
+    BLOCK_SIZE = get_optimal_block_size(V)
+
+    # non reduction loss
+    loss = torch.zeros(_input.shape, dtype=torch.float32, device=_input.device)
+    dX = torch.empty_like(_input)
+
+    if has_label:
+        n_non_ignore = (shift_labels != ignore_index).sum().item()
+    else:
+        n_non_ignore = BT
+
+    # Use NPU core count for grid size
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_rows)
+
+    _jsd_kernel[(grid_size,)](
+        X_ptr=_input,
+        X_stride=_input.stride(-2),
+        Y_ptr=target,
+        Y_stride=target.stride(-2),
+        loss_ptr=loss,
+        loss_stride=loss.stride(-2),
+        dX_ptr=dX,
+        dX_stride=dX.stride(-2),
+        label_ptr=(shift_labels if has_label else torch.empty(1, device=_input.device)),
+        beta=beta,
+        n_non_ignore=n_non_ignore,
+        ignore_index=ignore_index,
+        n_rows=n_rows,
+        n_cols=V,
+        BLOCK_SIZE=BLOCK_SIZE,
+        HAS_LABEL=has_label,
+    )
+
+    loss = torch.sum(loss)
+    return loss.to(_input.dtype), dX
+
+
+def jsd_backward(dX, grad_output):
+    # If jsd is the last layer, grad_output is 1.0. Skip the mul to save time
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        return dX
+    else:
+        return grad_output * dX
+
+
+class LigerJSDFunction(torch.autograd.Function):
+    r"""
+    This class implements the forward and backward pass for the generalized Jensen-Shannon Divergence.
+    .. math::
+        JSD(\beta)(P || Q)
+            = \beta * KLDiv(P || (\beta * P + (1 - \beta) * Q)) + (1 - \beta) * KLDiv(Q || (\beta * P + (1 - \beta) * Q))
+
+    .. note::
+        As all the other losses in PyTorch, this function expects the first argument,
+        :attr:`_input`, to be the predictions, the output of the student model, in log-space
+        and the second, :attr:`target`, to be the observations, the output of the teacher model, in log-space.
+        This differs from the standard mathematical notation :math:`JSD(P || Q)` where
+        :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        _input: torch.Tensor,
+        target: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+    ) -> torch.Tensor:
+        """
+        Args:
+            _input (torch.Tensor): predict values with shape (BT, V) in logspace
+            target (torch.Tensor): ground truth values with shape (BT, V) in logspace
+            shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
+            beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
+            ignore_index (int): the index to ignore. Default: -100
+
+        Returns:
+            loss (torch.Tensor): generalized JSD
+        """
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (_input.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+
+        loss, dX = jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label)
+        ctx.save_for_backward(dX)
+        return loss
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        (dX,) = ctx.saved_tensors
+        dX = jsd_backward(dX, grad_output)
+        return (
+            dX,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/kl_div.py b/src/liger_kernel/ops/backends/_ascend/ops/kl_div.py
new file mode 100755
index 0000000000000000000000000000000000000000..f7b2614f692f712cf3f0ee320dda77adf832ff86
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/kl_div.py
@@ -0,0 +1,327 @@
+from typing import Literal
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
+
+_REDUCTION_MODE_NONE: tl.constexpr = tl.constexpr(0)
+_REDUCTION_MODE_SUM: tl.constexpr = tl.constexpr(1)
+_REDUCTION_MODE_MEAN: tl.constexpr = tl.constexpr(2)
+_REDUCTION_MODE_BATCHMEAN: tl.constexpr = tl.constexpr(3)
+
+_str_to_reduction_mode = {
+    "none": _REDUCTION_MODE_NONE.value,
+    "sum": _REDUCTION_MODE_SUM.value,
+    "mean": _REDUCTION_MODE_MEAN.value,
+    "batchmean": _REDUCTION_MODE_BATCHMEAN.value,
+}
+
+# -----------------------------------------------------------------------------
+# Kernels (2D Tiling + Persistent Programs)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _kldiv_kernel_forward(
+    y_ptr,  # [B, S], prediction ptr, the kernel expects the prediction in log-space
+    gt_ptr,  # [B, S], ground truth ptr
+    loss_ptr,  # [B] or [B, S] if reduction == _REDUCTION_MODE_NONE, output ptr
+    n_rows,  # int, number of rows in the input tensor
+    n_cols,  # int, number of columns in the input tensor
+    eps,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    log_target: tl.constexpr = False,
+    reduction: tl.constexpr = _REDUCTION_MODE_BATCHMEAN,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    grid_m = tl.cdiv(n_rows, BLOCK_SIZE_M)
+    grid_n = tl.cdiv(n_cols, BLOCK_SIZE_N)
+    total_2d_blocks = grid_m * grid_n
+
+    # Persistent-program loop over logical 2D blocks.
+    for block_idx in tl.range(pid, total_2d_blocks, num_progs):
+        block_m = block_idx // grid_n
+        block_n = block_idx % grid_n
+
+        offset_m = tl.arange(0, BLOCK_SIZE_M) + block_m * BLOCK_SIZE_M
+        offset_n = tl.arange(0, BLOCK_SIZE_N) + block_n * BLOCK_SIZE_N
+
+        mask_m = offset_m < n_rows
+        mask_n = offset_n < n_cols
+
+        offset = offset_m[:, None] * n_cols + offset_n[None, :]
+        mask = mask_m[:, None] & mask_n[None, :]
+
+        y = tl.load(y_ptr + offset, mask=mask, other=0.0)
+        y_true = tl.load(gt_ptr + offset, mask=mask, other=0.0)
+
+        # KL(y_true || y_pred) with y_pred provided in log-space.
+        # - log_target=False: y_true is probability space; clamp with eps before log.
+        # - log_target=True : y_true is log-probability space.
+        if log_target:
+            loss = tl.exp(y_true) * (y_true - y)
+        else:
+            loss = y_true * (tl.log(tl.maximum(y_true, eps)) - y)
+
+        if reduction == _REDUCTION_MODE_NONE:
+            tl.store(loss_ptr + offset, loss, mask=mask)
+        else:
+            # Multiple block_n tiles may update the same row, so atomic_add is required.
+            loss_sum = tl.sum(loss, axis=1)
+            tl.atomic_add(loss_ptr + offset_m, loss_sum, mask=mask_m)
+
+
+@triton.jit
+def _kldiv_kernel_backward(
+    target_ptr,
+    new_grads_ptr,
+    grad_output_ptr,
+    n_rows,
+    n_cols,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    log_target: tl.constexpr = False,
+    reduction: tl.constexpr = _REDUCTION_MODE_BATCHMEAN,
+):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    grid_m = tl.cdiv(n_rows, BLOCK_SIZE_M)
+    grid_n = tl.cdiv(n_cols, BLOCK_SIZE_N)
+    total_2d_blocks = grid_m * grid_n
+
+    # For reduced losses, grad_output is a scalar. Load it once per program.
+    if reduction != _REDUCTION_MODE_NONE:
+        grad_output_scalar = tl.load(grad_output_ptr)
+
+    # Persistent-program loop over logical 2D blocks.
+    for block_idx in tl.range(pid, total_2d_blocks, num_progs):
+        block_m = block_idx // grid_n
+        block_n = block_idx % grid_n
+
+        offset_m = tl.arange(0, BLOCK_SIZE_M) + block_m * BLOCK_SIZE_M
+        offset_n = tl.arange(0, BLOCK_SIZE_N) + block_n * BLOCK_SIZE_N
+
+        mask_m = offset_m < n_rows
+        mask_n = offset_n < n_cols
+
+        offset = offset_m[:, None] * n_cols + offset_n[None, :]
+        mask = mask_m[:, None] & mask_n[None, :]
+
+        y_true = tl.load(target_ptr + offset, mask=mask, other=0.0)
+
+        if log_target:
+            res = -tl.exp(y_true)
+        else:
+            res = y_true * -1
+
+        if reduction != _REDUCTION_MODE_NONE:
+            res = res * grad_output_scalar
+        else:
+            grad_output = tl.load(grad_output_ptr + offset, mask=mask, other=0.0)
+            res = res * grad_output
+
+        if reduction == _REDUCTION_MODE_BATCHMEAN:
+            res = res / n_rows
+        elif reduction == _REDUCTION_MODE_MEAN:
+            res = res / (n_rows * n_cols)
+
+        tl.store(new_grads_ptr + offset, res, mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Helper: Call compute_default_tiling_strategy
+# -----------------------------------------------------------------------------
+
+
+def get_optimal_block_size(
+    n_rows,
+    dtype_size,
+    BLOCK_SIZE_N: tl.constexpr,
+    log_target: bool = False,
+    is_backward: bool = False,
+    is_scalar_grad_output: bool = True,
+):
+    """
+    Calculate optimal BLOCK_SIZE_M using compute_default_tiling_strategy.
+    """
+    # 1) Set memory multiplier
+    # Backward is lighter than forward in this op, so we typically use a smaller multiplier.
+    # If backward also needs to stream a full grad_output tile (i.e., grad_output is not a scalar),
+    # its memory footprint becomes closer to forward, so we bump the multiplier.
+    if is_backward:
+        multiplier = 2.5 if is_scalar_grad_output else 3.0
+    else:
+        multiplier = 3.0 if log_target else 6.0
+
+    # For bf16/fp16 (dtype_size < 4), compile-time UB overflow was observed on some shapes.
+    # Clamp to fp32 size for a conservative tiling estimate; this can be refined later.
+    dtype_size = max(dtype_size, 4)
+
+    # 2) Call tiling strategy (tile only dim 0 / rows)
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=dtype_size,
+        memory_multiplier=multiplier,
+        shapes=((n_rows, BLOCK_SIZE_N),),
+        tiling_dims=(0,),
+    )
+
+    # 3) Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return block_size
+    else:
+        return triton.next_power_of_2(min(128, n_rows))
+
+
+def kldiv_forward_triton(y_pred, y_true, log_target, reduction, eps):  # [BT, V]
+    BT, V = y_pred.shape
+    reduction = _str_to_reduction_mode[reduction]
+
+    out_size = (BT, V) if reduction == _REDUCTION_MODE_NONE.value else (BT,)
+    output_tensor = torch.zeros(out_size, device=y_pred.device, dtype=torch.float32)
+
+    BLOCK_SIZE_N = triton.next_power_of_2(min(128, V))
+    BLOCK_SIZE_M = get_optimal_block_size(BT, y_pred.element_size(), BLOCK_SIZE_N, log_target=log_target)
+    num_cores = get_npu_core_count()
+    total_blocks = triton.cdiv(BT, BLOCK_SIZE_M) * triton.cdiv(V, BLOCK_SIZE_N)
+    grid = min(num_cores, total_blocks)
+
+    _kldiv_kernel_forward[(grid,)](
+        y_pred,
+        y_true,
+        output_tensor,
+        BT,
+        V,
+        eps=eps,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        log_target=log_target,
+        reduction=reduction,
+    )
+
+    # Final reduction follows PyTorch KLDivLoss semantics.
+    # Note: In newer PyTorch versions, `mean` is planned to match `batchmean`.
+    # See: https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html
+    if reduction == _REDUCTION_MODE_BATCHMEAN.value:
+        return output_tensor.sum() / BT
+    elif reduction == _REDUCTION_MODE_SUM.value:
+        return output_tensor.sum(dim=0)
+    elif reduction == _REDUCTION_MODE_MEAN.value:
+        return output_tensor.sum() / (BT * V)
+    else:
+        return output_tensor
+
+
+def kldiv_backward_triton(target, grad_output, new_grads, log_target, reduction):
+    BT, V = target.shape
+    reduction = _str_to_reduction_mode[reduction]
+
+    BLOCK_SIZE_N = triton.next_power_of_2(min(128, V))
+    # grad_output handling:
+    # - numel() == 1: use scalar grad_output path in kernel.
+    # - numel() != 1: stream per-element grad_output tile in kernel.
+    is_scalar_grad_output = grad_output.numel() == 1
+    BLOCK_SIZE_M = get_optimal_block_size(
+        BT,
+        target.element_size(),
+        BLOCK_SIZE_N,
+        log_target=log_target,
+        is_backward=True,
+        is_scalar_grad_output=is_scalar_grad_output,
+    )
+    num_cores = get_npu_core_count()
+    total_blocks = triton.cdiv(BT, BLOCK_SIZE_M) * triton.cdiv(V, BLOCK_SIZE_N)
+    grid = min(num_cores, total_blocks)
+
+    _kldiv_kernel_backward[(grid,)](
+        target,
+        new_grads,
+        grad_output,
+        BT,
+        V,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        log_target=log_target,
+        reduction=reduction,
+    )
+
+    return new_grads
+
+
+class LigerKLDivLossFunction(torch.autograd.Function):
+    """
+    Class implementing the forward and backward pass for the KL Divergence Loss using Triton, as defined by the following formula:
+    ```python
+    if log_target:
+        loss = target.exp() * (target - input)
+    else:
+        loss = target * (target.log() - input)
+    ```,
+    then the loss is reduced according to the `reduction` parameter.
+    as defined in the PyTorch documentation: https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        y_pred: torch.Tensor,
+        y_true: torch.Tensor,
+        reduction: REDUCTION_LITERAL = "batchmean",
+        log_target: bool = False,
+        eps: float = 1e-10,
+    ) -> torch.Tensor:
+        """A forward pass for the KL Divergence Loss.
+
+        Args:
+            ctx: Torch autograd context
+            y_pred (torch.Tensor): A tensor of shape (BT, V) containing the predicted values, expected to be log-probabilities.
+            y_true (torch.Tensor): A tensor of shape (BT, V) containing the target values, expected to be either probabilities or log-probabilities, depending on the value of `log_target`.
+            reduction (REDUCTION_LITERAL, optional): Reduction to be used. Defaults to "batchmean".
+            log_target (bool, optional): If set to true, expects the ground truth to already be log-probabilities. Defaults to False.
+            eps: (float, optional): A small value to avoid division by zero. Defaults to 1e-10.
+
+        Returns:
+            torch.Tensor: The computed KL Divergence Loss, with shape (BT, V) if `reduction` is "none", else a scalar.
+        """
+        ctx.save_for_backward(y_true)
+        ctx.reduction = reduction
+        ctx.log_target = log_target
+        return kldiv_forward_triton(y_pred, y_true, log_target=log_target, reduction=reduction, eps=eps)
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        """A backward pass for the KL Divergence Loss.
+
+        Args:
+            ctx: Torch autograd context
+            grad_output (torch.Tensor): The gradient of the loss with respect to the output.
+
+        Returns:
+            tuple[torch.Tensor, None, None, None, None]: The gradient of the loss with respect to the inputs and None for the other arguments of the forward method.
+        """
+        (y_true,) = ctx.saved_tensors
+
+        new_grads = torch.empty_like(y_true)
+
+        derivative = kldiv_backward_triton(y_true, grad_output, new_grads, ctx.log_target, ctx.reduction)
+
+        return (
+            derivative,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/layer_norm.py b/src/liger_kernel/ops/backends/_ascend/ops/layer_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..6e82026d49d02b958627a21d9d2cc93a0df6bf26
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/layer_norm.py
@@ -0,0 +1,642 @@
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import rsqrt
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+# -----------------------------------------------------------------------------
+# Optimized Forward Kernel - No Tiling (for n_cols <= 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _layer_norm_forward_kernel_no_tiling(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    B_ptr,
+    Mean_ptr,
+    Mean_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    eps: tl.constexpr,
+    n_cols_inv: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    OPTIMIZED NPU layer_norm forward kernel for small n_cols (<= 2048).
+
+    Key optimizations:
+    1. Pre-compute n_cols_inv to avoid repeated scalar division
+    2. Hoist W and B loads outside the loop (already done)
+    3. Minimize per-iteration scalar operations
+    4. Use vectorized operations for mask handling
+    5. Optimize cache hints for memory access patterns
+    6. Reduce type conversions by keeping intermediate results in float32
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Pre-compute grid stride constants (done once, not per iteration)
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    # Load W and B once (already optimized - kept outside loop)
+    W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+    B_row = tl.load(B_ptr + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+
+    base_row_idx = pid * BLOCK_SIZE_M
+
+    # Grid-stride loop over row blocks
+    for i in range(num_iterations):
+        row_idx = i * grid_stride + base_row_idx + row_offsets
+        row_mask = row_idx < n_rows
+
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        X_block_ptr = X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :]
+
+        X_rows = tl.load(
+            X_block_ptr,
+            mask=block_mask,
+            other=0.0,
+            cache_modifier=".cg",
+        ).to(tl.float32)
+
+        # Compute mean with vectorized operations
+        row_sum = tl.sum(X_rows, axis=1)
+        mean_rows = row_sum * n_cols_inv  # Multiplication is faster than division
+
+        # Center the data (vectorized operation)
+        X_centered = X_rows - mean_rows[:, None]
+
+        X_centered_masked = tl.where(block_mask, X_centered, 0.0)
+        var_rows = tl.sum(X_centered_masked * X_centered_masked, axis=1) * n_cols_inv
+
+        rstd_rows = rsqrt(var_rows + eps)
+
+        Mean_ptr_offset = Mean_ptr + row_idx * Mean_row_stride
+        RSTD_ptr_offset = RSTD_ptr + row_idx * RSTD_row_stride
+
+        tl.store(Mean_ptr_offset, mean_rows, mask=row_mask)
+        tl.store(RSTD_ptr_offset, rstd_rows, mask=row_mask)
+
+        Y_f32 = X_centered * rstd_rows[:, None] * W_row[None, :] + B_row[None, :]
+
+        # Store output with coalesced memory access
+        Y_block_ptr = Y_ptr + row_idx[:, None] * Y_row_stride + col_offsets[None, :]
+        tl.store(Y_block_ptr, Y_f32, mask=block_mask)
+
+
+# -----------------------------------------------------------------------------
+# Forward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _layer_norm_forward_kernel_npu(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    B_ptr,
+    Mean_ptr,
+    Mean_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """NPU-optimized layer_norm forward kernel with column blocking."""
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    n_cols_inv = 1.0 / n_cols
+
+    for row_idx in range(pid, n_rows, num_progs):
+        Y_row_ptr = Y_ptr + row_idx * Y_row_stride
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        Mean_row_ptr = Mean_ptr + row_idx * Mean_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        row_sum = 0.0
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, cache_modifier=".cg").to(tl.float32)
+
+            row_sum += tl.sum(X_block)
+
+        mean = row_sum * n_cols_inv
+
+        var_sum = 0.0
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, cache_modifier=".cg").to(tl.float32)
+
+            X_centered = X_block - mean
+            var_sum += tl.sum(tl.where(mask, X_centered * X_centered, 0.0))
+
+        var = var_sum * n_cols_inv
+        rstd = rsqrt(var + eps)
+
+        tl.store(Mean_row_ptr, mean)
+        tl.store(RSTD_row_ptr, rstd)
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, cache_modifier=".ca").to(tl.float32)
+            W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+            B_block = tl.load(B_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+
+            X_centered = X_block - mean
+            Y_f32 = X_centered * rstd * W_block + B_block
+
+            tl.store(Y_row_ptr + col_offsets, Y_f32.to(X_block.dtype), mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Optimized Backward Kernel - No Tiling (for n_cols <= 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _layer_norm_backward_kernel_no_tiling(
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    Mean_ptr,
+    Mean_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    DX_ptr,
+    DX_row_stride,
+    DW_scratch_ptr,
+    DW_scratch_stride,
+    DB_scratch_ptr,
+    DB_scratch_stride,
+    DY_ptr,
+    DY_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    n_cols_inv: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    OPTIMIZED NPU layer_norm backward kernel for small n_cols (<= 2048).
+
+    Key optimizations:
+    1. Pre-compute n_cols_inv to avoid repeated division
+    2. Minimize scalar operations in the hot path
+    3. Reduce redundant mask computations
+    4. Optimize memory access patterns with better cache hints
+    5. Keep intermediate results in float32 to reduce conversions
+    6. Use vectorized operations throughout
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0).to(tl.float32)
+
+    # Per-program accumulators for dW/dB
+    dW_acc = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)
+    dB_acc = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)
+
+    base_row_idx = pid * BLOCK_SIZE_M
+
+    # Grid-stride loop over row blocks
+    for i in range(num_iterations):
+        row_idx = i * grid_stride + base_row_idx + row_offsets
+        row_mask = row_idx < n_rows
+
+        # Pre-compute block mask once
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        X_block_ptr = X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :]
+        DY_block_ptr = DY_ptr + row_idx[:, None] * DY_row_stride + col_offsets[None, :]
+        Mean_row_ptr = Mean_ptr + row_idx * Mean_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        # Load all required data with appropriate cache hints
+        # .cg = cache global (read once, don't pollute cache)
+        X_rows = tl.load(X_block_ptr, mask=block_mask, other=0.0, cache_modifier=".cg").to(tl.float32)
+        DY_rows = tl.load(DY_block_ptr, mask=block_mask, other=0.0, cache_modifier=".cg").to(tl.float32)
+        mean_rows = tl.load(Mean_row_ptr, mask=row_mask, other=0.0).to(tl.float32)
+        rstd_rows = tl.load(RSTD_row_ptr, mask=row_mask, other=0.0).to(tl.float32)
+
+        x_hat = (X_rows - mean_rows[:, None]) * rstd_rows[:, None]
+        wdy = W_row[None, :] * DY_rows
+
+        x_hat_wdy_masked = tl.where(block_mask, x_hat * wdy, 0.0)
+        wdy_masked = tl.where(block_mask, wdy, 0.0)
+
+        c1 = tl.sum(x_hat_wdy_masked, axis=1) * n_cols_inv
+        c2 = tl.sum(wdy_masked, axis=1) * n_cols_inv
+
+        DX_f32 = (wdy - (x_hat * c1[:, None] + c2[:, None])) * rstd_rows[:, None]
+
+        # Store dX with coalesced memory access
+        DX_block_ptr = DX_ptr + row_idx[:, None] * DX_row_stride + col_offsets[None, :]
+        tl.store(DX_block_ptr, DX_f32.to(X_ptr.dtype.element_ty), mask=block_mask)
+
+        dW_acc += tl.sum(tl.where(block_mask, DY_rows * x_hat, 0.0), axis=0)
+        dB_acc += tl.sum(tl.where(block_mask, DY_rows, 0.0), axis=0)
+
+    # Write accumulated gradients to scratch buffers
+    DW_scratch_offset = DW_scratch_ptr + pid * DW_scratch_stride + col_offsets
+    DB_scratch_offset = DB_scratch_ptr + pid * DB_scratch_stride + col_offsets
+
+    tl.store(DW_scratch_offset, dW_acc, mask=col_mask)
+    tl.store(DB_scratch_offset, dB_acc, mask=col_mask)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _layer_norm_backward_kernel_npu(
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    Mean_ptr,
+    Mean_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    DX_ptr,
+    DX_row_stride,
+    DW_ptr,
+    DB_ptr,
+    DY_ptr,
+    DY_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """NPU-optimized layer_norm backward kernel with column blocking."""
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    n_cols_inv = 1.0 / n_cols
+
+    for row_idx in range(pid, n_rows, num_progs):
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        DY_row_ptr = DY_ptr + row_idx * DY_row_stride
+        DX_row_ptr = DX_ptr + row_idx * DX_row_stride
+        Mean_row_ptr = Mean_ptr + row_idx * Mean_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        mean = tl.load(Mean_row_ptr).to(tl.float32)
+        rstd = tl.load(RSTD_row_ptr).to(tl.float32)
+
+        sum_x_hat_wdy = 0.0
+        sum_wdy = 0.0
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+            DY_block = tl.load(DY_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+            W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+
+            x_hat = (X_block - mean) * rstd
+            wdy = W_block * DY_block
+
+            sum_x_hat_wdy += tl.sum(tl.where(mask, x_hat * wdy, 0.0))
+            sum_wdy += tl.sum(tl.where(mask, wdy, 0.0))
+
+        c1 = sum_x_hat_wdy * n_cols_inv
+        c2 = sum_wdy * n_cols_inv
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+            DY_block = tl.load(DY_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+            W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+
+            x_hat = (X_block - mean) * rstd
+            wdy = W_block * DY_block
+
+            DX_block = (wdy - (x_hat * c1 + c2)) * rstd
+            tl.store(DX_row_ptr + col_offsets, DX_block.to(X_ptr.dtype.element_ty), mask=mask)
+
+            dW_block = DY_block * x_hat
+            dB_block = DY_block
+
+            tl.atomic_add(DW_ptr + col_offsets, dW_block, mask=mask)
+            tl.atomic_add(DB_ptr + col_offsets, dB_block, mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Helper Functions
+# -----------------------------------------------------------------------------
+
+
+def get_optimal_block_size(n_cols, is_forward: bool):
+    """
+    Calculate optimal block size using compute_default_tiling_strategy.
+
+    Memory analysis for forward pass (per row):
+    - Load: X_block, W_block, B_block (3 blocks)
+    - Store: Y_block, Mean, RSTD (3 blocks)
+    - Compute: X_centered, Y intermediate (2 blocks)
+    - Total: conservative estimate 10 blocks of memory
+
+    Memory analysis for backward pass (per row):
+    - Load: X_block, DY_block, W_block, Mean, RSTD, existing_DW, existing_DB (7 blocks)
+    - Store: DX_block, new_DW, new_DB (3 blocks)
+    - Compute: x_hat, wdy, DX intermediate, dW_block, dB_block (5 blocks)
+    - Total: conservative estimate 15 blocks of memory
+
+    Args:
+        n_cols: Number of columns in the tensor
+        is_forward: Whether this is for forward pass (True) or backward pass (False)
+
+    Returns:
+        Optimal block size
+    """
+    if n_cols <= 2048:
+        return triton.next_power_of_2(n_cols)
+
+    memory_multiplier = 10.0 if is_forward else 15.0
+
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=4,
+        memory_multiplier=memory_multiplier,
+        shapes=((n_cols,),),
+        tiling_dims=(0,),
+    )
+
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(2048, block_size)
+    else:
+        return 2048
+
+
+def _compute_grid_size(n_rows: int, block_size_m: int, num_cores: int) -> int:
+    """
+    Compute the effective grid size for no-tiling kernels.
+
+    OPTIMIZATION: Balances parallelism with overhead
+    - Ensures enough work per program to amortize launch costs
+    - Avoids launching idle programs
+    - Caps at 2x core count for hardware concurrency
+    """
+    num_row_blocks = triton.cdiv(n_rows, block_size_m)
+
+    return min(num_cores * 2, num_row_blocks)
+
+
+# -----------------------------------------------------------------------------
+# Forward and Backward Functions
+# -----------------------------------------------------------------------------
+
+
+def layer_norm_forward(X, W, B, eps):
+    """
+    NPU-optimized forward pass for LayerNorm.
+
+    Args:
+        X: Input tensor of shape (..., hidden_size)
+        W: Weight tensor of shape (hidden_size,)
+        B: Bias tensor of shape (hidden_size,)
+        eps: Small constant for numerical stability
+
+    Returns:
+        Tuple of (output, input, mean, rstd)
+    """
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    n_rows, n_cols = X.shape
+
+    if X.shape[1] != W.shape[0]:
+        raise ValueError(
+            f"Incompatible dimensions: input feature size (X.shape[1]={X.shape[1]}) "
+            f"must match weight size (W.shape[0]={W.shape[0]})"
+        )
+
+    # Get optimal block sizes
+    BLOCK_SIZE = get_optimal_block_size(n_cols, True)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE
+
+    # Allocate output tensors
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    Mean = torch.empty(n_rows, dtype=X.dtype, device=X.device)
+    RSTD = torch.empty(n_rows, dtype=X.dtype, device=X.device)
+
+    num_cores = get_npu_core_count()
+
+    # Choose kernel
+    if n_cols <= 2048:
+        grid_size = _compute_grid_size(n_rows, BLOCK_SIZE_M, num_cores)
+        n_cols_inv = 1.0 / float(n_cols)
+
+        _layer_norm_forward_kernel_no_tiling[(grid_size,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            B,
+            Mean,
+            Mean.stride(0),
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            n_cols_inv,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE,
+        )
+    else:
+        grid_size = min(num_cores, n_rows)
+        _layer_norm_forward_kernel_npu[(grid_size,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            B,
+            Mean,
+            Mean.stride(0),
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    return Y.view(*shape), X, Mean, RSTD
+
+
+def layer_norm_backward(dY, X, W, B, Mean, RSTD):
+    """
+    NPU-optimized backward pass for LayerNorm.
+
+    Args:
+        dY: Gradient of output
+        X: Input tensor
+        W: Weight tensor
+        B: Bias tensor
+        Mean: Pre-computed mean
+        RSTD: Pre-computed reciprocal standard deviation
+
+    Returns:
+        Tuple of (input_grad, weight_grad, bias_grad)
+    """
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    # Get optimal block sizes
+    BLOCK_SIZE = get_optimal_block_size(n_cols, False)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE
+
+    num_cores = get_npu_core_count()
+
+    # Allocate gradient tensors
+    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+
+    # Choose kernel
+    if n_cols <= 2048:
+        grid_size = _compute_grid_size(n_rows, BLOCK_SIZE_M, num_cores)
+        DW_scratch = torch.empty((grid_size, n_cols), dtype=torch.float32, device=W.device)
+        DB_scratch = torch.empty((grid_size, n_cols), dtype=torch.float32, device=W.device)
+
+        n_cols_inv = 1.0 / float(n_cols)
+
+        _layer_norm_backward_kernel_no_tiling[(grid_size,)](
+            X,
+            X.stride(0),
+            W,
+            Mean,
+            Mean.stride(0),
+            RSTD,
+            RSTD.stride(0),
+            DX,
+            DX.stride(0),
+            DW_scratch,
+            DW_scratch.stride(0),
+            DB_scratch,
+            DB_scratch.stride(0),
+            dY,
+            dY.stride(0),
+            n_rows,
+            n_cols,
+            n_cols_inv,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE,
+        )
+
+        DW = DW_scratch.sum(dim=0)
+        DB = DB_scratch.sum(dim=0)
+    else:
+        grid_size = min(num_cores, n_rows)
+
+        DW = torch.zeros(n_cols, dtype=torch.float32, device=W.device)
+        DB = torch.zeros(n_cols, dtype=torch.float32, device=W.device)
+
+        _layer_norm_backward_kernel_npu[(grid_size,)](
+            X,
+            X.stride(0),
+            W,
+            Mean,
+            Mean.stride(0),
+            RSTD,
+            RSTD.stride(0),
+            DX,
+            DX.stride(0),
+            DW,
+            DB,
+            dY,
+            dY.stride(0),
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    return DX.view(*shape), DW.to(W.dtype), DB.to(B.dtype)
+
+
+# -----------------------------------------------------------------------------
+# Autograd Function
+# -----------------------------------------------------------------------------
+
+
+class LigerLayerNormFunction(torch.autograd.Function):
+    """
+    OPTIMIZED NPU LayerNorm operation.
+
+    Key optimizations for no-tiling kernels:
+    1. Pre-compute 1/n_cols to avoid scalar division (40.6% → <30% target)
+    2. Minimize per-iteration scalar operations in grid-stride loops
+    3. Hoist constant computations outside loops
+    4. Use vectorized operations throughout
+    5. Optimize memory access patterns with better cache hints
+    6. Reduce type conversions by keeping intermediates in float32
+    7. Improve grid sizing for better work distribution
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, B, eps):
+        Y, X, Mean, RSTD = layer_norm_forward(X, W, B, eps)
+        ctx.save_for_backward(X, W, B, Mean, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        X, W, B, Mean, RSTD = ctx.saved_tensors
+        DX, DW, DB = layer_norm_backward(dY, X, W, B, Mean, RSTD)
+        return DX, DW, DB, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/llama4_rope.py b/src/liger_kernel/ops/backends/_ascend/ops/llama4_rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..437f81a6e2a4c3198902cd616abd1f16739cecc2
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/llama4_rope.py
@@ -0,0 +1,306 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+
+
+def _cast_and_contiguous(q, k, freqs_complex):
+    # Align dtype: fp32 only when q is fp32; otherwise keep q dtype for perf
+    compute_dtype = torch.float32 if q.dtype == torch.float32 else q.dtype
+
+    if k.dtype != q.dtype:
+        k = k.to(q.dtype)
+
+    q = q.to(compute_dtype).contiguous()
+    k = k.to(compute_dtype).contiguous()
+    freqs_complex = freqs_complex.contiguous()
+    return q, k, freqs_complex, compute_dtype
+
+
+@triton.jit
+def _triton_llama4_rope_npu(
+    q_ptr,
+    k_ptr,
+    freqs_complex_ptr,
+    q_row_stride,
+    k_row_stride,
+    q_head_stride,
+    k_head_stride,
+    freqs_row_stride,
+    sl,
+    bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    imag_sign: tl.constexpr,
+):
+    """
+    Llama4 RoPE on Ascend NPU for interleaved complex layout:
+    - q/k shape: (bs, sl, n_heads, hd)
+    - freqs_complex_ptr: (sl, hd//2, 2)
+    """
+    pid = tl.program_id(0).to(tl.int64)
+    batch_idx = pid // sl
+    seq_idx = pid % sl
+
+    if batch_idx >= bs:
+        return
+
+    q_base = q_ptr + pid * q_row_stride
+    k_base = k_ptr + pid * k_row_stride
+
+    freq_base = seq_idx * freqs_row_stride
+    hd_idx = tl.arange(0, hd)
+    hd_mask = hd_idx < (hd)
+
+    freq_idx = tl.arange(0, hd)
+    freq_mask = freq_idx < (hd)
+
+    freqs_complex = tl.load(freqs_complex_ptr + freq_base + freq_idx, mask=freq_mask, other=0.0)
+
+    freqs_complex = freqs_complex.reshape(hd // 2, 2, can_reorder=True)
+    freqs_real, freqs_imag = tl.split(freqs_complex)
+    freqs_imag = freqs_imag * imag_sign
+
+    # Q heads (chunked for UB)
+    for qh_block in range(0, n_qh, BLOCK_Q):
+        qh_idx = tl.arange(0, BLOCK_Q) + qh_block
+        qh_mask = qh_idx < n_qh
+        block_mask = qh_mask[:, None] & hd_mask[None, :]
+
+        head_ptr = q_base + qh_idx[:, None] * q_head_stride
+
+        q_pair = tl.load(
+            head_ptr + hd_idx[None, :],
+            mask=block_mask,
+            other=0.0,
+        )
+        q_pair = q_pair.reshape(BLOCK_Q, hd // 2, 2, can_reorder=True)
+        q_real, q_imag = tl.split(q_pair)
+
+        new_real = tl.math.fma(q_real, freqs_real, -(q_imag * freqs_imag))
+        new_imag = tl.math.fma(q_real, freqs_imag, q_imag * freqs_real)
+
+        pair_idx = tl.arange(0, hd // 2)
+        real_idx = pair_idx * 2
+        imag_idx = pair_idx * 2 + 1
+
+        pair_mask = pair_idx < (hd // 2)
+
+        real_mask = qh_mask[:, None] & pair_mask[None, :]
+        imag_mask = qh_mask[:, None] & pair_mask[None, :]
+
+        # store real
+        tl.store(
+            head_ptr + real_idx[None, :],
+            new_real,
+            mask=real_mask,
+        )
+
+        # store imag
+        tl.store(
+            head_ptr + imag_idx[None, :],
+            new_imag,
+            mask=imag_mask,
+        )
+
+    # K heads (chunked for UB)
+    for kh_block in range(0, n_kh, BLOCK_K):
+        kh_idx = tl.arange(0, BLOCK_K) + kh_block
+        kh_mask = kh_idx < n_kh
+        block_mask = kh_mask[:, None] & hd_mask[None, :]
+
+        head_ptr = k_base + kh_idx[:, None] * k_head_stride
+
+        k_pair = tl.load(
+            head_ptr + hd_idx[None, :],
+            mask=block_mask,
+            other=0.0,
+        )
+
+        k_pair = k_pair.reshape(BLOCK_K, hd // 2, 2, can_reorder=True)
+        k_real, k_imag = tl.split(k_pair)
+
+        new_real = tl.math.fma(k_real, freqs_real, -(k_imag * freqs_imag))
+        new_imag = tl.math.fma(k_real, freqs_imag, k_imag * freqs_real)
+
+        pair_idx = tl.arange(0, hd // 2)
+        real_idx = pair_idx * 2
+        imag_idx = pair_idx * 2 + 1
+
+        pair_mask = pair_idx < (hd // 2)
+
+        real_mask = kh_mask[:, None] & pair_mask[None, :]
+        imag_mask = kh_mask[:, None] & pair_mask[None, :]
+
+        # store real
+        tl.store(
+            head_ptr + real_idx[None, :],
+            new_real,
+            mask=real_mask,
+        )
+
+        # store imag
+        tl.store(
+            head_ptr + imag_idx[None, :],
+            new_imag,
+            mask=imag_mask,
+        )
+
+
+def llama4_rope_forward(q, k, freqs_cis):
+    """
+    Ascend NPU implementation of Llama4 RoPE.
+
+    q/k: (bs, sl, n_heads, hd) with interleaved complex last-dim layout.
+    freqs_cis: complex (..., hd//2) OR packed (..., 2*(hd//2)).
+    """
+    original_dtype = q.dtype
+
+    bs, sl, n_qh, hd = q.shape
+    _, _, n_kh, _ = k.shape
+    if hd % 2 != 0:
+        raise ValueError(f"head_dim must be even for interleaved complex layout, got {hd}")
+
+    if freqs_cis.is_complex():
+        freqs_cis = freqs_cis.reshape(-1, freqs_cis.shape[-1])
+        if freqs_cis.shape[0] > sl:
+            freqs_cis = freqs_cis[:sl]
+        freqs_cis = torch.view_as_real(freqs_cis)
+
+    q, k, freqs_cis, compute_dtype = _cast_and_contiguous(q, k, freqs_cis)
+
+    # UB tiling strategy: tile heads dimension only
+    dtype_size = q.element_size()
+    shapes = ((n_qh, hd), (n_kh, hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=20.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+        BLOCK_Q = max(BLOCK_Q, 2)
+        BLOCK_K = max(BLOCK_K, 2)
+    else:
+        BLOCK_Q = triton.next_power_of_2(n_qh)
+        BLOCK_K = triton.next_power_of_2(n_kh)
+
+    n_row = bs * sl
+
+    _triton_llama4_rope_npu[(n_row,)](
+        q,
+        k,
+        freqs_cis,
+        q.stride(1),
+        k.stride(1),
+        q.stride(2),
+        k.stride(2),
+        freqs_cis.stride(0),
+        sl,
+        bs,
+        n_qh,
+        n_kh,
+        hd,
+        BLOCK_Q,
+        BLOCK_K,
+        imag_sign=1.0,
+    )
+
+    if compute_dtype != original_dtype:
+        q = q.to(original_dtype)
+        k = k.to(original_dtype)
+    return q, k
+
+
+def llama4_rope_backward(dq, dk, freqs_cis):
+    """
+    Ascend NPU implementation of Llama4 RoPE.
+
+    q/k: (bs, sl, n_heads, hd) with interleaved complex last-dim layout.
+    freqs_cis: complex (..., hd//2) OR packed (..., 2*(hd//2)).
+    """
+    original_dtype = dq.dtype
+
+    bs, sl, n_qh, hd = dq.shape
+    _, _, n_kh, _ = dk.shape
+    if hd % 2 != 0:
+        raise ValueError(f"head_dim must be even for interleaved complex layout, got {hd}")
+
+    if freqs_cis.is_complex():
+        freqs_cis = freqs_cis.reshape(-1, freqs_cis.shape[-1])
+        if freqs_cis.shape[0] > sl:
+            freqs_cis = freqs_cis[:sl]
+        freqs_cis = torch.view_as_real(freqs_cis)
+
+    dq, dk, freqs_cis, compute_dtype = _cast_and_contiguous(dq, dk, freqs_cis)
+
+    # UB tiling strategy: tile heads dimension only
+    dtype_size = dq.element_size()
+    shapes = ((n_qh, hd), (n_kh, hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=20.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+        BLOCK_Q = max(BLOCK_Q, 2)
+        BLOCK_K = max(BLOCK_K, 2)
+    else:
+        BLOCK_Q = triton.next_power_of_2(n_qh)
+        BLOCK_K = triton.next_power_of_2(n_kh)
+
+    n_row = bs * sl
+
+    _triton_llama4_rope_npu[(n_row,)](
+        dq,
+        dk,
+        freqs_cis,
+        dq.stride(1),
+        dk.stride(1),
+        dq.stride(2),
+        dk.stride(2),
+        freqs_cis.stride(0),
+        sl,
+        bs,
+        n_qh,
+        n_kh,
+        hd,
+        BLOCK_Q,
+        BLOCK_K,
+        imag_sign=-1.0,
+    )
+
+    if compute_dtype != original_dtype:
+        dq = dq.to(original_dtype)
+        dk = dk.to(original_dtype)
+    return dq, dk
+
+
+class LigerLlama4RopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, freqs_cis, BLOCK_SIZE: int = None):
+        # BLOCK_SIZE is ignored for Ascend (we auto-tile heads by UB), kept for API compatibility
+        q_out, k_out = llama4_rope_forward(q, k, freqs_cis)
+        ctx.save_for_backward(freqs_cis.detach() if isinstance(freqs_cis, torch.Tensor) else freqs_cis)
+        return q_out, k_out
+
+    @staticmethod
+    def backward(ctx, dq, dk):
+        (freqs_cis,) = ctx.saved_tensors
+        dq_out, dk_out = llama4_rope_backward(dq, dk, freqs_cis)
+        return dq_out, dk_out, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/poly_norm.py b/src/liger_kernel/ops/backends/_ascend/ops/poly_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..d4deb2329a177da52deb2214487242774a71feee
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/poly_norm.py
@@ -0,0 +1,786 @@
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import rsqrt
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+# -----------------------------------------------------------------------------
+# Forward Kernel - No Tiling (for n_cols <= 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _poly_norm_forward_kernel_no_tiling(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,  # weight: [3] for [w0, w1, w2]
+    B_ptr,  # bias: scalar
+    RSTD_ptr,  # cache rstd for backward: shape (n_rows, 3)
+    RSTD_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    eps,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    NPU-optimized PolyNorm forward kernel for small n_cols (<= 2048).
+
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Grid-stride loop setup
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    # Load weights and bias
+    w0 = tl.load(W_ptr + 0)
+    w1 = tl.load(W_ptr + 1)
+    w2 = tl.load(W_ptr + 2)
+    b = tl.load(B_ptr)
+
+    # Grid-stride loop over row blocks
+    for i in range(num_iterations):
+        row_idx = i * grid_stride + pid * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        # Load input rows
+        X_rows = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            cache_modifier=".cg",
+        )
+
+        X_f32 = X_rows.to(tl.float32)
+
+        # Compute x³, x², x
+        X_pow3 = X_f32 * X_f32 * X_f32
+        X_pow2 = X_f32 * X_f32
+        X_pow1 = X_f32
+
+        # Compute norm(x³): norm(u) = u * rsqrt(mean(u²) + eps)
+        # Mask out out-of-bounds positions to prevent contaminating the sum
+        mean_square_3 = tl.sum(X_pow3 * X_pow3, axis=1) / n_cols
+        rstd_3 = rsqrt(mean_square_3 + eps)
+        norm_x3 = X_pow3 * rstd_3[:, None]
+
+        # Compute norm(x²)
+        mean_square_2 = tl.sum(X_pow2 * X_pow2, axis=1) / n_cols
+        rstd_2 = rsqrt(mean_square_2 + eps)
+        norm_x2 = X_pow2 * rstd_2[:, None]
+
+        # Compute norm(x)
+        mean_square_1 = tl.sum(X_pow1 * X_pow1, axis=1) / n_cols
+        rstd_1 = rsqrt(mean_square_1 + eps)
+        norm_x1 = X_pow1 * rstd_1[:, None]
+
+        # Cache rstd values for backward (store 3 values per row)
+        tl.store(RSTD_ptr + row_idx * RSTD_row_stride + 0, rstd_3.to(X_rows.dtype), mask=row_mask)
+        tl.store(RSTD_ptr + row_idx * RSTD_row_stride + 1, rstd_2.to(X_rows.dtype), mask=row_mask)
+        tl.store(RSTD_ptr + row_idx * RSTD_row_stride + 2, rstd_1.to(X_rows.dtype), mask=row_mask)
+
+        # Compute output: y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        Y_f32 = w0 * norm_x3 + w1 * norm_x2 + w2 * norm_x1 + b
+
+        # Store output
+        tl.store(
+            Y_ptr + row_idx[:, None] * Y_row_stride + col_offsets[None, :],
+            Y_f32.to(X_rows.dtype),
+            mask=block_mask,
+        )
+
+
+# -----------------------------------------------------------------------------
+# Forward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _poly_norm_forward_kernel_npu(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,  # weight: [3] for [w0, w1, w2]
+    B_ptr,  # bias: scalar
+    RSTD_ptr,  # cache rstd for backward: shape (n_rows, 3)
+    RSTD_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    NPU-optimized PolyNorm forward kernel with column blocking.
+
+    This kernel processes rows using a grid-stride loop pattern:
+    1. Each program handles multiple rows
+    2. For each row, we process it in column chunks of BLOCK_SIZE
+    3. Grid size is limited to NPU core count to avoid resource overflow
+
+    Three-pass algorithm per row:
+    - First pass: compute mean_square and rstd for x³, x², x across all column blocks
+    - Second pass: apply normalization and affine transformation
+
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+
+    # Load weights and bias
+    w0 = tl.load(W_ptr + 0)
+    w1 = tl.load(W_ptr + 1)
+    w2 = tl.load(W_ptr + 2)
+    b = tl.load(B_ptr)
+
+    # Grid-stride loop over rows
+    for row_idx in range(pid, n_rows, num_progs):
+        Y_row_ptr = Y_ptr + row_idx * Y_row_stride
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        # First pass: compute mean_square for x³, x², x
+        sum_square_3 = 0.0
+        sum_square_2 = 0.0
+        sum_square_1 = 0.0
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, cache_modifier=".cg").to(tl.float32)
+
+            # Compute powers
+            X_pow3 = X_block * X_block * X_block
+            X_pow2 = X_block * X_block
+            X_pow1 = X_block
+
+            sum_square_3 += tl.sum(X_pow3 * X_pow3)
+            sum_square_2 += tl.sum(X_pow2 * X_pow2)
+            sum_square_1 += tl.sum(X_pow1 * X_pow1)
+
+        # Compute rstd values
+        mean_square_3 = sum_square_3 / n_cols
+        mean_square_2 = sum_square_2 / n_cols
+        mean_square_1 = sum_square_1 / n_cols
+
+        rstd_3 = rsqrt(mean_square_3 + eps)
+        rstd_2 = rsqrt(mean_square_2 + eps)
+        rstd_1 = rsqrt(mean_square_1 + eps)
+
+        # Store rstd values
+        tl.store(RSTD_row_ptr + 0, rstd_3)
+        tl.store(RSTD_row_ptr + 1, rstd_2)
+        tl.store(RSTD_row_ptr + 2, rstd_1)
+
+        # Second pass: normalize and apply affine transformation
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            # Load input
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, cache_modifier=".ca").to(tl.float32)
+
+            # Compute powers
+            X_pow3 = X_block * X_block * X_block
+            X_pow2 = X_block * X_block
+            X_pow1 = X_block
+
+            # Apply normalization
+            norm_x3 = X_pow3 * rstd_3
+            norm_x2 = X_pow2 * rstd_2
+            norm_x1 = X_pow1 * rstd_1
+
+            # Compute output: y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+            Y_f32 = w0 * norm_x3 + w1 * norm_x2 + w2 * norm_x1 + b
+
+            # Store result
+            tl.store(Y_row_ptr + col_offsets, Y_f32.to(X_block.dtype), mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel - No Tiling (for n_cols <= 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _poly_norm_backward_kernel_no_tiling(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_scratch_ptr,  # shape: (n_programs, 3)
+    dW_scratch_stride,
+    dB_scratch_ptr,  # shape: (n_programs,)
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    NPU-optimized PolyNorm backward kernel for small n_cols (<= 2048).
+
+    Backward pass equations:
+        ∂L/∂x_i = Σ_p w_p * [p*x_i^(p-1) * grad_i/D_p - (p/d)*x_i^(2p-1) * S_p/(D_p³)]
+
+    where:
+        - D_p = RMS(x^p) = 1/rstd_p
+        - S_p = sum(grad * x^p) over the row
+        - d = n_cols
+        - p ∈ {3, 2, 1}
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Grid-stride loop setup
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    # Load weights
+    w0 = tl.load(W_ptr + 0).to(tl.float32)
+    w1 = tl.load(W_ptr + 1).to(tl.float32)
+    w2 = tl.load(W_ptr + 2).to(tl.float32)
+
+    # Each program accumulates its own dW/dB contribution to avoid atomic contention
+    dW0_acc = 0.0
+    dW1_acc = 0.0
+    dW2_acc = 0.0
+    dB_acc = 0.0
+
+    # Grid-stride loop over row blocks
+    for i in range(num_iterations):
+        row_idx = i * grid_stride + pid * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        # Load input and gradient data
+        X_rows = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            cache_modifier=".cg",
+        )
+        dY_rows = tl.load(
+            dY_ptr + row_idx[:, None] * dY_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            cache_modifier=".cg",
+        )
+
+        # Load cached rstd values (3 values per row)
+        rstd_3 = tl.load(RSTD_ptr + row_idx * RSTD_row_stride + 0, mask=row_mask, other=0.0).to(tl.float32)
+        rstd_2 = tl.load(RSTD_ptr + row_idx * RSTD_row_stride + 1, mask=row_mask, other=0.0).to(tl.float32)
+        rstd_1 = tl.load(RSTD_ptr + row_idx * RSTD_row_stride + 2, mask=row_mask, other=0.0).to(tl.float32)
+
+        X_f32 = X_rows.to(tl.float32)
+        dY_f32 = dY_rows.to(tl.float32)
+
+        # Compute powers
+        X_pow3 = X_f32 * X_f32 * X_f32
+        X_pow2 = X_f32 * X_f32
+        X_pow1 = X_f32
+
+        # Accumulate bias gradient: dB = sum(dY)
+        dB_acc += tl.sum(dY_f32)
+
+        # Compute gradient w.r.t. input using closed-form formula
+        # For p=3: ∂L/∂x from w0 * norm(x³)
+        S_3 = tl.sum(dY_f32 * X_pow3, axis=1)  # sum over columns for each row
+        grad_x_3 = w0 * (
+            3.0 * X_pow2 * rstd_3[:, None] * dY_f32
+            - (3.0 / n_cols) * X_pow2 * X_pow3 * (rstd_3[:, None] * rstd_3[:, None] * rstd_3[:, None]) * S_3[:, None]
+        )
+
+        # For p=2: ∂L/∂x from w1 * norm(x²)
+        S_2 = tl.sum(dY_f32 * X_pow2, axis=1)
+        grad_x_2 = w1 * (
+            2.0 * X_pow1 * rstd_2[:, None] * dY_f32
+            - (2.0 / n_cols) * X_pow1 * X_pow2 * (rstd_2[:, None] * rstd_2[:, None] * rstd_2[:, None]) * S_2[:, None]
+        )
+
+        # For p=1: ∂L/∂x from w2 * norm(x)
+        S_1 = tl.sum(dY_f32 * X_pow1, axis=1)
+        grad_x_1 = w2 * (
+            1.0 * rstd_1[:, None] * dY_f32
+            - (1.0 / n_cols) * X_pow1 * (rstd_1[:, None] * rstd_1[:, None] * rstd_1[:, None]) * S_1[:, None]
+        )
+
+        # Total gradient
+        dX_f32 = grad_x_3 + grad_x_2 + grad_x_1
+
+        # Store dX
+        tl.store(
+            dX_ptr + row_idx[:, None] * dX_row_stride + col_offsets[None, :],
+            dX_f32.to(X_ptr.dtype.element_ty),
+            mask=block_mask,
+        )
+
+        # Accumulate weight gradients using closed-form: dW_p = rstd_p * S_p
+        dW0_acc += tl.sum(rstd_3 * S_3)
+        dW1_acc += tl.sum(rstd_2 * S_2)
+        dW2_acc += tl.sum(rstd_1 * S_1)
+
+    # Write this program's accumulated dW/dB to its dedicated scratch row
+    tl.store(dW_scratch_ptr + pid * dW_scratch_stride + 0, dW0_acc)
+    tl.store(dW_scratch_ptr + pid * dW_scratch_stride + 1, dW1_acc)
+    tl.store(dW_scratch_ptr + pid * dW_scratch_stride + 2, dW2_acc)
+    tl.store(dB_scratch_ptr + pid, dB_acc)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _poly_norm_backward_kernel_npu(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dB_ptr,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    NPU-optimized PolyNorm backward kernel with column blocking.
+
+    Each program processes multiple rows using grid-stride loop.
+    For each row, we process columns in blocks to avoid UB overflow.
+
+    Two-pass algorithm:
+    - First pass: compute S_p = sum(grad * x^p) for p ∈ {3, 2, 1}
+    - Second pass: compute gradients dX, dW, dB
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+
+    # Load weights
+    w0 = tl.load(W_ptr + 0).to(tl.float32)
+    w1 = tl.load(W_ptr + 1).to(tl.float32)
+    w2 = tl.load(W_ptr + 2).to(tl.float32)
+
+    dw0_acc = 0.0
+    dw1_acc = 0.0
+    dw2_acc = 0.0
+    db_acc = 0.0
+
+    # Grid-stride loop over rows
+    for row_idx in range(pid, n_rows, num_progs):
+        dY_row_ptr = dY_ptr + row_idx * dY_row_stride
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        dX_row_ptr = dX_ptr + row_idx * dX_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        # Load cached rstd values
+        rstd_3 = tl.load(RSTD_row_ptr + 0).to(tl.float32)
+        rstd_2 = tl.load(RSTD_row_ptr + 1).to(tl.float32)
+        rstd_1 = tl.load(RSTD_row_ptr + 2).to(tl.float32)
+
+        # First pass: compute S_p = sum(grad * x^p)
+        S_3 = 0.0
+        S_2 = 0.0
+        S_1 = 0.0
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+            dY_block = tl.load(dY_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+
+            # Compute powers
+            X_pow3 = X_block * X_block * X_block
+            X_pow2 = X_block * X_block
+            X_pow1 = X_block
+
+            S_3 += tl.sum(dY_block * X_pow3)
+            S_2 += tl.sum(dY_block * X_pow2)
+            S_1 += tl.sum(dY_block * X_pow1)
+
+        # Second pass: compute gradients
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+            dY_block = tl.load(dY_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+
+            # Compute powers
+            X_pow3 = X_block * X_block * X_block
+            X_pow2 = X_block * X_block
+            X_pow1 = X_block
+
+            # Compute gradient w.r.t. input using closed-form formula
+            # For p=3: ∂L/∂x from w0 * norm(x³)
+            grad_x_3 = w0 * (
+                3.0 * X_pow2 * rstd_3 * dY_block - (3.0 / n_cols) * X_pow2 * X_pow3 * (rstd_3 * rstd_3 * rstd_3) * S_3
+            )
+
+            # For p=2: ∂L/∂x from w1 * norm(x²)
+            grad_x_2 = w1 * (
+                2.0 * X_pow1 * rstd_2 * dY_block - (2.0 / n_cols) * X_pow1 * X_pow2 * (rstd_2 * rstd_2 * rstd_2) * S_2
+            )
+
+            # For p=1: ∂L/∂x from w2 * norm(x)
+            grad_x_1 = w2 * (1.0 * rstd_1 * dY_block - (1.0 / n_cols) * X_pow1 * (rstd_1 * rstd_1 * rstd_1) * S_1)
+
+            # Total gradient
+            dX_block = grad_x_3 + grad_x_2 + grad_x_1
+
+            # Store dX
+            tl.store(dX_row_ptr + col_offsets, dX_block.to(X_ptr.dtype.element_ty), mask=mask)
+
+            dw0_acc += tl.sum(rstd_3 * dY_block * X_pow3)
+            dw1_acc += tl.sum(rstd_2 * dY_block * X_pow2)
+            dw2_acc += tl.sum(rstd_1 * dY_block * X_pow1)
+            db_acc += tl.sum(dY_block)
+
+    tl.store(dW_ptr + 0, dw0_acc)
+    tl.store(dW_ptr + 1, dw1_acc)
+    tl.store(dW_ptr + 2, dw2_acc)
+    tl.store(dB_ptr, db_acc)
+
+
+# -----------------------------------------------------------------------------
+# Helper Functions
+# -----------------------------------------------------------------------------
+
+
+def get_optimal_block_size(n_cols, is_forward: bool):
+    """
+    Calculate optimal block size using compute_default_tiling_strategy.
+
+    Memory analysis for forward pass (per row):
+    - Load: X_block (1 block)
+    - Compute: X_pow3, X_pow2, X_pow1, norm_x3, norm_x2, norm_x1 (6 blocks)
+    - Total: conservative estimate 8 blocks of memory
+
+    Memory analysis for backward pass (per row):
+    - Load: X_block, dY_block, RSTD (3 blocks)
+    - Compute: X_pow3, X_pow2, X_pow1, grad_x_3, grad_x_2, grad_x_1 (6 blocks)
+    - Total: conservative estimate 10 blocks of memory
+
+    Args:
+        n_cols: Number of columns in the tensor
+        is_forward: Whether this is for forward pass (True) or backward pass (False)
+
+    Returns:
+        Optimal block size
+    """
+    if n_cols <= 2048:
+        return triton.next_power_of_2(n_cols)
+
+    memory_multiplier = 8.0 if is_forward else 10.0
+
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.8,
+        dtype_size=4,
+        memory_multiplier=memory_multiplier,
+        shapes=((n_cols,),),
+        tiling_dims=(0,),
+    )
+
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(2048, block_size)
+    else:
+        return 2048
+
+
+def _compute_grid_size(n_rows: int, block_size_m: int, num_cores: int) -> int:
+    """
+    Compute the effective grid size for no-tiling kernels.
+
+    Limits the grid to the minimum of:
+    - The number of row blocks actually needed (ceil(n_rows / BLOCK_SIZE_M)), which
+      prevents launching idle programs that would waste core cycles
+    - NPU core count, which is the hardware concurrency upper bound
+
+    Args:
+        n_rows: Total number of rows to process
+        block_size_m: Number of rows each program handles per iteration
+        num_cores: Number of available NPU cores
+
+    Returns:
+        Effective grid size
+    """
+    num_row_blocks = triton.cdiv(n_rows, block_size_m)
+    return min(num_cores, num_row_blocks)
+
+
+# -----------------------------------------------------------------------------
+# Forward and Backward Functions
+# -----------------------------------------------------------------------------
+
+
+def poly_norm_forward(X, W, B, eps=1e-6):
+    """
+    PolyNorm Forward Pass
+
+    Args:
+        X: input tensor of shape (*, H) where H is hidden dimension
+        W: weight tensor of shape (3,) for [w0, w1, w2]
+        B: bias scalar tensor
+        eps: epsilon for numerical stability
+
+    Returns:
+        Y: output tensor of same shape as X
+        X: reshaped input (for backward)
+        RSTD: cached rstd values (for backward)
+        BLOCK_SIZE: block size used
+    """
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    n_rows, n_cols = X.shape
+
+    # Check constraints
+    assert W.shape[0] == 3, "Weight tensor must have shape (3,)"
+    assert B.numel() == 1, "Bias must be a scalar"
+
+    # Get optimal block sizes
+    BLOCK_SIZE = get_optimal_block_size(n_cols, True)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE
+
+    # RSTD is to cache rstd for each row (3 values per row)
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    RSTD = torch.empty((n_rows, 3), dtype=torch.float32, device=X.device)
+
+    # Grid size
+    num_cores = get_npu_core_count()
+
+    # Choose kernel based on n_cols
+    if n_cols <= 2048:
+        # Small kernel: use 2D tensor loading
+        grid_size = _compute_grid_size(n_rows, BLOCK_SIZE_M, num_cores)
+
+        _poly_norm_forward_kernel_no_tiling[(grid_size,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            B,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE,
+        )
+    else:
+        # Large kernel: use column blocking
+        grid_size = min(num_cores, n_rows)
+
+        _poly_norm_forward_kernel_npu[(grid_size,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            B,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    return Y.view(*shape), X, RSTD
+
+
+def poly_norm_backward(dY, X, W, RSTD, in_place):
+    """
+    PolyNorm Backward Pass
+
+    Args:
+        dY: gradient of output
+        X: input tensor (already reshaped to 2D)
+        W: weight tensor
+        RSTD: cached rstd values from forward
+        BLOCK_SIZE: block size from forward
+        in_place: whether to in-place modify dY to store dX (saves memory)
+
+    Returns:
+        dX: gradient w.r.t. input
+        dW: gradient w.r.t. weight
+        dB: gradient w.r.t. bias
+    """
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    # Get optimal block sizes
+    BLOCK_SIZE_BACKWARD = get_optimal_block_size(n_cols, False)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE_BACKWARD
+
+    # Grid size
+    num_cores = get_npu_core_count()
+
+    # Allocate or reuse gradients
+    if in_place is True:
+        dX = dY
+    else:
+        dX = torch.zeros_like(dY)
+
+    # Choose kernel based on n_cols
+    if n_cols <= 2048:
+        # Small kernel: use 2D tensor loading with scratch buffers
+        grid_size = _compute_grid_size(n_rows, BLOCK_SIZE_M, num_cores)
+
+        # Allocate per-program scratch buffers for dW and dB
+        dW_scratch = torch.empty((grid_size, 3), dtype=torch.float32, device=W.device)
+        dB_scratch = torch.empty((grid_size,), dtype=torch.float32, device=W.device)
+
+        _poly_norm_backward_kernel_no_tiling[(grid_size,)](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            W,
+            RSTD,
+            RSTD.stride(0),
+            dW_scratch,
+            dW_scratch.stride(0),
+            dB_scratch,
+            n_rows,
+            n_cols,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_BACKWARD,
+        )
+
+        dW = dW_scratch.sum(dim=0).to(W.dtype)
+        dB = dB_scratch.sum().to(W.dtype)
+    else:
+        # Large kernel: use column blocking with atomic operations
+        grid_size = min(num_cores, n_rows)
+
+        dW = torch.zeros(3, dtype=torch.float32, device=W.device)
+        dB = torch.zeros(1, dtype=torch.float32, device=W.device)
+
+        _poly_norm_backward_kernel_npu[(grid_size,)](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            W,
+            RSTD,
+            RSTD.stride(0),
+            dW,
+            dB,
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE_BACKWARD,
+        )
+
+        dW = dW.to(W.dtype)
+        dB = dB.squeeze().to(W.dtype)
+
+    # Reshape dX back to original shape
+    dX = dX.view(*shape)
+
+    return dX, dW, dB
+
+
+# -----------------------------------------------------------------------------
+# Autograd Function
+# -----------------------------------------------------------------------------
+
+
+class LigerPolyNormFunction(torch.autograd.Function):
+    """
+    PolyNorm Function with forward and backward pass
+
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+
+    Backward uses closed-form gradient:
+        ∂L/∂x_i = Σ_p w_p * [p*x_i^(p-1) * grad_i/D_p - (p/d)*x_i^(2p-1) * S_p/(D_p³)]
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, B, eps=1e-6, in_place=True):
+        """
+        Args:
+            X: input tensor of shape (B, T, H) or (BxT, H)
+            W: weight tensor of shape (3,) for [w0, w1, w2]
+            B: bias scalar
+            eps: epsilon for numerical stability
+            in_place: whether to in-place modify grad_output in backward (saves memory)
+
+        Returns:
+            Y: output tensor of same shape as X
+        """
+        Y, X, RSTD = poly_norm_forward(X, W, B, eps)
+        ctx.in_place = in_place
+        ctx.save_for_backward(X, W, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output):
+        """
+        Args:
+            grad_output: gradient of output
+
+        Returns:
+            dX, dW, dB: gradients w.r.t. X, W, B
+        """
+        X, W, RSTD = ctx.saved_tensors
+        dX, dW, dB = poly_norm_backward(grad_output, X, W, RSTD, ctx.in_place)
+        return dX, dW, dB, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py b/src/liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py
new file mode 100755
index 0000000000000000000000000000000000000000..d273b7ec972e48e210819a9d9b45745524513554
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py
@@ -0,0 +1,272 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def _triton_qwen2vl_mrope_npu(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    sin,
+    sl,
+    bs: tl.constexpr,
+    total_rows: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    mrope_section_t: tl.constexpr,
+    mrope_section_h: tl.constexpr,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    program_id = tl.program_id(0)
+    num_programs = tl.num_programs(0)
+
+    rows_per_program = (total_rows + num_programs - 1) // num_programs
+    start_row = program_id * rows_per_program
+    actual_rows = tl.minimum(rows_per_program, total_rows - start_row)
+
+    for row_offset in tl.range(0, actual_rows):
+        pid = start_row + row_offset
+
+        t_end = mrope_section_t
+        h_end = t_end + mrope_section_h
+
+        t_cos = cos + pid * hd
+        h_cos = t_cos + bs * sl * hd
+        w_cos = h_cos + bs * sl * hd
+        t_sin = sin + pid * hd
+        h_sin = t_sin + bs * sl * hd
+        w_sin = h_sin + bs * sl * hd
+
+        q_base = q_ptr + pid * q_row_stride
+        k_base = k_ptr + pid * k_row_stride
+
+        d_idx = tl.arange(0, hd // 2)
+        d_mask = d_idx < (hd // 2)
+
+        pos_mask_t = d_idx < t_end
+        pos_mask_h = (d_idx >= t_end) & (d_idx < h_end)
+
+        text_cos_vals = tl.load(t_cos + d_idx, mask=d_mask, other=0)
+        text_sin_vals = tl.load(t_sin + d_idx, mask=d_mask, other=0)
+        height_cos_vals = tl.load(h_cos + d_idx, mask=d_mask, other=0)
+        height_sin_vals = tl.load(h_sin + d_idx, mask=d_mask, other=0)
+        width_cos_vals = tl.load(w_cos + d_idx, mask=d_mask, other=0)
+        width_sin_vals = tl.load(w_sin + d_idx, mask=d_mask, other=0)
+
+        cos_vals = tl.where(pos_mask_t, text_cos_vals, tl.where(pos_mask_h, height_cos_vals, width_cos_vals))
+        sin_vals = tl.where(pos_mask_t, text_sin_vals, tl.where(pos_mask_h, height_sin_vals, width_sin_vals))
+
+        # Process q heads in chunks to prevent UB overflow
+        for qh_block in range(0, n_qh, BLOCK_Q):
+            qh_idx = tl.arange(0, BLOCK_Q) + qh_block
+            qh_mask = qh_idx < n_qh
+
+            block_mask = qh_mask[:, None] & d_mask[None, :]
+            offsets = qh_idx[:, None] * hd + d_idx[None, :]
+
+            q_left = tl.load(q_base + offsets, mask=block_mask, other=0)
+            q_right = tl.load(q_base + offsets + (hd // 2), mask=block_mask, other=0)
+
+            if not BACKWARD_PASS:
+                new_left = q_left * cos_vals - q_right * sin_vals
+                new_right = q_right * cos_vals + q_left * sin_vals
+            else:
+                new_left = q_left * cos_vals + q_right * sin_vals
+                new_right = q_right * cos_vals - q_left * sin_vals
+
+            tl.store(q_base + offsets, new_left, mask=block_mask)
+            tl.store(q_base + offsets + (hd // 2), new_right, mask=block_mask)
+
+        # Process k heads in chunks to prevent UB overflow
+        for kh_block in range(0, n_kh, BLOCK_K):
+            kh_idx = tl.arange(0, BLOCK_K) + kh_block
+            kh_mask = kh_idx < n_kh
+
+            block_mask = kh_mask[:, None] & d_mask[None, :]
+            offsets = kh_idx[:, None] * hd + d_idx[None, :]
+
+            k_left = tl.load(k_base + offsets, mask=block_mask, other=0)
+            k_right = tl.load(k_base + offsets + (hd // 2), mask=block_mask, other=0)
+
+            if not BACKWARD_PASS:
+                new_left = k_left * cos_vals - k_right * sin_vals
+                new_right = k_right * cos_vals + k_left * sin_vals
+            else:
+                new_left = k_left * cos_vals + k_right * sin_vals
+                new_right = k_right * cos_vals - k_left * sin_vals
+
+            tl.store(k_base + offsets, new_left, mask=block_mask)
+            tl.store(k_base + offsets + (hd // 2), new_right, mask=block_mask)
+
+
+def get_optimal_block_size_mrope(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size):
+    # MROPE forward tiling strategy:
+    # - cos_vals and sin_vals (include text, height and width) are loaded once outside loops (shared): (pad_hd // 2) * 6 = 3 * pad_hd elements each
+    # - In q heads loop (peak memory):
+    #   * q_left: BLOCK_Q * (pad_hd // 2) elements
+    #   * q_right: BLOCK_Q * (pad_hd // 2) elements
+    #   * new_left: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_Q * (pad_hd // 2) = 2 * BLOCK_Q * pad_hd elements
+    # - In k heads loop (peak memory):
+    #   * k_left: BLOCK_K * (pad_hd // 2) elements
+    #   * k_right: BLOCK_K * (pad_hd // 2) elements
+    #   * new_left: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_K * (pad_hd // 2) = 2 * BLOCK_K * pad_hd elements
+    # - Since q and k are processed separately, peak memory is max(BLOCK_Q, BLOCK_K) case
+    # - Plus shared cos/sin: 6 * (pad_hd // 2) = 3 * pad_hd elements
+    # - Conservative estimate: (2 * BLOCK_SIZE * pad_hd + 3 * pad_hd) * dtype_size * 8 bits
+    # - Simplified: (2 * BLOCK_SIZE + 3) * pad_hd * dtype_size * 8 bits
+    # - For safety, use: memory_multiplier=3.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+    # - shapes: ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    # - tiling_dims: (0, 0) means first dimension of each shape can be tiled
+    # - Returns: ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+    shapes = ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=3.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        # Strategy returns ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+    else:
+        # Fallback to conservative defaults
+        BLOCK_Q = 2048
+        BLOCK_K = 2048
+
+    return BLOCK_Q, BLOCK_K
+
+
+def qwen2vl_mrope_forward(q, k, cos, sin, mrope_section):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure tensors passed into the kernel are contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+
+    dtype_size = q.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size_mrope(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+
+    _triton_qwen2vl_mrope_npu[(grid_size,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        sin,
+        seq_len,
+        batch_size,
+        n_row,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_Q,
+        BLOCK_K,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+
+
+def qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+
+    dtype_size = dq.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size_mrope(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+
+    _triton_qwen2vl_mrope_npu[(grid_size,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        sin,
+        seq_len,
+        batch_size,
+        n_row,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_Q,
+        BLOCK_K,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+
+
+class LigerQwen2VLMRopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = qwen2vl_mrope_forward(q, k, cos, sin, mrope_section)
+        ctx.save_for_backward(cos, sin)
+        ctx.mrope_section = mrope_section
+        return q, k
+
+    @staticmethod
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        mrope_section = ctx.mrope_section
+        dq, dk = qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section)
+        return dq, dk, None, None, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/rms_norm.py b/src/liger_kernel/ops/backends/_ascend/ops/rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..f52fe40e16d58ffd716d307b1f037274a5c8408c
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/rms_norm.py
@@ -0,0 +1,782 @@
+import torch
+import triton
+import triton.language as tl
+
+from triton.language.math import rsqrt
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+from liger_kernel.ops.utils import torch_to_triton_dtype
+
+_CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
+_CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
+_CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
+
+
+def torch_dtype_to_triton(dtype):
+    mapping = {
+        torch.float32: tl.float32,
+        torch.bfloat16: tl.bfloat16,
+    }
+    return mapping.get(dtype, tl.float32)
+
+
+# -----------------------------------------------------------------------------
+# Forward Kernel - No Tiling (for n_cols <= 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _rms_norm_forward_kernel_no_tiling(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
+    X_DTYPE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    NPU-optimized rms_norm forward kernel for small n_cols (< 2048).
+
+    Performance optimizations:
+    1. Use 2D vector loading to maximize UB utilization (e.g., (1,2048), (2,1024), (4,512))
+    2. Process multiple rows at once using 2D indexing
+    3. Keep data in registers, minimize conversions
+    4. Use optimal cache policies
+
+    Used when n_cols < 2048 to avoid the overhead of column blocking.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(X_DTYPE)
+        offset = offset.to(X_DTYPE)
+
+    # Grid-stride loop setup for 2D blocks
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0)
+
+    # Grid-stride loop over row blocks
+    for i in range(num_iterations):
+        row_idx = i * grid_stride + pid * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        # Load multiple rows at once using 2D indexing
+        X_rows = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+
+        # Compute sum_square for all rows
+        if casting_mode == _CASTING_MODE_LLAMA or casting_mode == _CASTING_MODE_GEMMA:
+            X_rows = X_rows.to(tl.float32)
+
+        sum_squares = tl.sum(tl.where(block_mask, X_rows * X_rows, 0.0), axis=1)
+
+        # Compute rstd for all rows
+        mean_squares = sum_squares / n_cols
+        rstd_rows = rsqrt(mean_squares + eps)
+
+        # Store rstd_rows
+        tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd_rows, mask=row_mask)
+
+        # Apply casting based on mode
+        if casting_mode == _CASTING_MODE_GEMMA:
+            X_rows = X_rows.to(tl.float32)
+            if elementwise_affine:
+                W_row_fp32 = W_row.to(tl.float32)
+        elif casting_mode == _CASTING_MODE_LLAMA:
+            X_rows = X_rows.to(tl.float32)
+
+        # Normalize
+        X_rows = X_rows * rstd_rows[:, None]
+
+        # Cast back for Llama mode before weight multiplication
+        if casting_mode == _CASTING_MODE_LLAMA:
+            X_rows = X_rows.to(X_DTYPE)
+
+        # Apply weight
+        if elementwise_affine:
+            if casting_mode == _CASTING_MODE_GEMMA:
+                Y_rows = X_rows * (offset + W_row_fp32[None, :])
+            else:
+                Y_rows = X_rows * (offset + W_row[None, :])
+        else:
+            Y_rows = X_rows
+
+        # Cast back for Gemma mode
+        if casting_mode == _CASTING_MODE_GEMMA:
+            Y_rows = Y_rows.to(X_DTYPE)
+
+        # Store results
+        tl.store(Y_ptr + row_idx[:, None] * Y_row_stride + col_offsets[None, :], Y_rows, mask=block_mask)
+
+
+# -----------------------------------------------------------------------------
+# Forward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _rms_norm_forward_kernel_tiled(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
+    X_DTYPE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    NPU-optimized rms_norm forward kernel for large n_cols (>= 2048).
+
+    This kernel processes rows using a grid-stride loop pattern:
+    1. Each program handles multiple rows
+    2. For each row, we process it in column chunks of BLOCK_SIZE
+    3. Grid size is limited to NPU core count to avoid resource overflow
+
+    This solves two problems:
+    1. UB overflow when n_cols is too large (original kernel used n_cols as BLOCK_SIZE)
+    2. Efficient multi-row processing within a single kernel launch
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(X_DTYPE)
+        offset = offset.to(X_DTYPE)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    # Grid-stride loop over rows
+    for row_idx in tl.range(pid, n_rows, num_progs):
+        Y_row_ptr = Y_ptr + row_idx * Y_row_stride
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        # Accumulator for mean_square computation across all column blocks
+        sum_square = 0.0
+
+        # First pass: accumulate sum of squares
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, eviction_policy="evict_first")
+
+            if casting_mode == _CASTING_MODE_LLAMA or casting_mode == _CASTING_MODE_GEMMA:
+                X_block = X_block.to(tl.float32)
+
+            # Accumulate sum of squares (only for valid elements)
+            sum_square += tl.sum(X_block * X_block)
+
+        # Compute rstd for this row
+        mean_square = sum_square / n_cols
+
+        rstd = rsqrt(mean_square + eps)
+
+        # Store rstd
+        tl.store(RSTD_row_ptr, rstd)
+
+        # Second pass: normalize and multiply by weight
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            # Load X_block
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, cache_modifier=".ca")
+
+            if elementwise_affine:
+                W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+
+            # Apply casting based on mode
+            if casting_mode == _CASTING_MODE_GEMMA:
+                X_block = X_block.to(tl.float32)
+                if elementwise_affine:
+                    W_block = W_block.to(tl.float32)
+            elif casting_mode == _CASTING_MODE_LLAMA:
+                X_block = X_block.to(tl.float32)
+
+            # Normalize
+            X_block = X_block * rstd
+
+            # Cast back for Llama mode before weight multiplication
+            if casting_mode == _CASTING_MODE_LLAMA:
+                X_block = X_block.to(X_DTYPE)
+
+            # Apply weight
+            if elementwise_affine:
+                Y_block = X_block * (offset + W_block)
+            else:
+                Y_block = X_block
+
+            # Cast back for Gemma mode
+            if casting_mode == _CASTING_MODE_GEMMA:
+                Y_block = Y_block.to(X_DTYPE)
+
+            # Store result
+            tl.store(Y_row_ptr + col_offsets, Y_block, mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel - No Tiling (for n_cols <= 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _rms_norm_backward_kernel_no_tiling(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    offset,
+    casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    NPU-optimized rms_norm backward kernel for small n_cols (< 2048).
+
+    Performance optimizations:
+    1. Keep all data in registers, minimize conversions
+    2. Reuse X_normalized (X * rstd) for both dX and dW
+    3. Optimize computation order to reduce register pressure
+    4. Combine operations where possible
+    5. Use 2D vector loading to maximize UB utilization (e.g., (1,2048), (2,1024), (4,512))
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Grid-stride loop setup for 2D blocks
+    grid_stride = num_progs * BLOCK_SIZE_M
+    num_iterations = tl.cdiv(n_rows, grid_stride)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE_N)
+    col_mask = col_offsets < n_cols
+    row_offsets = tl.arange(0, BLOCK_SIZE_M)
+
+    # Load W once for all iterations
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0)
+        W_offset = W_row + offset
+
+    # Grid-stride loop over row blocks
+    for i in range(num_iterations):
+        row_idx = i * grid_stride + pid * BLOCK_SIZE_M + row_offsets
+        row_mask = row_idx < n_rows
+        block_mask = row_mask[:, None] & col_mask[None, :]
+
+        dY_rows = tl.load(
+            dY_ptr + row_idx[:, None] * dY_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+        X_rows = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=block_mask,
+            other=0.0,
+            eviction_policy="evict_first",
+        )
+
+        # Load rstd for all rows in the block
+        rstd_rows = tl.load(RSTD_ptr + row_idx * RSTD_row_stride, mask=row_mask, other=0.0)
+
+        # Convert X to fp32 once
+        X_rows = X_rows.to(tl.float32)
+
+        # Compute X_normalized (reused in dX and dW)
+        X_normalized = X_rows * rstd_rows[:, None]
+
+        # Compute m based on casting mode and elementwise_affine
+        if elementwise_affine:
+            if casting_mode == _CASTING_MODE_LLAMA:
+                m_rows = (dY_rows * W_offset[None, :]).to(tl.float32)
+                # For dW in Llama mode, we need X_normalized in original dtype
+                X_normalized = X_normalized.to(X_dtype)
+            elif casting_mode == _CASTING_MODE_GEMMA:
+                m_rows = dY_rows.to(tl.float32) * W_offset[None, :]
+            else:
+                m_rows = dY_rows * W_offset[None, :]
+        else:
+            if casting_mode == _CASTING_MODE_LLAMA or casting_mode == _CASTING_MODE_GEMMA:
+                m_rows = dY_rows.to(tl.float32)
+            else:
+                m_rows = dY_rows
+
+        # Compute sum(m * X) for correction factor
+        sum_m_X = tl.sum(tl.where(block_mask, m_rows * X_rows, 0.0), axis=1)
+
+        # Compute correction factor
+        correction_factors = -(1.0 / n_cols) * rstd_rows * rstd_rows * sum_m_X
+
+        # Compute dX = rstd * m + rstd * correction_factor * X
+        dX_rows = rstd_rows[:, None] * m_rows + rstd_rows[:, None] * correction_factors[:, None] * X_rows
+
+        # Store dX
+        tl.store(dX_ptr + row_idx[:, None] * dX_row_stride + col_offsets[None, :], dX_rows.to(X_dtype), mask=block_mask)
+
+        if elementwise_affine:
+            # Compute dW contribution: dY * X_normalized
+            dW_rows = (dY_rows * X_normalized).to(tl.float32)
+
+            # Accumulate to per-program dW buffer
+            dW_row_ptr = dW_ptr + pid * dW_row_stride
+            existing_dW = tl.load(dW_row_ptr + col_offsets, mask=col_mask, other=0.0)
+            new_dW = existing_dW + tl.sum(tl.where(block_mask, dW_rows, 0.0), axis=0)
+            tl.store(dW_row_ptr + col_offsets, new_dW, mask=col_mask)
+
+
+# -----------------------------------------------------------------------------
+# Backward Kernel - With Tiling (for n_cols > 2048)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _rms_norm_backward_kernel_tiled(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    offset,
+    casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    NPU-optimized rms_norm backward kernel for large n_cols (>= 2048).
+
+    Each program processes multiple rows using grid-stride loop.
+    For each row, we process columns in blocks to avoid UB overflow.
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Initialize dW accumulator (per-program, will be reduced later)
+    num_col_blocks = tl.cdiv(n_cols, BLOCK_SIZE)
+    offsets = tl.arange(0, BLOCK_SIZE)
+
+    # Grid-stride loop over rows
+    for row_idx in tl.range(pid, n_rows, num_progs):
+        # Base pointers for this row
+        dY_row_ptr = dY_ptr + row_idx * dY_row_stride
+        dX_row_ptr = dX_ptr + row_idx * dX_row_stride
+        X_row_ptr = X_ptr + row_idx * X_row_stride
+        RSTD_row_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+        # Load rstd for this row
+        rstd = tl.load(RSTD_row_ptr)
+
+        # First pass: compute sum(m * X) for the correction term
+        sum_m_X = 0.0
+
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            dY_block = tl.load(dY_row_ptr + col_offsets, mask=mask, other=0.0, eviction_policy="evict_first")
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0, eviction_policy="evict_first")
+
+            # Convert to fp32 for computation
+            X_block = X_block.to(tl.float32)
+
+            if elementwise_affine:
+                W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0, eviction_policy="evict_first")
+                W_offset = W_block + offset
+
+                # Compute m based on casting mode
+                if casting_mode == _CASTING_MODE_LLAMA:
+                    m = (dY_block * W_offset).to(tl.float32)
+                elif casting_mode == _CASTING_MODE_GEMMA:
+                    dY_block = dY_block.to(tl.float32)
+                    m = dY_block * W_offset
+                else:
+                    m = dY_block * W_offset
+            else:
+                # Compute m based on casting mode
+                if casting_mode == _CASTING_MODE_LLAMA:
+                    m = dY_block.to(tl.float32)
+                elif casting_mode == _CASTING_MODE_GEMMA:
+                    m = dY_block.to(tl.float32)
+                else:
+                    m = dY_block
+
+            # Accumulate sum(m * X)
+            sum_m_X += tl.sum(m * X_block)
+
+        # Compute the correction factor
+        correction_factor = -(1.0 / n_cols) * rstd * rstd * sum_m_X
+
+        # Second pass: compute gradients
+        for col_block_idx in range(num_col_blocks):
+            col_start = col_block_idx * BLOCK_SIZE
+            col_offsets = col_start + offsets
+            mask = col_offsets < n_cols
+
+            dY_block = tl.load(dY_row_ptr + col_offsets, mask=mask, other=0.0)
+            X_block = tl.load(X_row_ptr + col_offsets, mask=mask, other=0.0)
+
+            X_block = X_block.to(tl.float32)
+
+            if elementwise_affine:
+                W_block = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+                W_offset = W_block + offset
+
+                # Compute m based on casting mode
+                if casting_mode == _CASTING_MODE_LLAMA:
+                    m = (dY_block * W_offset).to(tl.float32)
+                elif casting_mode == _CASTING_MODE_GEMMA:
+                    dY_block = dY_block.to(tl.float32)
+                    m = dY_block * W_offset
+                else:
+                    m = dY_block * W_offset
+            else:
+                # Compute m based on casting mode
+                if casting_mode == _CASTING_MODE_LLAMA:
+                    m = dY_block.to(tl.float32)
+                elif casting_mode == _CASTING_MODE_GEMMA:
+                    m = dY_block.to(tl.float32)
+                else:
+                    m = dY_block
+
+            # Compute dX
+            dX_block = rstd * m + rstd * correction_factor * X_block
+
+            # Store dX
+            tl.store(dX_row_ptr + col_offsets, dX_block.to(X_dtype), mask=mask)
+
+            if elementwise_affine:
+                # Compute dW contribution (accumulate per program)
+                if casting_mode == _CASTING_MODE_LLAMA:
+                    dW_block = dY_block * (X_block * rstd).to(X_dtype)
+                else:
+                    dW_block = dY_block * (X_block * rstd)
+
+                # Atomic add to dW_ptr (each program writes to its own row)
+                dW_row_ptr = dW_ptr + pid * dW_row_stride
+
+                # Load existing dW, add contribution, store back
+                existing_dW = tl.load(dW_row_ptr + col_offsets, mask=mask, other=0.0)
+                new_dW = existing_dW + dW_block.to(tl.float32)
+                tl.store(dW_row_ptr + col_offsets, new_dW, mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Helper Functions
+# -----------------------------------------------------------------------------
+
+
+def get_optimal_block_size(n_cols, is_forward: bool):
+    """
+    Calculate optimal block size for forward pass using compute_default_tiling_strategy.
+
+    Memory analysis for forward pass (per row):
+    - Load: X_block, W_block (2 blocks)
+    - Compute: X_block (fp32), Y_block (1-2 blocks)
+    - Total: conservative estimate 6 blocks of memory
+
+    Memory analysis for backward pass (per row):
+    - Load: dY_block, X_block, W_block (3 blocks)
+    - Compute: m, dX_block, dW_block (3 blocks)
+    - Store: dX_block, accumulated dW (2 blocks)
+    - Total: conservative estimate 8 blocks of memory
+
+    Args:
+        n_cols: Number of columns in the tensor
+        is_forward: Whether this is for forward pass
+
+    Returns:
+        Optimal block size
+    """
+    if n_cols <= 2048:
+        return triton.next_power_of_2(n_cols)
+
+    memory_multiplier = 6.0 if is_forward else 8.0
+
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=4,
+        memory_multiplier=memory_multiplier,
+        shapes=((n_cols,),),
+        tiling_dims=(0,),
+    )
+
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(2048, block_size)
+    else:
+        return 2048
+
+
+# -----------------------------------------------------------------------------
+# Forward and Backward Functions
+# -----------------------------------------------------------------------------
+
+
+_str_to_casting_mode = {
+    "llama": _CASTING_MODE_LLAMA.value,
+    "gemma": _CASTING_MODE_GEMMA.value,
+    "none": _CASTING_MODE_NONE.value,
+}
+
+
+def rms_norm_forward(X, W, eps, offset, casting_mode):
+    if not isinstance(casting_mode, int):
+        assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
+        casting_mode = _str_to_casting_mode[casting_mode]
+    else:
+        assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    n_rows, n_cols = X.shape
+    X_DTYPE = torch_dtype_to_triton(X.dtype)
+
+    # Get optimal block size for column processing
+    BLOCK_SIZE = get_optimal_block_size(n_cols, True)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE
+
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+
+    # RSTD is always fp32 for Llama/Gemma modes
+    rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
+    RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
+
+    if W is not None:
+        # Check constraints
+        assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension"
+        elementwise_affine = True
+    else:
+        elementwise_affine = False
+
+    # Grid size limited to NPU core count
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores * 2, n_rows)
+
+    # Choose kernel based on n_cols
+    if n_cols <= 2048:
+        # Use no-tiling kernel for small n_cols
+        _rms_norm_forward_kernel_no_tiling[(grid_size,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            elementwise_affine,
+            X_DTYPE,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE,
+        )
+    else:
+        # Use tiled kernel for large n_cols
+        _rms_norm_forward_kernel_tiled[(grid_size,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            elementwise_affine,
+            X_DTYPE,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    return Y.view(*shape), X, RSTD, casting_mode
+
+
+def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, in_place):
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    # Get NPU core count for grid size
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores * 2, n_rows)
+
+    # Get optimal block size for backward pass
+    BLOCK_SIZE = get_optimal_block_size(n_cols, False)
+    BLOCK_SIZE_M = 2048 // BLOCK_SIZE
+
+    if W is not None:
+        # fp32 for numerical stability
+        _dW = torch.zeros((grid_size, n_cols), dtype=torch.float32, device=W.device)
+        elementwise_affine = True
+    else:
+        _dW = None
+        elementwise_affine = False
+
+    if in_place:
+        dX = dY
+    else:
+        dX = torch.empty_like(dY)
+
+    # Choose kernel based on n_cols
+    if n_cols <= 2048:
+        # Use no-tiling kernel for small n_cols
+        _rms_norm_backward_kernel_no_tiling[(grid_size,)](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            torch_to_triton_dtype[X.dtype],
+            W,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0) if elementwise_affine else 0,
+            n_rows,
+            n_cols,
+            offset,
+            casting_mode,
+            elementwise_affine,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE,
+        )
+    else:
+        # Use tiled kernel for large n_cols
+        _rms_norm_backward_kernel_tiled[(grid_size,)](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            torch_to_triton_dtype[X.dtype],
+            W,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0) if elementwise_affine else 0,
+            n_rows,
+            n_cols,
+            offset,
+            casting_mode,
+            elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    dX = dX.view(*shape)
+
+    if elementwise_affine:
+        dW = _dW.sum(dim=0).to(W.dtype)
+    else:
+        dW = None
+
+    return dX, dW
+
+
+class LigerRMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama", in_place=True, row_mode=None):
+        """
+        X: (B, T, H) or (BxT, H)
+        W: (H,)
+        """
+        if isinstance(X, torch.distributed.tensor.DTensor):
+            # Input tensor is output of a tensor parallel module and
+            # needs to be gathered to a local tensor to compute
+            # RMSE layer norm on each TP worker.
+            # TODO: support CP.
+            X = X.full_tensor()
+
+        Y, X, RSTD, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode)
+        ctx.offset = offset
+        ctx.casting_mode = casting_mode
+        ctx.in_place = in_place
+        ctx.elementwise_affine = W is not None
+        if W is not None:
+            ctx.save_for_backward(X, W, RSTD)
+        else:
+            ctx.save_for_backward(X, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        """
+        Y: (B, T, H) or (BxT, H)
+        """
+        if ctx.elementwise_affine:
+            X, W, RSTD = ctx.saved_tensors
+        else:
+            X, RSTD = ctx.saved_tensors
+            W = None
+        if isinstance(dY, torch.distributed.tensor.DTensor):
+            # Gradients are output of a tensor parallel module and
+            # needs to be gathered to a local tensor for computing RMSE layer.
+            # TODO: support CP.
+            dY = dY.full_tensor()
+
+        dX, dW = rms_norm_backward(dY, X, W, RSTD, ctx.offset, ctx.casting_mode, ctx.in_place)
+        return dX, dW, None, None, None, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/rope.py b/src/liger_kernel/ops/backends/_ascend/ops/rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..ba5391f8f654c77fd0288903efdf2ea4adbb82cb
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/rope.py
@@ -0,0 +1,262 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def _triton_rope_npu(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    cos_row_stride,
+    sin,
+    sin_row_stride,
+    sl,
+    total_rows: tl.constexpr,
+    cos_bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    BLOCK_Q: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    program_id = tl.program_id(0)
+    num_programs = tl.num_programs(0)
+
+    rows_per_program = (total_rows + num_programs - 1) // num_programs
+    start_row = program_id * rows_per_program
+    actual_rows = tl.minimum(rows_per_program, total_rows - start_row)
+
+    for row_offset in tl.range(0, actual_rows):
+        pid = start_row + row_offset
+
+        row_idx = pid % sl
+        cos_ptr = cos + tl.where(cos_bs == 1, row_idx * cos_row_stride, pid * cos_row_stride)
+        sin_ptr = sin + tl.where(cos_bs == 1, row_idx * sin_row_stride, pid * sin_row_stride)
+
+        # Pre-compute d_idx and cos/sin values outside loops (they don't depend on heads)
+        d_idx = tl.arange(0, hd // 2)
+        d_mask = d_idx < (hd // 2)  # Always True, but kept for clarity
+        cos_vals = tl.load(cos_ptr + d_idx, mask=d_mask, other=0)
+        sin_vals = tl.load(sin_ptr + d_idx, mask=d_mask, other=0)
+
+        # Process q heads in chunks to prevent UB overflow
+        for qh_block in range(0, n_qh, BLOCK_Q):
+            qh_idx = tl.arange(0, BLOCK_Q) + qh_block
+            qh_mask = qh_idx < n_qh
+
+            # block_mask: qh_mask broadcasted over d_idx dimension
+            block_mask = qh_mask[:, None]
+
+            offsets = qh_idx[:, None] * hd + d_idx[None, :]
+            q_base = q_ptr + pid * q_row_stride
+
+            q_left = tl.load(q_base + offsets, mask=block_mask, other=0)
+            q_right = tl.load(q_base + offsets + (hd // 2), mask=block_mask, other=0)
+
+            if not BACKWARD_PASS:
+                new_left = q_left * cos_vals - q_right * sin_vals
+                new_right = q_right * cos_vals + q_left * sin_vals
+            else:
+                new_left = q_left * cos_vals + q_right * sin_vals
+                new_right = q_right * cos_vals - q_left * sin_vals
+
+            tl.store(q_base + offsets, new_left, mask=block_mask)
+            tl.store(q_base + offsets + (hd // 2), new_right, mask=block_mask)
+
+        # Process k heads in chunks to prevent UB overflow
+        for kh_block in range(0, n_kh, BLOCK_K):
+            kh_idx = tl.arange(0, BLOCK_K) + kh_block
+            kh_mask = kh_idx < n_kh
+
+            # block_mask: kh_mask broadcasted over d_idx dimension
+            block_mask = kh_mask[:, None]
+
+            offsets = kh_idx[:, None] * hd + d_idx[None, :]
+            k_base = k_ptr + pid * k_row_stride
+
+            k_left = tl.load(k_base + offsets, mask=block_mask, other=0)
+            k_right = tl.load(k_base + offsets + (hd // 2), mask=block_mask, other=0)
+
+            if not BACKWARD_PASS:
+                new_left = k_left * cos_vals - k_right * sin_vals
+                new_right = k_right * cos_vals + k_left * sin_vals
+            else:
+                new_left = k_left * cos_vals + k_right * sin_vals
+                new_right = k_right * cos_vals - k_left * sin_vals
+
+            tl.store(k_base + offsets, new_left, mask=block_mask)
+            tl.store(k_base + offsets + (hd // 2), new_right, mask=block_mask)
+
+
+def get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size):
+    # Compute tiling strategy based on UB capacity
+    # ROPE forward tiling strategy (based on optimized ROPE kernel):
+    # - cos_vals and sin_vals are loaded once outside loops (shared): pad_hd // 2 elements each
+    # - In q heads loop (peak memory):
+    #   * q_left: BLOCK_Q * (pad_hd // 2) elements
+    #   * q_right: BLOCK_Q * (pad_hd // 2) elements
+    #   * new_left: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_Q * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_Q * (pad_hd // 2) = 2 * BLOCK_Q * pad_hd elements
+    # - In k heads loop (peak memory):
+    #   * k_left: BLOCK_K * (pad_hd // 2) elements
+    #   * k_right: BLOCK_K * (pad_hd // 2) elements
+    #   * new_left: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * new_right: BLOCK_K * (pad_hd // 2) elements (intermediate result)
+    #   * Total: 4 * BLOCK_K * (pad_hd // 2) = 2 * BLOCK_K * pad_hd elements
+    # - Since q and k are processed separately, peak memory is max(BLOCK_Q, BLOCK_K) case
+    # - Plus shared cos/sin: 2 * (pad_hd // 2) = pad_hd elements
+    # - Conservative estimate: (2 * BLOCK_SIZE * pad_hd + pad_hd) * dtype_size * 8 bits
+    # - Simplified: (2 * BLOCK_SIZE + 1) * pad_hd * dtype_size * 8 bits
+    # - For safety, use: memory_multiplier=3.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+    # - shapes: ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    # - tiling_dims: (0, 0) means first dimension of each shape can be tiled
+    # - Returns: ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+    shapes = ((pad_n_q_head, pad_hd), (pad_n_kv_head, pad_hd))
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.90,
+        dtype_size=dtype_size,
+        memory_multiplier=3.0,
+        shapes=shapes,
+        tiling_dims=(0, 0),
+    )
+
+    if tile_shapes is not None and len(tile_shapes) == len(shapes):
+        # Strategy returns ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+        q_tile_shape, k_tile_shape = tile_shapes
+        BLOCK_Q, _ = q_tile_shape
+        BLOCK_K, _ = k_tile_shape
+    else:
+        # Fallback to conservative defaults
+        BLOCK_Q = 2048
+        BLOCK_K = 2048
+
+    return BLOCK_Q, BLOCK_K
+
+
+def rope_forward(q, k, cos, sin):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
+
+    dtype_size = q.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+
+    _triton_rope_npu[(grid_size,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        n_row,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        BLOCK_Q,
+        BLOCK_K,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+
+
+def rope_backward(dq, dk, cos, sin):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+
+    dtype_size = dq.element_size()
+    BLOCK_Q, BLOCK_K = get_optimal_block_size(pad_n_q_head, pad_n_kv_head, pad_hd, dtype_size)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, n_row)
+
+    _triton_rope_npu[(grid_size,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        n_row,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        BLOCK_Q,
+        BLOCK_K,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+
+
+class LigerRopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = rope_forward(q, k, cos, sin)
+        ctx.save_for_backward(cos, sin)
+        return q, k
+
+    @staticmethod
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+
+        cos, sin = ctx.saved_tensors
+        dq, dk = rope_backward(dq, dk, cos, sin)
+        return dq, dk, None, None, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/softmax.py b/src/liger_kernel/ops/backends/_ascend/ops/softmax.py
new file mode 100755
index 0000000000000000000000000000000000000000..d9144d5c8a87d60dc18b51ab59fe92a4ad42db84
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/softmax.py
@@ -0,0 +1,344 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def _softmax_single_block_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    ROWS_PER_BLOCK: tl.constexpr,
+):
+    """
+    Single-block softmax forward kernel for small column sizes.
+
+    Processes entire row in one block when n_cols <= BLOCK_SIZE.
+    Uses 2D tensor to process multiple rows simultaneously for better UB utilization.
+
+    Args:
+        Y_ptr: Output tensor pointer
+        Y_row_stride: Stride for output rows
+        X_ptr: Input tensor pointer
+        X_row_stride: Stride for input rows
+        n_rows: Number of rows to process
+        n_cols: Number of columns per row
+        BLOCK_SIZE: Block size for column processing
+        ROWS_PER_BLOCK: Number of rows to process simultaneously
+    """
+    row_block_start = tl.program_id(0) * ROWS_PER_BLOCK
+    row_block_step = tl.num_programs(0) * ROWS_PER_BLOCK
+
+    row_offsets = tl.arange(0, ROWS_PER_BLOCK)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+
+    for row_block_idx in tl.range(row_block_start, n_rows, row_block_step):
+        row_idx = row_block_idx + row_offsets
+        row_mask = row_idx < n_rows
+        col_mask = col_offsets < n_cols
+
+        # 2D mask: [ROWS_PER_BLOCK, BLOCK_SIZE]
+        mask = row_mask[:, None] & col_mask[None, :]
+
+        # Load 2D block: [ROWS_PER_BLOCK, BLOCK_SIZE]
+        offsets = row_idx[:, None] * X_row_stride + col_offsets[None, :]
+        x = tl.load(X_ptr + offsets, mask=mask, other=float("-inf"))
+
+        # Compute softmax per row (axis=1)
+        m = tl.max(x, axis=1)
+        e = tl.exp(x - m[:, None])
+        d = tl.sum(e, axis=1)
+        y = e / d[:, None]
+
+        # Store 2D block
+        offsets = row_idx[:, None] * Y_row_stride + col_offsets[None, :]
+        tl.store(Y_ptr + offsets, y, mask=mask)
+
+
+@triton.jit
+def _softmax_multi_block_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Multi-block softmax forward kernel using two-pass algorithm.
+
+    First pass computes max and sum for numerical stability.
+    Second pass normalizes and writes output.
+
+    Args:
+        Y_ptr: Output tensor pointer
+        Y_row_stride: Stride for output rows
+        X_ptr: Input tensor pointer
+        X_row_stride: Stride for input rows
+        n_rows: Number of rows to process
+        n_cols: Number of columns per row
+        BLOCK_SIZE: Block size for column processing
+    """
+    row_start = tl.program_id(0)
+    num_prog = tl.num_programs(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+
+    for row_idx in tl.range(row_start, n_rows, num_prog):
+        row_start_ptr = X_ptr + row_idx * X_row_stride
+        m = tl.float32(float("-inf"))
+        d = tl.float32(0.0)
+
+        for start in tl.range(0, n_cols, BLOCK_SIZE):
+            idx = start + col_offsets
+            mask = idx < n_cols
+            xblk = tl.load(
+                row_start_ptr + idx, mask=mask, other=float("-inf"), eviction_policy="evict_first", cache_modifier=".ca"
+            )
+            blk_max = tl.max(xblk, axis=0)
+            new_m = tl.maximum(m, blk_max)
+            d = d * tl.exp(m - new_m) + tl.sum(tl.exp(xblk - new_m), axis=0)
+            m = new_m
+
+        for start in tl.range(0, n_cols, BLOCK_SIZE):
+            idx = start + col_offsets
+            mask = idx < n_cols
+            xblk = tl.load(
+                row_start_ptr + idx, mask=mask, other=float("-inf"), eviction_policy="evict_first", cache_modifier=".ca"
+            )
+            yblk = tl.exp(xblk - m) / d
+            tl.store(Y_ptr + row_idx * Y_row_stride + idx, yblk, mask=mask, cache_modifier=".cs")
+
+
+@triton.jit
+def _softmax_single_block_backward_kernel(
+    dy_ptr,
+    dy_stride,
+    y_ptr,
+    y_stride,
+    dx_ptr,
+    dx_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    ROWS_PER_BLOCK: tl.constexpr,
+):
+    """
+    Single-block softmax backward kernel for small column sizes.
+
+    Computes gradient: dx = y * (dy - sum(dy * y))
+    Uses 2D tensor to process multiple rows simultaneously for better UB utilization.
+
+    Args:
+        dy_ptr: Gradient output pointer
+        dy_stride: Stride for gradient output rows
+        y_ptr: Forward output pointer
+        y_stride: Stride for forward output rows
+        dx_ptr: Gradient input pointer
+        dx_stride: Stride for gradient input rows
+        n_rows: Number of rows to process
+        n_cols: Number of columns per row
+        BLOCK_SIZE: Block size for column processing
+        ROWS_PER_BLOCK: Number of rows to process simultaneously
+    """
+    row_block_start = tl.program_id(0) * ROWS_PER_BLOCK
+    row_block_step = tl.num_programs(0) * ROWS_PER_BLOCK
+
+    row_offsets = tl.arange(0, ROWS_PER_BLOCK)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+
+    for row_block_idx in tl.range(row_block_start, n_rows, row_block_step):
+        row_idx = row_block_idx + row_offsets
+        row_mask = row_idx < n_rows
+        col_mask = col_offsets < n_cols
+
+        # 2D mask: [ROWS_PER_BLOCK, BLOCK_SIZE]
+        mask = row_mask[:, None] & col_mask[None, :]
+
+        # Load 2D blocks: [ROWS_PER_BLOCK, BLOCK_SIZE]
+        dy_offsets = row_idx[:, None] * dy_stride + col_offsets[None, :]
+        y_offsets = row_idx[:, None] * y_stride + col_offsets[None, :]
+
+        dy = tl.load(dy_ptr + dy_offsets, mask=mask, other=0.0)
+        y = tl.load(y_ptr + y_offsets, mask=mask, other=0.0)
+
+        # Compute dot product per row (axis=1)
+        dot = tl.sum(dy * y, axis=1)
+        dx = y * (dy - dot[:, None])
+
+        # Store 2D block
+        dx_offsets = row_idx[:, None] * dx_stride + col_offsets[None, :]
+        tl.store(dx_ptr + dx_offsets, dx, mask=mask)
+
+
+@triton.jit
+def _softmax_multi_block_backward_kernel(
+    dy_ptr,
+    dy_stride,
+    y_ptr,
+    y_stride,
+    dx_ptr,
+    dx_stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Multi-block softmax backward kernel using two-pass algorithm.
+
+    Computes gradient: dx = y * (dy - sum(dy * y))
+
+    Args:
+        dy_ptr: Gradient output pointer
+        dy_stride: Stride for gradient output rows
+        y_ptr: Forward output pointer
+        y_stride: Stride for forward output rows
+        dx_ptr: Gradient input pointer
+        dx_stride: Stride for gradient input rows
+        n_rows: Number of rows to process
+        n_cols: Number of columns per row
+        BLOCK_SIZE: Block size for column processing
+    """
+    row_start = tl.program_id(0)
+    num_prog = tl.num_programs(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+
+    for row_idx in tl.range(row_start, n_rows, num_prog):
+        dy_start_ptr = dy_ptr + row_idx * dy_stride
+        y_start_ptr = y_ptr + row_idx * y_stride
+        acc = 0.0
+
+        for start in tl.range(0, n_cols, BLOCK_SIZE):
+            idx = start + col_offsets
+            mask = idx < n_cols
+            dy_blk = tl.load(dy_start_ptr + idx, mask=mask, other=0.0, eviction_policy="evict_first")
+            y_blk = tl.load(
+                y_start_ptr + idx, mask=mask, other=0.0, eviction_policy="evict_first", cache_modifier=".ca"
+            )
+            acc += tl.sum(dy_blk * y_blk, axis=0)
+
+        for start in tl.range(0, n_cols, BLOCK_SIZE):
+            idx = start + col_offsets
+            mask = idx < n_cols
+            dy_blk = tl.load(dy_start_ptr + idx, mask=mask, other=0.0)
+            y_blk = tl.load(y_start_ptr + idx, mask=mask, other=0.0, cache_modifier=".ca")
+            dx_blk = y_blk * (dy_blk - acc)
+            tl.store(dx_ptr + row_idx * dx_stride + idx, dx_blk, mask=mask, cache_modifier=".wb")
+
+
+def _softmax_forward(x):
+    *batch, n_cols = x.shape
+    x2d = x.contiguous().view(-1, n_cols)
+    n_rows = x2d.shape[0]
+    MAX_FUSED_BLOCK_SIZE = 8192
+
+    BLOCK_SIZE = triton.next_power_of_2(n_cols)
+    BLOCK_SIZE = min(BLOCK_SIZE, MAX_FUSED_BLOCK_SIZE)
+
+    y2d = torch.empty_like(x2d)
+    num_cores = get_npu_core_count()
+
+    if n_cols <= BLOCK_SIZE:
+        # Calculate optimal ROWS_PER_BLOCK to utilize UB efficiently
+        # Target: ROWS_PER_BLOCK * BLOCK_SIZE <= MAX_FUSED_BLOCK_SIZE
+        ROWS_PER_BLOCK = min(MAX_FUSED_BLOCK_SIZE // BLOCK_SIZE, 32)
+        ROWS_PER_BLOCK = triton.next_power_of_2(ROWS_PER_BLOCK)
+
+        # Calculate number of programs needed
+        num_row_blocks = (n_rows + ROWS_PER_BLOCK - 1) // ROWS_PER_BLOCK
+        num_programs = min(num_cores, num_row_blocks)
+
+        _softmax_single_block_forward_kernel[(num_programs,)](
+            y2d, y2d.stride(0), x2d, x2d.stride(0), n_rows, n_cols, BLOCK_SIZE=BLOCK_SIZE, ROWS_PER_BLOCK=ROWS_PER_BLOCK
+        )
+        multi_block_launch = False
+    else:
+        num_programs = min(num_cores, n_rows)
+        ROWS_PER_BLOCK = 1  # Not used in multi-block
+
+        _softmax_multi_block_forward_kernel[(num_programs,)](
+            y2d, y2d.stride(0), x2d, x2d.stride(0), n_rows, n_cols, BLOCK_SIZE=BLOCK_SIZE
+        )
+        multi_block_launch = True
+
+    return y2d.view(*batch, n_cols), BLOCK_SIZE, ROWS_PER_BLOCK, multi_block_launch
+
+
+def _softmax_backward(
+    dy: torch.Tensor,
+    y: torch.Tensor,
+    BLOCK_SIZE: int,
+    ROWS_PER_BLOCK: int,
+    multi_block_launch: bool,
+) -> torch.Tensor:
+    *batch, n_cols = dy.shape
+    dy2d = dy.contiguous().view(-1, n_cols)
+    y2d = y.contiguous().view(-1, n_cols)
+    n_rows = dy2d.shape[0]
+    dx2d = torch.empty_like(dy2d)
+
+    num_cores = get_npu_core_count()
+
+    if not multi_block_launch and n_cols <= BLOCK_SIZE:
+        num_row_blocks = (n_rows + ROWS_PER_BLOCK - 1) // ROWS_PER_BLOCK
+        num_programs = min(num_cores, num_row_blocks)
+        _softmax_single_block_backward_kernel[(num_programs,)](
+            dy2d,
+            dy2d.stride(0),
+            y2d,
+            y2d.stride(0),
+            dx2d,
+            dx2d.stride(0),
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+            ROWS_PER_BLOCK=ROWS_PER_BLOCK,
+        )
+    else:
+        num_programs = min(num_cores, n_rows)
+
+        _softmax_multi_block_backward_kernel[(num_programs,)](
+            dy2d,
+            dy2d.stride(0),
+            y2d,
+            y2d.stride(0),
+            dx2d,
+            dx2d.stride(0),
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    return dx2d.view(*batch, n_cols)
+
+
+class LigerSoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, input_: torch.Tensor):
+        y, BLOCK_SIZE, ROWS_PER_BLOCK, multi_block_launch = _softmax_forward(input_)
+        ctx.save_for_backward(y)
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.ROWS_PER_BLOCK = ROWS_PER_BLOCK
+        ctx.multi_block_launch = multi_block_launch
+        return y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output):
+        (y,) = ctx.saved_tensors
+        dx = _softmax_backward(
+            grad_output,
+            y,
+            ctx.BLOCK_SIZE,
+            ctx.ROWS_PER_BLOCK,
+            ctx.multi_block_launch,
+        )
+        return dx
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/sparsemax.py b/src/liger_kernel/ops/backends/_ascend/ops/sparsemax.py
new file mode 100755
index 0000000000000000000000000000000000000000..a6deaf8af89cfc62458e722ab61ba56bcb76562b
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/sparsemax.py
@@ -0,0 +1,385 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+
+@triton.jit
+def _sparsemax_forward_kernel(
+    x_ptr,
+    x_stride_row,
+    sorted_x_ptr,
+    sorted_x_stride_row,
+    o_ptr,
+    o_stride_row,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Sparsemax forward kernel for rows where n_cols <= BLOCK_SIZE.
+
+    Args:
+        x_ptr: pointer to input tensor [n_rows, n_cols], fp32.
+        x_stride_row: row stride of x.
+        sorted_x_ptr: pointer to x sorted descending along last dim, fp32.
+        sorted_x_stride_row: row stride of sorted_x.
+        o_ptr: pointer to output tensor [n_rows, n_cols].
+        o_stride_row: row stride of o.
+        n_rows: number of rows (constexpr).
+        n_cols: number of columns (constexpr).
+        BLOCK_SIZE: tile size >= n_cols (constexpr).
+    """
+    pid_row = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    for row in tl.range(pid_row, n_rows, num_progs):
+        ptr_x_data_row = x_ptr + row * x_stride_row
+        ptr_sorted_x_data_row = sorted_x_ptr + row * sorted_x_stride_row
+        ptr_output_row = o_ptr + row * o_stride_row
+
+        offs = tl.arange(0, BLOCK_SIZE)
+        mask = offs < n_cols
+
+        z_sorted_block = tl.load(
+            ptr_sorted_x_data_row + offs,
+            mask=mask,
+            other=-float("inf"),
+            cache_modifier=".cg",
+        ).to(tl.float32)
+
+        z_valid = tl.where(mask, z_sorted_block, 0.0)
+        cssv = tl.cumsum(z_valid, 0)
+
+        r = (offs + 1).to(tl.float32)
+        t_vec = (cssv - 1.0) / r
+        support = (z_sorted_block > t_vec) & mask
+
+        k_int = tl.sum(support.to(tl.int32), 0)
+        k_clamped_int = tl.maximum(k_int, 1)
+        k = k_clamped_int.to(tl.float32)
+
+        s = tl.sum(tl.where(support, z_sorted_block, 0.0), 0)
+        tau = (s - 1.0) / k
+
+        x_block = tl.load(
+            ptr_x_data_row + offs,
+            mask=mask,
+            other=0.0,
+            cache_modifier=".cg",
+        ).to(tl.float32)
+
+        y = tl.maximum(x_block - tau, 0.0)
+
+        tl.store(
+            ptr_output_row + offs,
+            y.to(ptr_output_row.dtype.element_ty),
+            mask=mask,
+            cache_modifier=".cs",
+        )
+
+
+@triton.jit
+def _sparsemax_forward_tiled_kernel(
+    x_ptr,
+    x_stride_row,
+    sorted_x_ptr,
+    sorted_x_stride_row,
+    o_ptr,
+    o_stride_row,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Sparsemax forward kernel for rows where n_cols > BLOCK_SIZE (tiled).
+
+    Args:
+        x_ptr: pointer to input tensor [n_rows, n_cols], fp32.
+        x_stride_row: row stride of x.
+        sorted_x_ptr: pointer to x sorted descending along last dim, fp32.
+        sorted_x_stride_row: row stride of sorted_x.
+        o_ptr: pointer to output tensor [n_rows, n_cols].
+        o_stride_row: row stride of o.
+        n_rows: number of rows (constexpr).
+        n_cols: number of columns (constexpr).
+        BLOCK_SIZE: tile size < n_cols (constexpr).
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    for row in tl.range(pid, n_rows, num_progs):
+        sorted_row_ptr = sorted_x_ptr + row * sorted_x_stride_row
+        x_row_ptr = x_ptr + row * x_stride_row
+        out_row_ptr = o_ptr + row * o_stride_row
+        offs = tl.arange(0, BLOCK_SIZE)
+
+        # ------------------------------------------------------------------
+        # Pass 1: find tau from sorted data
+        # Since data is sorted descending, support is a contiguous prefix,
+        # so k = sum(support) — no need for max(support_r), saves one reduction.
+        # ------------------------------------------------------------------
+        running_sum = tl.zeros((), tl.float32)
+        k = tl.zeros((), tl.int32)
+        sum_support = tl.zeros((), tl.float32)
+
+        for tile in tl.range(0, tl.cdiv(n_cols, BLOCK_SIZE)):
+            idx = tile * BLOCK_SIZE + offs
+            mask = idx < n_cols
+
+            z = tl.load(sorted_row_ptr + idx, mask=mask, other=0.0, cache_modifier=".ca").to(tl.float32)
+
+            cssv = tl.cumsum(z, axis=0) + running_sum
+            r = (idx + 1).to(tl.float32)
+            t = (cssv - 1.0) / r
+            support = (z > t) & mask
+
+            k += tl.sum(support.to(tl.int32), axis=0)
+            sum_support += tl.sum(tl.where(support, z, 0.0), axis=0)
+            running_sum += tl.sum(z, axis=0)
+
+        tau = (sum_support - 1.0) / tl.maximum(k, 1).to(tl.float32)
+
+        # ------------------------------------------------------------------
+        # Pass 2: write output y = max(x - tau, 0)
+        # ------------------------------------------------------------------
+        for tile in tl.range(0, tl.cdiv(n_cols, BLOCK_SIZE)):
+            idx = tile * BLOCK_SIZE + offs
+            mask = idx < n_cols
+
+            x = tl.load(x_row_ptr + idx, mask=mask, other=0.0, cache_modifier=".ca").to(tl.float32)
+            y = tl.maximum(x - tau, 0.0)
+
+            tl.store(out_row_ptr + idx, y.to(out_row_ptr.dtype.element_ty), mask=mask, cache_modifier=".cs")
+
+
+@triton.jit
+def _sparsemax_backward_kernel(
+    o_ptr,
+    go_ptr,
+    gi_ptr,
+    stride,
+    n_rows: tl.constexpr,
+    n_cols: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Sparsemax backward kernel for rows where n_cols <= BLOCK_SIZE.
+
+    Args:
+        o_ptr: pointer to forward output [n_rows, n_cols], fp32.
+        go_ptr: pointer to upstream gradient [n_rows, n_cols].
+        gi_ptr: pointer to input gradient output [n_rows, n_cols].
+        stride: common row stride for o, go, gi.
+        n_rows: number of rows (constexpr).
+        n_cols: number of columns (constexpr).
+        BLOCK_SIZE: tile size >= n_cols (constexpr).
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    for row in tl.range(pid, n_rows, num_progs):
+        o_row = o_ptr + row * stride
+        go_row = go_ptr + row * stride
+        gi_row = gi_ptr + row * stride
+
+        offs = tl.arange(0, BLOCK_SIZE)
+        mask = offs < n_cols
+
+        o_val = tl.load(o_row + offs, mask=mask, other=0.0).to(tl.float32)
+        go_val = tl.load(go_row + offs, mask=mask, other=0.0).to(tl.float32)
+        supp = (o_val > 0.0) & mask
+
+        go_sum = tl.sum(tl.where(supp, go_val, 0.0), axis=0)
+        supp_cnt = tl.sum(supp.to(tl.float32), axis=0)
+
+        gi_val = tl.where(
+            supp,
+            go_val - go_sum / tl.maximum(supp_cnt, 1.0),
+            0.0,
+        )
+        tl.store(gi_row + offs, gi_val.to(gi_row.dtype.element_ty), mask=mask)
+
+
+@triton.jit
+def _sparsemax_backward_tiled_kernel(
+    o_ptr, go_ptr, gi_ptr, stride, n_rows: tl.constexpr, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
+):
+    """Sparsemax backward kernel for rows where n_cols > BLOCK_SIZE (tiled).
+
+    Args:
+        o_ptr: pointer to forward output [n_rows, n_cols], fp32.
+        go_ptr: pointer to upstream gradient [n_rows, n_cols].
+        gi_ptr: pointer to input gradient output [n_rows, n_cols].
+        stride: common row stride for o, go, gi.
+        n_rows: number of rows (constexpr).
+        n_cols: number of columns (constexpr).
+        BLOCK_SIZE: tile size < n_cols (constexpr).
+    """
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    for row in tl.range(pid, n_rows, num_progs):
+        o_row = o_ptr + row * stride
+        go_row = go_ptr + row * stride
+        gi_row = gi_ptr + row * stride
+
+        offs = tl.arange(0, BLOCK_SIZE)
+
+        supp_cnt = tl.zeros((), tl.float32)
+        go_sum = tl.zeros((), tl.float32)
+
+        for i in tl.range(0, tl.cdiv(n_cols, BLOCK_SIZE)):
+            offs_iter = i * BLOCK_SIZE + offs
+            mask_iter = offs_iter < n_cols
+            o_val = tl.load(o_row + offs_iter, mask=mask_iter, other=0.0, cache_modifier=".ca").to(tl.float32)
+            go_val = tl.load(go_row + offs_iter, mask=mask_iter, other=0.0).to(tl.float32)
+            supp = o_val > 0
+            go_sum += tl.sum(tl.where(supp, go_val, 0.0))
+            supp_cnt += tl.sum(supp.to(tl.float32))
+
+        for i in tl.range(0, tl.cdiv(n_cols, BLOCK_SIZE)):
+            offs_iter = i * BLOCK_SIZE + offs
+            mask_iter = offs_iter < n_cols
+            o_val = tl.load(o_row + offs_iter, mask=mask_iter, other=0.0, cache_modifier=".ca").to(tl.float32)
+            go_val = tl.load(go_row + offs_iter, mask=mask_iter, other=0.0).to(tl.float32)
+
+            supp = o_val > 0
+            gi_val = tl.where(
+                supp,
+                go_val - tl.cast(go_sum / tl.maximum(supp_cnt, 1e-6), gi_row.dtype.element_ty).to(tl.float32),
+                0.0,
+            )
+            tl.store(gi_row + offs_iter, gi_val.to(gi_row.dtype.element_ty), mask=mask_iter, cache_modifier=".cs")
+
+
+def sparsemax_forward(x, dim):
+    if dim < 0:
+        dim += x.dim()
+
+    x_sw = x.transpose(dim, -1).contiguous()
+    n_cols = x_sw.size(-1)
+    n_rows = x_sw.numel() // n_cols
+    x_flat = x_sw.view(n_rows, n_cols)
+
+    x_flat_fp32 = x_flat if x_flat.dtype == torch.float32 else x_flat.float()
+    x_sorted_flat = torch.sort(x_flat_fp32, dim=-1, descending=True).values
+
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9,
+        dtype_size=4,
+        memory_multiplier=12.0,
+        shapes=((n_cols,),),
+        tiling_dims=(0,),
+    )
+
+    if tile_shapes and len(tile_shapes) > 0:
+        BLOCK_SIZE = tile_shapes[0][0]
+    else:
+        BLOCK_SIZE = 2048
+
+    out_flat = torch.empty_like(x_flat_fp32)
+    grid = (min(n_rows, get_npu_core_count()),)
+
+    if n_cols <= BLOCK_SIZE:
+        # non-tiled kernel: single load covers whole row
+        _sparsemax_forward_kernel[grid](
+            x_flat_fp32,
+            x_flat_fp32.stride(0),
+            x_sorted_flat,
+            x_sorted_flat.stride(0),
+            out_flat,
+            out_flat.stride(0),
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+    else:
+        # tiled kernel: compute tau and write output in one fused kernel
+        _sparsemax_forward_tiled_kernel[grid](
+            x_flat_fp32,
+            x_flat_fp32.stride(0),
+            x_sorted_flat,
+            x_sorted_flat.stride(0),
+            out_flat,
+            out_flat.stride(0),
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    y = out_flat.view(x_sw.shape).transpose(dim, -1)
+    return y, out_flat
+
+
+def sparsemax_backward(
+    grad_out: torch.Tensor,
+    out_flat: torch.Tensor,
+    dim: int,
+) -> torch.Tensor:
+    if dim < 0:
+        dim += grad_out.dim()
+
+    grad_sw = grad_out.transpose(dim, -1).contiguous()
+    n_cols = grad_sw.size(-1)
+    n_rows = grad_sw.numel() // n_cols
+    go_flat = grad_sw.view(n_rows, n_cols)
+
+    dx_flat = torch.empty_like(go_flat).contiguous()
+    grid = (min(n_rows, get_npu_core_count()),)
+
+    # use single-pass kernel when feasible
+    if n_cols <= 4096:
+        BLOCK_SIZE = triton.next_power_of_2(n_cols)
+        _sparsemax_backward_kernel[grid](
+            out_flat,
+            go_flat,
+            dx_flat,
+            out_flat.stride(0),
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    else:
+        # use tiling strategy for very large n_cols: ~10 live buffers at peak = 10.0 multiplier
+        tile_shapes = compute_default_tiling_strategy(
+            safety_margin=0.9,
+            dtype_size=4,
+            memory_multiplier=8.0,
+            shapes=((n_cols,),),
+            tiling_dims=(0,),
+        )
+
+        if tile_shapes and len(tile_shapes) > 0:
+            BLOCK_SIZE = tile_shapes[0][0]
+        else:
+            BLOCK_SIZE = 2048
+
+        _sparsemax_backward_tiled_kernel[grid](
+            out_flat,
+            go_flat,
+            dx_flat,
+            out_flat.stride(0),
+            n_rows,
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+    dx = dx_flat.view_as(grad_sw).transpose(dim, -1)
+    return dx
+
+
+class LigerSparsemaxFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, x: torch.Tensor, dim: int):
+        y, out_flat = sparsemax_forward(x, dim)
+        ctx.save_for_backward(out_flat)
+        ctx.dim = dim
+        return y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_out: torch.Tensor):
+        (out_flat,) = ctx.saved_tensors
+        dx = sparsemax_backward(grad_out, out_flat, ctx.dim)
+        return dx, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/swiglu.py b/src/liger_kernel/ops/backends/_ascend/ops/swiglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..9c244742e0c375d9eb08c424b75b244bbdd7771e
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/swiglu.py
@@ -0,0 +1,136 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import get_npu_core_count
+
+# -----------------------------------------------------------------------------
+# Kernels (High-performance 1D Flatten Implementation)
+# -----------------------------------------------------------------------------
+
+
+@triton.jit
+def _swiglu_forward_kernel_flat(a_ptr, b_ptr, c_ptr, total_elements, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    # Grid-Stride Loop
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+
+    for idx in tl.range(start_idx, total_elements, stride):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+
+        a_val = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b_val = tl.load(b_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        res = (a_val * tl.sigmoid(a_val)) * b_val
+        tl.store(c_ptr + offsets, res, mask=mask)
+
+
+@triton.jit
+def _swiglu_backward_kernel_flat(dc_ptr, a_ptr, b_ptr, da_ptr, db_ptr, total_elements, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+    start_idx = pid * BLOCK_SIZE
+    stride = num_progs * BLOCK_SIZE
+
+    for idx in tl.range(start_idx, total_elements, stride):
+        offsets = idx + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < total_elements
+
+        dc = tl.load(dc_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        a = tl.load(a_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        b = tl.load(b_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+
+        sig_a = tl.sigmoid(a)
+        silu_a = a * sig_a
+        term1 = silu_a * (1.0 - sig_a) + sig_a
+
+        db = dc * silu_a
+        da = dc * b * term1
+
+        tl.store(da_ptr + offsets, da, mask=mask)
+        tl.store(db_ptr + offsets, db, mask=mask)
+
+
+# -----------------------------------------------------------------------------
+# Helper: Call compute_default_tiling_strategy
+# -----------------------------------------------------------------------------
+
+
+def get_optimal_block_size(total_elements, is_backward=False):
+    """
+    Calculate optimal Block Size using compute_default_tiling_strategy
+    """
+    # 1. Set Memory Multiplier
+    # Forward is lighter, Backward requires more memory for intermediate variables
+    # 8.0 and 12.0 are empirical values based on Atlas 800I A2 UB (192KB)
+    multiplier = 12.0 if is_backward else 8.0
+
+    # 2. Call calculation function
+    # Treat input as 1D (total_elements,), only tiling on dim 0
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.9, dtype_size=4, memory_multiplier=multiplier, shapes=((total_elements,),), tiling_dims=(0,)
+    )
+
+    # 3. Parse result
+    if tile_shapes and len(tile_shapes) > 0:
+        block_size = tile_shapes[0][0]
+        return max(256, block_size)
+    else:
+        return 2048
+
+
+def swiglu_forward(a, b):
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+
+    total_elements = a.numel()
+    c = torch.empty_like(a)
+
+    block_size = get_optimal_block_size(total_elements, is_backward=False)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+
+    _swiglu_forward_kernel_flat[(grid_size,)](a, b, c, total_elements, BLOCK_SIZE=block_size)
+    return c
+
+
+def swiglu_backward(a, b, dc):
+    if not dc.is_contiguous():
+        dc = dc.contiguous()
+    if not a.is_contiguous():
+        a = a.contiguous()
+    if not b.is_contiguous():
+        b = b.contiguous()
+
+    total_elements = dc.numel()
+    grad_a = torch.empty_like(a)
+    grad_b = torch.empty_like(b)
+
+    block_size = get_optimal_block_size(total_elements, is_backward=True)
+
+    num_cores = get_npu_core_count()
+    grid_size = min(num_cores, (total_elements + block_size - 1) // block_size)
+
+    _swiglu_backward_kernel_flat[(grid_size,)](dc, a, b, grad_a, grad_b, total_elements, BLOCK_SIZE=block_size)
+    return grad_a, grad_b
+
+
+class LigerSiLUMulFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b):
+        c = swiglu_forward(a, b)
+        ctx.save_for_backward(a, b)
+        return c
+
+    @staticmethod
+    def backward(ctx, dc):
+        a, b = ctx.saved_tensors
+        grad_a, grad_b = swiglu_backward(a, b, dc)
+        return grad_a, grad_b
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/tvd.py b/src/liger_kernel/ops/backends/_ascend/ops/tvd.py
new file mode 100755
index 0000000000000000000000000000000000000000..62a913a09a8c9532bf7523896cbfb2cee773f288
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ops/tvd.py
@@ -0,0 +1,221 @@
+from typing import Literal
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.backends._ascend.ub_manager import compute_default_tiling_strategy
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+
+MAX_FUSED_SIZE = 65536 // 4
+
+REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
+
+
+@triton.jit
+def _tv_distance_kernel(
+    p_ptr,
+    p_stride,
+    q_ptr,
+    q_stride,
+    loss_ptr,
+    loss_stride,
+    grads_ptr,
+    grads_stride,
+    label_ptr,
+    ignore_index: tl.constexpr,
+    n_cols,  # V
+    total_rows: tl.constexpr,  # BT
+    BLOCK_SIZE: tl.constexpr,
+    HAS_LABEL: tl.constexpr,
+    reduction: tl.constexpr = "batchmean",
+):
+    thread_id = tl.program_id(0)
+    num_threads = tl.num_programs(0)
+
+    for pid in tl.range(thread_id, total_rows, num_threads):
+        p_row_ptr = p_ptr + pid * p_stride
+        q_row_ptr = q_ptr + pid * q_stride
+        loss_row_ptr = loss_ptr + pid * loss_stride
+        grads_row_ptr = grads_ptr + pid * grads_stride
+        label_row_ptr = label_ptr + pid
+
+        base_offsets = tl.arange(0, BLOCK_SIZE)
+
+        should_skip = False
+        if HAS_LABEL:
+            label = tl.load(label_row_ptr)
+            if label == ignore_index:
+                should_skip = True
+
+        if should_skip:
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + base_offsets
+                mask = offsets < n_cols
+                tl.store(grads_row_ptr + offsets, 0.0, mask=mask)
+                if reduction == "none":
+                    tl.store(loss_row_ptr + offsets, 0.0, mask=mask)
+        else:
+            loss_sum = 0.0
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + base_offsets
+                mask = offsets < n_cols
+
+                p = tl.load(p_row_ptr + offsets, mask=mask, other=0.0)
+                q = tl.load(q_row_ptr + offsets, mask=mask, other=0.0)
+
+                # TVD(P || Q) = 0.5 * |P - Q|
+                tv_loss = 0.5 * tl.abs(p - q)
+                grad_res = tl.where(p > q, 0.5, -0.5)
+
+                tl.store(grads_row_ptr + offsets, grad_res, mask=mask)
+
+                if reduction == "none":
+                    tl.store(loss_row_ptr + offsets, tv_loss, mask=mask)
+                else:
+                    loss_sum += tl.sum(tv_loss, axis=0)
+
+            if reduction != "none":
+                tl.store(loss_row_ptr, loss_sum)
+
+
+def tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label):
+    BT, V = p.shape
+
+    # TVD forward tiling strategy
+    # - In main loop (calculate loss and grad):
+    #   * p: BLOCK_Q elements
+    #   * q: BLOCK_Q elements
+    #   * tv_loss: BLOCK_Q elements
+    #   * grad_res: BLOCK_Q elements
+    #   * loss_sum: BLOCK_Q elements (when reduction != "none")
+    #   * Total: 4 * BLOCK_Q elements or 5 * BLOCK_Q elements when reduction != "none"
+    # - Since loss_sum is not necessarily used in every calculation,
+    # - and considering the consumption of other shared memory and the potential memory consumption of the HAS_LABEL loop.
+    # - Conservative estimate: 5 * BLOCK_Q * dtype_size * 8 bits
+    # - For safety, use: memory_multiplier=5.0 * BLOCK_SIZE * pad_hd * dtype_size * 8 bits
+    # - shapes: ((V,),)
+    # - tiling_dims: (0,) means first dimension of each shape can be tiled
+    # - Returns: ((block_size,),
+    shapes = ((V,),)
+    tile_shapes = compute_default_tiling_strategy(
+        safety_margin=0.80,
+        # In the TVD calculation, many data are implicitly converted to f32, so the size of f32 can be directly used.
+        dtype_size=4,
+        memory_multiplier=5.0,
+        shapes=shapes,
+        tiling_dims=(0,),
+    )
+
+    if tile_shapes is not None and len(tile_shapes) > 0 and len(tile_shapes[0]) > 0:
+        # Strategy returns ((block_size,),)
+        BLOCK_SIZE = tile_shapes[0][0]
+    else:
+        # Fallback to desired block size if no best practice found (no tiling needed)
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+
+    num_cores = get_npu_core_count()
+    grid = (min(num_cores, BT),)
+
+    out_size = (BT, V) if reduction == "none" else (BT,)
+
+    # The loss and grid accumulation on BF16 platform of NPU will have precision errors.
+    output_tensor = torch.zeros(out_size, device=p.device, dtype=torch.float32)
+    grads = torch.empty_like(p, dtype=torch.float32)
+
+    n_non_ignore = (shift_labels != ignore_index).sum().item() if has_label else BT
+
+    _tv_distance_kernel[grid](
+        p,
+        p.stride(0),
+        q,
+        q.stride(0),
+        output_tensor,
+        output_tensor.stride(0),
+        grads,
+        grads.stride(0),
+        shift_labels if has_label else torch.empty(1, device=p.device),
+        ignore_index,
+        V,
+        BT,
+        BLOCK_SIZE=BLOCK_SIZE,
+        HAS_LABEL=has_label,
+        reduction=reduction,
+    )
+
+    if reduction == "batchmean":
+        return output_tensor.sum() / n_non_ignore, grads / n_non_ignore
+    elif reduction == "sum":
+        return output_tensor.sum(dim=0), grads
+    elif reduction == "mean":
+        return output_tensor.sum() / (n_non_ignore * V), grads / (n_non_ignore * V)
+    else:
+        return output_tensor, grads
+
+
+def tvd_backward_triton(grad_output, grads):
+    # If this is the last layer, grad_output is 1.0. Skip the mul then.
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        return grads
+
+    return grads * grad_output
+
+
+class LigerTVDLossFunction(torch.autograd.Function):
+    """
+    Class implementing the forward and backward pass for the Total Variation Distance Loss using Triton.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        p: torch.Tensor,
+        q: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+        reduction: REDUCTION_LITERAL = "batchmean",
+        ignore_index: int = -100,
+    ) -> torch.Tensor:
+        """A forward pass for the Total Variation Distance Loss.
+
+        Args:
+            ctx: Torch autograd context
+            p (torch.Tensor): A tensor of shape (BT, V) containing the first distribution.
+            q (torch.Tensor): A tensor of shape (BT, V) containing the second distribution.
+            shift_labels (Optional[torch.Tensor]): A tensor of shape (BT,) containing the labels.
+            reduction (REDUCTION_LITERAL, optional): The reduction method to be applied. Defaults to "batchmean".
+            ignore_index (int, optional): The index to ignore during loss calculation. Defaults to -100.
+
+        Returns:
+            torch.Tensor: The computed Total Variation Distance Loss.
+        """
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (p.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+
+        loss, grads = tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label)
+        ctx.save_for_backward(grads)
+        return loss
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        """A backward pass for the Total Variation Distance Loss.
+
+        Args:
+            ctx: Torch autograd context
+            grad_output (torch.Tensor): The gradient of the loss with respect to the output.
+
+        Returns:
+            tuple[torch.Tensor, None, None, None, None]: The gradient of the loss with respect to the inputs.
+        """
+        (grads,) = ctx.saved_tensors
+        grads = tvd_backward_triton(grad_output, grads)
+
+        return grads, None, None, None, None
diff --git a/src/liger_kernel/ops/backends/_ascend/ub_manager.py b/src/liger_kernel/ops/backends/_ascend/ub_manager.py
new file mode 100755
index 0000000000000000000000000000000000000000..0873ab619538c4bb33b924219f22fed7d10f1ec1
--- /dev/null
+++ b/src/liger_kernel/ops/backends/_ascend/ub_manager.py
@@ -0,0 +1,373 @@
+"""
+Unified Buffer (UB) Manager for Ascend NPU.
+
+This module provides UB capacity detection and tiling strategy computation
+for running Triton kernels on Ascend NPU. It automatically calculates
+optimal block sizes based on UB capacity constraints to prevent UB overflow.
+"""
+
+import os
+
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import triton
+
+from liger_kernel.utils import is_npu_available
+
+
+def _normalize_tiling_dims(tiling_dim: Union[int, Tuple[int, ...]]) -> set:
+    """
+    Normalize tiling dimension specification to a set of dimension indices.
+
+    Args:
+        tiling_dim: Either an int (single dimension) or tuple of ints (multiple dimensions).
+
+    Returns:
+        Set of dimension indices that can be tiled.
+    """
+    if isinstance(tiling_dim, int):
+        return {tiling_dim}
+    elif isinstance(tiling_dim, tuple):
+        return set(tiling_dim)
+    else:
+        return set()
+
+
+def _default_strategy(
+    ub_capacity_bits: int,
+    safety_margin: float,
+    dtype_size: int,
+    memory_multiplier: float,
+    shapes: Tuple[Tuple[int, ...], ...],
+    tiling_dims: Tuple[Union[int, Tuple[int, ...]], ...],
+) -> Tuple[int, ...]:
+    """
+    Default tiling strategy: calculate maximum safe block size based on UB capacity.
+
+    This is a unified strategy function that works for all kernels by abstracting
+    the memory calculation as: memory_multiplier * BLOCK_SIZE * unit_param * dtype_size * 8 bits
+
+    Args:
+        ub_capacity_bits: UB capacity in bits
+        safety_margin: Safety margin as a float (e.g., 0.80 for 80%)
+        dtype_size: Size of data type in bytes (e.g., 2 for float16, 4 for float32)
+        memory_multiplier: Memory multiplier for estimating peak memory usage
+        shapes: Tuple of full shapes. Each shape is a tuple of dimension sizes.
+            - For ROPE: ((n_q_head, hd), (n_kv_head, hd))
+            - For GEGLU: ((n_cols,),)
+        tiling_dims: Tuple specifying which dimensions can be tiled for each shape.
+            Each element can be:
+            - int: single dimension index (e.g., 0 for first dimension)
+            - tuple of ints: multiple dimensions that can be tiled together
+            - For ROPE: (0, 0) means first dimension of each shape can be tiled
+            - For GEGLU: (0,) means first dimension of the shape can be tiled
+            Length must match len(shapes).
+
+    Returns:
+        Tuple of maximum safe block sizes, one for each shape.
+        Each element is a power of 2.
+
+    Note:
+        For each shape, fixed dimensions (non-tiling) are multiplied together to get unit_param.
+        The final block size is computed in compute_default_tiling_strategy by taking
+        min(desired_block_size, max_safe_block_size) where desired_block_size = triton.next_power_of_2(original_dim).
+    """
+    if not shapes or not tiling_dims:
+        return ()
+
+    # Calculate max_safe_block_size for each tiling dimension
+    max_safe_sizes = []
+
+    for shape, tiling_dim in zip(shapes, tiling_dims):
+        # Normalize tiling_dim to a set of dimension indices
+        tiling_dim_set = _normalize_tiling_dims(tiling_dim)
+
+        # Validate tiling dimensions are within shape bounds
+        if not tiling_dim_set:
+            raise ValueError(
+                f"Invalid tiling_dim: {tiling_dim}. tiling_dim must be an int or a non-empty tuple of ints."
+            )
+        if any(dim_idx < 0 or dim_idx >= len(shape) for dim_idx in tiling_dim_set):
+            raise ValueError(
+                f"Invalid tiling_dim: {tiling_dim} for shape {shape}. "
+                f"All dimension indices must be in range [0, {len(shape)})."
+            )
+
+        # Calculate unit_param: product of fixed (non-tiling) dimensions
+        unit_param = 1.0
+        for dim_idx, dim_size in enumerate(shape):
+            if dim_idx not in tiling_dim_set:
+                if dim_size <= 0:
+                    # Invalid dimension size, use conservative default
+                    unit_param = 1.0
+                    break
+                unit_param *= float(dim_size)
+
+        # Ensure unit_param is at least 1.0
+        if unit_param <= 0:
+            unit_param = 1.0
+
+        # Calculate maximum safe block size based on UB capacity
+        # Memory: memory_multiplier * BLOCK_SIZE * unit_param * dtype_size * 8 bits
+        SAFE_UB_CAPACITY_BITS = int(ub_capacity_bits * safety_margin)
+
+        # Solve: memory_multiplier * BLOCK_SIZE * unit_param * dtype_size * 8 <= SAFE_UB_CAPACITY_BITS
+        # BLOCK_SIZE <= SAFE_UB_CAPACITY_BITS / (memory_multiplier * unit_param * dtype_size * 8)
+        max_block_size = int(SAFE_UB_CAPACITY_BITS // (memory_multiplier * unit_param * dtype_size * 8))
+        max_block_size = max(1, max_block_size)
+
+        # Find largest power of 2 <= max_block_size
+        # Use triton.next_power_of_2(max_block_size + 1) // 2 to get the largest power of 2 <= max_block_size
+        safe_block_size = triton.next_power_of_2(max_block_size + 1) // 2
+        max_safe_sizes.append(safe_block_size)
+
+    return tuple(max_safe_sizes)
+
+
+class UBManager:
+    """
+    Unified Buffer Manager for Ascend NPU.
+
+    Provides UB capacity detection and management for Ascend NPU devices.
+    The UB capacity is used by tiling strategy functions to calculate optimal block sizes.
+    """
+
+    def __init__(self, ub_capacity_bits: Optional[int] = None):
+        """
+        Initialize UB Manager.
+
+        Args:
+            ub_capacity_bits: UB capacity in bits. If None, will be detected automatically.
+        """
+        self._npu_model = self._detect_npu_model()
+        self._ub_capacity_bits = ub_capacity_bits or self._detect_ub_capacity()
+
+    @property
+    def ub_capacity_bits(self) -> int:
+        """Get UB capacity in bits."""
+        return self._ub_capacity_bits
+
+    @property
+    def ub_capacity_bytes(self) -> int:
+        """Get UB capacity in bytes."""
+        return self._ub_capacity_bits // 8
+
+    @property
+    def npu_model(self) -> str:
+        """Get detected NPU model name."""
+        return self._npu_model
+
+    def _detect_npu_model(self) -> str:
+        """Detect NPU model from device properties."""
+        if not is_npu_available():
+            return "unknown"
+
+        try:
+            dev_props = torch.npu.get_device_properties(0)
+            # Try to get model name from device properties
+            return dev_props.name
+        except Exception:
+            pass
+
+        return "default"
+
+    def _detect_ub_capacity(self) -> int:
+        """
+        Detect UB capacity from environment variable or get_soc_spec.
+
+        Returns:
+            UB capacity in bits.
+
+        Raises:
+            RuntimeError: If UB capacity cannot be detected and no environment variable is set.
+        """
+        # Check environment variable first (in bits)
+        env_capacity = os.getenv("ASCEND_UB_CAPACITY_BITS")
+        if env_capacity is not None:
+            try:
+                capacity_bits = int(env_capacity)
+                if capacity_bits > 0:
+                    return capacity_bits
+            except ValueError:
+                pass
+
+        # Try to get from get_soc_spec (returns bytes, convert to bits)
+        if is_npu_available():
+            try:
+                from tbe.common.platform import get_soc_spec
+                from tbe.common.platform import set_current_compile_soc_info
+
+                # Set current SOC info for get_soc_spec to work correctly
+                device = getattr(torch, "npu")
+                soc_info = device.get_device_name(device.current_device())
+                set_current_compile_soc_info(soc_info)
+
+                # Query UB size (get_soc_spec returns size in bytes)
+                ub_size_bytes = get_soc_spec("UB_SIZE")
+
+                if ub_size_bytes is None or ub_size_bytes <= 0:
+                    raise ValueError(f"Invalid UB_SIZE from get_soc_spec: {ub_size_bytes}")
+
+                # Convert bytes to bits
+                ub_capacity_bits = ub_size_bytes * 8
+                return ub_capacity_bits
+
+            except ImportError:
+                raise RuntimeError(
+                    "Cannot import tbe.common.platform.get_soc_spec. "
+                    "Please ensure CANN environment variables are sourced "
+                    "(e.g., source /usr/local/Ascend/ascend-toolkit/set_env.sh)"
+                )
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to detect UB capacity from get_soc_spec: {e}. "
+                    "Please set ASCEND_UB_CAPACITY_BITS environment variable as fallback."
+                )
+
+        # If NPU is not available, raise error
+        raise RuntimeError(
+            "NPU is not available and UB capacity cannot be detected. "
+            "Please set ASCEND_UB_CAPACITY_BITS environment variable."
+        )
+
+
+# Global singleton instance
+_ub_manager: Optional[UBManager] = None
+
+
+def get_ub_manager() -> UBManager:
+    """Get global UB manager instance."""
+    global _ub_manager
+    if _ub_manager is None:
+        _ub_manager = UBManager()
+    return _ub_manager
+
+
+def compute_default_tiling_strategy(
+    safety_margin: float = 0.80,
+    dtype_size: Optional[int] = None,
+    memory_multiplier: Optional[float] = None,
+    shapes: Optional[Tuple[Tuple[int, ...], ...]] = None,
+    tiling_dims: Optional[Tuple[Union[int, Tuple[int, ...]], ...]] = None,
+) -> Optional[Tuple[Tuple[int, ...], ...]]:
+    """
+    Compute tiling strategy using the default strategy function.
+
+    This function directly calls the default strategy and computes the final
+    tiling result. All kernels use the same unified strategy function, so
+    there's no need for kernel_name-based lookup.
+
+    Args:
+        safety_margin: Safety margin as a float (e.g., 0.80 for 80%). Default is 0.80.
+        dtype_size: Size of data type in bytes (e.g., 2 for float16, 4 for float32).
+            Must be provided. If None or <= 0, defaults to 4 (float32).
+        memory_multiplier: Memory multiplier for estimating peak memory usage.
+            - For GEGLU: typically 10.0 for backward, 4.0 for forward
+            - For ROPE: typically 3.0
+            If None, defaults to 10.0 (conservative estimate).
+        shapes: Tuple of full shapes. Each shape is a tuple of dimension sizes.
+            - For ROPE: ((n_q_head, hd), (n_kv_head, hd))
+            - For GEGLU: ((n_cols,),)
+            Can pass original shapes (will handle padding internally) or padded shapes.
+        tiling_dims: Tuple specifying which dimensions can be tiled for each shape.
+            Each element can be:
+            - int: single dimension index (e.g., 0 for first dimension)
+            - tuple of ints: multiple dimensions that can be tiled together
+            - For ROPE: (0, 0) means first dimension of each shape can be tiled
+            - For GEGLU: (0,) means first dimension of the shape can be tiled
+            Length must match len(shapes). Cannot be empty.
+
+    Returns:
+        Tuple of tiled shapes with same structure as input shapes.
+        Tiling dimensions are replaced with computed block sizes (power of 2),
+        while non-tiling dimensions are padded to next power of 2.
+        - For ROPE: ((block_size_q, pad_hd), (block_size_kv, pad_hd))
+        - For GEGLU: ((block_size,),)
+        Returns None if shapes or tiling_dims is None or empty.
+
+    Examples:
+        >>> # ROPE forward
+        >>> strategy = compute_default_tiling_strategy(
+        ...     safety_margin=0.90,
+        ...     dtype_size=4,
+        ...     memory_multiplier=3.0,
+        ...     shapes=((32, 128), (32, 128)),
+        ...     tiling_dims=(0, 0)
+        ... )
+        >>> # Returns: ((block_size_q, 128), (block_size_kv, 128))
+        >>> # GEGLU forward
+        >>> strategy = compute_default_tiling_strategy(
+        ...     safety_margin=0.80,
+        ...     dtype_size=2,
+        ...     memory_multiplier=7.0,
+        ...     shapes=((4096,),),
+        ...     tiling_dims=(0,)
+        ... )
+        >>> # Returns: ((block_size,),)
+    """
+    ub_manager = get_ub_manager()
+
+    if shapes is None or not shapes or tiling_dims is None or not tiling_dims:
+        return None
+
+    if len(shapes) != len(tiling_dims):
+        return None
+
+    if dtype_size is None or dtype_size <= 0:
+        dtype_size = 4  # Default to float32
+
+    if memory_multiplier is None or memory_multiplier <= 0:
+        memory_multiplier = 10.0  # Default conservative estimate
+
+    # Call strategy to get max_safe_block_size for each shape
+    max_supported = _default_strategy(
+        ub_manager.ub_capacity_bits,
+        safety_margin,
+        dtype_size,
+        memory_multiplier,
+        shapes,
+        tiling_dims,
+    )
+
+    if not max_supported or len(max_supported) != len(shapes):
+        return None
+
+    # Build result: same structure as shapes, with tiling dims replaced by computed block sizes
+    result = []
+    for shape, tiling_dim, max_safe in zip(shapes, tiling_dims, max_supported):
+        result_shape = list(shape)
+
+        # Normalize tiling_dim to a set of dimension indices
+        tiling_dim_set = _normalize_tiling_dims(tiling_dim)
+
+        # Validate tiling dimensions are within shape bounds
+        if not tiling_dim_set:
+            raise ValueError(
+                f"Invalid tiling_dim: {tiling_dim}. tiling_dim must be an int or a non-empty tuple of ints."
+            )
+        if any(dim_idx < 0 or dim_idx >= len(result_shape) for dim_idx in tiling_dim_set):
+            raise ValueError(
+                f"Invalid tiling_dim: {tiling_dim} for shape {shape}. "
+                f"All dimension indices must be in range [0, {len(result_shape)})."
+            )
+
+        # Replace tiling dimensions with computed block sizes
+        # For each tiling dimension, compute: min(desired, max_safe)
+        for dim_idx in tiling_dim_set:
+            original_dim = result_shape[dim_idx]
+            desired = triton.next_power_of_2(original_dim)
+            final_val = min(desired, max_safe)
+            final_val = max(1, final_val)  # Ensure at least 1
+            result_shape[dim_idx] = final_val
+
+        # Pad non-tiling dimensions to next power of 2
+        for dim_idx, dim_size in enumerate(result_shape):
+            if dim_idx not in tiling_dim_set:
+                result_shape[dim_idx] = triton.next_power_of_2(dim_size)
+
+        result.append(tuple(result_shape))
+
+    return tuple(result)
diff --git a/src/liger_kernel/ops/backends/registry.py b/src/liger_kernel/ops/backends/registry.py
new file mode 100755
index 0000000000000000000000000000000000000000..5fe3613c82304d33e20d68b536823edc2c9d152e
--- /dev/null
+++ b/src/liger_kernel/ops/backends/registry.py
@@ -0,0 +1,61 @@
+"""
+Vendor registry for Liger-Kernel multi-backend support.
+
+This module defines VendorInfo and the registry for vendor registration.
+Each vendor registers itself by calling register_vendor() in its __init__.py.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+# Dynamically get backends package path to avoid hardcoding
+_BACKENDS_PACKAGE = __name__.rsplit(".", 1)[0]  # "liger_kernel.ops.backends"
+
+
+@dataclass
+class VendorInfo:
+    """
+    Information about a chip vendor and its supported device.
+
+    Attributes:
+        vendor: Vendor name (e.g., "ascend", "intel", "nvidia")
+        device: Device type this vendor supports (e.g., "npu", "xpu")
+    """
+
+    vendor: str
+    device: str
+
+    @property
+    def module_path(self) -> str:
+        """Auto-generated module path based on vendor name."""
+        return f"{_BACKENDS_PACKAGE}._{self.vendor}.ops"
+
+
+# Registry mapping device types to their vendor info
+# Vendors register themselves via register_vendor()
+VENDOR_REGISTRY: dict[str, VendorInfo] = {}
+
+
+def register_vendor(vendor_info: VendorInfo) -> None:
+    """
+    Register a vendor's info in the global registry.
+
+    This should be called in each vendor's __init__.py to register itself.
+
+    Args:
+        vendor_info: VendorInfo instance to register
+    """
+    VENDOR_REGISTRY[vendor_info.device] = vendor_info
+
+
+def get_vendor_for_device(device: str) -> Optional[VendorInfo]:
+    """
+    Get the VendorInfo for a given device type.
+
+    Args:
+        device: Device type (e.g., "npu", "xpu")
+
+    Returns:
+        VendorInfo if found, None otherwise
+    """
+    return VENDOR_REGISTRY.get(device)
diff --git a/src/liger_kernel/ops/cross_entropy.py b/src/liger_kernel/ops/cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..4793f75c9d14e25e1503908f8deb3879c812f010
--- /dev/null
+++ b/src/liger_kernel/ops/cross_entropy.py
@@ -0,0 +1,558 @@
+import operator
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
+from liger_kernel.utils import infer_device
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import tanh
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import tanh
+else:
+    from triton.language.math import tanh
+
+
+@triton.jit
+def liger_cross_entropy_kernel(
+    X_ptr,
+    X_stride,
+    Y_ptr,
+    Y_stride,
+    weight_ptr,
+    loss_ptr,
+    z_loss_ptr,
+    loss_stride,
+    token_accuracy_ptr,
+    token_accuracy_stride,
+    predicted_tokens_ptr,
+    predicted_tokens_stride,
+    n_cols,
+    n_non_ignore,
+    sum_non_ignore_weight,
+    weight_sum,
+    ignore_index,
+    lse_square_scale: tl.constexpr,
+    label_smoothing: tl.constexpr,
+    reduction: tl.constexpr,  # set it as constexpr since reduction is always known at compile time
+    softcap,
+    RETURN_Z_LOSS: tl.constexpr,
+    RETURN_TOKEN_ACCURACY: tl.constexpr,
+    RETURN_PREDICTED_TOKENS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
+    HAS_SOFTCAPPING: tl.constexpr,
+    HAS_GRADIENTS: tl.constexpr,
+):
+    """
+    This kernel computes both cross entropy loss and the gradient of the input.
+    We only consider hard label + mean reduction for now. Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math.
+
+    Parameters:
+    X_ptr: Pointer to input tensor.
+    X_stride (int): The stride of the input tensor.
+    Y_ptr: Pointer to target tensor.
+    Y_stride (int): The stride of the target tensor.
+    weight_ptr: Pointer to weight tensor.
+    loss_ptr: Pointer to tensor to store the loss.
+    z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0.
+    loss_stride (int): The stride of the loss tensor.
+    token_accuracy_ptr: Pointer to tensor to store the per-token accuracy. No operation if RETURN_TOKEN_ACCURACY is 0.
+    token_accuracy_stride (int): The stride of the token accuracy tensor.
+    n_cols (int): The number of columns in the input tensor.
+    n_non_ignore (float): The number of non-ignored elements in the batch.
+    sum_non_ignore_weight (float): The sum of non-ignored target's weights in the batch.
+    weight_sum (float): The sum of weight tensor.
+    ignore_index (int): The index to ignore in the target.
+    label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
+    lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
+    reduction (str): The string for the reduction to apply
+    softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap).
+    RETURN_Z_LOSS (int): The boolean value to decide whether to store z loss to z_loss_ptr or not. It must be 0 or 1.
+    RETURN_TOKEN_ACCURACY (int): The boolean value to decide whether to store per-token accuracy to token_accuracy_ptr or not. It must be 0 or 1.
+    BLOCK_SIZE (int): The block size for Triton operations.
+    HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
+    HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
+    HAS_GRADIENTS (bool): The boolean value to determine whether calculating gradients in forward pass.
+    """
+
+    # https://github.com/triton-lang/triton/issues/1058
+    # If B*T*V is too large, program_id * stride will overflow out of int32, so we convert to int64
+    program_id = tl.program_id(0).to(tl.int64)
+
+    # 1. Load Y_ptr first because if the target is ignore_index, we can return right away
+    Y_ptr += program_id * Y_stride
+    y = tl.load(Y_ptr)
+
+    # 2. locate the start index
+    X_ptr += program_id * X_stride
+
+    if y == ignore_index:
+        # set all X_ptr as 0
+        for i in range(0, n_cols, BLOCK_SIZE):
+            X_offsets = i + tl.arange(0, BLOCK_SIZE)
+            tl.store(X_ptr + X_offsets, 0.0, mask=X_offsets < n_cols)
+        # For ignored tokens, set token accuracy to 0
+        if RETURN_TOKEN_ACCURACY:
+            token_accuracy_ptr += program_id * token_accuracy_stride
+            tl.store(token_accuracy_ptr, 0.0)
+        if RETURN_PREDICTED_TOKENS:
+            predicted_tokens_ptr += program_id * predicted_tokens_stride
+            tl.store(predicted_tokens_ptr, -1)
+        return
+
+    loss_ptr += program_id * loss_stride
+    if RETURN_Z_LOSS:
+        z_loss_ptr += program_id * loss_stride
+    if RETURN_TOKEN_ACCURACY:
+        token_accuracy_ptr += program_id * token_accuracy_stride
+    if RETURN_PREDICTED_TOKENS:
+        predicted_tokens_ptr += program_id * predicted_tokens_stride
+
+    if HAS_WEIGHT:
+        weight_y = tl.load(weight_ptr + y).cast(tl.float32)
+
+    # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
+    # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
+
+    # 3. [Online softmax] first pass: find max + sum
+    m = float("-inf")  # m is the max value. use the notation from the paper
+    d = 0.0  # d is the sum. use the notation from the paper
+    argmax_idx = 0  # Track the index of the maximum value for token accuracy / predicted tokens computation
+    ori_X_y = tl.load(X_ptr + y).cast(tl.float32)  # we need to store the original value of X_y for the loss calculation
+    if HAS_SOFTCAPPING:
+        ori_X_y = softcap * tanh(ori_X_y / softcap)
+
+    # Label smoothing is a general case of normal cross entropy
+    # See the full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issue-2503665310
+    scaled_x_sum = 0.0
+    eps = label_smoothing / n_cols
+
+    for i in range(0, n_cols, BLOCK_SIZE):
+        X_offsets = i + tl.arange(0, BLOCK_SIZE)
+        X_block = tl.load(
+            X_ptr + X_offsets,
+            mask=X_offsets < n_cols,
+            other=float("-inf"),
+            # Ensure float32 precision for softmax calculation
+        ).cast(tl.float32)
+        if HAS_SOFTCAPPING:
+            X_block = softcap * tanh(X_block / softcap)
+        block_max = tl.max(X_block)
+
+        # Track argmax for accuracy / predicted tokens computation
+        if RETURN_TOKEN_ACCURACY or RETURN_PREDICTED_TOKENS:
+            # Find the index of the maximum value in this block
+            is_max_mask = X_block == block_max
+            # Mask out invalid indices with a value larger than n_cols
+            masked_offsets = tl.where(is_max_mask, X_offsets, n_cols)
+            # Get the first (smallest) index where max occurs
+            current_block_argmax_idx = tl.min(masked_offsets)
+
+            is_new_max = block_max > m
+            argmax_idx = tl.where(is_new_max, current_block_argmax_idx, argmax_idx)
+
+        if label_smoothing > 0:
+            # scale X beforehand to avoid overflow
+            if HAS_WEIGHT:
+                weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block * weight_block, 0.0))
+            else:
+                scaled_x_sum += tl.sum(tl.where(X_offsets < n_cols, -eps * X_block, 0.0))
+        m_new = tl.maximum(m, block_max)
+        d = d * tl.exp(m - m_new) + tl.sum(tl.exp(X_block - m_new))
+        m = m_new
+
+    # log (sum(e^(X_i))) = log (sum(e ^ (max(X) * e ^ (X_i - max(X)))))
+    #                    = log (e^(max(X)) * sum(e ^ (X_i - max(X))))
+    #                    = max(X) + log (sum(e ^ (X_i - max(X)))) = m + log d
+    lse = m + tl.log(d)
+
+    # 4. [Online Softmax] Second pass: compute gradients
+    # For 'mean' reduction, gradients are normalized by number of non-ignored elements (N)
+    # dx_y = (softmax(x_y) - 1) / N
+    # dx_i = softmax(x_i) / N, i != y
+    # For label smoothing:
+    # dx_i = (softmax(x_i) - label_smoothing / V) / N, V = n_cols, i != y
+    # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) / N
+    #      = dx_i - (1 - label_smoothing) / N
+    # With Z loss:
+    # dx_i = ((1 + 2 * lse_square_scale * lse) * softmax(x_i) - label_smoothing / V) / N, i != y
+    # dx_y = dx_i - (1 - label_smoothing) / N
+    # For 'sum' reduction, no normalization is applied:
+    # dx_y = softmax(x_y) - 1
+    # dx_i = softmax(x_i), for i ≠ y
+    if HAS_GRADIENTS:
+        for i in range(0, n_cols, BLOCK_SIZE):
+            X_offsets = i + tl.arange(0, BLOCK_SIZE)
+            X_block = tl.load(
+                X_ptr + X_offsets,
+                mask=X_offsets < n_cols,
+                other=float("-inf"),
+                # Ensure float32 precision for softmax calculation
+            ).cast(tl.float32)
+            if HAS_SOFTCAPPING:
+                intermediate = tanh(X_block / softcap)
+                X_block = softcap * intermediate
+
+            if not HAS_WEIGHT:
+                # softmax(x_i)
+                X_block = tl.exp(X_block - m) / d
+                # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
+                X_block += 2 * lse_square_scale * lse * X_block
+                # smoothing term
+                X_block += -eps
+                # special handle dx_y
+                X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
+                # reduction scale
+                if reduction == "mean":
+                    X_block = X_block / n_non_ignore
+            else:
+                weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                softmax_X = tl.exp(X_block - m) / d
+                # derivative of original_loss
+                dloss_ori = (1 - label_smoothing) * softmax_X
+                # specially handle dx_y
+                dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
+                dloss_ori = dloss_ori * weight_y
+                # derivative of smooth_loss
+                dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
+                # derivative of z-loss
+                dz_loss = 2 * lse_square_scale * lse * softmax_X
+                # reduction scale
+                if reduction == "mean":
+                    dloss_ori = dloss_ori / sum_non_ignore_weight
+                    dloss_smooth = dloss_smooth / sum_non_ignore_weight
+                    # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+                    dz_loss = dz_loss / n_non_ignore
+                # derivative of total_loss
+                X_block = dloss_ori + dloss_smooth + dz_loss
+
+            # chain rule softcapping
+            # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
+            if HAS_SOFTCAPPING:
+                X_block = X_block * (1 - intermediate * intermediate)
+
+            tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
+
+    # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
+    # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
+    tl.debug_barrier()
+
+    # 5. Calculate the loss
+
+    # loss = log (softmax(X_y)) = log ((e ^ (X_y - max(X)) / sum(e ^ (X - max(X))))
+    #      = (X_y - max(X)) - log(sum(e ^ (X - max(X))))
+    #      = X_y - m - log d = X_y - lse
+    # sum(e ^ (X - max(X))) must >= 1 because the max term is e ^ 0 = 1
+    # So we can safely calculate log (softmax(X_y)) without overflow
+    loss = lse - ori_X_y
+    if HAS_WEIGHT:
+        loss = weight_y * loss
+
+    # Original loss = H(q, p),  with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
+    # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
+    #          = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i))
+    # By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as:
+    #          = (1 - label_smoothing) * H(q, p) + (sum(-eps * x_i) + label_smoothing * (m + logd))
+    # Refer to H(q', p) in section 7 of the paper: https://arxiv.org/pdf/1512.00567
+    # pytorch: https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
+    # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
+    if label_smoothing > 0:
+        if HAS_WEIGHT:
+            smooth_loss = scaled_x_sum + eps * lse * weight_sum
+        else:
+            smooth_loss = scaled_x_sum + label_smoothing * lse
+        loss = loss * (1 - label_smoothing) + smooth_loss
+
+    # An auxiliary loss, z_loss
+    # Refer to Page14 Loss function section in the paper PaLM: https://www.jmlr.org/papers/v24/22-1144.html
+    z_loss = lse_square_scale * lse * lse
+    # Normalize the loss by the number of non-ignored elements if reduction is "mean"
+    if reduction == "mean":
+        if HAS_WEIGHT:
+            loss = loss / sum_non_ignore_weight
+        else:
+            loss = loss / n_non_ignore
+        # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+        z_loss = z_loss / n_non_ignore
+    loss += z_loss
+
+    tl.store(loss_ptr, loss)
+    if RETURN_Z_LOSS:
+        tl.store(z_loss_ptr, z_loss)
+    if RETURN_TOKEN_ACCURACY:
+        # Store 1.0 if prediction is correct, 0.0 otherwise
+        is_correct = 1.0 if argmax_idx == y else 0.0
+        tl.store(token_accuracy_ptr, is_correct)
+    if RETURN_PREDICTED_TOKENS:
+        tl.store(predicted_tokens_ptr, argmax_idx)
+
+
+# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
+# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
+# The optimal maximum block size depends on your hardware, your kernel, and your dtype
+# the best size we found by manually tuning on xpu and npu.
+if infer_device() == "xpu":
+    MAX_FUSED_SIZE = 4096
+elif infer_device() == "npu":
+    MAX_FUSED_SIZE = 2048
+else:
+    MAX_FUSED_SIZE = 65536 // 2
+
+
+def cross_entropy_forward(
+    _input,
+    target,
+    weight,
+    ignore_index,
+    lse_square_scale,
+    label_smoothing,
+    reduction,
+    softcap,
+    return_z_loss,
+    return_token_accuracy=False,
+    return_predicted_tokens=False,
+):
+    assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
+    assert isinstance(return_token_accuracy, bool), (
+        f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
+    )
+    assert isinstance(return_predicted_tokens, bool), (
+        f"return_predicted_tokens must be True or False. Got: {return_predicted_tokens}"
+    )
+
+    BT, V = _input.shape
+    n_rows = BT
+
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+
+    # unreduced loss
+    loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
+    z_loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    token_accuracy_1d = (
+        torch.zeros(n_rows, dtype=torch.float32, device=_input.device) if return_token_accuracy else None
+    )
+    predicted_tokens_1d = (
+        torch.full((n_rows,), -1, dtype=torch.int64, device=_input.device) if return_predicted_tokens else None
+    )
+
+    target_mask = target != ignore_index
+    n_non_ignore = target_mask.sum().item()
+    assert (target * target_mask).max() < _input.shape[-1], (
+        f"Target {target.max()} is out of bounds. Expected < {_input.shape[-1]}"
+    )
+    assert (target * target_mask).min() >= 0, f"Target {target.min()} is out of bounds. Expected >= 0"
+    sum_non_ignore_weight = n_non_ignore
+    weight_sum = 0.0
+    if weight is not None:
+        assert weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {weight.shape}"
+        assert torch.is_floating_point(weight), (
+            f"If given, weight has to be a Tensor of floating point dtype. Got: {weight.dtype}"
+        )
+        sum_non_ignore_weight = torch.gather(weight, dim=0, index=target.masked_select(target_mask)).sum().item()
+        weight_sum = weight.sum().item()
+        # ensure weight is contiguous
+        if weight.stride(-1) != 1:
+            weight = weight.contiguous()
+
+    # ensure _input and target are contiguous in the last dimension
+    if _input.stride(-1) != 1:
+        _input = _input.contiguous()
+    if target.stride(-1) != 1:
+        target = target.contiguous()
+
+    # Here we use a trick to store X_ptr gradient in X_ptr so we can save memory
+    liger_cross_entropy_kernel[(n_rows,)](
+        X_ptr=_input,
+        X_stride=_input.stride(-2),
+        Y_ptr=target,
+        Y_stride=target.stride(-1),  # always 1
+        weight_ptr=weight,  # dummy if None
+        loss_ptr=loss_1d,
+        z_loss_ptr=z_loss_1d,
+        loss_stride=loss_1d.stride(-1),  # always 1
+        token_accuracy_ptr=token_accuracy_1d,
+        token_accuracy_stride=token_accuracy_1d.stride(-1)
+        if return_token_accuracy
+        else 0,  # always 1 if accuracy is enabled
+        predicted_tokens_ptr=predicted_tokens_1d,
+        predicted_tokens_stride=predicted_tokens_1d.stride(-1)
+        if return_predicted_tokens
+        else 0,  # always 1 if predicted tokens is enabled
+        n_cols=V,
+        n_non_ignore=n_non_ignore,
+        sum_non_ignore_weight=sum_non_ignore_weight,
+        ignore_index=ignore_index,
+        weight_sum=weight_sum,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        reduction=reduction,
+        softcap=softcap,
+        RETURN_Z_LOSS=return_z_loss,
+        RETURN_TOKEN_ACCURACY=return_token_accuracy,
+        RETURN_PREDICTED_TOKENS=return_predicted_tokens,
+        BLOCK_SIZE=BLOCK_SIZE,
+        HAS_WEIGHT=True if weight is not None else False,
+        HAS_SOFTCAPPING=True if softcap is not None else False,
+        HAS_GRADIENTS=_input.requires_grad,
+        # TODO: 32 seems to give the best performance
+        # Performance is quite sensitive to num_warps
+        num_warps=32 if not is_hip() else 16,
+    )
+
+    if reduction == "none":
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
+        token_accuracy = token_accuracy_1d if return_token_accuracy else None
+    else:
+        loss = torch.sum(loss_1d)
+        z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+        # For accuracy, we compute the mean across all non-ignored tokens
+        token_accuracy = torch.sum(token_accuracy_1d) / n_non_ignore if return_token_accuracy else None
+
+    predicted_tokens = predicted_tokens_1d if return_predicted_tokens else None
+
+    return loss, z_loss, token_accuracy, predicted_tokens, _input
+
+
+def cross_entropy_backward(_input, grad_output):
+    # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        pass
+    # If reduction is 'none'
+    elif grad_output.ndim > 0:
+        _input = _input * grad_output.unsqueeze(dim=1)
+    # If reduction is ['mean', 'sum'], grad_output is just a scalar
+    # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
+    # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
+    else:
+        BT, V = _input.shape
+        n_rows = BT
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+
+        element_mul_kernel[(n_rows,)](
+            _input,
+            _input.stride(-2),
+            grad_output,
+            V,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=32 if not is_hip() else 16,
+        )
+
+    return _input
+
+
+class LigerCrossEntropyFunction(torch.autograd.Function):
+    """
+    This class implements a custom autograd function for the Liger Cross Entropy loss.
+    It overrides the forward and backward methods of the torch.autograd.Function class.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        _input: torch.Tensor,
+        target: torch.Tensor,
+        weight: Optional[torch.FloatTensor],
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
+        return_token_accuracy: bool = False,
+        return_predicted_tokens: bool = False,
+    ):
+        """
+        The forward pass of the Liger Cross Entropy loss.
+
+        Parameters:
+        ctx : The context object.
+        _input (tensor): The input tensor of shape (BT, V) where B is batch size, T is sequence length, V is vocab size.
+        target (tensor): The target tensor of shape (BT) where each value is in [0, V-1].
+        weight(Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype
+        ignore_index (int): The index to ignore in the target.
+        lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
+        label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
+        reduction (str): The reduction to apply to the output: "none" | "mean | "sum".
+        softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
+        return_z_loss (bool): When `return_z_loss` is `True`, returns (loss, z_loss, token_accuracy, predicted_tokens) instead of (loss, None, None, None). Default: `False`
+        return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
+        return_predicted_tokens (bool): When `return_predicted_tokens` is `True`, returns per-token predicted class indices (argmax) without materializing logits. Default: `False`
+
+        Returns:
+        tuple: A tuple with the computed losses, accuracy, and predicted tokens: (loss, z_loss, token_accuracy, predicted_tokens). z_loss, token_accuracy, and predicted_tokens are None if not requested.
+        """
+        input_requires_grad = _input.requires_grad
+
+        loss, z_loss, token_accuracy, predicted_tokens, _input = cross_entropy_forward(
+            _input,
+            target,
+            weight,
+            ignore_index,
+            lse_square_scale,
+            label_smoothing,
+            reduction,
+            softcap,
+            return_z_loss,
+            return_token_accuracy,
+            return_predicted_tokens,
+        )
+        # TODO: investigation
+        # If we don't detach the _input tensor, the memory will double
+        # Not sure why but seems that there will be a time both grad and value exist but in different location
+        if input_requires_grad:
+            ctx.save_for_backward(_input.detach())
+        ctx.return_z_loss = return_z_loss
+        ctx.return_token_accuracy = return_token_accuracy
+        ctx.return_predicted_tokens = return_predicted_tokens
+
+        return loss, z_loss, token_accuracy, predicted_tokens
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_output2, grad_output3, grad_output4):
+        """
+        The backward pass of the Liger Cross Entropy loss.
+
+        Parameters:
+        ctx : The context object with saved tensors.
+        grad_output (tensor): The tensor containing the gradient of the loss with respect to the output.
+        grad_output2 (tensor): No use. Gradient for z_loss (not used as z_loss is only for logging).
+        grad_output3 (tensor): No use. Gradient for token_accuracy (not used as token_accuracy is only for metrics).
+        grad_output4 (tensor): No use. Gradient for predicted_tokens (not used as predicted_tokens is only for metrics).
+        Returns:
+        tuple: A tuple with the gradients with respect to the inputs. The elements are tensors or None.
+        """
+        if ctx.return_z_loss:
+            del grad_output2  # z_loss is only for logging
+        if ctx.return_token_accuracy:
+            del grad_output3  # token_accuracy is only for metrics
+        if ctx.return_predicted_tokens:
+            del grad_output4  # predicted_tokens is only for metrics
+
+        (_input,) = ctx.saved_tensors
+        _input = cross_entropy_backward(_input, grad_output)
+        return (
+            _input,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/liger_kernel/ops/dyt.py b/src/liger_kernel/ops/dyt.py
new file mode 100755
index 0000000000000000000000000000000000000000..432c0ee275681150c8915906ab7c0d334405ae7c
--- /dev/null
+++ b/src/liger_kernel/ops/dyt.py
@@ -0,0 +1,160 @@
+import operator
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+from liger_kernel.ops.utils import infer_device
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import tanh
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import tanh
+else:
+    from triton.language.math import tanh
+
+
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16, 32]
+#                   ],
+#                   key=['N'])
+@triton.jit
+def _dyt_fwd_kernel(X, Y, Alpha, Gamma, Beta, HAVE_BETA: tl.constexpr, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024):
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    row_id = tl.cast(tl.program_id(1), tl.int64)
+
+    X += row_id * N
+    Y += row_id * N
+    alpha = tl.load(Alpha).to(tl.float32)
+
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+
+    x = tl.load(X + col, mask=mask, other=0.0).to(tl.float32)
+
+    tanh_x = tanh(alpha * x)
+    y = tanh_x * gamma
+    if HAVE_BETA:
+        beta = tl.load(Beta + col, mask=mask, other=0.0).to(tl.float32)
+        y += beta
+    tl.store(Y + col, y, mask=mask)
+
+
+# @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
+#                   for bn in [1024, 2048, 4096]
+#                   for ns in [1,2,4]
+#                   for nw in [4, 8, 16]
+#                   ],
+#                   key=['N'])
+@triton.jit
+def _dyt_bwd_kernel(
+    DY, DX, DA, DG, DB, X, Alpha, Gamma, HAVE_BETA: tl.constexpr, M, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024
+):
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    start_row_id = tl.cast(tl.program_id(1), tl.int64)
+
+    alpha = tl.load(Alpha).to(tl.float32)
+    da = 0.0
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    dg = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAVE_BETA:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    for row_id in range(start_row_id, M, tl.num_programs(1)):
+        x = tl.load(X + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        tanh_x = tanh(alpha * x)
+        if HAVE_BETA:
+            db += dy
+        dg += dy * tanh_x
+        tmp = (1 - tanh_x * tanh_x) * dy * gamma
+        da += tl.sum(x * tmp, 0)
+        dx = alpha * tmp
+        tl.store(DX + row_id * N + col, dx, mask=mask)
+
+    tl.store(DG + start_row_id * N + col, dg, mask=mask)
+    if HAVE_BETA:
+        tl.store(DB + start_row_id * N + col, db, mask=mask)
+    tl.store(DA + start_row_id * tl.cdiv(N, 512) + tl.program_id(0), da)
+
+
+def liger_dyt_fwd(x, alpha, gamma, beta):
+    assert x.is_contiguous()
+    HAVE_BETA = True if beta is not None else False
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
+
+    y = torch.empty_like(x)
+
+    if N >= 4096:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 2048), "num_warps": 4, "num_stages": 1}
+    else:
+        kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 4, "num_stages": 1}
+
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), M)
+    _dyt_fwd_kernel[(grid)](
+        x,
+        y,
+        alpha,
+        gamma,
+        beta,
+        HAVE_BETA,
+        N,
+        **kwargs,
+    )
+    return y.view(input_shape)
+
+
+def liger_dyt_bwd(dy, x, alpha, gamma, beta):
+    assert dy.is_contiguous()
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
+    HAVE_BETA = True if beta is not None else False
+
+    device = infer_device()
+    if device == "cuda":
+        NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
+    elif device == "xpu":
+        NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
+    elif device == "npu":
+        NUM_SMS = get_npu_core_count()
+    da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
+    dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
+    db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
+    dx = torch.empty_like(dy)
+
+    kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 8, "num_stages": 2}
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), NUM_SMS)
+    _dyt_bwd_kernel[grid](dy, dx, da, dg, db, x, alpha, gamma, HAVE_BETA, M, N, **kwargs)
+    if HAVE_BETA:
+        db = db.sum(0).to(x.dtype)
+    dg = dg.sum(0).to(gamma.dtype)
+    da = da.sum().to(x.dtype).unsqueeze(0)
+    return dx.view(input_shape), da, dg, db
+
+
+class LigerDyTFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, x, alpha, gamma, beta):
+        y = liger_dyt_fwd(x, alpha, gamma, beta)
+        ctx.save_for_backward(x, alpha, gamma, beta)
+        return y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dy):
+        x, alpha, gamma, beta = ctx.saved_tensors
+        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(dy, x, alpha, gamma, beta)
+        return dx, dalpha, dgamma, dbeta
diff --git a/src/liger_kernel/ops/experimental/embedding.py b/src/liger_kernel/ops/experimental/embedding.py
new file mode 100755
index 0000000000000000000000000000000000000000..159b9a66d64158332c37e763ca9763ac9ede1932
--- /dev/null
+++ b/src/liger_kernel/ops/experimental/embedding.py
@@ -0,0 +1,141 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+@triton.jit
+def embedding_forward_kernel(
+    embeddings_ptr,
+    indices_ptr,
+    output_ptr,
+    n_elements,
+    embedding_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    start_m = pid_m * BLOCK_SIZE_M
+    start_n = pid_n * BLOCK_SIZE_N
+    offsets_m = start_m + tl.arange(0, BLOCK_SIZE_M)
+    mask_m = offsets_m < n_elements
+    indices = tl.load(indices_ptr + offsets_m, mask=mask_m, other=0)
+    offsets_n = start_n + tl.arange(0, BLOCK_SIZE_N)
+    mask_n = offsets_n < embedding_dim
+
+    embedding_offsets = indices[:, None] * embedding_dim + offsets_n[None, :]
+    embeddings = tl.load(
+        embeddings_ptr + embedding_offsets,
+        mask=mask_m[:, None] & mask_n[None, :],
+        other=0.0,
+    )
+
+    output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
+    tl.store(output_ptr + output_offsets, embeddings, mask=mask_m[:, None] & mask_n[None, :])
+
+
+@triton.jit
+def embedding_backward_kernel(
+    grad_output_ptr,
+    grad_weight_ptr,
+    indices_ptr,
+    n_elements,
+    embedding_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    start_m = pid_m * BLOCK_SIZE_M
+    start_n = pid_n * BLOCK_SIZE_N
+    offsets_m = start_m + tl.arange(0, BLOCK_SIZE_M)
+    mask_m = offsets_m < n_elements
+    indices = tl.load(indices_ptr + offsets_m, mask=mask_m, other=0)
+    offsets_n = start_n + tl.arange(0, BLOCK_SIZE_N)
+    mask_n = offsets_n < embedding_dim
+
+    grad_output = tl.load(
+        grad_output_ptr + offsets_m[:, None] * embedding_dim + offsets_n[None, :],
+        mask=mask_m[:, None] & mask_n[None, :],
+        other=0.0,
+    )
+
+    grad_weight_offsets = indices[:, None] * embedding_dim + offsets_n[None, :]
+
+    tl.atomic_add(
+        grad_weight_ptr + grad_weight_offsets,
+        grad_output,
+        mask=mask_m[:, None] & mask_n[None, :],
+    )
+
+
+class LigerEmbeddingFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, embeddings: torch.Tensor, indices: torch.Tensor):
+        ori_shape = indices.shape
+        indices = indices.view(-1)
+        output = torch.empty(
+            indices.shape[0],
+            embeddings.shape[1],
+            device=indices.device,
+            dtype=embeddings.dtype,
+        )
+
+        n_elements = indices.numel()
+        embedding_dim = embeddings.shape[1]
+
+        BLOCK_SIZE_M = triton.next_power_of_2(min(128, embedding_dim))
+        BLOCK_SIZE_N = triton.next_power_of_2(min(128, embedding_dim))
+        grid = (
+            triton.cdiv(n_elements, BLOCK_SIZE_M),
+            triton.cdiv(embedding_dim, BLOCK_SIZE_N),
+        )
+
+        embedding_forward_kernel[grid](
+            embeddings,
+            indices,
+            output,
+            n_elements,
+            embedding_dim=embedding_dim,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+        )
+
+        ctx.save_for_backward(indices, embeddings)
+
+        return output.view(*ori_shape, -1)
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor):
+        indices, embedding_table = ctx.saved_tensors
+        grad_output = grad_output.contiguous().view(-1, embedding_table.shape[1])
+
+        grad_weight = torch.zeros_like(embedding_table)
+
+        n_elements = indices.numel()
+        embedding_dim = embedding_table.shape[1]
+
+        BLOCK_SIZE_M = triton.next_power_of_2(min(128, embedding_dim))
+        BLOCK_SIZE_N = triton.next_power_of_2(min(128, embedding_dim))
+        grid = (
+            triton.cdiv(n_elements, BLOCK_SIZE_M),
+            triton.cdiv(embedding_dim, BLOCK_SIZE_N),
+        )
+
+        embedding_backward_kernel[grid](
+            grad_output,
+            grad_weight,
+            indices,
+            n_elements,
+            embedding_dim=embedding_dim,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+        )
+
+        return grad_weight, None
diff --git a/src/liger_kernel/ops/experimental/mm_int8int2.py b/src/liger_kernel/ops/experimental/mm_int8int2.py
new file mode 100755
index 0000000000000000000000000000000000000000..326d536326698174b944b4915b4443a670980ca1
--- /dev/null
+++ b/src/liger_kernel/ops/experimental/mm_int8int2.py
@@ -0,0 +1,349 @@
+import torch
+import triton
+import triton.language as tl
+
+
+def unpack_weights(packed: torch.Tensor, bits: int = 2) -> torch.Tensor:
+    values_per_item = 8 // bits
+    packed_shape = packed.shape
+
+    if len(packed_shape) == 1:
+        original_row_dim = packed_shape[0] * values_per_item
+        unpacked_shape = (original_row_dim,)
+    else:
+        original_row_dim = packed_shape[0] * values_per_item
+        unpacked_shape = (original_row_dim, *packed_shape[1:])
+
+    unpacked = torch.zeros(unpacked_shape, device=packed.device, dtype=torch.uint8)
+
+    for i in range(values_per_item):
+        start = i * packed_shape[0]
+        end = start + packed_shape[0]
+        mask = 3 << (2 * i)
+        unpacked[start:end] = (packed & mask) >> (2 * i)
+
+    unpacked = unpacked.to(torch.int32) - 1
+    return unpacked
+
+
+def pack_weights(intweights: torch.Tensor, bits: int = 2) -> torch.Tensor:
+    intweights += 1
+    original_shape = intweights.shape
+    values_per_item = 8 // bits
+    row_dim = (original_shape[0] + values_per_item - 1) // values_per_item
+
+    if len(original_shape) == 1:
+        packed_tensor_shape = (row_dim,)
+    else:
+        packed_tensor_shape = (row_dim, *original_shape[1:])
+
+    packed = torch.zeros(packed_tensor_shape, device=intweights.device, dtype=torch.uint8)
+    unpacked = intweights.to(torch.uint8)
+
+    def lshift(t: torch.Tensor, bits: int):
+        return t << bits
+
+    it = min(values_per_item, (original_shape[0] // row_dim) + 1)
+    for i in range(it):
+        start = i * row_dim
+        end = min(start + row_dim, original_shape[0])
+        packed[: (end - start)] |= lshift(unpacked[start:end], bits * i)
+
+    return packed
+
+
+def get_autotune_config():
+    return [
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 256,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 256,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 128,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 64,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 64,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 32,
+                "BLOCK_SIZE_N": 32,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 4,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+    ]
+
+
+@triton.autotune(
+    configs=get_autotune_config(),
+    key=["M", "N", "K"],
+)
+@triton.jit
+def matmul_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    N,
+    K: tl.constexpr,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # We want K / 4 to be divisible by BLOCK_SIZE_K so that the multiplication can be aligned
+    tl.static_assert(
+        K % (4 * BLOCK_SIZE_K) == 0,
+        "K / 4 must be divisible by BLOCK_SIZE_K => K divisible by 4*BLOCK_SIZE_K",
+    )
+    # determine the block id in the 1D grid, pid <=> blockId in cuda
+    pid = tl.program_id(axis=0)
+    # number of blocks we would need in the M dimension
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    # number of blocks we would need in the N dimension
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    # blocks are grouped along the M dimension. num_pid_in_group computes how many blocks are grouped together,
+    # and group_id calculates the group to which the current block (pid) belongs.
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+
+    # pid of the first block in the group that the current block belongs too
+    first_pid_m = group_id * GROUP_SIZE_M
+
+    # pid_m : pid of the block along the M dimension of the output matrix, and pid_n : pid of the block along the N dimension of the output matrix
+    # remember that the grid of blocks is 1D, but we calculate pid_m and pid_n to locate the block pid place in the output matrix
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # offs_am represent the indices of elements within the block for matrices A with respect to the M dimension
+    # offs_bn represent the indices of elements within the block for matrices B with respect to the N dimension
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    """
+        This part of the code generates pointers to the specific blocks of matrices A and B that the current thread block will process.
+
+        As described in the PyTorch documentation, a stride refers to the step size needed to move from one element to the next along a given dimension:
+
+        For matrix A: stride_am = A.stride(0) = K (stride along the rows), and stride_ak = A.stride(1) = 1 (stride along the columns).
+        For matrix B: stride_bk = B.stride(0) = N (stride along the rows), and stride_bn = B.stride(1) = 1 (stride along the columns).
+        Now, let's break down the pointer generation:
+
+        offs_am[:, None] creates a column of shape [BLOCK_SIZE_M, 1], which represents the row indices of matrix A that this block is processing. It is multiplied by K (the number of columns in matrix A) since A is stored in row-major order. So, the element at position (i, j) in A is located at index i*K + j in memory.
+        offs_k[None, BLOCK_SIZE_K] creates a row vector representing the column indices of the block, i.e., a range from 0 to BLOCK_SIZE_K. This is used to compute the positions of the columns within the block.
+        When combined, the result has the shape [BLOCK_SIZE_M, BLOCK_SIZE_K], where each entry (i, j) points to the element in matrix A at position (i, j) for the current block.
+
+        The same logic is applied to matrix B, but the resulting shape is [BLOCK_SIZE_K, BLOCK_SIZE_N], representing the block of matrix B that the thread block will work on.
+    """
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    # An accumulator matrix is initialized with zeros. It stores the intermediate results of the block matrix multiplication.
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.int32)
+    """
+        We split the loop into two layers. The outer loop runs 4 times, and each iteration focuses on a specific portion of matrix A.
+
+        For example, when i = 0, we’re only concerned with the blocks of matrix A that cover the range from 0 to K // (4 * BLOCK_SIZE_K).
+        Since matrix B is packed, its first dimension is effectively divided by 4. So, while we process the first segment of matrix A,
+        we still iterate over the entire first dimension of matrix B.
+
+        In each of the 4 iterations of the outer loop, we go through the full blocks of matrix B, but what changes is the data we extract.
+        Matrix B elements contain 4 weights, all packed into an int8 format, and during each iteration of the outer loop,
+        we extract a different weight by using bitwise shifting operations. This way, we access a unique weight on each pass.
+    """
+    for i in range(4):
+        b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+        for j in range(0, tl.cdiv(K // 4, BLOCK_SIZE_K)):
+            k = i * tl.cdiv(K // 4, BLOCK_SIZE_K) + j
+            # load the block of matrix A
+            a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0)
+            # load the block of matrix B
+            b_uint8 = tl.load(b_ptrs, mask=offs_k[:, None] < K, other=0)
+            # when i = 0 for example, we only care about the first 2 bits of the elements of the matrix B, so we use the mask 00000011 to mask the other bits
+            mask = 3 << (2 * i)
+            # we shift the results after the mask
+            b = (b_uint8 & mask) >> (2 * i)
+            # During the packing of the weights, it's easier to pack 0, 1, 2, then -1, 0, 1, so we add 1 to the weight tensor, and we substract it here
+            tensor_full = tl.full((1,), 1, dtype=tl.int8)
+            # We accumulate the result of multiplication of the blocks along the K dimension on int32 to avoid any overflows or underflows.
+            accumulator += tl.dot(a, (b.to(tl.int8) - tensor_full), out_dtype=tl.int32)
+            # we move the pointers, for the a_ptrs we more in a horizontal way along the second dimension -> we use strid_ak=1
+            # for b_ptrs we move in a vertical way, along the rows -> we use stride_bk=N
+            a_ptrs += BLOCK_SIZE_K * stride_ak
+            b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    c = accumulator
+    # These lines compute the offsets into matrix C where the result of this block’s computation should be stored.
+    # stride_cm = N & stride_cn = 1
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    # we do a boundary check to ensure only elements within matrix bounds are stored
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+def matmul(a, b):
+    assert a.shape[1] == b.shape[0] * 4, "Incompatible dimensions, the weight matrix need to be packed"
+    assert a.is_contiguous(), "Matrix A must be contiguous"
+    M, K = a.shape
+    _, N = b.shape
+    # c is in int32 to avoid any overflows or underflows
+    c = torch.empty((M, N), device=a.device, dtype=torch.int32)
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),)
+    matmul_kernel[grid](
+        a,
+        b,
+        c,
+        M,
+        N,
+        K,
+        a.stride(0),
+        a.stride(1),
+        b.stride(0),
+        b.stride(1),
+        c.stride(0),
+        c.stride(1),
+    )
+    return c
diff --git a/src/liger_kernel/ops/fused_add_rms_norm.py b/src/liger_kernel/ops/fused_add_rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..866377687fd44f5374d4f1eef9e5e15cf8d9cbad
--- /dev/null
+++ b/src/liger_kernel/ops/fused_add_rms_norm.py
@@ -0,0 +1,410 @@
+import math
+import operator
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+from liger_kernel.ops.utils import set_large_grf_mode
+from liger_kernel.ops.utils import torch_to_triton_dtype
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+
+
+_CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
+_CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
+_CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
+
+
+@triton.jit
+def _fused_add_rms_norm_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    S_ptr,  # output residual
+    S_row_stride,
+    X_ptr,
+    X_row_stride,
+    R_ptr,  # input residual
+    R_row_stride,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_cols,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,  # constexpr so the `if` blocks can be optimized out
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    This kernel computes the following:
+    1. hidden_states = residual + hidden_states
+    2. residual = hidden_states
+    3. hidden_states = rmsnorm(hidden_states)
+
+    This is a commonly used pattern in the decoder layers of LLMs.
+    Some examples:
+    1. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/qwen3/modeling_qwen3.py#L271
+    2. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/llama4/modeling_llama4.py#L393
+
+    This kernel is inspired by the rms_norm forward kernel, and is adapted to support the residual addition in the forward pass.
+    The backward pass is also adapted to support the residual addition in the backward pass.
+    """
+
+    row_idx = tl.program_id(0).to(tl.int64)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    Y_ptr += row_idx * Y_row_stride
+    S_ptr += row_idx * S_row_stride
+    X_ptr += row_idx * X_row_stride
+    R_ptr += row_idx * R_row_stride
+    RSTD_ptr += row_idx * RSTD_row_stride
+
+    X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
+    R_row = tl.load(R_ptr + col_offsets, mask=mask, other=0)
+    S_row = X_row + R_row
+    tl.store(S_ptr + col_offsets, S_row, mask=mask)
+    S_row_dtype = S_row.dtype
+    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
+
+    # On Llama, only rstd is computed on fp32
+    if casting_mode == _CASTING_MODE_LLAMA:
+        S_row = S_row.to(tl.float32)
+
+    # Gemma computes everything on fp32, and then casts back the output to the original dtype
+    if casting_mode == _CASTING_MODE_GEMMA:
+        W_row = W_row.to(tl.float32)
+        S_row = S_row.to(tl.float32)
+
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(S_row_dtype)
+        offset = offset.to(S_row_dtype)
+
+    mean_square = tl.sum(S_row * S_row, axis=0) / n_cols
+    rstd = rsqrt(mean_square + eps)
+
+    # We can save time by caching rms with minimal memory overhead
+    # because rms is much smaller compared to X_row, as rms is for each row.
+    # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
+    tl.store(RSTD_ptr, rstd)
+
+    S_row = S_row * rstd
+
+    # On Llama, the multiplication with the weight is done on the original dtype
+    if casting_mode == _CASTING_MODE_LLAMA:
+        S_row = S_row.to(S_row_dtype)
+
+    Y_row = S_row * (offset + W_row)
+
+    if casting_mode == _CASTING_MODE_GEMMA:
+        Y_row = Y_row.to(S_row_dtype)
+
+    tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
+
+
+@triton.jit
+def _fused_add_rms_norm_backward_kernel(
+    dY_ptr,
+    dY_row_stride,
+    dS_out_ptr,
+    dS_out_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows,
+    n_cols,
+    offset,
+    rows_per_program: tl.constexpr,
+    casting_mode: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    has_dS_out: tl.constexpr,
+):
+    """
+    This kernel is adapted from the rms_norm backward kernel, and is adapted to support the residual
+    addition in the backward pass. For the following code pattern:
+    1. hidden_states = residual + hidden_states
+    2. residual = hidden_states
+    3. hidden_states = rmsnorm(hidden_states)
+
+    The gradient of hidden_states and residual comes out be exactly same. The value of this gradient is
+    the sum of the gradient of the hidden_states in step 3 and the gradient of the residual in step 2.
+
+    The backward pass computation logic is same as the rms_norm backward kernel, except that the gradient
+    of the hidden_states in step 3 and the gradient of the residual in step 2 are summed up.
+    """
+
+    row_block_id = tl.program_id(0).to(tl.int64)
+    row_start = row_block_id * rows_per_program
+    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+    W_row = W_row + offset
+
+    for row_idx in range(row_start, row_end):
+        dy_base = dY_ptr + row_idx * dY_row_stride
+        dx_base = dX_ptr + row_idx * dX_row_stride
+
+        x_base = X_ptr + row_idx * X_row_stride
+        rstd_base = RSTD_ptr + row_idx * RSTD_row_stride
+
+        dY_row = tl.load(dy_base + col_offsets, mask=mask, other=0.0)
+        X_row = tl.load(x_base + col_offsets, mask=mask, other=0.0)
+
+        # Get cached rms
+        rstd_row = tl.load(rstd_base)
+
+        X_row = X_row.to(tl.float32)
+
+        # Different bacward graphs for different casting modes
+        if casting_mode == _CASTING_MODE_LLAMA:
+            m = (dY_row * W_row).to(tl.float32)
+
+        elif casting_mode == _CASTING_MODE_GEMMA:
+            dY_row = dY_row.to(tl.float32)
+            m = dY_row * W_row
+        else:
+            m = dY_row * W_row
+
+        dX_row = rstd_row * m
+
+        if has_dS_out:
+            ds_base = dS_out_ptr + row_idx * dS_out_row_stride
+            dS_out_row = tl.load(ds_base + col_offsets, mask=mask, other=0.0)
+            dX_row += (rstd_row) * (
+                -(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
+            ) + dS_out_row
+        else:
+            dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
+
+        # calculate the gradient of W
+        if casting_mode == _CASTING_MODE_LLAMA:
+            dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
+        else:
+            # here X_row is already in fp32 (see previous if block)
+            dW_row += dY_row * (X_row * rstd_row)
+
+        tl.store(dx_base + col_offsets, dX_row.to(X_dtype), mask=mask)
+
+    tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
+
+
+_str_to_casting_mode = {
+    "llama": _CASTING_MODE_LLAMA.value,
+    "gemma": _CASTING_MODE_GEMMA.value,
+    "none": _CASTING_MODE_NONE.value,
+}
+
+
+def fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode):
+    if not isinstance(casting_mode, int):
+        assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
+        casting_mode = _str_to_casting_mode[casting_mode]
+    else:
+        assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
+
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    R = R.view(-1, dim)
+    n_rows, n_cols = X.shape
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    S = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    # RSTD is to cache rstd for each row
+    # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
+    rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
+    RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
+
+    # Check constraints.
+    assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+
+    # XPU-specific optimization
+    kernel_args = {}
+    if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+
+    # TODO: add _block_fused_add_rms_norm_forward_kernel
+    _fused_add_rms_norm_forward_kernel[(n_rows,)](
+        Y,
+        Y.stride(0),
+        S,
+        S.stride(0),
+        X,
+        X.stride(0),
+        R,
+        R.stride(0),
+        W,
+        W.stride(0),
+        RSTD,
+        RSTD.stride(0),
+        n_cols,
+        eps,
+        offset,
+        casting_mode,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        **kernel_args,  # XPU-specific optimization
+    )
+
+    return Y.view(*shape), S.view(*shape), RSTD, BLOCK_SIZE, num_warps, casting_mode
+
+
+def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    dS_out = dS_out.view(-1, dim)
+    S = S.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    sm_count = 1
+    if S.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
+    elif S.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
+    elif S.device.type == "npu":
+        sm_count = get_npu_core_count()
+
+    # fp32 for numerical stability especially.
+    _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+
+    if n_cols > BLOCK_SIZE:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    rows_per_program = math.ceil(n_rows / sm_count)
+    grid = (sm_count,)
+
+    if in_place is True:
+        dX = dY
+    else:
+        dX = torch.empty_like(dY)
+
+    # XPU-specific optimization
+    kernel_args = {}
+    if S.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+
+    # TODO: add _block_fused_add_rms_norm_backward_kernel
+    _fused_add_rms_norm_backward_kernel[grid](
+        dY,
+        dY.stride(0),
+        dS_out,
+        dS_out.stride(0),
+        dX,
+        dX.stride(0),
+        S,
+        S.stride(0),
+        torch_to_triton_dtype[S.dtype],
+        W,
+        W.stride(0),
+        RSTD,
+        RSTD.stride(0),
+        _dW,
+        _dW.stride(0),
+        n_rows,
+        n_cols,
+        offset,
+        rows_per_program,
+        casting_mode,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        has_dS_out=dS_out is not None,
+        **kernel_args,  # XPU-specific optimization
+    )
+
+    dX = dX.view(*shape)
+    dW = _dW.sum(dim=0).to(W.dtype)
+
+    return dX, dX, dW  # dR is equal to dX
+
+
+class LigerFusedAddRMSNormFunction(torch.autograd.Function):
+    """
+    Performs a fused operation that first adds a residual tensor to the hidden_states tensor (`X`), then applies RMSNorm (Root Mean Square Normalization) to the result using the weight tensor `W`, with optional offset and casting mode.
+
+    This class implements the following sequence, commonly used in transformer decoder layers:
+        1. hidden_states = residual + hidden_states
+        2. residual = hidden_states (after addition)
+        3. hidden_states = rmsnorm(hidden_states)
+
+    Both the normalized hidden_states and the updated residual are returned as outputs.
+
+    Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
+    uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
+    `(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
+
+    In addition, different models cast their inputs at different places during RMSNorm computation. For
+    example, Gemma casts everything to fp32 before starting the computation, while Llama casts only the
+    inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
+    support the following casting modes (they match HuggingFace Transformers' implementations):
+    - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
+    - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
+    - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
+
+    The `in_place` option determines whether to modify dY in-place to store dX. This defaults to `True` to save memory.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, R, W, eps, offset=0.0, casting_mode="llama", in_place=False):
+        """
+        X: (B, T, H) or (BxT, H)
+        W: (H,)
+        """
+        # TODO: add row_mode
+        Y, S, RSTD, BLOCK_SIZE, num_warps, casting_mode = fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode)
+        ctx.offset = offset
+        ctx.casting_mode = casting_mode
+        ctx.in_place = in_place
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.save_for_backward(S, W, RSTD)
+        return Y, S
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY, dS_out):
+        """
+        Y: (B, T, H) or (BxT, H)
+        """
+        S, W, RSTD = ctx.saved_tensors
+        dX, dR, dW = fused_add_rms_norm_backward(
+            dY,
+            dS_out,
+            S,
+            W,
+            RSTD,
+            ctx.offset,
+            ctx.casting_mode,
+            ctx.BLOCK_SIZE,
+            ctx.num_warps,
+            ctx.in_place,
+        )
+
+        return dX, dR, dW, None, None, None, None, None
diff --git a/src/liger_kernel/ops/fused_linear_cross_entropy.py b/src/liger_kernel/ops/fused_linear_cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..01f1b565866a2e2cc9b6c60e5898fc269250bcd5
--- /dev/null
+++ b/src/liger_kernel/ops/fused_linear_cross_entropy.py
@@ -0,0 +1,400 @@
+import torch
+import triton
+
+from liger_kernel.ops.cross_entropy import liger_cross_entropy_kernel
+from liger_kernel.ops.utils import amp_custom_bwd
+from liger_kernel.ops.utils import amp_custom_fwd
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
+from liger_kernel.utils import infer_device
+
+# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
+# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
+# The optimal maximum block size depends on your hardware, your kernel, and your dtype
+MAX_FUSED_SIZE = 2048 if infer_device() == "npu" else 65536 // 2
+
+
+def fused_linear_cross_entropy_forward(
+    _input,
+    weight,
+    target,
+    ce_weight=None,
+    bias=None,
+    ignore_index=-100,
+    lse_square_scale=0.0,
+    label_smoothing=0.0,
+    reduction="mean",
+    softcap=None,
+    return_z_loss=False,
+    accum_dtype=None,
+    use_token_scaling=False,
+    return_token_accuracy=False,
+    return_predicted_tokens=False,
+):
+    assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
+    assert isinstance(return_token_accuracy, bool), (
+        f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
+    )
+    assert isinstance(return_predicted_tokens, bool), (
+        f"return_predicted_tokens must be True or False. Got: {return_predicted_tokens}"
+    )
+    device = _input.device
+
+    input_requires_grad = _input.requires_grad
+
+    # inputs have shape: BT x H
+    # materialized activations will have shape: BT x V
+    # the increase in memory = BT x V
+    # reduction can be achieved by partitioning the number of tokens BT into smaller chunks.
+    # for ex: if we were to achieve the same memory consumption as BT x H, then the chunk size should be:
+    # inc_factor = (V+H-1)//H, chunk_size = (BT + inc_factor - 1)//inc_factor
+    # for ex: BT = 4096*4, V = 32000, H = 4096 ==> inc_factor = 8, chunk_size = 2048
+    BT, H = _input.shape
+    V = weight.shape[0]
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+
+    inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
+    chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
+    num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
+
+    grad_input = torch.zeros_like(_input, device=device)
+
+    # we use fp32 for loss and gradients accumulator
+    if input_requires_grad:
+        if accum_dtype is None:
+            grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
+        else:
+            grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
+    else:
+        grad_weight = None
+        grad_bias = None
+
+    loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
+    z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    token_accuracy_1d = torch.zeros(BT, dtype=torch.float32, device=device) if return_token_accuracy else None
+    predicted_tokens_1d = torch.full((BT,), -1, dtype=torch.int64, device=device) if return_predicted_tokens else None
+
+    # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
+    target_mask = target != ignore_index
+    total_n_non_ignore = target_mask.sum().item()
+    total_sum_non_ignore_ce_weight = total_n_non_ignore
+    ce_weight_sum = 0.0
+    if ce_weight is not None:
+        assert ce_weight.shape[0] == V, f"If given, weight has to be a Tensor of size V. Got: {ce_weight.shape}"
+        assert torch.is_floating_point(ce_weight), (
+            f"If given, weight has to be a Tensor of floating point dtype. Got: {ce_weight.dtype}"
+        )
+        total_sum_non_ignore_ce_weight = (
+            torch.gather(ce_weight, dim=0, index=target.masked_select(target_mask)).sum().item()
+        )
+        ce_weight_sum = ce_weight.sum().item()
+        if ce_weight.stride(-1) != 1:
+            ce_weight = ce_weight.contiguous()
+
+    for chunk_id in range(num_chunks):
+        start_idx = chunk_id * chunk_size
+        end_idx = min((chunk_id + 1) * chunk_size, BT)
+        _input_chunk = _input[start_idx:end_idx]  # chunk_size x H
+
+        # when doing matmul, use the original precision
+        logits_chunk = _input_chunk @ weight.t()  # chunk_size x V
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+
+        target_chunk = target[start_idx:end_idx]  # chunk_size,
+
+        n_rows = logits_chunk.shape[0]
+
+        # Compute predicted probabilities for token scaling if needed
+        if use_token_scaling:
+            # Compute softmax probabilities for scaling
+            # We need to compute this before the cross entropy kernel modifies logits_chunk
+            logits_for_softmax = logits_chunk.detach().clone()  # Detach to avoid gradient flow
+            if softcap is not None:
+                logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
+
+            # Compute softmax to get predicted probabilities
+            probs = torch.softmax(logits_for_softmax, dim=-1)
+
+            # Get predicted probabilities for token scaling, handling ignored targets
+            valid_target_mask = target_chunk != ignore_index
+            valid_targets = target_chunk[valid_target_mask]
+
+            if len(valid_targets) > 0:
+                # Gather probabilities only for valid targets
+                valid_probs = probs[valid_target_mask]
+                pred_probs_valid = torch.gather(valid_probs, -1, valid_targets.unsqueeze(-1)).squeeze(-1)
+
+                # Create full tensor with zeros for ignored targets
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
+                pred_probs[valid_target_mask] = pred_probs_valid
+            else:
+                # All targets are ignored
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
+
+            # Store the scaling factors
+            scaling_factors = pred_probs.detach()  # Detach to ensure no gradient flow
+
+        # unreduced loss
+        loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
+        z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
+        token_accuracy_1d_slice = token_accuracy_1d[start_idx:end_idx] if return_token_accuracy else None
+        predicted_tokens_1d_slice = predicted_tokens_1d[start_idx:end_idx] if return_predicted_tokens else None
+
+        # ensure _input and target are contiguous
+        logits_chunk = logits_chunk.contiguous()
+        target_chunk = target_chunk.contiguous()
+
+        # Here we calculate the gradient of logits_chunk in place so we can save memory.
+        liger_cross_entropy_kernel[(n_rows,)](
+            X_ptr=logits_chunk,
+            X_stride=logits_chunk.stride(-2),
+            Y_ptr=target_chunk,
+            Y_stride=target_chunk.stride(-1),  # always 1
+            weight_ptr=ce_weight,
+            loss_ptr=loss_1d_slice,
+            z_loss_ptr=z_loss_1d_slice,
+            loss_stride=loss_1d_slice.stride(-1),  # always 1
+            token_accuracy_ptr=token_accuracy_1d_slice,
+            token_accuracy_stride=token_accuracy_1d_slice.stride(-1)
+            if return_token_accuracy
+            else 0,  # always 1 if accuracy is enabled
+            predicted_tokens_ptr=predicted_tokens_1d_slice,
+            predicted_tokens_stride=predicted_tokens_1d_slice.stride(-1)
+            if return_predicted_tokens
+            else 0,  # always 1 if predicted tokens is enabled
+            n_cols=V,
+            n_non_ignore=total_n_non_ignore,
+            sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
+            weight_sum=ce_weight_sum,
+            ignore_index=ignore_index,
+            lse_square_scale=lse_square_scale,
+            label_smoothing=label_smoothing,
+            reduction=reduction,
+            softcap=softcap,
+            RETURN_Z_LOSS=return_z_loss,
+            RETURN_TOKEN_ACCURACY=return_token_accuracy,
+            RETURN_PREDICTED_TOKENS=return_predicted_tokens,
+            HAS_WEIGHT=True if ce_weight is not None else False,
+            HAS_SOFTCAPPING=True if softcap is not None else False,
+            HAS_GRADIENTS=input_requires_grad,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=32 if not is_hip() else 16,
+        )
+
+        # Apply token scaling if requested
+        if use_token_scaling:
+            loss_1d_slice = loss_1d_slice * scaling_factors
+            if return_z_loss:
+                z_loss_1d_slice = z_loss_1d_slice * scaling_factors
+
+        loss_1d[start_idx:end_idx] = loss_1d_slice
+        if return_z_loss:
+            z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
+        if return_token_accuracy:
+            token_accuracy_1d[start_idx:end_idx] = token_accuracy_1d_slice
+        if return_predicted_tokens:
+            predicted_tokens_1d[start_idx:end_idx] = predicted_tokens_1d_slice
+        grad_logits_chunk = logits_chunk  # chunk_size x V
+
+        # Apply token scaling to gradients if requested
+        if use_token_scaling:
+            # Expand scaling factors to match gradient dimensions
+            scaling_factors_expanded = scaling_factors.unsqueeze(-1)  # chunk_size x 1
+            grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
+
+        if input_requires_grad:
+            grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
+
+        if grad_weight is not None and input_requires_grad:
+            grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
+
+        if bias is not None and input_requires_grad:
+            torch.add(
+                input=grad_bias,
+                other=grad_logits_chunk.sum(dim=0),
+                out=grad_bias,
+                alpha=1.0,
+            )
+
+    # Need extra calculations for backward if reduction=='none'. Not supporting reduction='none' now.
+    # if reduction == "none":
+    #     loss = loss_1d
+    #     z_loss = z_loss_1d if return_z_loss else None
+
+    if reduction == "none":
+        # Return per-token losses
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
+        token_accuracy = token_accuracy_1d if return_token_accuracy else None
+    else:
+        loss = torch.sum(loss_1d)
+        z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+        # For accuracy, we compute the mean across all non-ignored tokens
+        token_accuracy = torch.sum(token_accuracy_1d) / total_n_non_ignore if return_token_accuracy else None
+
+    predicted_tokens = predicted_tokens_1d if return_predicted_tokens else None
+
+    # Cast back to original dtype
+    grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
+    grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
+
+    return loss, z_loss, token_accuracy, predicted_tokens, grad_input, grad_weight, grad_bias
+
+
+def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
+    # If cross entropy is the last layer, grad_output is 1.0. Skip the mul to save time
+    if not torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
+        # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
+        BT, H = grad_input.shape
+        n_rows = BT
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
+
+        element_mul_kernel[(n_rows,)](
+            grad_input,
+            grad_input.stride(-2),
+            grad_output,
+            H,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=32 if not is_hip() else 16,
+        )
+
+        # handle grad_weight
+        if grad_weight is not None:
+            V, H = grad_weight.shape
+            n_rows = V
+
+            element_mul_kernel[(n_rows,)](
+                grad_weight,
+                grad_weight.stride(-2),
+                grad_output,
+                H,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=32 if not is_hip() else 16,
+            )
+
+        if grad_bias is not None:
+            V = grad_bias.shape[0]
+            n_rows = V
+
+            element_mul_kernel[(n_rows,)](
+                grad_bias,
+                grad_bias.stride(-1),
+                grad_output,
+                1,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=32 if not is_hip() else 16,
+            )
+    return grad_input, grad_weight, grad_bias
+
+
+class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
+    @staticmethod
+    @amp_custom_fwd
+    def forward(
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ce_weight=None,
+        ignore_index=-100,
+        lse_square_scale=0.0,
+        label_smoothing=0.0,
+        reduction="mean",
+        softcap=None,
+        return_z_loss: bool = False,
+        accum_dtype=None,
+        use_token_scaling: bool = False,
+        return_token_accuracy: bool = False,
+        return_predicted_tokens: bool = False,
+    ):
+        """
+        Fusing the last linear layer with cross-entropy loss
+            Reference: https://github.com/mgmalek/efficient_cross_entropy
+
+        Handle the forward and backward pass of the final linear layer via cross-entropy loss by avoiding
+        the materialization of the large logits tensor. Since Cross Entropy Loss is the last layer, we can
+        compute the gradient at the forward pass. By doing so, we don't have to store the _input and target
+        for the backward pass.
+
+        _input: (B*T, H) where B is batch size, T is sequence length, H is hidden dimension.
+        target: (B*T) where each value is in [0, V-1]
+        weight: (V, H) where V is the number of classes
+        bias: (V) where V is the number of classes
+        ce_weight: a manual rescaling weight given to each class. If given, has to be a Tensor of size V and floating point dtype
+        ignore_index: the index to ignore in the target
+        label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
+        reduction: reduction to apply
+        accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
+            Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
+        use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
+            When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
+            Default: False.
+        return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
+        return_predicted_tokens (bool): When `return_predicted_tokens` is `True`, returns per-token predicted class indices (argmax) without materializing logits. Default: `False`
+        """
+
+        loss, z_loss, token_accuracy, predicted_tokens, grad_input, grad_weight, grad_bias = (
+            fused_linear_cross_entropy_forward(
+                _input=_input,
+                weight=weight,
+                target=target,
+                bias=bias,
+                ce_weight=ce_weight,
+                ignore_index=ignore_index,
+                lse_square_scale=lse_square_scale,
+                label_smoothing=label_smoothing,
+                reduction=reduction,
+                softcap=softcap,
+                return_z_loss=return_z_loss,
+                accum_dtype=accum_dtype,
+                use_token_scaling=use_token_scaling,
+                return_token_accuracy=return_token_accuracy,
+                return_predicted_tokens=return_predicted_tokens,
+            )
+        )
+        # downcast to dtype and store for backward
+        ctx.save_for_backward(
+            grad_input.detach(),
+            grad_weight.detach() if grad_weight is not None else None,
+            grad_bias.detach() if grad_bias is not None else None,
+        )
+        ctx.return_z_loss = return_z_loss
+        ctx.return_token_accuracy = return_token_accuracy
+        ctx.return_predicted_tokens = return_predicted_tokens
+        return loss, z_loss, token_accuracy, predicted_tokens
+
+    @staticmethod
+    @amp_custom_bwd
+    def backward(ctx, grad_output, grad_output2, grad_output3, grad_output4):
+        if ctx.return_z_loss:
+            del grad_output2  # z_loss is only for logging
+        if ctx.return_token_accuracy:
+            del grad_output3  # token_accuracy is only for metrics
+        if ctx.return_predicted_tokens:
+            del grad_output4  # predicted_tokens is only for metrics
+        (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
+        grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
+            grad_output, grad_input, grad_weight, grad_bias
+        )
+        return (
+            grad_input,
+            grad_weight,
+            None,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,  # use_token_scaling
+            None,  # return_token_accuracy
+            None,  # return_predicted_tokens
+        )
diff --git a/src/liger_kernel/ops/fused_linear_jsd.py b/src/liger_kernel/ops/fused_linear_jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..e31b10769b6004522cea805d459b5c59a5ed56b9
--- /dev/null
+++ b/src/liger_kernel/ops/fused_linear_jsd.py
@@ -0,0 +1,228 @@
+from typing import Optional
+
+import torch
+import triton
+
+from liger_kernel.ops.jsd import _jsd_kernel
+from liger_kernel.ops.utils import amp_custom_bwd
+from liger_kernel.ops.utils import amp_custom_fwd
+from liger_kernel.ops.utils import element_mul_kernel
+from liger_kernel.ops.utils import is_hip
+from liger_kernel.utils import infer_device
+
+# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
+# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
+# The optimal maximum block size depends on your hardware, your kernel, and your dtype
+MAX_FUSED_SIZE = 4096 if infer_device() == "xpu" else 65536 // 2
+
+
+def fused_linear_jsd_forward(
+    student_input,
+    student_weight,
+    teacher_input,
+    teacher_weight,
+    shift_labels,
+    jsd_beta,
+    ignore_index,
+    has_label,
+    temperature,
+):
+    device = student_input.device
+    dtype = student_input.dtype
+
+    # inputs have shape: BT x H
+    # materialized activations will have shape: BT x V
+    # the increase in memory = BT x V
+    # reduction can be achieved by partitioning the number of tokens BT into smaller chunks.
+    # for ex: if we were to achieve the same memory consumption as BT x H, then the chunk size should be:
+    # inc_factor = (V+H-1)//H, chunk_size = (BT + inc_factor - 1)//inc_factor
+    # for ex: BT = 4096*4, V = 32000, H = 4096 ==> inc_factor = 8, chunk_size = 2048
+    BT, H = student_input.shape
+    V = student_weight.shape[0]
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+
+    inc_factor = triton.cdiv(V, H)  # (V + H - 1) // H
+    chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
+    num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
+
+    grad_weight = torch.zeros_like(student_weight, device=device) if student_weight.requires_grad else None
+    grad_input = torch.zeros_like(student_input)
+    # we use fp32 for loss accumulator
+    loss_1d = torch.zeros((BT, V), dtype=torch.float32, device=device)
+
+    if has_label:
+        n_non_ignore = (shift_labels != ignore_index).sum().item()
+    else:
+        n_non_ignore = BT
+
+    for chunk_id in range(num_chunks):
+        start_idx = chunk_id * chunk_size
+        end_idx = min((chunk_id + 1) * chunk_size, BT)
+
+        # chunk both inputs, shape: chunk_size x H
+        student_input_chunk = student_input[start_idx:end_idx]
+        teacher_input_chunk = teacher_input[start_idx:end_idx]
+
+        # shape: chunk_size x V
+        # For anything starting from logits to the final JSD loss, we do computation
+        # in FP32 to avoid losing numerical stability.
+        student_logits_chunk = (student_input_chunk @ student_weight.t()).to(torch.float32)
+        teacher_logits_chunk = (teacher_input_chunk @ teacher_weight.t()).to(torch.float32)
+        chunk_n_rows = student_logits_chunk.shape[0]
+
+        # unreduced loss
+        loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size
+        # log-softmax with temperature
+        student_logits_chunk = student_logits_chunk / temperature
+        teacher_logits_chunk = teacher_logits_chunk / temperature
+        student_prob_chunk = torch.log_softmax(student_logits_chunk, dim=-1)
+        teacher_prob_chunk = torch.log_softmax(teacher_logits_chunk, dim=-1)
+
+        # ensure _input and target are contiguous
+        student_prob_chunk = student_prob_chunk.contiguous()
+        teacher_prob_chunk = teacher_prob_chunk.contiguous()
+
+        # Here we calculate the gradient of prob_chunk in place so we can save memory.
+        _jsd_kernel[(chunk_n_rows,)](
+            X_ptr=student_prob_chunk,
+            X_stride=student_prob_chunk.stride(-2),
+            Y_ptr=teacher_prob_chunk,
+            Y_stride=teacher_prob_chunk.stride(-2),
+            loss_ptr=loss_1d_slice,
+            loss_stride=loss_1d_slice.stride(-2),
+            dX_ptr=student_prob_chunk,
+            dX_stride=student_prob_chunk.stride(-2),
+            label_ptr=(
+                shift_labels[start_idx:end_idx] if has_label else torch.empty(1, device=device)
+            ),  # dummy ptr if no label
+            beta=jsd_beta,
+            n_non_ignore=n_non_ignore,
+            ignore_index=ignore_index,
+            n_cols=V,
+            BLOCK_SIZE=BLOCK_SIZE,
+            HAS_LABEL=has_label,
+        )
+        loss_1d[start_idx:end_idx] = loss_1d_slice
+        # gradients of prob_chunk in place, shape: chunk_size x V
+        # gradients of logits_chunk in place, shape: chunk_size x V
+        student_logits_chunk = (
+            student_prob_chunk
+            - torch.softmax(student_logits_chunk, dim=-1)
+            * student_prob_chunk.sum(dim=-1, keepdim=True).broadcast_to(student_prob_chunk.shape)
+        ) / temperature
+        # now we traverse back to grad w.r.t. input to `lm_head` and grad
+        # w.r.t. `lm_head` which should be computed in original dtype
+        student_logits_chunk = student_logits_chunk.to(dtype)
+        grad_input[start_idx:end_idx] = student_logits_chunk @ student_weight
+
+        if grad_weight is not None:
+            grad_weight.add_(student_logits_chunk.t() @ student_input_chunk)
+
+    loss = torch.sum(loss_1d)
+    return loss, grad_input, grad_weight
+
+
+def fused_linear_jsd_backward(grad_output, grad_input, grad_weight):
+    # If JSD is the last layer, grad_output is 1.0. Skip the mul to save time
+    if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
+        # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
+        BT, H = grad_input.shape
+        n_rows = BT
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
+
+        element_mul_kernel[(n_rows,)](
+            grad_input,
+            grad_input.stride(-2),
+            grad_output,
+            H,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=32 if not is_hip() else 16,
+        )
+
+        # handle grad_weight
+        if grad_weight is not None:
+            V, H = grad_weight.shape
+            n_rows = V
+
+            element_mul_kernel[(n_rows,)](
+                grad_weight,
+                grad_weight.stride(-2),
+                grad_output,
+                H,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=32 if not is_hip() else 16,
+            )
+
+    return grad_input, grad_weight
+
+
+class LigerFusedLinearJSDFunction(torch.autograd.Function):
+    """
+    Fusing the last linear layer with generalized JSD
+
+    Handle the forward and backward pass of the final linear layer via JSD by avoiding
+    the materialization of the large logits tensor. Since JSD is the last layer, we can
+    compute the gradient at the forward pass.
+    """
+
+    @staticmethod
+    @amp_custom_fwd
+    def forward(
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+        jsd_beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        """
+        Args:
+
+            student_input (torch.tensor): input of the last projection layer in student model, with shape (B*T, H), where B is batch size, T is sequence length, H is hidden dimension.
+            student_weight (torch.tensor): the last projection layer in student model, with shape (V, H), where V is vocab size
+            teacher_input (torch.tensor): input of the last projection layer in teacher model, with shape (B*T, H), where B is batch size, T is sequence length, H is hidden dimension.
+            teacher_weight (torch.tensor): the last projection layer in teacher model, with shape (V, H), where V is vocab size
+            shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
+            jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
+            ignore_index (int): the index to ignore. Default: -100
+            temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`
+
+        Returns:
+            loss (torch.Tensor): generalized JSD
+        """
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (teacher_input.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+
+        loss, grad_input, grad_weight = fused_linear_jsd_forward(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            shift_labels,
+            jsd_beta,
+            ignore_index,
+            has_label,
+            temperature,
+        )
+        # downcast to dtype and store for backward
+        ctx.save_for_backward(
+            grad_input.detach(),
+            grad_weight.detach() if grad_weight is not None else None,
+        )
+        return loss
+
+    @staticmethod
+    @amp_custom_bwd
+    def backward(ctx, grad_output):
+        (grad_input, grad_weight) = ctx.saved_tensors
+        grad_input, grad_weight = fused_linear_jsd_backward(grad_output, grad_input, grad_weight)
+        return (grad_input, grad_weight, None, None, None, None, None, None)
diff --git a/src/liger_kernel/ops/fused_neighborhood_attention.py b/src/liger_kernel/ops/fused_neighborhood_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..557358fc9a1cb2916aaeb6a61c6da47ba4a9f3fc
--- /dev/null
+++ b/src/liger_kernel/ops/fused_neighborhood_attention.py
@@ -0,0 +1,1022 @@
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.softmax import _softmax_backward
+from liger_kernel.ops.softmax import _softmax_forward
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+@triton.jit
+def _neighborhood_mask_kernel(
+    mask_ptr,
+    seq_len: tl.constexpr,
+    kernel_size: tl.constexpr,
+    dilation: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    num_stages: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    """
+    Generate a neighborhood attention mask for a given sequence.
+
+    This kernel creates a binary mask that defines which positions in a sequence
+    can attend to each other based on a neighborhood window with optional dilation.
+    Each row of the mask corresponds to a query position, and each column indicates
+    whether that key position is within the allowed neighborhood.
+
+    The neighborhood is defined as positions within kernel_size//2 * dilation distance
+    from the center position. When dilation > 1, only positions at multiples of the
+    dilation factor are included in the neighborhood.
+
+    Args:
+        mask_ptr: Pointer to the output mask tensor [seq_len, seq_len]
+        seq_len: Length of the input sequence
+        kernel_size: Size of the neighborhood window (must be odd)
+        dilation: Dilation factor for the neighborhood pattern
+        BLOCK_SIZE: Block size for processing (compile-time constant)
+        num_stages: Number of pipeline stages (compile-time constant)
+        num_warps: Number of warps (compile-time constant)
+
+    Grid: (seq_len,)
+    Each program processes one row of the mask matrix.
+    """
+    row_id = tl.program_id(0)
+
+    center = row_id
+    half_kernel = kernel_size // 2
+
+    start = tl.maximum(0, center - half_kernel * dilation)
+    end = tl.minimum(seq_len, center + half_kernel * dilation + 1)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < seq_len
+
+    valid_neighbors = (col_offsets >= start) & (col_offsets < end)
+    if dilation > 1:
+        relative_pos = col_offsets - center
+        valid_dilation = (relative_pos % dilation) == 0
+        valid_neighbors = valid_neighbors & valid_dilation
+
+    mask_values = tl.where(valid_neighbors & mask, 1.0, 0.0)
+
+    base_offset = row_id * seq_len
+    tl.store(mask_ptr + base_offset + col_offsets, mask_values, mask=mask)
+
+
+@triton.jit
+def _fused_neighborhood_attention_qk_kernel(
+    Q_ptr,
+    K_ptr,
+    QK_ptr,
+    mask_ptr,
+    q_batch_stride,
+    q_head_stride,
+    q_seq_stride,
+    q_dim_stride,
+    k_batch_stride,
+    k_head_stride,
+    k_seq_stride,
+    k_dim_stride,
+    qk_batch_stride,
+    qk_head_stride,
+    qk_seq_stride,
+    qk_seq2_stride,
+    batch_size: tl.constexpr,
+    num_heads: tl.constexpr,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    scale: tl.constexpr,
+    kernel_size: tl.constexpr,
+    dilation: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    num_stages: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    """
+    Compute Q @ K^T with neighborhood masking and scaling.
+
+    This kernel performs the first stage of neighborhood attention by computing
+    the attention scores between queries and keys, applying scaling, and masking
+    positions outside the neighborhood window. The result is a matrix of attention
+    scores ready for softmax normalization.
+
+    The computation is tiled across sequence dimensions for memory efficiency.
+    Each tile computes a block of the attention score matrix by iterating over
+    the head dimension and accumulating dot products.
+
+    Args:
+        Q_ptr: Pointer to query tensor [batch_size, num_heads, seq_len, head_dim]
+        K_ptr: Pointer to key tensor [batch_size, num_heads, seq_len, head_dim]
+        QK_ptr: Pointer to output tensor [batch_size, num_heads, seq_len, seq_len]
+        mask_ptr: Pointer to neighborhood mask [seq_len, seq_len]
+        q_*_stride: Strides for query tensor
+        k_*_stride: Strides for key tensor
+        qk_*_stride: Strides for output tensor
+        batch_size: Number of batches
+        num_heads: Number of attention heads
+        seq_len: Sequence length
+        head_dim: Dimension of each attention head
+        scale: Scaling factor for attention scores (typically 1/sqrt(head_dim))
+        kernel_size: Size of the neighborhood window
+        dilation: Dilation factor for the neighborhood
+        BLOCK_SIZE_M: Block size for sequence dimension (rows)
+        BLOCK_SIZE_N: Block size for sequence dimension (cols)
+        BLOCK_SIZE_K: Block size for head dimension
+        num_stages: Number of pipeline stages
+        num_warps: Number of warps
+
+    Grid: (batch_size * num_heads, cdiv(seq_len, BLOCK_SIZE_M), cdiv(seq_len, BLOCK_SIZE_N))
+    Each program computes a tile of the attention score matrix.
+    """
+    batch_head_id = tl.program_id(0)
+    tile_m = tl.program_id(1)
+    tile_n = tl.program_id(2)
+
+    batch_id = batch_head_id // num_heads
+    head_id = batch_head_id % num_heads
+
+    row_start = tile_m * BLOCK_SIZE_M
+    col_start = tile_n * BLOCK_SIZE_N
+
+    row_offsets = row_start + tl.arange(0, BLOCK_SIZE_M)
+    col_offsets = col_start + tl.arange(0, BLOCK_SIZE_N)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k_start in range(0, head_dim, BLOCK_SIZE_K):
+        k_offsets = k_start + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = k_offsets < head_dim
+
+        q_ptrs = (
+            Q_ptr
+            + batch_id * q_batch_stride
+            + head_id * q_head_stride
+            + row_offsets[:, None] * q_seq_stride
+            + k_offsets[None, :] * q_dim_stride
+        )
+        q_mask = (row_offsets[:, None] < seq_len) & k_mask[None, :]
+        q_chunk = tl.load(q_ptrs, mask=q_mask, other=0.0)
+
+        k_ptrs = (
+            K_ptr
+            + batch_id * k_batch_stride
+            + head_id * k_head_stride
+            + col_offsets[:, None] * k_seq_stride
+            + k_offsets[None, :] * k_dim_stride
+        )
+        k_mask = (col_offsets[:, None] < seq_len) & k_mask[None, :]
+        k_chunk = tl.load(k_ptrs, mask=k_mask, other=0.0)
+
+        acc += tl.dot(q_chunk, tl.trans(k_chunk))
+
+    acc = acc * scale
+
+    mask_ptrs = mask_ptr + row_offsets[:, None] * seq_len + col_offsets[None, :]
+    valid_mask = (row_offsets[:, None] < seq_len) & (col_offsets[None, :] < seq_len)
+    neighborhood_mask = tl.load(mask_ptrs, mask=valid_mask, other=0.0)
+
+    acc = tl.where(neighborhood_mask > 0.0, acc, float("-inf"))
+
+    qk_ptrs = (
+        QK_ptr
+        + batch_id * qk_batch_stride
+        + head_id * qk_head_stride
+        + row_offsets[:, None] * qk_seq_stride
+        + col_offsets[None, :] * qk_seq2_stride
+    )
+    tl.store(qk_ptrs, acc, mask=valid_mask)
+
+
+@triton.jit
+def _fused_neighborhood_attention_av_kernel(
+    Attn_ptr,
+    V_ptr,
+    Out_ptr,
+    attn_batch_stride,
+    attn_head_stride,
+    attn_seq_stride,
+    attn_seq2_stride,
+    v_batch_stride,
+    v_head_stride,
+    v_seq_stride,
+    v_dim_stride,
+    out_batch_stride,
+    out_head_stride,
+    out_seq_stride,
+    out_dim_stride,
+    batch_size: tl.constexpr,
+    num_heads: tl.constexpr,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    num_stages: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    """
+    Compute Attention @ V to produce the final output.
+
+    This kernel performs the second stage of neighborhood attention by multiplying
+    the normalized attention weights with the value matrix. The computation is
+    tiled for memory efficiency, with each tile computing a block of the output.
+
+    Args:
+        Attn_ptr: Pointer to attention weights [batch_size, num_heads, seq_len, seq_len]
+        V_ptr: Pointer to value tensor [batch_size, num_heads, seq_len, head_dim]
+        Out_ptr: Pointer to output tensor [batch_size, num_heads, seq_len, head_dim]
+        attn_*_stride: Strides for attention weights tensor
+        v_*_stride: Strides for value tensor
+        out_*_stride: Strides for output tensor
+        batch_size: Number of batches
+        num_heads: Number of attention heads
+        seq_len: Sequence length
+        head_dim: Dimension of each attention head
+        BLOCK_SIZE_M: Block size for sequence dimension (rows)
+        BLOCK_SIZE_N: Block size for head dimension (cols)
+        BLOCK_SIZE_K: Block size for sequence dimension (reduction)
+        num_stages: Number of pipeline stages
+        num_warps: Number of warps
+
+    Grid: (batch_size * num_heads, cdiv(seq_len, BLOCK_SIZE_M), cdiv(head_dim, BLOCK_SIZE_N))
+    Each program computes a tile of the output matrix.
+    """
+    batch_head_id = tl.program_id(0)
+    tile_m = tl.program_id(1)
+    tile_n = tl.program_id(2)
+
+    batch_id = batch_head_id // num_heads
+    head_id = batch_head_id % num_heads
+
+    row_start = tile_m * BLOCK_SIZE_M
+    col_start = tile_n * BLOCK_SIZE_N
+
+    row_offsets = row_start + tl.arange(0, BLOCK_SIZE_M)
+    col_offsets = col_start + tl.arange(0, BLOCK_SIZE_N)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k_start in range(0, seq_len, BLOCK_SIZE_K):
+        k_offsets = k_start + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = k_offsets < seq_len
+
+        attn_ptrs = (
+            Attn_ptr
+            + batch_id * attn_batch_stride
+            + head_id * attn_head_stride
+            + row_offsets[:, None] * attn_seq_stride
+            + k_offsets[None, :] * attn_seq2_stride
+        )
+        attn_mask = (row_offsets[:, None] < seq_len) & k_mask[None, :]
+        attn_chunk = tl.load(attn_ptrs, mask=attn_mask, other=0.0)
+
+        v_ptrs = (
+            V_ptr
+            + batch_id * v_batch_stride
+            + head_id * v_head_stride
+            + k_offsets[:, None] * v_seq_stride
+            + col_offsets[None, :] * v_dim_stride
+        )
+        v_mask = k_mask[:, None] & (col_offsets[None, :] < head_dim)
+        v_chunk = tl.load(v_ptrs, mask=v_mask, other=0.0)
+
+        acc += tl.dot(attn_chunk, v_chunk)
+
+    out_ptrs = (
+        Out_ptr
+        + batch_id * out_batch_stride
+        + head_id * out_head_stride
+        + row_offsets[:, None] * out_seq_stride
+        + col_offsets[None, :] * out_dim_stride
+    )
+    valid_mask = (row_offsets[:, None] < seq_len) & (col_offsets[None, :] < head_dim)
+    tl.store(out_ptrs, acc, mask=valid_mask)
+
+
+@triton.jit
+def _fused_neighborhood_attention_grad_qk_kernel(
+    grad_attn_ptr,
+    K_ptr,
+    grad_Q_ptr,
+    grad_attn_batch_stride,
+    grad_attn_head_stride,
+    grad_attn_seq_stride,
+    grad_attn_seq2_stride,
+    k_batch_stride,
+    k_head_stride,
+    k_seq_stride,
+    k_dim_stride,
+    grad_q_batch_stride,
+    grad_q_head_stride,
+    grad_q_seq_stride,
+    grad_q_dim_stride,
+    batch_size: tl.constexpr,
+    num_heads: tl.constexpr,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    scale: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    num_stages: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    """
+    Compute gradient with respect to queries: grad_Q = grad_attn @ K * scale.
+
+    This kernel computes the gradient of the loss with respect to the query tensor
+    by multiplying the gradient of attention weights with the key tensor. The
+    computation follows the chain rule for the attention mechanism.
+
+    Args:
+        grad_attn_ptr: Pointer to gradient of attention weights [batch_size, num_heads, seq_len, seq_len]
+        K_ptr: Pointer to key tensor [batch_size, num_heads, seq_len, head_dim]
+        grad_Q_ptr: Pointer to output gradient tensor [batch_size, num_heads, seq_len, head_dim]
+        grad_attn_*_stride: Strides for gradient attention tensor
+        k_*_stride: Strides for key tensor
+        grad_q_*_stride: Strides for gradient query tensor
+        batch_size: Number of batches
+        num_heads: Number of attention heads
+        seq_len: Sequence length
+        head_dim: Dimension of each attention head
+        scale: Scaling factor applied to attention scores
+        BLOCK_SIZE_M: Block size for sequence dimension (rows)
+        BLOCK_SIZE_N: Block size for head dimension (cols)
+        BLOCK_SIZE_K: Block size for sequence dimension (reduction)
+        num_stages: Number of pipeline stages
+        num_warps: Number of warps
+
+    Grid: (batch_size * num_heads, cdiv(seq_len, BLOCK_SIZE_M), cdiv(head_dim, BLOCK_SIZE_N))
+    Each program computes a tile of the query gradient matrix.
+    """
+    batch_head_id = tl.program_id(0)
+    tile_m = tl.program_id(1)
+    tile_n = tl.program_id(2)
+
+    batch_id = batch_head_id // num_heads
+    head_id = batch_head_id % num_heads
+
+    row_start = tile_m * BLOCK_SIZE_M
+    col_start = tile_n * BLOCK_SIZE_N
+
+    row_offsets = row_start + tl.arange(0, BLOCK_SIZE_M)
+    col_offsets = col_start + tl.arange(0, BLOCK_SIZE_N)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k_start in range(0, seq_len, BLOCK_SIZE_K):
+        k_offsets = k_start + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = k_offsets < seq_len
+
+        grad_attn_ptrs = (
+            grad_attn_ptr
+            + batch_id * grad_attn_batch_stride
+            + head_id * grad_attn_head_stride
+            + row_offsets[:, None] * grad_attn_seq_stride
+            + k_offsets[None, :] * grad_attn_seq2_stride
+        )
+        grad_attn_mask = (row_offsets[:, None] < seq_len) & k_mask[None, :]
+        grad_attn_chunk = tl.load(grad_attn_ptrs, mask=grad_attn_mask, other=0.0)
+
+        k_ptrs = (
+            K_ptr
+            + batch_id * k_batch_stride
+            + head_id * k_head_stride
+            + k_offsets[:, None] * k_seq_stride
+            + col_offsets[None, :] * k_dim_stride
+        )
+        k_mask_2d = k_mask[:, None] & (col_offsets[None, :] < head_dim)
+        k_chunk = tl.load(k_ptrs, mask=k_mask_2d, other=0.0)
+
+        acc += tl.dot(grad_attn_chunk, k_chunk)
+
+    acc = acc * scale
+
+    grad_q_ptrs = (
+        grad_Q_ptr
+        + batch_id * grad_q_batch_stride
+        + head_id * grad_q_head_stride
+        + row_offsets[:, None] * grad_q_seq_stride
+        + col_offsets[None, :] * grad_q_dim_stride
+    )
+    valid_mask = (row_offsets[:, None] < seq_len) & (col_offsets[None, :] < head_dim)
+    tl.store(grad_q_ptrs, acc, mask=valid_mask)
+
+
+@triton.jit
+def _fused_neighborhood_attention_grad_k_kernel(
+    grad_attn_ptr,
+    Q_ptr,
+    grad_K_ptr,
+    grad_attn_batch_stride,
+    grad_attn_head_stride,
+    grad_attn_seq_stride,
+    grad_attn_seq2_stride,
+    q_batch_stride,
+    q_head_stride,
+    q_seq_stride,
+    q_dim_stride,
+    grad_k_batch_stride,
+    grad_k_head_stride,
+    grad_k_seq_stride,
+    grad_k_dim_stride,
+    batch_size: tl.constexpr,
+    num_heads: tl.constexpr,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    scale: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    num_stages: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    """
+    Compute gradient with respect to keys: grad_K = grad_attn^T @ Q * scale.
+
+    This kernel computes the gradient of the loss with respect to the key tensor
+    by multiplying the transpose of the gradient of attention weights with the
+    query tensor. The computation follows the chain rule for the attention mechanism.
+
+    Args:
+        grad_attn_ptr: Pointer to gradient of attention weights [batch_size, num_heads, seq_len, seq_len]
+        Q_ptr: Pointer to query tensor [batch_size, num_heads, seq_len, head_dim]
+        grad_K_ptr: Pointer to output gradient tensor [batch_size, num_heads, seq_len, head_dim]
+        grad_attn_*_stride: Strides for gradient attention tensor
+        q_*_stride: Strides for query tensor
+        grad_k_*_stride: Strides for gradient key tensor
+        batch_size: Number of batches
+        num_heads: Number of attention heads
+        seq_len: Sequence length
+        head_dim: Dimension of each attention head
+        scale: Scaling factor applied to attention scores
+        BLOCK_SIZE_M: Block size for sequence dimension (rows)
+        BLOCK_SIZE_N: Block size for head dimension (cols)
+        BLOCK_SIZE_K: Block size for sequence dimension (reduction)
+        num_stages: Number of pipeline stages
+        num_warps: Number of warps
+
+    Grid: (batch_size * num_heads, cdiv(seq_len, BLOCK_SIZE_M), cdiv(head_dim, BLOCK_SIZE_N))
+    Each program computes a tile of the key gradient matrix.
+    """
+    batch_head_id = tl.program_id(0)
+    tile_m = tl.program_id(1)
+    tile_n = tl.program_id(2)
+
+    batch_id = batch_head_id // num_heads
+    head_id = batch_head_id % num_heads
+
+    row_start = tile_m * BLOCK_SIZE_M
+    col_start = tile_n * BLOCK_SIZE_N
+
+    row_offsets = row_start + tl.arange(0, BLOCK_SIZE_M)
+    col_offsets = col_start + tl.arange(0, BLOCK_SIZE_N)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k_start in range(0, seq_len, BLOCK_SIZE_K):
+        k_offsets = k_start + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = k_offsets < seq_len
+
+        q_ptrs = (
+            Q_ptr
+            + batch_id * q_batch_stride
+            + head_id * q_head_stride
+            + k_offsets[:, None] * q_seq_stride
+            + col_offsets[None, :] * q_dim_stride
+        )
+        q_mask = k_mask[:, None] & (col_offsets[None, :] < head_dim)
+        q_chunk = tl.load(q_ptrs, mask=q_mask, other=0.0)
+
+        grad_attn_T_ptrs = (
+            grad_attn_ptr
+            + batch_id * grad_attn_batch_stride
+            + head_id * grad_attn_head_stride
+            + row_offsets[:, None] * grad_attn_seq2_stride
+            + k_offsets[None, :] * grad_attn_seq_stride
+        )
+        grad_attn_T_mask = (row_offsets[:, None] < seq_len) & k_mask[None, :]
+        grad_attn_T_chunk = tl.load(grad_attn_T_ptrs, mask=grad_attn_T_mask, other=0.0)
+
+        acc += tl.dot(grad_attn_T_chunk, q_chunk)
+
+    acc = acc * scale
+
+    grad_k_ptrs = (
+        grad_K_ptr
+        + batch_id * grad_k_batch_stride
+        + head_id * grad_k_head_stride
+        + row_offsets[:, None] * grad_k_seq_stride
+        + col_offsets[None, :] * grad_k_dim_stride
+    )
+    valid_mask = (row_offsets[:, None] < seq_len) & (col_offsets[None, :] < head_dim)
+    tl.store(grad_k_ptrs, acc, mask=valid_mask)
+
+
+@triton.jit
+def _fused_neighborhood_attention_grad_v_kernel(
+    Attn_ptr,
+    grad_output_ptr,
+    grad_V_ptr,
+    attn_batch_stride,
+    attn_head_stride,
+    attn_seq_stride,
+    attn_seq2_stride,
+    grad_out_batch_stride,
+    grad_out_head_stride,
+    grad_out_seq_stride,
+    grad_out_dim_stride,
+    grad_v_batch_stride,
+    grad_v_head_stride,
+    grad_v_seq_stride,
+    grad_v_dim_stride,
+    batch_size: tl.constexpr,
+    num_heads: tl.constexpr,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    num_stages: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    """
+    Compute gradient with respect to values: grad_V = Attn^T @ grad_output.
+
+    This kernel computes the gradient of the loss with respect to the value tensor
+    by multiplying the transpose of the attention weights with the gradient of the
+    output. The computation follows the chain rule for the attention mechanism.
+
+    Args:
+        Attn_ptr: Pointer to attention weights [batch_size, num_heads, seq_len, seq_len]
+        grad_output_ptr: Pointer to gradient of output [batch_size, num_heads, seq_len, head_dim]
+        grad_V_ptr: Pointer to output gradient tensor [batch_size, num_heads, seq_len, head_dim]
+        attn_*_stride: Strides for attention weights tensor
+        grad_out_*_stride: Strides for gradient output tensor
+        grad_v_*_stride: Strides for gradient value tensor
+        batch_size: Number of batches
+        num_heads: Number of attention heads
+        seq_len: Sequence length
+        head_dim: Dimension of each attention head
+        BLOCK_SIZE_M: Block size for sequence dimension (rows)
+        BLOCK_SIZE_N: Block size for head dimension (cols)
+        BLOCK_SIZE_K: Block size for sequence dimension (reduction)
+        num_stages: Number of pipeline stages
+        num_warps: Number of warps
+
+    Grid: (batch_size * num_heads, cdiv(seq_len, BLOCK_SIZE_M), cdiv(head_dim, BLOCK_SIZE_N))
+    Each program computes a tile of the value gradient matrix.
+    """
+    batch_head_id = tl.program_id(0)
+    tile_m = tl.program_id(1)
+    tile_n = tl.program_id(2)
+
+    batch_id = batch_head_id // num_heads
+    head_id = batch_head_id % num_heads
+
+    row_start = tile_m * BLOCK_SIZE_M
+    col_start = tile_n * BLOCK_SIZE_N
+
+    row_offsets = row_start + tl.arange(0, BLOCK_SIZE_M)
+    col_offsets = col_start + tl.arange(0, BLOCK_SIZE_N)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k_start in range(0, seq_len, BLOCK_SIZE_K):
+        k_offsets = k_start + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = k_offsets < seq_len
+
+        attn_ptrs = (
+            Attn_ptr
+            + batch_id * attn_batch_stride
+            + head_id * attn_head_stride
+            + k_offsets[:, None] * attn_seq_stride
+            + row_offsets[None, :] * attn_seq2_stride
+        )
+        attn_mask = k_mask[:, None] & (row_offsets[None, :] < seq_len)
+        attn_chunk = tl.load(attn_ptrs, mask=attn_mask, other=0.0)
+
+        grad_out_ptrs = (
+            grad_output_ptr
+            + batch_id * grad_out_batch_stride
+            + head_id * grad_out_head_stride
+            + k_offsets[:, None] * grad_out_seq_stride
+            + col_offsets[None, :] * grad_out_dim_stride
+        )
+        grad_out_mask = k_mask[:, None] & (col_offsets[None, :] < head_dim)
+        grad_out_chunk = tl.load(grad_out_ptrs, mask=grad_out_mask, other=0.0)
+
+        acc += tl.dot(tl.trans(attn_chunk), grad_out_chunk)
+
+    grad_v_ptrs = (
+        grad_V_ptr
+        + batch_id * grad_v_batch_stride
+        + head_id * grad_v_head_stride
+        + row_offsets[:, None] * grad_v_seq_stride
+        + col_offsets[None, :] * grad_v_dim_stride
+    )
+    valid_mask = (row_offsets[:, None] < seq_len) & (col_offsets[None, :] < head_dim)
+    tl.store(grad_v_ptrs, acc, mask=valid_mask)
+
+
+@triton.jit
+def _fused_neighborhood_attention_grad_attn_kernel(
+    grad_output_ptr,
+    V_ptr,
+    grad_attn_ptr,
+    grad_out_batch_stride,
+    grad_out_head_stride,
+    grad_out_seq_stride,
+    grad_out_dim_stride,
+    v_batch_stride,
+    v_head_stride,
+    v_seq_stride,
+    v_dim_stride,
+    grad_attn_batch_stride,
+    grad_attn_head_stride,
+    grad_attn_seq_stride,
+    grad_attn_seq2_stride,
+    batch_size: tl.constexpr,
+    num_heads: tl.constexpr,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    num_stages: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    """
+    Compute gradient with respect to attention weights: grad_attn = grad_output @ V^T.
+
+    This kernel computes the gradient of the loss with respect to the attention
+    weights by multiplying the gradient of the output with the transpose of the
+    value tensor. This gradient will later be passed through the softmax backward
+    pass to compute gradients for the attention scores.
+
+    Args:
+        grad_output_ptr: Pointer to gradient of output [batch_size, num_heads, seq_len, head_dim]
+        V_ptr: Pointer to value tensor [batch_size, num_heads, seq_len, head_dim]
+        grad_attn_ptr: Pointer to output gradient tensor [batch_size, num_heads, seq_len, seq_len]
+        grad_out_*_stride: Strides for gradient output tensor
+        v_*_stride: Strides for value tensor
+        grad_attn_*_stride: Strides for gradient attention tensor
+        batch_size: Number of batches
+        num_heads: Number of attention heads
+        seq_len: Sequence length
+        head_dim: Dimension of each attention head
+        BLOCK_SIZE_M: Block size for sequence dimension (rows)
+        BLOCK_SIZE_N: Block size for sequence dimension (cols)
+        BLOCK_SIZE_K: Block size for head dimension (reduction)
+        num_stages: Number of pipeline stages
+        num_warps: Number of warps
+
+    Grid: (batch_size * num_heads, cdiv(seq_len, BLOCK_SIZE_M), cdiv(seq_len, BLOCK_SIZE_N))
+    Each program computes a tile of the attention gradient matrix.
+    """
+    batch_head_id = tl.program_id(0)
+    tile_m = tl.program_id(1)
+    tile_n = tl.program_id(2)
+
+    batch_id = batch_head_id // num_heads
+    head_id = batch_head_id % num_heads
+
+    row_start = tile_m * BLOCK_SIZE_M
+    col_start = tile_n * BLOCK_SIZE_N
+
+    row_offsets = row_start + tl.arange(0, BLOCK_SIZE_M)
+    col_offsets = col_start + tl.arange(0, BLOCK_SIZE_N)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k_start in range(0, head_dim, BLOCK_SIZE_K):
+        k_offsets = k_start + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = k_offsets < head_dim
+
+        grad_out_ptrs = (
+            grad_output_ptr
+            + batch_id * grad_out_batch_stride
+            + head_id * grad_out_head_stride
+            + row_offsets[:, None] * grad_out_seq_stride
+            + k_offsets[None, :] * grad_out_dim_stride
+        )
+        grad_out_mask = (row_offsets[:, None] < seq_len) & k_mask[None, :]
+        grad_out_chunk = tl.load(grad_out_ptrs, mask=grad_out_mask, other=0.0)
+
+        v_ptrs = (
+            V_ptr
+            + batch_id * v_batch_stride
+            + head_id * v_head_stride
+            + col_offsets[None, :] * v_seq_stride
+            + k_offsets[:, None] * v_dim_stride
+        )
+        v_mask = (col_offsets[None, :] < seq_len) & k_mask[:, None]
+        v_chunk = tl.load(v_ptrs, mask=v_mask, other=0.0)
+
+        acc += tl.dot(grad_out_chunk, v_chunk)
+
+    grad_attn_ptrs = (
+        grad_attn_ptr
+        + batch_id * grad_attn_batch_stride
+        + head_id * grad_attn_head_stride
+        + row_offsets[:, None] * grad_attn_seq_stride
+        + col_offsets[None, :] * grad_attn_seq2_stride
+    )
+    valid_mask = (row_offsets[:, None] < seq_len) & (col_offsets[None, :] < seq_len)
+    tl.store(grad_attn_ptrs, acc, mask=valid_mask)
+
+
+def fused_neighborhood_attention_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kernel_size: int = 7,
+    dilation: int = 1,
+    scale: float = None,
+    return_lse: bool = False,
+) -> tuple:
+    """
+    Fused neighborhood attention forward pass.
+
+    Args:
+        query: Query tensor of shape [batch_size, num_heads, seq_len, head_dim]
+        key: Key tensor of shape [batch_size, num_heads, seq_len, head_dim]
+        value: Value tensor of shape [batch_size, num_heads, seq_len, head_dim]
+        kernel_size: Size of the neighborhood window
+        dilation: Dilation factor for the neighborhood
+        scale: Scaling factor for attention scores (default: rsqrt(head_dim))
+        return_lse: Whether to return log-sum-exp values
+
+    Returns:
+        Tuple of (output tensor, softmax parameters for backward)
+    """
+    batch_size, num_heads, seq_len, head_dim = query.shape
+
+    if scale is None:
+        scale = 1.0 / math.sqrt(head_dim)
+
+    query = query.contiguous()
+    key = key.contiguous()
+    value = value.contiguous()
+
+    output = torch.empty_like(query)
+    qk_scores = torch.empty(batch_size, num_heads, seq_len, seq_len, device=query.device, dtype=query.dtype)
+
+    mask = torch.zeros(seq_len, seq_len, device=query.device, dtype=torch.float32)
+
+    BLOCK_SIZE, num_warps = calculate_settings(seq_len)
+    BLOCK_SIZE_M = min(64, triton.next_power_of_2(seq_len))
+    BLOCK_SIZE_N = min(64, triton.next_power_of_2(seq_len))
+    BLOCK_SIZE_K = max(16, triton.next_power_of_2(head_dim))
+
+    num_stages = 4 if seq_len >= 512 else 2
+
+    grid_mask = (seq_len,)
+    _neighborhood_mask_kernel[grid_mask](
+        mask,
+        seq_len,
+        kernel_size,
+        dilation,
+        BLOCK_SIZE,
+        num_stages,
+        num_warps,
+    )
+
+    grid_qk = (batch_size * num_heads, triton.cdiv(seq_len, BLOCK_SIZE_M), triton.cdiv(seq_len, BLOCK_SIZE_N))
+    _fused_neighborhood_attention_qk_kernel[grid_qk](
+        query,
+        key,
+        qk_scores,
+        mask,
+        query.stride(0),
+        query.stride(1),
+        query.stride(2),
+        query.stride(3),
+        key.stride(0),
+        key.stride(1),
+        key.stride(2),
+        key.stride(3),
+        qk_scores.stride(0),
+        qk_scores.stride(1),
+        qk_scores.stride(2),
+        qk_scores.stride(3),
+        batch_size,
+        num_heads,
+        seq_len,
+        head_dim,
+        scale,
+        kernel_size,
+        dilation,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        BLOCK_SIZE_K,
+        num_stages,
+        num_warps,
+    )
+
+    qk_reshaped = qk_scores.view(batch_size * num_heads * seq_len, seq_len)
+    attn_reshaped, BLOCK_SIZE_softmax, num_warps_softmax, multi_block_launch = _softmax_forward(qk_reshaped)
+    attn_weights = attn_reshaped.view(batch_size, num_heads, seq_len, seq_len)
+
+    grid_av = (batch_size * num_heads, triton.cdiv(seq_len, BLOCK_SIZE_M), triton.cdiv(head_dim, BLOCK_SIZE_N))
+    _fused_neighborhood_attention_av_kernel[grid_av](
+        attn_weights,
+        value,
+        output,
+        attn_weights.stride(0),
+        attn_weights.stride(1),
+        attn_weights.stride(2),
+        attn_weights.stride(3),
+        value.stride(0),
+        value.stride(1),
+        value.stride(2),
+        value.stride(3),
+        output.stride(0),
+        output.stride(1),
+        output.stride(2),
+        output.stride(3),
+        batch_size,
+        num_heads,
+        seq_len,
+        head_dim,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        BLOCK_SIZE_K,
+        num_stages,
+        num_warps,
+    )
+
+    if return_lse:
+        raise NotImplementedError("return_lse=True is not supported yet.")
+
+    softmax_params = (BLOCK_SIZE_softmax, num_warps_softmax, multi_block_launch)
+    return output, attn_weights, softmax_params
+
+
+class LigerFusedNeighborhoodAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, query, key, value, kernel_size=7, dilation=1, scale=None):
+        output, attn_weights, softmax_params = fused_neighborhood_attention_forward(
+            query, key, value, kernel_size, dilation, scale
+        )
+        ctx.save_for_backward(query, key, value, attn_weights)
+        ctx.kernel_size = kernel_size
+        ctx.dilation = dilation
+        ctx.scale = scale
+        ctx.softmax_params = softmax_params
+        return output
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output):
+        query, key, value, attn_weights = ctx.saved_tensors
+        BLOCK_SIZE_softmax, num_warps_softmax, multi_block_launch = ctx.softmax_params
+
+        batch_size, num_heads, seq_len, head_dim = query.shape
+        scale = ctx.scale if ctx.scale is not None else 1.0 / math.sqrt(head_dim)
+
+        grad_query = torch.zeros_like(query)
+        grad_key = torch.zeros_like(key)
+        grad_value = torch.zeros_like(value)
+        grad_attn_weights = torch.zeros_like(attn_weights)
+
+        BLOCK_SIZE_M = min(64, triton.next_power_of_2(seq_len))
+        BLOCK_SIZE_N = min(64, triton.next_power_of_2(seq_len))
+        BLOCK_SIZE_K = min(64, triton.next_power_of_2(head_dim))
+        num_stages = 4 if seq_len >= 512 else 2
+        _, num_warps = calculate_settings(seq_len)
+
+        grid_grad_attn = (
+            batch_size * num_heads,
+            triton.cdiv(seq_len, BLOCK_SIZE_M),
+            triton.cdiv(seq_len, BLOCK_SIZE_N),
+        )
+        _fused_neighborhood_attention_grad_attn_kernel[grid_grad_attn](
+            grad_output,
+            value,
+            grad_attn_weights,
+            grad_output.stride(0),
+            grad_output.stride(1),
+            grad_output.stride(2),
+            grad_output.stride(3),
+            value.stride(0),
+            value.stride(1),
+            value.stride(2),
+            value.stride(3),
+            grad_attn_weights.stride(0),
+            grad_attn_weights.stride(1),
+            grad_attn_weights.stride(2),
+            grad_attn_weights.stride(3),
+            batch_size,
+            num_heads,
+            seq_len,
+            head_dim,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            num_stages,
+            num_warps,
+        )
+
+        grad_attn_reshaped = grad_attn_weights.view(batch_size * num_heads * seq_len, seq_len)
+        attn_reshaped = attn_weights.view(batch_size * num_heads * seq_len, seq_len)
+
+        grad_qk_reshaped = _softmax_backward(
+            grad_attn_reshaped, attn_reshaped, BLOCK_SIZE_softmax, num_warps_softmax, multi_block_launch
+        )
+        grad_qk_scores = grad_qk_reshaped.view(batch_size, num_heads, seq_len, seq_len)
+
+        grid_grad_q = (batch_size * num_heads, triton.cdiv(seq_len, BLOCK_SIZE_M), triton.cdiv(head_dim, BLOCK_SIZE_N))
+        _fused_neighborhood_attention_grad_qk_kernel[grid_grad_q](
+            grad_qk_scores,
+            key,
+            grad_query,
+            grad_qk_scores.stride(0),
+            grad_qk_scores.stride(1),
+            grad_qk_scores.stride(2),
+            grad_qk_scores.stride(3),
+            key.stride(0),
+            key.stride(1),
+            key.stride(2),
+            key.stride(3),
+            grad_query.stride(0),
+            grad_query.stride(1),
+            grad_query.stride(2),
+            grad_query.stride(3),
+            batch_size,
+            num_heads,
+            seq_len,
+            head_dim,
+            scale,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            num_stages,
+            num_warps,
+        )
+
+        grid_grad_k = (batch_size * num_heads, triton.cdiv(seq_len, BLOCK_SIZE_M), triton.cdiv(head_dim, BLOCK_SIZE_N))
+        _fused_neighborhood_attention_grad_k_kernel[grid_grad_k](
+            grad_qk_scores,
+            query,
+            grad_key,
+            grad_qk_scores.stride(0),
+            grad_qk_scores.stride(1),
+            grad_qk_scores.stride(2),
+            grad_qk_scores.stride(3),
+            query.stride(0),
+            query.stride(1),
+            query.stride(2),
+            query.stride(3),
+            grad_key.stride(0),
+            grad_key.stride(1),
+            grad_key.stride(2),
+            grad_key.stride(3),
+            batch_size,
+            num_heads,
+            seq_len,
+            head_dim,
+            scale,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            num_stages,
+            num_warps,
+        )
+
+        grid_grad_v = (batch_size * num_heads, triton.cdiv(seq_len, BLOCK_SIZE_M), triton.cdiv(head_dim, BLOCK_SIZE_N))
+        _fused_neighborhood_attention_grad_v_kernel[grid_grad_v](
+            attn_weights,
+            grad_output,
+            grad_value,
+            attn_weights.stride(0),
+            attn_weights.stride(1),
+            attn_weights.stride(2),
+            attn_weights.stride(3),
+            grad_output.stride(0),
+            grad_output.stride(1),
+            grad_output.stride(2),
+            grad_output.stride(3),
+            grad_value.stride(0),
+            grad_value.stride(1),
+            grad_value.stride(2),
+            grad_value.stride(3),
+            batch_size,
+            num_heads,
+            seq_len,
+            head_dim,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            BLOCK_SIZE_K,
+            num_stages,
+            num_warps,
+        )
+
+        return grad_query, grad_key, grad_value, None, None, None
diff --git a/src/liger_kernel/ops/geglu.py b/src/liger_kernel/ops/geglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..6aa99c405797018d49974e4e357273d90bc413d9
--- /dev/null
+++ b/src/liger_kernel/ops/geglu.py
@@ -0,0 +1,143 @@
+import operator
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import tanh
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import tanh
+else:
+    from triton.language.math import tanh
+
+
+@triton.jit
+def _geglu_tanh_forward_kernel(a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    program_id = tl.program_id(0).to(tl.int64)
+
+    # locate start index
+    a += program_id * stride
+    b += program_id * stride
+    c += program_id * stride
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+    a_row = tl.load(a + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask=mask, other=0)
+
+    # tanh approximation form of GELU is computed with:
+    # 0.5 * a * (1 + tanh(sqrt(2 / pi) * (a + 0.044715 * a^3)))
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    a_cubed = a_row * a_row * a_row
+    tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
+    tanh_result = tanh(tanh_arg)
+    geglu_a = 0.5 * a_row * (1 + tanh_result)
+    c_row = geglu_a.cast(b_row.dtype) * b_row
+    tl.store(c + col_offsets, c_row, mask=mask)
+
+
+@triton.jit
+def _geglu_tanh_backward_kernel(dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    program_id = tl.program_id(0).to(tl.int64)
+
+    # locate start index
+    dc += program_id * stride
+    a += program_id * stride
+    b += program_id * stride
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    dc_row = tl.load(dc + col_offsets, mask=mask, other=0)
+    a_row = tl.load(a + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b + col_offsets, mask=mask, other=0)
+
+    # recomputation to save memory
+    sqrt_2_over_pi = 0.7978845608028654  # sqrt(2 / pi)
+    a_cubed = a_row * a_row * a_row
+    tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
+    tanh_result = tanh(tanh_arg)
+    geglu_a = 0.5 * a_row * (1 + tanh_result)
+    geglu_a = geglu_a.to(dc_row.dtype).to(tl.float32)
+
+    db_row = dc_row.cast(tl.float32) * geglu_a
+
+    # Gradient w.r.t. a can be computed with:
+    # b * (0.5 * (1 + tanh(z)) + 0.5 * a * (1 - tanh(z)^2) * (sqrt(2/pi) * (1 + 3 * 0.044715 * a^2)))
+    # where z = sqrt(2/pi) * (a + 0.044715 * a^3)
+    term1 = 0.5 * (1 + tanh_result)
+    tanh_sq = tanh_result * tanh_result
+    term2 = 0.5 * a_row * (1 - tanh_sq) * (sqrt_2_over_pi * (1 + 3 * 0.044715 * a_row * a_row))
+    da_row = dc_row * b_row * (term1 + term2)
+
+    tl.store(a + col_offsets, da_row, mask=mask)
+    tl.store(b + col_offsets, db_row.to(dc_row.dtype), mask=mask)
+
+
+def geglu_forward(a, b):
+    ori_shape = a.shape
+
+    n_cols = ori_shape[-1]
+    a = a.view(-1, n_cols)
+    b = b.view(-1, n_cols)
+    c = torch.empty_like(a)
+    n_rows = a.shape[0]
+
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    _geglu_tanh_forward_kernel[(n_rows,)](
+        a,
+        b,
+        c,
+        c.stride(-2),
+        n_cols=n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return a, b, c.view(*ori_shape)
+
+
+def geglu_backward(a, b, dc):
+    ori_shape = dc.shape
+    n_cols = ori_shape[-1]
+    dc = dc.view(-1, n_cols)
+    n_rows = dc.shape[0]
+
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    _geglu_tanh_backward_kernel[(n_rows,)](
+        dc,
+        a,
+        b,
+        dc.stride(-2),
+        n_cols=n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+
+    return a.view(*ori_shape), b.view(*ori_shape)
+
+
+class LigerGELUMulFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, a, b):
+        a, b, c = geglu_forward(a, b)
+        ctx.save_for_backward(a, b)
+        return c
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dc):
+        a, b = ctx.saved_tensors
+        a, b = geglu_backward(a, b, dc)
+        return a, b
diff --git a/src/liger_kernel/ops/group_norm.py b/src/liger_kernel/ops/group_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..865fc337f77518562e0cac3587f9b8fb84c679e7
--- /dev/null
+++ b/src/liger_kernel/ops/group_norm.py
@@ -0,0 +1,311 @@
+import operator
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import infer_device
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+
+if infer_device() == "npu":
+    MAX_FUSED_SIZE = 16384  # 8192
+else:
+    MAX_FUSED_SIZE = 65536
+
+
+@triton.jit
+def _group_norm_forward_kernel(
+    Y_ptr,  # pointer to output, shape (n_rows, n_groups, hidden_size)
+    Y_row_stride,  # stride of each row in output
+    Y_col_stride,  # stride of each column in output
+    X_ptr,  # pointer to input, shape (n_rows, n_groups, hidden_size)
+    X_row_stride,  # stride of each row in input
+    X_col_stride,  # stride of each column in input
+    Mean_ptr,  # pointer to mean, shape (n_rows, n_groups)
+    Mean_row_stride,  # stride of each row in mean
+    Mean_col_stride,  # stride of each column in mean
+    RSTD_ptr,  # pointer to rstd, shape (n_rows, n_groups)
+    RSTD_row_stride,  # stride of each row in rstd
+    RSTD_col_stride,  # stride of each column in rstd
+    W_ptr,  # pointer to W
+    B_ptr,  # pointer to B
+    hidden_size,  # hidden size of X
+    channels_per_group,  # the number of channels per group
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    References:
+    https://nn.labml.ai/normalization/group_norm/index.html
+    """
+    batch_idx = tl.program_id(0)
+    group_idx = tl.program_id(1)
+
+    X_ptr += batch_idx * X_row_stride + group_idx * X_col_stride
+    Y_ptr += batch_idx * Y_row_stride + group_idx * Y_col_stride
+
+    block_range = tl.arange(0, BLOCK_SIZE)
+
+    # Compute mean and variance using the online algorithm
+    s = 0.0
+    squared_sum = 0.0
+    for i in tl.range(0, hidden_size, BLOCK_SIZE):
+        hidden_size_offsets = i + block_range
+        mask = hidden_size_offsets < hidden_size
+        X = tl.load(X_ptr + hidden_size_offsets, mask=mask, other=0.0)
+        s += tl.sum(X)
+        # X**2
+        squared_sum += tl.sum(X * X)
+
+    m = s / hidden_size
+
+    # variance = E[X**2] - E[X]**2
+    variance = (squared_sum / hidden_size) - (m * m)
+
+    # 1/std
+    rstd = rsqrt(variance + eps)
+
+    # Normalize — flat loop over full hidden_size (not per-channel)
+    # This avoids the nested channel × per_channel_hidden loop where
+    # BLOCK_SIZE >> hidden_size_per_channel causes massive padding waste.
+    hidden_size_per_channel = hidden_size // channels_per_group
+    for i in tl.range(0, hidden_size, BLOCK_SIZE):
+        hidden_size_offsets = i + block_range
+        mask = hidden_size_offsets < hidden_size
+        X = tl.load(X_ptr + hidden_size_offsets, mask=mask, other=m)
+        # Determine which channel each element belongs to, then load W/B
+        local_channel = hidden_size_offsets // hidden_size_per_channel
+        global_channel = group_idx * channels_per_group + local_channel
+        W = tl.load(W_ptr + global_channel, mask=mask)
+        B = tl.load(B_ptr + global_channel, mask=mask)
+        Y = (X - m) * rstd * W + B
+        tl.store(Y_ptr + hidden_size_offsets, Y, mask=mask)
+
+    tl.store(Mean_ptr + batch_idx * Mean_row_stride + group_idx * Mean_col_stride, m)
+    tl.store(RSTD_ptr + batch_idx * RSTD_row_stride + group_idx * RSTD_col_stride, rstd)
+
+
+@triton.jit
+def _group_norm_backward_kernel(
+    X_ptr,  # pointer to input, shape (n_rows, n_channels, hidden_size)
+    X_row_stride,  # stride of each row in input
+    X_col_stride,  # stride of each column in input
+    W_ptr,  # pointer to weights, shape (n_channels)
+    Mean_ptr,  # pointer to mean, shape (n_rows, n_groups)
+    Mean_ptr_row_stride,  # stride of each column in mean
+    Mean_ptr_col_stride,  # stride of each column in mean
+    RSTD_ptr,  # pointer to rstd, shape (n_rows, n_groups)
+    DX_ptr,  # pointer to input grad, shape (n_rows, n_groups, hidden_size)
+    DW_ptr,  # pointer to weights grad, shape (n_channels)
+    DB_ptr,  # pointer to bias grad, shape (n_channels)
+    UPSTREAM_ptr,  # pointer to output grad, shape (n_rows, n_channels, hidden_size)
+    hidden_size: tl.constexpr,  # hidden size
+    channels_per_group: tl.constexpr,  # number of groups in group norm
+    BLOCK_SIZE: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    """
+    References:
+    https://nn.labml.ai/normalization/group_norm/index.html
+    https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
+
+    The backprop equations are the same for group_norm and layer_norm
+    the only difference here is that we load the Mean, Rstd corresponding to the
+    group we're computing gradients for and the mean and rstd are computed over n-channels
+    so the total number of elements we compute the mean over is num_channels_per_group * hidden_size
+
+    We also need to load the Weights corresponding to the current channel to compute the gradients.
+    """
+    batch_idx = tl.program_id(0)
+    group_idx = tl.program_id(1)
+
+    # Move the pointers to the correct batch
+    X_ptr += batch_idx * X_row_stride
+    DX_ptr += batch_idx * X_row_stride
+    UPSTREAM_ptr += batch_idx * X_row_stride
+
+    # Mean and rstd are the same shape so have the same strides
+    mean = tl.load(Mean_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride)
+    rstd = tl.load(RSTD_ptr + batch_idx * Mean_ptr_row_stride + group_idx * Mean_ptr_col_stride)
+
+    c1 = 0.0
+    c2 = 0.0
+    block_range = tl.arange(0, BLOCK_SIZE)
+
+    # We need to compute the sum terms of the backprop equations across all channels in the group
+    for channel_idx in range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
+        dW = 0.0
+        dB = 0.0
+        # Move the pointers to the correct channel
+        W = tl.load(W_ptr + channel_idx)
+        for i in tl.range(0, hidden_size, BLOCK_SIZE):
+            hidden_size_offsets = i + block_range
+            mask = hidden_size_offsets < hidden_size
+            X = tl.load(
+                X_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+            UPSTREAM_grad = tl.load(
+                UPSTREAM_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+
+            x_hat = (X - mean) * rstd
+            dW += tl.sum(UPSTREAM_grad * x_hat)
+            dB += tl.sum(UPSTREAM_grad)
+
+            wdy = W * UPSTREAM_grad
+            c1 += tl.sum(x_hat * wdy)
+            c2 += tl.sum(wdy)
+
+        # Need to ensure additions to the same channel are atomic
+        tl.atomic_add(DW_ptr + channel_idx, dW.to(dtype))
+        tl.atomic_add(DB_ptr + channel_idx, dB.to(dtype))
+
+    N = hidden_size * channels_per_group
+    c1 = c1 / N
+    c2 = c2 / N
+
+    for channel_idx in tl.range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
+        # Move the pointers to the correct channel
+        W = tl.load(W_ptr + channel_idx)
+        for i in range(0, hidden_size, BLOCK_SIZE):
+            hidden_size_offsets = i + block_range
+            mask = hidden_size_offsets < hidden_size
+            X = tl.load(
+                X_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+            UPSTREAM_grad = tl.load(
+                UPSTREAM_ptr + channel_idx * X_col_stride + hidden_size_offsets,
+                mask=mask,
+                other=0.0,
+            )
+
+            x_hat = (X - mean) * rstd
+            wdy = W * UPSTREAM_grad
+            dx = (wdy - (x_hat * c1 + c2)) * rstd
+            tl.store(DX_ptr + channel_idx * X_col_stride + hidden_size_offsets, dx, mask=mask)
+
+
+def group_norm_forward(X, num_channels, num_groups, W, B, eps):
+    shape = X.shape
+    batch_size = shape[0]
+    channels_per_group = num_channels // num_groups
+    # Reshape X so that the mean and std are computed across the groups
+    X = X.view(batch_size, num_groups, -1).contiguous()
+    hidden_size = X.shape[-1]
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(hidden_size))
+    Y = torch.empty((batch_size, num_groups, hidden_size), dtype=X.dtype, device=X.device)
+    Mean = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
+    RSTD = torch.zeros((batch_size, num_groups), dtype=X.dtype, device=X.device)
+
+    _group_norm_forward_kernel[(batch_size, num_groups)](
+        Y,
+        Y.stride(0),
+        Y.stride(1),
+        X,
+        X.stride(0),
+        X.stride(1),
+        Mean,
+        Mean.stride(0),
+        Mean.stride(1),
+        RSTD,
+        RSTD.stride(0),
+        RSTD.stride(1),
+        W,
+        B,
+        hidden_size,
+        channels_per_group,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    # Return tensors in the original shape
+    return Y.view(*shape), X.view(*shape), Mean, RSTD, BLOCK_SIZE
+
+
+def group_norm_backward(dY, X, W, B, Mean, RSTD, num_channels, num_groups):
+    shape = dY.shape
+    batch_size = shape[0]
+    hidden_size = dY.shape[-1]
+    channels_per_group = num_channels // num_groups
+    dY = dY.view(batch_size, num_groups, -1)
+    DX = torch.empty(
+        (batch_size, num_groups, hidden_size * channels_per_group),
+        dtype=X.dtype,
+        device=X.device,
+    )
+    DW = torch.zeros((num_channels), dtype=W.dtype, device=W.device)
+    DB = torch.zeros((num_channels), dtype=B.dtype, device=B.device)
+    triton_dtype = tl.float32 if X.dtype == torch.float32 else tl.bfloat16
+
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(hidden_size))
+    _group_norm_backward_kernel[(batch_size, num_groups)](
+        X,
+        X.stride(0),
+        X.stride(1),
+        W,
+        Mean,
+        Mean.stride(0),
+        Mean.stride(1),
+        RSTD,
+        DX,
+        DW,
+        DB,
+        dY,
+        hidden_size,
+        channels_per_group,
+        BLOCK_SIZE=BLOCK_SIZE,
+        dtype=triton_dtype,
+    )
+
+    # Return tensors in the original shape
+    return DX.view(*shape), DW, DB
+
+
+class LigerGroupNormFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        X,
+        affine_scaling_weight,
+        affine_shifting_bias,
+        num_channels,
+        num_groups,
+        eps,
+    ):
+        Y, X, Mean, RSTD, BLOCK_SIZE = group_norm_forward(
+            X,
+            num_channels,
+            num_groups,
+            affine_scaling_weight,
+            affine_shifting_bias,
+            eps,
+        )
+        ctx.num_channels = num_channels
+        ctx.num_groups = num_groups
+        ctx.save_for_backward(X, affine_scaling_weight, affine_shifting_bias, Mean, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        X, W, B, Mean, RSTD = ctx.saved_tensors
+        DX, DW, DB = group_norm_backward(dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups)
+        return DX, DW, DB, None, None, None
diff --git a/src/liger_kernel/ops/grpo_loss.py b/src/liger_kernel/ops/grpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..bcd1262bc84f0966710e381ca9c434a0b4974f42
--- /dev/null
+++ b/src/liger_kernel/ops/grpo_loss.py
@@ -0,0 +1,930 @@
+import torch
+import triton
+import triton.language as tl
+
+# Loss type constants for Triton constexpr branching
+# GRPO/DAPO/BNPO/DR_GRPO all use the same per-token loss computation (standard PPO clipping)
+_LOSS_TYPE_GRPO: tl.constexpr = tl.constexpr(0)
+_LOSS_TYPE_CISPO: tl.constexpr = tl.constexpr(1)
+_LOSS_TYPE_SAPO: tl.constexpr = tl.constexpr(2)
+
+_str_to_loss_type = {
+    "grpo": _LOSS_TYPE_GRPO.value,
+    "dapo": _LOSS_TYPE_GRPO.value,
+    "bnpo": _LOSS_TYPE_GRPO.value,
+    "dr_grpo": _LOSS_TYPE_GRPO.value,
+    "luspo": _LOSS_TYPE_GRPO.value,
+    "cispo": _LOSS_TYPE_CISPO.value,
+    "sapo": _LOSS_TYPE_SAPO.value,
+}
+
+
+@triton.jit
+def _selective_log_softmax_kernel(
+    LOGITS,
+    INPUT_IDS,
+    LOG_P,
+    MASK,
+    TEMPERATURE,
+    stride_input_ids_b,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    INPUT_IDS += off_b * stride_input_ids_b + off_l
+    LOG_P += off_b * L + off_l
+
+    if MASK is not None:
+        MASK += off_b * stride_input_ids_b + off_l
+        not_skip = tl.load(MASK)
+        if not_skip == 0:
+            return
+
+    m_i = float("-inf")
+    l_i = 0.0
+    for start in range(0, N, BLOCK_N):
+        cols = start + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+        new_m_i = tl.maximum(m_i, tl.max(logits))
+        alpha = tl.exp(m_i - new_m_i)
+        l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+        m_i = new_m_i
+    lse = m_i + tl.log(l_i)
+
+    ids = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + ids).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    tl.store(LOG_P, logp)
+
+
+# compue old_logp and ref_logp, it reduce 10G peak Memory. it does not requires grad
+@torch.no_grad
+def fused_selective_log_softmax(logits: torch.Tensor, input_ids: torch.Tensor, temperature: float = 0.9, mask=None):
+    assert logits.is_contiguous()
+    B, L_ADD_1, N = logits.shape
+    L = L_ADD_1 - 1
+    input_ids = input_ids[:, -L:]
+    if mask is not None:
+        mask = mask[:, -L:]
+    log_p = torch.zeros(B, L, dtype=torch.float32, device=logits.device)
+    kwargs = {"BLOCK_N": 2048, "num_stages": 4, "num_warps": 1}
+    _selective_log_softmax_kernel[(B, L)](
+        logits, input_ids, log_p, mask, temperature, input_ids.stride(0), L, N, **kwargs
+    )
+    return log_p
+
+
+# @triton.autotune([triton.Config({"BLOCK_N":BLOCK_N}, num_stages=ns, num_warps=nw)
+#                   for BLOCK_N in [2048, 4096, 8192]
+#                   for ns in [1, 2, 4]
+#                   for nw in [1, 2, 4, 8, 16]],
+#                   key=['N'])
+@triton.jit
+def _grpo_loss_fwd_kernel(
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    COMPLETION_MASK,
+    ADVANTAGES,
+    VLLM_IS_RATIO,
+    VLLM_IS_RATIO_STRIDE,
+    LOSS,
+    LSE,
+    KL,
+    IS_CLIPPED,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    LOSS_TYPE: tl.constexpr,
+    SAPO_TEMP_POS,
+    SAPO_TEMP_NEG,
+    DELTA,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            return
+
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    LOSS += off_b * L + off_l
+    LSE += off_b * L + off_l
+    IS_CLIPPED += off_b * L + off_l
+
+    m_i = float("-inf")
+    l_i = 0.0
+    for start in range(0, N, BLOCK_N):
+        cols = start + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+        new_m_i = tl.maximum(m_i, tl.max(logits))
+        alpha = tl.exp(m_i - new_m_i)
+        l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+        m_i = new_m_i
+    lse = m_i + tl.log(l_i)
+
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    if OLD_LOGP is None:
+        old_logp = logp
+    else:
+        OLD_LOGP += off_b * L + off_l
+        old_logp = tl.load(OLD_LOGP).to(tl.float32)
+    coef_1 = tl.exp(logp - old_logp)
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+
+    # Branch based on loss type
+    if LOSS_TYPE == 0:  # GRPO/DAPO/BNPO/DR_GRPO: standard PPO clipping
+        coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+        is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
+        is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
+        is_clipped = is_low_clipped | is_high_clipped
+        # Apply delta (two-sided clipping from INTELLECT-2) to coef_1
+        if DELTA != 0.0:
+            coef_1 = tl.minimum(coef_1, DELTA)
+        per_token_loss1 = coef_1 * advantage
+        per_token_loss2 = coef_2 * advantage
+        per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
+
+    elif LOSS_TYPE == 1:  # CISPO: upper-bound only clipping, detached, multiply by logp
+        # Reference: MiniMax-M1 technical report
+        # https://github.com/huggingface/trl/blob/035c3ff151b953ca72cdfe0ee966bc1469a26fde/trl/trainer/grpo_trainer.py#L2030
+        coef_2 = tl.minimum(coef_1, EPS_HIGH)  # upper-bound only (EPS_HIGH is the raw bound for CISPO)
+        per_token_loss = -coef_2 * advantage * logp  # includes logp term
+        is_clipped = (coef_1 > EPS_HIGH) & (advantage > 0)
+
+    elif LOSS_TYPE == 2:  # SAPO: soft adaptive policy optimization with sigmoid gating
+        # Reference: https://huggingface.co/papers/2511.20347
+        # Formula: sigmoid(τ * (ρ - 1)) * 4 / τ
+        temperature = tl.where(advantage > 0, SAPO_TEMP_POS, SAPO_TEMP_NEG)
+        sigmoid_input = temperature * (coef_1 - 1.0)
+        sapo_coef = tl.sigmoid(sigmoid_input) * 4.0 / temperature
+        per_token_loss = -sapo_coef * advantage
+        is_clipped = 0.0  # SAPO has no clipping concept
+
+    # Apply vLLM importance sampling correction BEFORE adding KL penalty
+    if VLLM_IS_RATIO is not None:
+        # Use modulo to support both (B, L) per-token and (B, 1) per-sequence shapes
+        vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+            tl.float32
+        )
+        per_token_loss = per_token_loss * vllm_is_ratio
+
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        KL += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        kl = tl.exp(ref_logp - logp) - (ref_logp - logp) - 1
+        if USE_BIAS_CORRECTION_KL:
+            # Importance-sampling-corrected KL (DeepSeek-V3.2): kl *= coef_1
+            kl = kl * tl.exp(logp - old_logp)
+        per_token_loss += BETA * kl
+        tl.store(KL, kl)
+
+    tl.store(LOSS, per_token_loss)
+    tl.store(LSE, lse)
+    tl.store(IS_CLIPPED, is_clipped)
+
+
+# Sequence-level forward kernel: uses pre-computed coef_1 per sequence
+@triton.jit
+def _grpo_loss_fwd_kernel_seq(
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    COMPLETION_MASK,
+    ADVANTAGES,
+    COEF_1,  # Pre-computed sequence-level importance weight (B,)
+    COEF_2,  # Pre-computed clipped coef (B,)
+    IS_CLIPPED_SEQ,  # Pre-computed clipping indicator (B,)
+    VLLM_IS_RATIO,  # vLLM importance sampling ratio (B, L) or (B, 1) or None
+    VLLM_IS_RATIO_STRIDE,  # stride for VLLM_IS_RATIO (L for per-token, 1 for per-sequence)
+    LOSS,
+    LSE,
+    KL,
+    IS_CLIPPED,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            return
+
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    COEF_1 += off_b
+    COEF_2 += off_b
+    IS_CLIPPED_SEQ += off_b
+    LOSS += off_b * L + off_l
+    LSE += off_b * L + off_l
+    IS_CLIPPED += off_b * L + off_l
+
+    # Compute log softmax
+    m_i = float("-inf")
+    l_i = 0.0
+    for start in range(0, N, BLOCK_N):
+        cols = start + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+        new_m_i = tl.maximum(m_i, tl.max(logits))
+        alpha = tl.exp(m_i - new_m_i)
+        l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+        m_i = new_m_i
+    lse = m_i + tl.log(l_i)
+
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+
+    # Load pre-computed sequence-level coefficients
+    coef_1 = tl.load(COEF_1).to(tl.float32)
+    coef_2 = tl.load(COEF_2).to(tl.float32)
+    is_clipped_seq = tl.load(IS_CLIPPED_SEQ)
+
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+    per_token_loss1 = coef_1 * advantage
+    per_token_loss2 = coef_2 * advantage
+    per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
+
+    # Apply vLLM importance sampling correction BEFORE adding KL
+    if VLLM_IS_RATIO is not None:
+        vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+            tl.float32
+        )
+        per_token_loss = per_token_loss * vllm_is_ratio
+
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        KL += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        kl = tl.exp(ref_logp - logp) - (ref_logp - logp) - 1
+        if USE_BIAS_CORRECTION_KL:
+            # Importance-sampling-corrected KL (DeepSeek-V3.2): kl *= token-level coef_1
+            if OLD_LOGP is None:
+                old_logp = logp
+            else:
+                old_logp = tl.load(OLD_LOGP + off_b * L + off_l).to(tl.float32)
+            kl = kl * tl.exp(logp - old_logp)
+        per_token_loss += BETA * kl
+        tl.store(KL, kl)
+
+    tl.store(LOSS, per_token_loss)
+    tl.store(LSE, lse)
+    tl.store(IS_CLIPPED, is_clipped_seq)  # Same for all tokens in sequence
+
+
+# Sequence-level backward kernel
+@triton.jit
+def _grpo_loss_bwd_kernel_seq(
+    DLOSS,
+    DLOSS_SUM,
+    DLOGITS,
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    ADVANTAGES,
+    COMPLETION_MASK,
+    LSE,
+    COEF_1,  # Pre-computed sequence-level importance weight (B,)
+    SEQ_LEN,  # Number of valid tokens per sequence (B,)
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    DELTA,
+    loss_stride0,
+    loss_stride1,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+
+    DLOGITS += off_b * (L + 1) * N + off_l * N
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            for start in range(0, N, BLOCK_N):
+                cols = tl.arange(0, BLOCK_N) + start
+                tl.store(DLOGITS + cols, 0.0, mask=cols < N)
+            return
+
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    DLOSS += off_b * loss_stride0 + off_l * loss_stride1
+    DLOSS_SUM += off_b
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    LSE += off_b * L + off_l
+    COEF_1 += off_b
+    SEQ_LEN += off_b
+
+    dloss = tl.load(DLOSS).to(tl.float32)
+    dloss_sum = tl.load(DLOSS_SUM).to(tl.float32)
+    lse = tl.load(LSE).to(tl.float32)
+    coef_1 = tl.load(COEF_1).to(tl.float32)
+    seq_len = tl.load(SEQ_LEN).to(tl.float32)
+
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+    coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+    if DELTA != 0.0:
+        coef_1_for_loss = tl.minimum(coef_1, DELTA)
+    else:
+        coef_1_for_loss = coef_1
+    per_token_loss1 = coef_1_for_loss * advantage
+    per_token_loss2 = coef_2 * advantage
+    is_unclipped = per_token_loss2 >= per_token_loss1
+
+    # For sequence-level: gradient flows through mean, so scale by coef_1/seq_len
+    # d(loss)/d(logp) = -advantage * coef_1 / seq_len (when unclipped and not delta-clamped)
+    dlogp = -coef_1 * advantage / seq_len * is_unclipped * dloss_sum
+    if DELTA != 0.0:
+        dlogp = dlogp * (coef_1 <= DELTA)
+
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        if USE_BIAS_CORRECTION_KL:
+            # d(kl * coef_1)/d(logp) = coef_1 * (logp - ref_logp), where coef_1 = exp(logp - old_logp)
+            if OLD_LOGP is None:
+                old_logp = logp
+            else:
+                old_logp = tl.load(OLD_LOGP + off_b * L + off_l).to(tl.float32)
+            token_coef_1 = tl.exp(logp - old_logp)
+            dlogp += BETA * token_coef_1 * (logp - ref_logp) * dloss
+        else:
+            dlogp += BETA * (1 - tl.exp(ref_logp - logp)) * dloss
+
+    dlogp = dlogp / TEMPERATURE
+    tl.debug_barrier()
+    for start_n in tl.range(0, N, BLOCK_N):
+        cols = start_n + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=-float("inf")).to(tl.float32) / TEMPERATURE
+        probs = tl.exp(logits - lse)
+        dlogits = tl.where(cols == idx, 1 - probs, -probs) * dlogp
+        tl.store(DLOGITS + cols, dlogits, mask=cols < N)
+
+
+@triton.jit
+def _grpo_loss_bwd_kernel(
+    DLOSS,
+    DLOGITS,
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    ADVANTAGES,
+    COMPLETION_MASK,
+    LSE,
+    VLLM_IS_RATIO,
+    VLLM_IS_RATIO_STRIDE,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    LOSS_TYPE: tl.constexpr,
+    SAPO_TEMP_POS,
+    SAPO_TEMP_NEG,
+    DELTA,
+    USE_BIAS_CORRECTION_KL: tl.constexpr,
+    loss_stride0,
+    loss_stride1,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+
+    DLOGITS += off_b * (L + 1) * N + off_l * N
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            for start in range(0, N, BLOCK_N):
+                cols = tl.arange(0, BLOCK_N) + start
+                tl.store(DLOGITS + cols, 0.0, mask=cols < N)
+            return
+
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    DLOSS += off_b * loss_stride0 + off_l * loss_stride1
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    LSE += off_b * L + off_l
+
+    dloss = tl.load(DLOSS).to(tl.float32)
+    lse = tl.load(LSE).to(tl.float32)
+
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    if OLD_LOGP is None:
+        old_logp = logp
+    else:
+        OLD_LOGP += off_b * L + off_l
+        old_logp = tl.load(OLD_LOGP).to(tl.float32)
+    coef_1 = tl.exp(logp - old_logp)
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+
+    # Branch based on loss type for gradient computation
+    if LOSS_TYPE == 0:  # GRPO/DAPO/BNPO/DR_GRPO: standard PPO clipping
+        coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+        if DELTA != 0.0:
+            coef_1_for_loss = tl.minimum(coef_1, DELTA)
+        else:
+            coef_1_for_loss = coef_1
+        per_token_loss1 = coef_1_for_loss * advantage
+        per_token_loss2 = coef_2 * advantage
+        mask = per_token_loss2 >= per_token_loss1
+        # Gradient uses original coef_1; zero when delta-clamped (constant → no gradient)
+        dlogp = -coef_1 * advantage * mask
+        if DELTA != 0.0:
+            dlogp = dlogp * (coef_1 <= DELTA)
+
+    elif LOSS_TYPE == 1:  # CISPO: coef_2 is DETACHED, so gradient only flows through logp
+        # loss = -coef_2 * advantage * logp, where coef_2 = clamp(coef_1, max=eps_high).detach()
+        # d(loss)/d(logp) = -coef_2 * advantage (coef_2 treated as constant due to detach)
+        coef_2 = tl.minimum(coef_1, EPS_HIGH)
+        dlogp = -coef_2 * advantage
+
+    elif LOSS_TYPE == 2:  # SAPO: gradient through sigmoid gating
+        # loss = -sapo_coef * advantage, where sapo_coef = sigmoid(τ*(ρ-1)) * 4/τ
+        # d(loss)/d(logp) = -advantage * d(sapo_coef)/d(coef_1) * d(coef_1)/d(logp)
+        # d(coef_1)/d(logp) = coef_1 (since coef_1 = exp(logp - old_logp))
+        # d(sapo_coef)/d(coef_1) = d/d(coef_1)[sigmoid(τ*(coef_1-1)) * 4/τ]
+        #                       = τ * sigmoid' * 4/τ = 4 * sigmoid * (1 - sigmoid)
+        # (the τ factors cancel out in the derivative)
+        temperature = tl.where(advantage > 0, SAPO_TEMP_POS, SAPO_TEMP_NEG)
+        sigmoid_input = temperature * (coef_1 - 1.0)
+        sigmoid_val = tl.sigmoid(sigmoid_input)
+        d_sapo_d_coef1 = 4.0 * sigmoid_val * (1.0 - sigmoid_val)
+        dlogp = -advantage * d_sapo_d_coef1 * coef_1
+
+    # Apply vLLM IS ratio to PPO gradient (before KL gradient)
+    if VLLM_IS_RATIO is not None:
+        # Use modulo to support both (B, L) per-token and (B, 1) per-sequence shapes
+        vllm_is_ratio = tl.load(VLLM_IS_RATIO + off_b * VLLM_IS_RATIO_STRIDE + off_l % VLLM_IS_RATIO_STRIDE).to(
+            tl.float32
+        )
+        dlogp = dlogp * vllm_is_ratio
+
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        if USE_BIAS_CORRECTION_KL:
+            # d(kl * coef_1)/d(logp) = coef_1 * (logp - ref_logp), where coef_1 = exp(logp - old_logp)
+            dlogp += BETA * coef_1 * (logp - ref_logp)
+        else:
+            dlogp += BETA * (1 - tl.exp(ref_logp - logp))
+
+    dlogp = dlogp * dloss / TEMPERATURE
+    tl.debug_barrier()
+    for start_n in tl.range(0, N, BLOCK_N):
+        cols = start_n + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=-float("inf")).to(tl.float32) / TEMPERATURE
+        probs = tl.exp(logits - lse)
+        dlogits = tl.where(cols == idx, 1 - probs, -probs) * dlogp
+        tl.store(DLOGITS + cols, dlogits, mask=cols < N)
+
+
+def _compute_dapo_normalizer(completion_mask):
+    """Global active tokens averaged per process (for distributed DAPO loss)."""
+    normalizer = completion_mask.to(torch.float32).sum()
+    world_size = 1
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        normalizer = normalizer.clone()
+        torch.distributed.all_reduce(normalizer, op=torch.distributed.ReduceOp.SUM)
+        world_size = torch.distributed.get_world_size()
+    normalizer = normalizer / world_size
+    return torch.clamp(normalizer, min=1.0)
+
+
+def _reduce_loss(per_token_loss, mask, loss_type, max_completion_length, B, L):
+    """Apply loss reduction based on loss_type."""
+    if loss_type == "grpo" or loss_type == "sapo":
+        return ((per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean()
+    elif loss_type == "bnpo":
+        return (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0)
+    elif loss_type == "dr_grpo":
+        max_len = max_completion_length if max_completion_length is not None else L
+        return (per_token_loss * mask).sum() / (B * max_len)
+    elif loss_type == "dapo" or loss_type == "cispo":
+        return (per_token_loss * mask).sum() / _compute_dapo_normalizer(mask)
+    elif loss_type == "luspo":
+        return (per_token_loss * mask.sum(-1, keepdim=True)).mean()
+    raise ValueError(f"Unknown loss_type: {loss_type}. Expected one of: grpo, bnpo, dr_grpo, dapo, cispo, sapo, luspo")
+
+
+class GrpoLossFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        logits,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace,
+        loss_type="grpo",
+        max_completion_length=None,
+        reduce=True,
+        importance_sampling_level="token",
+        sapo_temperature_pos=1.0,
+        sapo_temperature_neg=1.05,
+        vllm_is_ratio=None,
+        delta=None,
+        use_bias_correction_kl=False,
+    ):
+        assert logits.is_contiguous() and completion_ids.is_contiguous()
+        assert old_logp is None or old_logp.is_contiguous()
+        assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
+        assert importance_sampling_level in ("token", "sequence"), (
+            f"importance_sampling_level must be 'token' or 'sequence', got {importance_sampling_level}"
+        )
+
+        # Validate loss_type
+        if loss_type not in _str_to_loss_type:
+            raise ValueError(f"Unknown loss_type '{loss_type}'. Supported types: {list(_str_to_loss_type.keys())}")
+
+        # Validate delta + loss_type combinations
+        if delta is not None and loss_type in ("cispo", "sapo"):
+            raise ValueError(f"delta (two-sided clipping) is not supported for loss_type='{loss_type}'.")
+
+        # Map delta to float for Triton (Triton can't handle None)
+        delta_val = 0.0 if delta is None else float(delta)
+
+        # Validate sequence-level + loss_type combinations
+        if importance_sampling_level == "sequence" and loss_type in ("cispo", "sapo"):
+            raise ValueError(
+                f"Sequence-level importance sampling is not supported for loss_type='{loss_type}'. "
+                f"Use importance_sampling_level='token' instead."
+            )
+
+        # Validate SAPO temperatures to prevent division by zero or numerical instability
+        if loss_type == "sapo":
+            if sapo_temperature_pos <= 0:
+                raise ValueError(f"sapo_temperature_pos must be positive, got {sapo_temperature_pos}")
+            if sapo_temperature_neg <= 0:
+                raise ValueError(f"sapo_temperature_neg must be positive, got {sapo_temperature_neg}")
+
+        # Convert loss_type string to integer for Triton constexpr
+        loss_type_int = _str_to_loss_type[loss_type]
+
+        B, L_ADD_1, N = logits.shape
+        L = L_ADD_1 - 1
+
+        if completion_mask is not None:
+            assert completion_mask.is_contiguous()
+
+        mask = completion_mask.float() if completion_mask is not None else torch.ones(B, L, device=logits.device)
+
+        # Handle vLLM IS ratio
+        vllm_is_ratio_ptr = None
+        vllm_is_ratio_stride = L  # default to per-token (unused when ptr is None)
+        if vllm_is_ratio is not None:
+            assert vllm_is_ratio.dim() in (1, 2), (
+                f"vllm_is_ratio must be 1D (B,) or 2D (B, L) / (B, 1), got {vllm_is_ratio.dim()}D"
+            )
+            if vllm_is_ratio.dim() == 2:
+                assert vllm_is_ratio.shape[0] == B and vllm_is_ratio.shape[1] in (1, L), (
+                    f"vllm_is_ratio shape must be ({B}, 1) or ({B}, {L}), got {tuple(vllm_is_ratio.shape)}"
+                )
+            else:
+                assert vllm_is_ratio.shape[0] == B, (
+                    f"vllm_is_ratio shape must be ({B},), got {tuple(vllm_is_ratio.shape)}"
+                )
+            vllm_is_ratio = vllm_is_ratio.contiguous()
+            vllm_is_ratio_ptr = vllm_is_ratio
+            vllm_is_ratio_stride = vllm_is_ratio.shape[1] if vllm_is_ratio.dim() > 1 else 1
+
+        # Allocate outputs
+        loss = torch.zeros(B, L, device=logits.device, dtype=torch.float32)
+        lse = torch.zeros_like(loss)
+        is_clipped = torch.zeros_like(loss)
+        kl = torch.zeros_like(loss) if beta != 0.0 else None
+
+        if importance_sampling_level == "sequence":
+            # Sequence-level: pre-compute sequence importance weights, then use Triton kernel
+            # Step 1: Get per-token log probs using existing Triton kernel
+            per_token_logps = fused_selective_log_softmax(logits, completion_ids, temperature, completion_mask)
+
+            # Step 2: Compute sequence-level importance weights
+            if old_logp is None:
+                log_ratio = torch.zeros_like(per_token_logps)
+            else:
+                log_ratio = per_token_logps - old_logp
+
+            seq_lens = mask.sum(-1).clamp(min=1.0)  # (B,)
+            seq_log_importance = (log_ratio * mask).sum(-1) / seq_lens  # (B,)
+            coef_1 = torch.exp(seq_log_importance)  # (B,)
+            coef_2 = torch.clamp(coef_1, 1 - eps_low, 1 + eps_high)  # (B,)
+
+            # Compute is_clipped at sequence level (using original coef_1)
+            is_clipped_seq = ((coef_1 < 1 - eps_low) & (advantages < 0)) | ((coef_1 > 1 + eps_high) & (advantages > 0))
+            is_clipped_seq = is_clipped_seq.float()  # (B,)
+
+            # Apply delta clamp for loss computation (keep original coef_1 for backward)
+            if delta is not None:
+                coef_1_for_loss = torch.clamp(coef_1, max=delta)
+            else:
+                coef_1_for_loss = coef_1
+
+            # Step 3: Run Triton kernel with pre-computed coefficients
+            kwargs = {"BLOCK_N": 2048, "num_stages": 2, "num_warps": 1}
+            _grpo_loss_fwd_kernel_seq[(B, L)](
+                logits,
+                old_logp,
+                ref_logp,
+                completion_ids,
+                completion_mask,
+                advantages,
+                coef_1_for_loss.contiguous(),
+                coef_2.contiguous(),
+                is_clipped_seq.contiguous(),
+                vllm_is_ratio_ptr,
+                vllm_is_ratio_stride,
+                loss,
+                lse,
+                kl,
+                is_clipped,
+                temperature,
+                beta,
+                use_bias_correction_kl,
+                L,
+                N,
+                **kwargs,
+            )
+
+            # Save extra tensors for backward
+            ctx.save_for_backward(
+                logits,
+                old_logp,
+                ref_logp,
+                completion_ids,
+                advantages,
+                completion_mask,
+                lse,
+                mask,
+                coef_1,
+                seq_lens,
+                vllm_is_ratio_ptr,
+            )
+        else:
+            # Token-level: use optimized Triton kernel with LOSS_TYPE branching
+            kwargs = {"BLOCK_N": 2048, "num_stages": 2, "num_warps": 1}
+            _grpo_loss_fwd_kernel[(B, L)](
+                logits,
+                old_logp,
+                ref_logp,
+                completion_ids,
+                completion_mask,
+                advantages,
+                vllm_is_ratio_ptr,
+                vllm_is_ratio_stride,
+                loss,
+                lse,
+                kl,
+                is_clipped,
+                temperature,
+                beta,
+                eps_low,
+                eps_high,
+                loss_type_int,
+                sapo_temperature_pos,
+                sapo_temperature_neg,
+                delta_val,
+                use_bias_correction_kl,
+                L,
+                N,
+                **kwargs,
+            )
+            ctx.save_for_backward(
+                logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse, mask, vllm_is_ratio_ptr
+            )
+
+        ctx.infos = (
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            inplace,
+            loss_type,
+            loss_type_int,
+            sapo_temperature_pos,
+            sapo_temperature_neg,
+            max_completion_length,
+            B,
+            L,
+            importance_sampling_level,
+            vllm_is_ratio_stride,
+            reduce,
+            delta_val,
+            use_bias_correction_kl,
+        )
+
+        # Compute metrics before reduction
+        mask_sum = mask.sum().clamp(min=1.0)
+        kl_mean = (kl * mask).sum() / mask_sum if kl is not None else None
+        clip_ratio = (is_clipped.float() * mask).sum() / mask_sum
+
+        if not reduce:
+            loss_out = loss * mask
+            kl_out = kl * mask if kl is not None else None
+            is_clipped_out = is_clipped * mask
+            return loss_out, kl_out, is_clipped_out
+
+        reduced_loss = _reduce_loss(loss, mask, loss_type, max_completion_length, B, L)
+        return reduced_loss, kl_mean, clip_ratio
+
+    @staticmethod
+    def backward(ctx, *args):
+        dloss_input = args[0]
+        saved_tensors = ctx.saved_tensors
+        (
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            inplace,
+            loss_type,
+            loss_type_int,
+            sapo_temperature_pos,
+            sapo_temperature_neg,
+            max_completion_length,
+            B,
+            L,
+            importance_sampling_level,
+            vllm_is_ratio_stride,
+            reduce,
+            delta_val,
+            use_bias_correction_kl,
+        ) = ctx.infos
+
+        if importance_sampling_level == "sequence":
+            (
+                logits,
+                old_logp,
+                ref_logp,
+                completion_ids,
+                advantages,
+                completion_mask,
+                lse,
+                mask,
+                coef_1,
+                seq_lens,
+                vllm_is_ratio,
+            ) = saved_tensors
+        else:
+            (logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse, mask, vllm_is_ratio) = (
+                saved_tensors
+            )
+
+        _, L_ADD_1, N = logits.shape
+
+        # Compute per-token gradient scaling based on loss_type
+        if not reduce:
+            dloss = dloss_input
+        elif loss_type == "grpo" or loss_type == "sapo":
+            seq_lens_bwd = mask.sum(-1, keepdim=True).clamp(min=1.0)
+            dloss = dloss_input * mask / (seq_lens_bwd * B)
+        elif loss_type == "bnpo":
+            dloss = dloss_input * mask / mask.sum().clamp(min=1.0)
+        elif loss_type == "dr_grpo":
+            max_len = max_completion_length if max_completion_length is not None else L
+            dloss = dloss_input * mask / (B * max_len)
+        elif loss_type == "dapo" or loss_type == "cispo":
+            dloss = dloss_input * mask / _compute_dapo_normalizer(mask)
+        elif loss_type == "luspo":
+            # loss = mean(per_token_loss * seq_lens), mean divides by B*L
+            seq_lens_bwd = mask.sum(-1, keepdim=True).clamp(min=1.0)
+            dloss = dloss_input * seq_lens_bwd / (B * L)
+        else:
+            raise ValueError(f"Unknown loss_type: {loss_type}")
+
+        dlogits = logits.data if inplace else torch.empty_like(logits)
+        kwargs = {"BLOCK_N": 4096, "num_stages": 1, "num_warps": 16}
+
+        if importance_sampling_level == "sequence":
+            if vllm_is_ratio is None:
+                dloss_sum = dloss.sum(-1).contiguous()
+            else:
+                if vllm_is_ratio.dim() == 1:
+                    ratio = vllm_is_ratio.unsqueeze(-1)
+                else:
+                    ratio = vllm_is_ratio
+                dloss_sum = (dloss * ratio).sum(-1).contiguous()
+            # Sequence-level backward kernel
+            _grpo_loss_bwd_kernel_seq[(B, L)](
+                dloss,
+                dloss_sum,
+                dlogits,
+                logits,
+                old_logp,
+                ref_logp,
+                completion_ids,
+                advantages,
+                completion_mask,
+                lse,
+                coef_1,
+                seq_lens,
+                temperature,
+                beta,
+                use_bias_correction_kl,
+                eps_low,
+                eps_high,
+                delta_val,
+                *dloss.stride(),
+                L,
+                N,
+                **kwargs,
+            )
+        else:
+            # Token-level backward kernel with LOSS_TYPE branching
+            _grpo_loss_bwd_kernel[(B, L)](
+                dloss,
+                dlogits,
+                logits,
+                old_logp,
+                ref_logp,
+                completion_ids,
+                advantages,
+                completion_mask,
+                lse,
+                vllm_is_ratio,
+                vllm_is_ratio_stride,
+                temperature,
+                beta,
+                eps_low,
+                eps_high,
+                loss_type_int,
+                sapo_temperature_pos,
+                sapo_temperature_neg,
+                delta_val,
+                use_bias_correction_kl,
+                *dloss.stride(),
+                L,
+                N,
+                **kwargs,
+            )
+
+        dlogits[:, -1, :] = 0
+        # Return gradients for all forward inputs: dlogits + 19 None for non-differentiable params
+        return (
+            dlogits,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/liger_kernel/ops/jsd.py b/src/liger_kernel/ops/jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..3115a254913b694745a940e73bdd6296c90ff5cd
--- /dev/null
+++ b/src/liger_kernel/ops/jsd.py
@@ -0,0 +1,201 @@
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.utils import infer_device
+
+
+@triton.jit
+def _jsd_kernel(
+    X_ptr,  # input in logspace, X = log Q
+    X_stride,
+    Y_ptr,  # ground truth in logspace, Y = log P
+    Y_stride,
+    loss_ptr,
+    loss_stride,
+    dX_ptr,
+    dX_stride,
+    label_ptr,
+    beta: tl.constexpr,
+    n_non_ignore: int,
+    ignore_index: tl.constexpr,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    HAS_LABEL: tl.constexpr,
+):
+    # JSD(P || Q) = (KL(P || M) + KL(Q || M)) / 2, M = (1/2) * (P + Q) = (1/2) * (e ^ Y + e ^ X)
+    #             = sum(P * log P + Q * log Q - 2 * M * log M) / 2
+    #             = sum(e ^ Y * Y + e ^ X * X - 2 * M * log M) / 2
+    # grad_x_i = 0.5 * Q * (X - log_M)
+    pid = tl.program_id(0).to(tl.int64)
+    X_ptr += pid * X_stride
+    dX_ptr += pid * dX_stride
+    Y_ptr += pid * Y_stride
+    loss_ptr += pid * loss_stride
+    label_ptr += pid
+
+    if HAS_LABEL:
+        label = tl.load(label_ptr)
+        if label == ignore_index:
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + tl.arange(0, BLOCK_SIZE)
+                tl.store(dX_ptr + offsets, 0.0, mask=offsets < n_cols)
+            return
+
+    for i in range(0, n_cols, BLOCK_SIZE):
+        offsets = i + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_cols
+        X = tl.load(X_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+        Y = tl.load(Y_ptr + offsets, mask=mask, other=float("-inf")).to(tl.float32)
+
+        if beta == 0.0:  # forward KL
+            Y_max = tl.max(Y, axis=0)
+            Y_shifted = Y - Y_max
+            Y_prob = tl.exp(Y_shifted) * tl.exp(Y_max)  # Compensate for the shift
+            loss = Y_prob * (Y - X)
+            dX = -Y_prob
+        elif beta == 1.0:  # reverse KL
+            X_max = tl.max(X, axis=0)
+            X_shifted = X - X_max
+            X_prob = tl.exp(X_shifted) * tl.exp(X_max)  # Compensate for the shift
+            loss = X_prob * (X - Y)
+            dX = loss + X_prob
+        else:
+            max_val = tl.maximum(tl.max(X, axis=0), tl.max(Y, axis=0))
+            X_shifted = X - max_val
+            Y_shifted = Y - max_val
+
+            # Pre-compute exp(max_val) since it's used twice
+            exp_max = tl.exp(max_val)
+
+            # Compute exp terms with compensation
+            Q = tl.exp(X_shifted) * exp_max  # = exp(X)
+            P = tl.exp(Y_shifted) * exp_max  # = exp(Y)
+
+            # Pre-compute common terms
+            beta_P = beta * P
+            one_minus_beta_Q = (1 - beta) * Q
+            M = beta_P + one_minus_beta_Q
+            log_M = tl.log(M)  # No need to compensate as M is already in original scale
+
+            loss = beta_P * Y + one_minus_beta_Q * X - M * log_M
+            dX = one_minus_beta_Q * (X - log_M)
+
+        # Pre-compute scaling factor
+        scale = 1.0 / n_non_ignore
+        loss = loss * scale
+        dX = dX * scale
+
+        tl.store(loss_ptr + offsets, loss, mask=mask)
+        tl.store(dX_ptr + offsets, dX, mask=mask)
+
+
+MAX_FUSED_SIZE = 4096 if infer_device() == "xpu" else 65536
+
+
+def jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label):
+    BT, V = _input.shape
+    n_rows = BT
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+    # non reduction loss
+    loss = torch.zeros(_input.shape, dtype=torch.float32, device=_input.device)
+    dX = torch.empty_like(_input)
+
+    if has_label:
+        n_non_ignore = (shift_labels != ignore_index).sum().item()
+    else:
+        n_non_ignore = BT
+
+    _jsd_kernel[(n_rows,)](
+        X_ptr=_input,  # input in logspace, X = log Q
+        X_stride=_input.stride(-2),
+        Y_ptr=target,  # ground truth in logspace, Y = log P
+        Y_stride=target.stride(-2),
+        loss_ptr=loss,
+        loss_stride=loss.stride(-2),
+        dX_ptr=dX,
+        dX_stride=dX.stride(-2),
+        label_ptr=(shift_labels if has_label else torch.empty(1, device=_input.device)),  # dummy ptr if no label
+        beta=beta,
+        n_non_ignore=n_non_ignore,
+        ignore_index=ignore_index,
+        n_cols=V,
+        BLOCK_SIZE=BLOCK_SIZE,
+        HAS_LABEL=has_label,
+    )
+
+    loss = torch.sum(loss)
+    return loss.to(_input.dtype), dX
+
+
+def jsd_backward(dX, grad_output):
+    # If jsd is the last layer, grad_output is 1.0. Skip the mul to save time
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        return dX
+    else:
+        return grad_output * dX
+
+
+class LigerJSDFunction(torch.autograd.Function):
+    r"""
+    This class implements the forward and backward pass for the generalized Jensen-Shannon Divergence.
+    .. math::
+        JSD(\beta)(P || Q)
+            = \beta * KLDiv(P || (\beta * P + (1 - \beta) * Q)) + (1 - \beta) * KLDiv(Q || (\beta * P + (1 - \beta) * Q))
+
+    .. note::
+        As all the other losses in PyTorch, this function expects the first argument,
+        :attr:`_input`, to be the predictions, the output of the student model, in log-space
+        and the second, :attr:`target`, to be the observations, the output of the teacher model, in log-space.
+        This differs from the standard mathematical notation :math:`JSD(P || Q)` where
+        :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        _input: torch.Tensor,
+        target: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+    ) -> torch.Tensor:
+        """
+        Args:
+            _input (torch.Tensor): predict values with shape (BT, V) in logspace
+            target (torch.Tensor): ground truth values with shape (BT, V) in logspace
+            shift_labels (Optional[torch.LongTensor]): indicator of next predicted vocab with shape (BT) where each value is in [0, V-1].
+            beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
+            ignore_index (int): the index to ignore. Default: -100
+
+        Returns:
+            loss (torch.Tensor): generalized JSD
+        """
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (_input.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+
+        loss, dX = jsd_forward(_input, target, shift_labels, beta, ignore_index, has_label)
+        ctx.save_for_backward(dX)
+        return loss
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        (dX,) = ctx.saved_tensors
+        dX = jsd_backward(dX, grad_output)
+        return (
+            dX,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/liger_kernel/ops/kl_div.py b/src/liger_kernel/ops/kl_div.py
new file mode 100755
index 0000000000000000000000000000000000000000..273294072633528d7e07bffa6b74086d85a13ff4
--- /dev/null
+++ b/src/liger_kernel/ops/kl_div.py
@@ -0,0 +1,259 @@
+from typing import Literal
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import is_hip
+from liger_kernel.utils import infer_device
+
+
+def get_num_warps(BLOCK_SIZE):
+    num_warps = 4
+    if BLOCK_SIZE >= 32768:
+        num_warps = 32 if not is_hip() else 16
+    elif BLOCK_SIZE >= 8192:
+        num_warps = 16
+    elif BLOCK_SIZE >= 2048:
+        num_warps = 8
+
+    return num_warps
+
+
+if infer_device() == "xpu":
+    MAX_FUSED_SIZE = 8192
+elif infer_device() == "npu":
+    MAX_FUSED_SIZE = 8192
+else:
+    MAX_FUSED_SIZE = 65536 // 4  # 65536 // 4 or 8 works the best
+
+REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
+
+_REDUCTION_MODE_NONE: tl.constexpr = tl.constexpr(0)
+_REDUCTION_MODE_SUM: tl.constexpr = tl.constexpr(1)
+_REDUCTION_MODE_MEAN: tl.constexpr = tl.constexpr(2)
+_REDUCTION_MODE_BATCHMEAN: tl.constexpr = tl.constexpr(3)
+
+_str_to_reduction_mode = {
+    "none": _REDUCTION_MODE_NONE.value,
+    "sum": _REDUCTION_MODE_SUM.value,
+    "mean": _REDUCTION_MODE_MEAN.value,
+    "batchmean": _REDUCTION_MODE_BATCHMEAN.value,
+}
+
+
+@triton.jit
+def _kldiv_kernel_forward(
+    y_ptr,  # [B, S], prediction ptr, the kernel expects the prediction in log-space
+    y_stride,  # int, prediction stride
+    gt_ptr,  # [B, S], ground truth ptr
+    gt_stride,  # int, ground truth stride
+    loss_ptr,  # [B] or [B, S] if reduction == _REDUCTION_MODE_NONE, output ptr
+    loss_stride,  # int, output stride
+    n_cols,  # int, number of columns in the input tensor
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+    log_target: tl.constexpr = False,
+    reduction: tl.constexpr = _REDUCTION_MODE_BATCHMEAN,
+):
+    pid = tl.program_id(0).to(tl.int64)
+    y_ptr += pid * y_stride
+    gt_ptr += pid * gt_stride
+    loss_ptr += pid * loss_stride
+
+    base_offsets = tl.arange(0, BLOCK_SIZE)
+
+    loss_sum = 0.0
+    for i in range(0, n_cols, BLOCK_SIZE):
+        offsets = i + base_offsets
+        mask = offsets < n_cols
+        y = tl.load(y_ptr + offsets, mask=mask, other=0.0)
+        y_true = tl.load(gt_ptr + offsets, mask=mask, other=0.0)
+
+        # KL(y_true || y) = y_true * (log(y_true) - log(y))
+        # We compute KL(y_true || y) with y in the log-space
+        if not log_target:
+            loss = y_true * (tl.log(tl.maximum(y_true, eps)) - y)
+        else:
+            loss = tl.exp(y_true) * (y_true - y)
+
+        if reduction == _REDUCTION_MODE_NONE:
+            tl.store(loss_ptr + offsets, loss, mask=mask)
+        else:
+            loss_sum += tl.sum(loss, axis=0)
+
+    if reduction != _REDUCTION_MODE_NONE:
+        tl.store(loss_ptr, loss_sum)
+
+
+@triton.jit
+def _kldiv_kernel_backward(
+    target_ptr,
+    target_stride,
+    new_grads_ptr,
+    new_grads_stride,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    log_target: tl.constexpr = False,
+):
+    pid = tl.program_id(0).to(tl.int64)
+
+    target_ptr += pid * target_stride
+    new_grads_ptr += pid * new_grads_stride
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_cols
+
+    for i in range(0, n_cols, BLOCK_SIZE):
+        offsets = i + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_cols
+
+        target = tl.load(target_ptr + offsets, mask=mask, other=0.0)
+
+        if not log_target:
+            res = target * -1
+        else:
+            res = -tl.exp(target)
+
+        tl.store(new_grads_ptr + offsets, res, mask=mask)
+
+
+def kldiv_forward_triton(y_pred, y_true, log_target, reduction, eps):  # [BT, V]
+    BT, V = y_pred.shape
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+    num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
+
+    grid = (BT,)
+    reduction = _str_to_reduction_mode[reduction]
+
+    out_size = (BT, V) if reduction == _REDUCTION_MODE_NONE.value else (BT,)
+    output_tensor = torch.zeros(out_size, device=y_pred.device, dtype=torch.float32)
+
+    _kldiv_kernel_forward[grid](
+        y_pred,
+        y_pred.stride(0),
+        y_true,
+        y_true.stride(0),
+        output_tensor,
+        output_tensor.stride(0),
+        V,
+        eps=eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        log_target=log_target,
+        reduction=reduction,
+    )
+
+    # calculated according to the reduction mode same as in Pytorch. In the later versions, `mean` will be changed to the same behavior as `batchmean`
+    # https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html
+    # https://github.com/pytorch/pytorch/blob/d7b57c4d63edb42e1deeeba9497fcb5f1f748ff2/torch/nn/functional.py#L3372
+    if reduction == _REDUCTION_MODE_BATCHMEAN.value:
+        return output_tensor.sum() / BT
+    elif reduction == _REDUCTION_MODE_SUM.value:
+        return output_tensor.sum(dim=0)
+    elif reduction == _REDUCTION_MODE_MEAN.value:
+        return output_tensor.sum() / (BT * V)
+    else:
+        return output_tensor
+
+
+def kldiv_backward_triton(target, grad_output, new_grads, log_target):
+    BT, V = target.shape
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+    num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
+
+    grid = (BT,)
+
+    # We store the gradients in-place in the input tensor
+    _kldiv_kernel_backward[grid](
+        target,
+        target.stride(0),
+        new_grads,
+        new_grads.stride(0),
+        V,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        log_target=log_target,
+    )
+
+    # If cross entropy is the last layer, grad_output is 1.0. Skip the mul then.
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        return new_grads
+
+    return new_grads * grad_output
+
+
+class LigerKLDivLossFunction(torch.autograd.Function):
+    """
+    Class implementing the forward and backward pass for the KL Divergence Loss using Triton, as defined by the following formula:
+    ```python
+    if log_target:
+        loss = target.exp() * (target - input)
+    else:
+        loss = target * (target.log() - input)
+    ```,
+    then the loss is reduced according to the `reduction` parameter.
+    as defined in the PyTorch documentation: https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        y_pred: torch.Tensor,
+        y_true: torch.Tensor,
+        reduction: REDUCTION_LITERAL = "batchmean",
+        log_target: bool = False,
+        eps: float = 1e-10,
+    ) -> torch.Tensor:
+        """A forward pass for the KL Divergence Loss.
+
+        Args:
+            ctx: Torch autograd context
+            y_pred (torch.Tensor): A tensor of shape (BT, V) containing the predicted values, expected to be log-probabilities.
+            y_true (torch.Tensor): A tensor of shape (BT, V) containing the target values, expected to be either probabilities or log-probabilities, depending on the value of `log_target`.
+            reduction (REDUCTION_LITERAL, optional): Reduction to be used. Defaults to "batchmean".
+            log_target (bool, optional): If set to true, expects the ground truth to already be log-probabilities. Defaults to False.
+            eps: (float, optional): A small value to avoid division by zero. Defaults to 1e-10.
+
+        Returns:
+            torch.Tensor: The computed KL Divergence Loss, with shape (BT, V) if `reduction` is "none", else a scalar.
+        """
+        ctx.save_for_backward(y_true)
+        ctx.reduction = reduction
+        ctx.log_target = log_target
+        return kldiv_forward_triton(y_pred, y_true, log_target=log_target, reduction=reduction, eps=eps)
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        """A backward pass for the KL Divergence Loss.
+
+        Args:
+            ctx: Torch autograd context
+            grad_output (torch.Tensor): The gradient of the loss with respect to the output.
+
+        Returns:
+            tuple[torch.Tensor, None, None, None, None]: The gradient of the loss with respect to the inputs and None for the other arguments of the forward method.
+        """
+        (y_true,) = ctx.saved_tensors
+
+        new_grads = torch.empty_like(y_true)
+
+        derivative = kldiv_backward_triton(y_true, grad_output, new_grads, ctx.log_target)
+
+        if ctx.reduction == "batchmean":
+            derivative = derivative / y_true.shape[0]
+        elif ctx.reduction == "sum" or ctx.reduction == "none":
+            pass
+        elif ctx.reduction == "mean":
+            derivative = derivative / (y_true.shape[0] * y_true.shape[1])
+
+        return (
+            derivative,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/src/liger_kernel/ops/layer_norm.py b/src/liger_kernel/ops/layer_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..e8ac6b5f39b8ed227b68f75cc79f72fc4012b5c2
--- /dev/null
+++ b/src/liger_kernel/ops/layer_norm.py
@@ -0,0 +1,320 @@
+import math
+import operator
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+from liger_kernel.ops.utils import set_large_grf_mode
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+
+
+@triton.jit
+def _layer_norm_forward_kernel(
+    Y_ptr,  # pointer to output, shape (n_rows, n_cols)
+    Y_row_stride,  # stride of each row in output
+    X_ptr,  # pointer to input, shape (n_rows, n_cols)
+    X_row_stride,  # stride of each row in input
+    W_ptr,  # pointer to weights, shape (n_cols,)
+    W_row_stride,  # stride of each row in weights
+    B_ptr,  # pointer to bias, shape (n_cols,)
+    B_row_stride,  # stride of each row in bias
+    Mean_ptr,  # pointer to mean, shape (n_rows,)
+    Mean_row_stride,  # stride of each row in mean
+    RSTD_ptr,  # pointer to rstd, shape (n_rows,)
+    RSTD_row_stride,  # stride of each row in rstd
+    n_cols,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    References:
+    https://arxiv.org/abs/1607.06450
+    https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
+    """
+    row_idx = tl.program_id(0).to(tl.int64)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    # Pre-load weights and bias in fp32 to avoid repeated conversions
+    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+    B_row = tl.load(B_ptr + col_offsets, mask=mask, other=0.0)
+    W_f32 = W_row.to(tl.float32)
+    B_f32 = B_row.to(tl.float32)
+
+    # Calculate pointers for this row
+    row_X_ptr = X_ptr + row_idx * X_row_stride
+    row_Y_ptr = Y_ptr + row_idx * Y_row_stride
+    row_Mean_ptr = Mean_ptr + row_idx * Mean_row_stride
+    row_RSTD_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+
+    # Load input data and convert to fp32 for numerical stability
+    X_row = tl.load(row_X_ptr + col_offsets, mask=mask, other=0.0)
+    X_f32 = X_row.to(tl.float32)
+
+    # Compute statistics in fp32 for numerical stability
+    mean = tl.sum(X_f32, axis=0) / n_cols
+    X_centered = X_f32 - mean
+    # Apply mask to variance calculation to exclude contributions from masked elements
+    X_centered_masked = tl.where(mask, X_centered, 0.0)
+    var = tl.sum(X_centered_masked * X_centered_masked, axis=0) / n_cols
+    rstd = rsqrt(var + eps)
+
+    # Store statistics (convert back to original dtype only once)
+    tl.store(row_Mean_ptr, mean.to(X_row.dtype))
+    tl.store(row_RSTD_ptr, rstd.to(X_row.dtype))
+
+    # Fused normalization and affine transformation
+    # Y = (X - mean) * rstd * W + B = X_centered * rstd * W + B
+    Y_f32 = X_centered * rstd * W_f32 + B_f32
+
+    # Store output (single conversion back to original dtype)
+    tl.store(row_Y_ptr + col_offsets, Y_f32.to(X_row.dtype), mask=mask)
+
+
+@triton.jit
+def _layer_norm_backward_kernel(
+    X_ptr,  # pointer to input, shape (n_rows, n_cols)
+    stride_x,  # stride of each row in input
+    W_ptr,  # pointer to weights, shape (n_cols,)
+    Mean_ptr,  # pointer to mean, shape (n_rows,)
+    stride_mean,  # stride of each row in mean
+    RSTD_ptr,  # pointer to rstd, shape (n_rows,)
+    stride_rstd,  # stride of each row in rstd
+    DX_ptr,  # pointer to input grad, shape (n_rows, n_cols)
+    stride_dx,  # stride of each row in input grad
+    DW_ptr,  # pointer to weights grad, shape (n_cols,)
+    stride_dw,  # stride of each row in weights grad
+    DB_ptr,  # pointer to bias grad, shape (n_cols,)
+    stride_db,  # stride of each row in bias grad
+    DY_ptr,  # pointer to output grad, shape (n_rows, n_cols)
+    stride_dy,  # stride of each row in output grad
+    n_rows,
+    n_cols,
+    rows_per_program: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    References:
+    https://arxiv.org/abs/1607.06450
+    https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
+    """
+    row_block_id = tl.program_id(0).to(tl.int64)
+    row_start = row_block_id * rows_per_program
+    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
+    cols = tl.arange(0, BLOCK_SIZE)
+    mask = cols < n_cols
+
+    dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    db_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+    # Pre-load weights once (same optimization as forward pass)
+    w = tl.load(W_ptr + cols, mask=mask, other=0.0)
+    w_f32 = w.to(tl.float32)
+
+    for row_idx in range(row_start, row_end):
+        # Calculate pointers for this specific row
+        row_X_ptr = X_ptr + row_idx * stride_x
+        row_DX_ptr = DX_ptr + row_idx * stride_dx
+        row_DY_ptr = DY_ptr + row_idx * stride_dy
+        row_Mean_ptr = Mean_ptr + row_idx * stride_mean
+        row_RSTD_ptr = RSTD_ptr + row_idx * stride_rstd
+
+        # Load data for this row
+        x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
+        dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
+        mean = tl.load(row_Mean_ptr)
+        rstd = tl.load(row_RSTD_ptr)
+
+        # Convert to fp32 for numerical stability
+        x_f32 = x.to(tl.float32)
+        dy_f32 = dy.to(tl.float32)
+        mean_f32 = mean.to(tl.float32)
+        rstd_f32 = rstd.to(tl.float32)
+
+        # Compute backward pass for this row
+        x_hat = (x_f32 - mean_f32) * rstd_f32
+        wdy = w_f32 * dy_f32
+        c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
+        c2 = tl.sum(wdy, axis=0) / n_cols
+        dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
+
+        # Store input gradient
+        tl.store(row_DX_ptr + cols, dx, mask=mask)
+
+        # Accumulate weight and bias gradients for this thread block's assigned rows
+        dw = dy_f32 * x_hat
+        db = dy_f32
+        dW_row += dw
+        db_row += db
+
+    tl.store(DW_ptr + row_block_id * stride_dw + cols, dW_row, mask=mask)
+    tl.store(DB_ptr + row_block_id * stride_db + cols, db_row, mask=mask)
+
+
+def layer_norm_forward(X, W, B, eps):
+    """
+    Args:
+        X: Input tensor of shape (..., hidden_size)
+        W: Weight tensor of shape (hidden_size,)
+        B: Bias tensor of shape (hidden_size,)
+        eps: Small constant for numerical stability
+
+    Returns:
+        Tuple of (output, input, mean, rstd, block_size, num_warps)
+    """
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    n_rows, n_cols = X.shape
+
+    # Calculate optimal block size and warp configuration
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    # Allocate output tensors
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    Mean = torch.empty(n_rows, dtype=X.dtype, device=X.device)
+    RSTD = torch.empty(n_rows, dtype=X.dtype, device=X.device)
+
+    # Validate input dimensions
+    if X.shape[1] != W.shape[0]:
+        raise ValueError(
+            f"Incompatible dimensions: input feature size (X.shape[1]={X.shape[1]}) "
+            f"must match weight size (W.shape[0]={W.shape[0]})"
+        )
+
+    # XPU-specific optimization
+    kernel_args = {}
+    if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+
+    # Launch kernel with one thread block per row for optimal performance
+    grid = (n_rows,)
+    _layer_norm_forward_kernel[grid](
+        Y,
+        Y.stride(0),
+        X,
+        X.stride(0),
+        W,
+        W.stride(0),
+        B,
+        B.stride(0),
+        Mean,
+        Mean.stride(0),
+        RSTD,
+        RSTD.stride(0),
+        n_cols,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        **kernel_args,
+    )
+
+    return Y.view(*shape), X, Mean, RSTD, BLOCK_SIZE, num_warps
+
+
+def layer_norm_backward(dY, X, W, B, Mean, RSTD):
+    """
+    Args:
+        dY: Gradient of output
+        X: Input tensor
+        W: Weight tensor
+        B: Bias tensor
+        Mean: Pre-computed mean
+        RSTD: Pre-computed reciprocal standard deviation
+
+    Returns:
+        Tuple of (input_grad, weight_grad, bias_grad)
+    """
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_core_count()
+
+    # fp32 for numerical stability especially.
+    _DW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+    _DB = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+
+    # Calculate optimal block size and warp configuration
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    if n_cols > BLOCK_SIZE:
+        raise RuntimeError(f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}.")
+    rows_per_program = math.ceil(n_rows / sm_count)
+    grid = (sm_count,)
+
+    # Allocate gradient tensors
+    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+
+    kernel_args = {"num_warps": num_warps}
+    # XPU-specific optimization
+    if X.device.type == "xpu":
+        kernel_args.update({"num_warps": 32, "num_stages": 4})
+        set_large_grf_mode(kernel_args)
+
+    # Launch kernel with one thread block per row for optimal performance
+    _layer_norm_backward_kernel[grid](
+        X,
+        X.stride(0),
+        W,
+        Mean,
+        Mean.stride(0),
+        RSTD,
+        RSTD.stride(0),
+        DX,
+        DX.stride(0),
+        _DW,
+        _DW.stride(0),
+        _DB,
+        _DB.stride(0),
+        dY,
+        dY.stride(0),
+        n_rows,
+        n_cols,
+        rows_per_program=rows_per_program,
+        BLOCK_SIZE=BLOCK_SIZE,
+        **kernel_args,
+    )
+
+    DX = DX.view(*shape)
+    DW = _DW.sum(dim=0).to(W.dtype)
+    DB = _DB.sum(dim=0).to(B.dtype)
+
+    return DX, DW, DB
+
+
+class LigerLayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, B, eps):
+        Y, X, Mean, RSTD, BLOCK_SIZE, num_warps = layer_norm_forward(X, W, B, eps)
+        ctx.save_for_backward(X, W, B, Mean, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        X, W, B, Mean, RSTD = ctx.saved_tensors
+        DX, DW, DB = layer_norm_backward(dY, X, W, B, Mean, RSTD)
+        return DX, DW, DB, None
diff --git a/src/liger_kernel/ops/llama4_rope.py b/src/liger_kernel/ops/llama4_rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..9167d69f35d76998367a7a5bb095c130aa8c39f1
--- /dev/null
+++ b/src/liger_kernel/ops/llama4_rope.py
@@ -0,0 +1,180 @@
+import torch
+import triton
+import triton.language as tl
+
+
+def _cast_and_contiguous(q, k, freqs_complex):
+    # Align dtype: fp32 only when q is fp32; otherwise keep q dtype for perf
+    compute_dtype = torch.float32 if q.dtype == torch.float32 else q.dtype
+
+    if k.dtype != q.dtype:
+        k = k.to(q.dtype)
+
+    q = q.to(compute_dtype).contiguous()
+    k = k.to(compute_dtype).contiguous()
+    freqs_complex = freqs_complex.contiguous()
+    return q, k, freqs_complex
+
+
+@triton.jit
+def _llama4_rope_kernel(
+    q_ptr,
+    k_ptr,
+    freqs_complex_ptr,
+    q_row_stride,
+    k_row_stride,
+    q_head_stride,
+    k_head_stride,
+    freqs_row_stride,
+    seq_len,
+    batch_size,
+    imag_sign,
+    head_dim_half: tl.constexpr,
+    n_q_heads: tl.constexpr,
+    n_k_heads: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    H100-optimized RoPE kernel with improved parallelization across heads and dimensions.
+    Grid: (batch*seq, head)
+    """
+    # 2D grid
+    pid_bs = tl.program_id(0)  # over batch*seq
+    pid_h = tl.program_id(1)  # over heads
+
+    batch_idx = pid_bs // seq_len
+    seq_idx = pid_bs % seq_len
+
+    # Bounds check
+    if batch_idx >= batch_size or seq_idx >= seq_len:
+        return
+
+    # Base pointers for this (batch, seq) position
+    base_offset = batch_idx * seq_len + seq_idx
+    q_base = q_ptr + base_offset * q_row_stride
+    k_base = k_ptr + base_offset * k_row_stride
+    freq_base = seq_idx * freqs_row_stride
+
+    # Tiling over dim/2
+    for d_start in tl.static_range(0, head_dim_half, BLOCK_SIZE):
+        d_indices = d_start + tl.arange(0, BLOCK_SIZE)
+        mask_d = d_indices < head_dim_half
+
+        # Compute offsets for the block
+        freq_offsets = d_indices[:, None] * 2 + tl.arange(0, 2)[None, :]
+        # Load the block
+        freqs_complex = tl.load(freqs_complex_ptr + freq_base + freq_offsets, mask=mask_d[:, None], other=0.0)
+        freqs_real, freqs_imag = tl.split(freqs_complex)
+        freqs_imag = freqs_imag * imag_sign
+
+        # Process one query head per program in pid_h
+        if pid_h < n_q_heads:
+            q_head_ptr = q_base + pid_h * q_head_stride
+            q_real = tl.load(q_head_ptr + d_indices * 2, mask=mask_d, other=0.0)
+            q_imag = tl.load(q_head_ptr + d_indices * 2 + 1, mask=mask_d, other=0.0)
+
+            # Complex multiply with FMAs: (a+ib)*(c+i d) = (a*c - b*d) + i(a*d + b*c)
+            new_q_real = tl.math.fma(q_real, freqs_real, -(q_imag * freqs_imag))
+            new_q_imag = tl.math.fma(q_real, freqs_imag, q_imag * freqs_real)
+
+            tl.store(q_head_ptr + d_indices * 2, new_q_real, mask=mask_d)
+            tl.store(q_head_ptr + d_indices * 2 + 1, new_q_imag, mask=mask_d)
+
+        # Process one key head per program in pid_h
+        if pid_h < n_k_heads:
+            k_head_ptr = k_base + pid_h * k_head_stride
+            k_real = tl.load(k_head_ptr + d_indices * 2, mask=mask_d, other=0.0)
+            k_imag = tl.load(k_head_ptr + d_indices * 2 + 1, mask=mask_d, other=0.0)
+
+            new_k_real = tl.math.fma(k_real, freqs_real, -(k_imag * freqs_imag))
+            new_k_imag = tl.math.fma(k_real, freqs_imag, k_imag * freqs_real)
+
+            tl.store(k_head_ptr + d_indices * 2, new_k_real, mask=mask_d)
+            tl.store(k_head_ptr + d_indices * 2 + 1, new_k_imag, mask=mask_d)
+
+
+def _select_kernel_meta(head_dim_half: int):
+    # Heuristic tuning for block size and num_warps
+    if head_dim_half >= 256:
+        return 128, 8
+    if head_dim_half >= 96:
+        return 128, 4
+    if head_dim_half >= 48:
+        return 64, 4
+    if head_dim_half >= 24:
+        return 32, 2
+    return 16, 2
+
+
+def llama4_rope_forward(q, k, freqs_cis, BLOCK_SIZE: int = None, imag_sign: float = 1.0):
+    # Save original dtype for casting back
+    original_dtype = q.dtype
+
+    batch_size, seq_len, n_q_heads, head_dim = q.shape
+    _, _, n_k_heads, _ = k.shape
+    head_dim_half = head_dim // 2
+    if freqs_cis.is_complex():
+        freqs_cis = freqs_cis.reshape(-1, freqs_cis.shape[-1])
+        if freqs_cis.shape[0] > seq_len:
+            freqs_cis = freqs_cis[:seq_len]
+        freqs_cis = torch.view_as_real(freqs_cis)
+
+    # Cast to appropriate dtype and make contiguous only when needed
+    q, k, freqs_cis = _cast_and_contiguous(q, k, freqs_cis)
+
+    # H100-optimized meta-params
+    if BLOCK_SIZE is None:
+        BLOCK_SIZE, num_warps = _select_kernel_meta(head_dim_half)
+    else:
+        # Provide a default num_warps if caller pins BLOCK_SIZE
+        _, num_warps = _select_kernel_meta(head_dim_half)
+
+    # 2D grid: one program per (batch, seq, head)
+    n_heads_max = max(n_q_heads, n_k_heads)
+    grid = (batch_size * seq_len, n_heads_max)
+
+    # Launch kernel
+    _llama4_rope_kernel[grid](
+        q,
+        k,
+        freqs_cis,
+        q.stride(1),
+        k.stride(1),
+        q.stride(2),
+        k.stride(2),
+        freqs_cis.stride(0),
+        seq_len,
+        batch_size,
+        imag_sign,
+        head_dim_half,
+        n_q_heads,
+        n_k_heads,
+        BLOCK_SIZE,
+        num_warps=num_warps,
+        num_stages=2,
+    )
+
+    # Cast back to original dtype only if it differs from compute dtype
+    if q.dtype != original_dtype:
+        q = q.to(original_dtype)
+    if k.dtype != original_dtype:
+        k = k.to(original_dtype)
+
+    return q, k
+
+
+class LigerLlama4RopeFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, freqs_cis, BLOCK_SIZE: int = None):
+        q_out, k_out = llama4_rope_forward(q, k, freqs_cis, BLOCK_SIZE, imag_sign=1.0)
+        ctx.save_for_backward(freqs_cis.detach() if isinstance(freqs_cis, torch.Tensor) else freqs_cis)
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        return q_out, k_out
+
+    @staticmethod
+    def backward(ctx, dq, dk):
+        (freqs_cis,) = ctx.saved_tensors
+        BLOCK_SIZE = getattr(ctx, "BLOCK_SIZE", None)
+        # Use imag_sign=-1.0 for conjugate without materializing a new tensor
+        dq_out, dk_out = llama4_rope_forward(dq, dk, freqs_cis, BLOCK_SIZE, imag_sign=-1.0)
+        return dq_out, dk_out, None
diff --git a/src/liger_kernel/ops/mhc.py b/src/liger_kernel/ops/mhc.py
new file mode 100755
index 0000000000000000000000000000000000000000..1a4569d334137a3102070e24543655df361cd6b6
--- /dev/null
+++ b/src/liger_kernel/ops/mhc.py
@@ -0,0 +1,1674 @@
+import math
+
+from typing import Any
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+def _post_res_default_meta(c: int) -> Tuple[int, int, int, int]:
+    """
+    Returns default (block_n, block_c, num_warps, num_stages) for post_res kernels.
+    Tuned for different hidden dimensions on NVIDIA GPUs.
+    """
+    if c >= 4096:
+        return 32, 128, 8, 3  # (block_n, block_c, num_warps, num_stages)
+    if c >= 2048:
+        return 32, 128, 4, 2
+    if c >= 1024:
+        return 32, 64, 4, 2
+    return 32, 64, 2, 2
+
+
+def _post_res_meta(
+    c: int,
+    block_n: Optional[int],
+    block_c: Optional[int],
+    num_warps: Optional[int],
+    num_stages: Optional[int],
+) -> Tuple[int, int, int, int]:
+    bn, bc, nw, ns = _post_res_default_meta(c)
+    return (
+        bn if block_n is None else int(block_n),
+        bc if block_c is None else int(block_c),
+        nw if num_warps is None else int(num_warps),
+        ns if num_stages is None else int(num_stages),
+    )
+
+
+# -------------------------------------------------------------------------------------------------
+# (1) Coefficients: fused matmul + RMS scalar (Eq. 14–15)
+#   mix = (x @ phi) * rsqrt(mean(x^2) + eps)
+#
+# We provide two paths:
+#   - TC path: x BF16/FP16 and phi BF16/FP16 (Tensor Cores)
+#   - TF32-ish path: x cast to FP32 and phi FP32 (relies on Triton/arch for TF32)
+# -------------------------------------------------------------------------------------------------
+
+
+@triton.jit
+def _mhc_mm_norm_fwd_kernel(
+    x_ptr,
+    phi_ptr,
+    mix_ptr,
+    invr_ptr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    M: tl.constexpr,
+    stride_xn: tl.constexpr,
+    stride_xk: tl.constexpr,
+    stride_phik: tl.constexpr,
+    stride_phim: tl.constexpr,
+    stride_mn: tl.constexpr,
+    stride_mm: tl.constexpr,
+    eps: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    CAST_FP32: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_m = tl.program_id(1)
+
+    n_offs = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    m_offs = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+
+    acc = tl.zeros((BLOCK_N, BLOCK_M), tl.float32)
+    sumsq = tl.zeros((BLOCK_N,), tl.float32)
+
+    for k0 in tl.static_range(0, K, BLOCK_K):
+        k_offs = k0 + tl.arange(0, BLOCK_K)
+
+        x = tl.load(
+            x_ptr + n_offs[:, None] * stride_xn + k_offs[None, :] * stride_xk,
+            mask=(n_offs[:, None] < N) & (k_offs[None, :] < K),
+            other=0.0,
+        )
+        if CAST_FP32:
+            x = x.to(tl.float32)
+            sumsq += tl.sum(x * x, axis=1)
+        else:
+            x_f = x.to(tl.float32)
+            sumsq += tl.sum(x_f * x_f, axis=1)
+
+        phi = tl.load(
+            phi_ptr + k_offs[:, None] * stride_phik + m_offs[None, :] * stride_phim,
+            mask=(k_offs[:, None] < K) & (m_offs[None, :] < M),
+            other=0.0,
+        )
+        if CAST_FP32:
+            phi = phi.to(tl.float32)
+
+        acc += tl.dot(x, phi)
+
+    invr = tl.rsqrt(sumsq / K + eps)
+    out = acc * invr[:, None]
+
+    tl.store(
+        mix_ptr + n_offs[:, None] * stride_mn + m_offs[None, :] * stride_mm,
+        out,
+        mask=(n_offs[:, None] < N) & (m_offs[None, :] < M),
+    )
+    if pid_m == 0:
+        tl.store(invr_ptr + n_offs, invr, mask=n_offs < N)
+
+
+def mhc_mm_norm_fwd(
+    x: torch.Tensor,
+    phi: torch.Tensor,
+    eps: float,
+    *,
+    out_mix: Optional[torch.Tensor] = None,
+    out_invr: Optional[torch.Tensor] = None,
+    block_n: int = 32,
+    block_k: int = 256,
+    block_m: int = 32,
+    num_warps: int = 4,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fused (x @ phi) + invr = rsqrt(mean(x^2)+eps) and returns mix=(x@phi)*invr.
+
+    Args:
+        x: [N, K] contiguous
+        phi: [K, M] contiguous
+        eps: float
+    Returns:
+        mix: [N, M] float32
+        invr: [N] float32
+    """
+    assert x.is_contiguous(), "x must be contiguous"
+    assert phi.is_contiguous(), "phi must be contiguous"
+
+    N, K = x.shape
+    K2, M = phi.shape
+    assert K2 == K, f"phi.shape[0] must match K: got {K2} vs {K}"
+
+    if out_mix is None:
+        out_mix = torch.empty((N, M), device=x.device, dtype=torch.float32)
+    if out_invr is None:
+        out_invr = torch.empty((N,), device=x.device, dtype=torch.float32)
+
+    grid = (triton.cdiv(N, block_n), triton.cdiv(M, block_m))
+
+    use_tc = (x.dtype == phi.dtype) and (x.dtype in (torch.float16, torch.bfloat16))
+
+    _mhc_mm_norm_fwd_kernel[grid](
+        x,
+        phi,
+        out_mix,
+        out_invr,
+        N=N,
+        K=K,
+        M=M,
+        stride_xn=x.stride(0),
+        stride_xk=x.stride(1),
+        stride_phik=phi.stride(0),
+        stride_phim=phi.stride(1),
+        stride_mn=out_mix.stride(0),
+        stride_mm=out_mix.stride(1),
+        eps=eps,
+        BLOCK_N=block_n,
+        BLOCK_K=block_k,
+        BLOCK_M=block_m,
+        CAST_FP32=not use_tc,
+        num_warps=num_warps,
+    )
+    return out_mix, out_invr
+
+
+# -------------------------------------------------------------------------------------------------
+# Backward for fused (x @ phi) + RMS scalar
+#
+# mix = (x @ phi) * invr
+# invr = rsqrt(mean(x^2) + eps)
+#
+# Given grad_mix, compute:
+#   grad_z   = grad_mix * invr
+#   g        = sum(grad_mix * (mix / invr)) = sum(grad_mix * mix) / invr
+#   factor   = -(g / K) * invr^3
+#   grad_x   = grad_z @ phi^T + factor * x
+#   grad_phi = x^T @ grad_z
+#
+# grad_phi is accumulated into FP32 with atomic adds (split over N-chunks).
+# -------------------------------------------------------------------------------------------------
+
+
+@triton.jit
+def _mhc_mm_norm_bwd_fused_kernel(
+    x_ptr,
+    phi_ptr,
+    mix_ptr,
+    invr_ptr,
+    grad_mix_ptr,
+    grad_x_ptr,
+    grad_phi_ptr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    M: tl.constexpr,
+    stride_xn: tl.constexpr,
+    stride_xk: tl.constexpr,
+    stride_phik: tl.constexpr,
+    stride_phim: tl.constexpr,
+    stride_mn: tl.constexpr,
+    stride_mm: tl.constexpr,
+    stride_invr: tl.constexpr,
+    stride_gmn: tl.constexpr,
+    stride_gmm: tl.constexpr,
+    stride_gxn: tl.constexpr,
+    stride_gxk: tl.constexpr,
+    stride_gpk: tl.constexpr,
+    stride_gpm: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    CAST_FP32: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_k = tl.program_id(1)
+
+    n_offs = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    k_offs = pid_k * BLOCK_K + tl.arange(0, BLOCK_K)
+
+    invr = tl.load(invr_ptr + n_offs * stride_invr, mask=n_offs < N, other=0.0).to(tl.float32)
+
+    x = tl.load(
+        x_ptr + n_offs[:, None] * stride_xn + k_offs[None, :] * stride_xk,
+        mask=(n_offs[:, None] < N) & (k_offs[None, :] < K),
+        other=0.0,
+    )
+    if CAST_FP32:
+        x = x.to(tl.float32)
+        x_f = x
+    else:
+        x_f = x.to(tl.float32)
+
+    acc = tl.zeros((BLOCK_N, BLOCK_K), tl.float32)
+    g_acc = tl.zeros((BLOCK_N,), tl.float32)
+
+    for m0 in tl.static_range(0, M, BLOCK_M):
+        m_offs = m0 + tl.arange(0, BLOCK_M)
+
+        grad_mix = tl.load(
+            grad_mix_ptr + n_offs[:, None] * stride_gmn + m_offs[None, :] * stride_gmm,
+            mask=(n_offs[:, None] < N) & (m_offs[None, :] < M),
+            other=0.0,
+        ).to(tl.float32)
+
+        mix = tl.load(
+            mix_ptr + n_offs[:, None] * stride_mn + m_offs[None, :] * stride_mm,
+            mask=(n_offs[:, None] < N) & (m_offs[None, :] < M),
+            other=0.0,
+        ).to(tl.float32)
+
+        g_acc += tl.sum(grad_mix * mix, axis=1)
+
+        phi = tl.load(
+            phi_ptr + k_offs[:, None] * stride_phik + m_offs[None, :] * stride_phim,
+            mask=(k_offs[:, None] < K) & (m_offs[None, :] < M),
+            other=0.0,
+        )
+        if CAST_FP32:
+            phi = phi.to(tl.float32)
+            grad_z = grad_mix * invr[:, None]
+        else:
+            grad_z = (grad_mix * invr[:, None]).to(phi.dtype)
+
+        acc += tl.dot(grad_z, tl.trans(phi))
+
+        dphi = tl.dot(tl.trans(x), grad_z)
+        tl.atomic_add(
+            grad_phi_ptr + k_offs[:, None] * stride_gpk + m_offs[None, :] * stride_gpm,
+            dphi,
+            mask=(k_offs[:, None] < K) & (m_offs[None, :] < M),
+        )
+
+    g = g_acc / invr
+    invr3 = invr * invr * invr
+    factor = (-g * invr3) / K
+
+    gx = acc + x_f * factor[:, None]
+
+    if CAST_FP32:
+        tl.store(
+            grad_x_ptr + n_offs[:, None] * stride_gxn + k_offs[None, :] * stride_gxk,
+            gx,
+            mask=(n_offs[:, None] < N) & (k_offs[None, :] < K),
+        )
+    else:
+        tl.store(
+            grad_x_ptr + n_offs[:, None] * stride_gxn + k_offs[None, :] * stride_gxk,
+            gx.to(x.dtype),
+            mask=(n_offs[:, None] < N) & (k_offs[None, :] < K),
+        )
+
+
+def mhc_mm_norm_bwd(
+    x: torch.Tensor,
+    phi: torch.Tensor,
+    mix: torch.Tensor,
+    invr: torch.Tensor,
+    grad_mix: torch.Tensor,
+    *,
+    out_grad_x: Optional[torch.Tensor] = None,
+    out_grad_phi: Optional[torch.Tensor] = None,
+    block_n: int = 32,
+    block_k: int = 256,
+    block_m: int = 32,
+    num_warps: int = 4,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Triton backward for `mhc_mm_norm_fwd`.
+
+    Returns:
+        grad_x: [N, K] same dtype as x
+        grad_phi: [K, M] FP32 (safe for atomic adds; cast on return if needed)
+
+    Note:
+        grad_phi is accumulated via atomic_add in FP32. For very large N
+        (batch * sequence length > 1M), accumulated rounding errors may
+        become noticeable. This is typically not an issue for standard
+        training configurations.
+    """
+    assert (
+        x.is_contiguous()
+        and phi.is_contiguous()
+        and mix.is_contiguous()
+        and invr.is_contiguous()
+        and grad_mix.is_contiguous()
+    )
+
+    N, K = x.shape
+    K2, M = phi.shape
+    assert K2 == K
+    assert mix.shape == (N, M)
+    assert grad_mix.shape == (N, M)
+    assert invr.shape == (N,)
+
+    if out_grad_x is None:
+        out_grad_x = torch.empty_like(x)
+    if out_grad_phi is None:
+        out_grad_phi = torch.zeros((K, M), device=x.device, dtype=torch.float32)
+
+    use_tc = (x.dtype == phi.dtype) and (x.dtype in (torch.float16, torch.bfloat16))
+
+    grid = (triton.cdiv(N, block_n), triton.cdiv(K, block_k))
+    _mhc_mm_norm_bwd_fused_kernel[grid](
+        x,
+        phi,
+        mix,
+        invr,
+        grad_mix,
+        out_grad_x,
+        out_grad_phi,
+        N=N,
+        K=K,
+        M=M,
+        stride_xn=x.stride(0),
+        stride_xk=x.stride(1),
+        stride_phik=phi.stride(0),
+        stride_phim=phi.stride(1),
+        stride_mn=mix.stride(0),
+        stride_mm=mix.stride(1),
+        stride_invr=invr.stride(0),
+        stride_gmn=grad_mix.stride(0),
+        stride_gmm=grad_mix.stride(1),
+        stride_gxn=out_grad_x.stride(0),
+        stride_gxk=out_grad_x.stride(1),
+        stride_gpk=out_grad_phi.stride(0),
+        stride_gpm=out_grad_phi.stride(1),
+        BLOCK_N=block_n,
+        BLOCK_K=block_k,
+        BLOCK_M=block_m,
+        CAST_FP32=not use_tc,
+        num_warps=num_warps,
+    )
+
+    if out_grad_phi.dtype != phi.dtype:
+        out_grad_phi = out_grad_phi.to(phi.dtype)
+    return out_grad_x, out_grad_phi
+
+
+# -------------------------------------------------------------------------------------------------
+# Sinkhorn-Knopp forward/backward for H_res (Eq. 19)
+# -------------------------------------------------------------------------------------------------
+
+
+@triton.jit
+def _mhc_split_sinkhorn_fwd_kernel(
+    mix_ptr,
+    b_ptr,
+    hpre_ptr,
+    hpost_ptr,
+    hres_ptr,
+    hist_ptr,
+    N: tl.constexpr,
+    HC: tl.constexpr,
+    M: tl.constexpr,
+    stride_mn: tl.constexpr,
+    stride_mm: tl.constexpr,
+    stride_hp_n: tl.constexpr,
+    stride_hp_h: tl.constexpr,
+    stride_hq_n: tl.constexpr,
+    stride_hq_h: tl.constexpr,
+    stride_hr_n: tl.constexpr,
+    stride_hr_i: tl.constexpr,
+    stride_hr_j: tl.constexpr,
+    stride_hn: tl.constexpr,
+    stride_ht: tl.constexpr,
+    stride_hi: tl.constexpr,
+    stride_hj: tl.constexpr,
+    alpha_pre_ptr,
+    alpha_post_ptr,
+    alpha_res_ptr,
+    pre_eps: tl.constexpr,
+    sinkhorn_eps: tl.constexpr,
+    post_mult: tl.constexpr,
+    TMAX: tl.constexpr,
+    STORE_HIST: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    if pid >= N:
+        return
+
+    # Load scalar alpha parameters from GPU memory (avoids CPU sync)
+    alpha_pre = tl.load(alpha_pre_ptr).to(tl.float32)
+    alpha_post = tl.load(alpha_post_ptr).to(tl.float32)
+    alpha_res = tl.load(alpha_res_ptr).to(tl.float32)
+
+    # --- Pre/post logits
+    j = tl.arange(0, HC)
+    mix_pre = tl.load(mix_ptr + pid * stride_mn + j * stride_mm).to(tl.float32)
+    mix_post = tl.load(mix_ptr + pid * stride_mn + (HC + j) * stride_mm).to(tl.float32)
+
+    b_pre = tl.load(b_ptr + j).to(tl.float32)
+    b_post = tl.load(b_ptr + (HC + j)).to(tl.float32)
+
+    pre_logits = mix_pre * alpha_pre + b_pre
+    post_logits = mix_post * alpha_post + b_post
+
+    pre = tl.sigmoid(pre_logits) + pre_eps
+    post = tl.sigmoid(post_logits) * post_mult
+
+    tl.store(hpre_ptr + pid * stride_hp_n + j * stride_hp_h, pre)
+    tl.store(hpost_ptr + pid * stride_hq_n + j * stride_hq_h, post)
+
+    # --- Residual logits matrix [HC, HC]
+    rows = tl.arange(0, HC)[:, None]
+    cols = tl.arange(0, HC)[None, :]
+    flat = rows * HC + cols  # [HC,HC]
+
+    mix_res = tl.load(mix_ptr + pid * stride_mn + (2 * HC + flat) * stride_mm).to(tl.float32)
+    b_res = tl.load(b_ptr + (2 * HC + flat)).to(tl.float32)
+
+    logits = mix_res * alpha_res + b_res
+
+    # Sinkhorn: initial row-softmax (stable) then alternating row/col norms.
+    row_max = tl.max(logits, axis=1)
+    e = tl.exp(logits - row_max[:, None])
+    row_sum = tl.sum(e, axis=1)
+    mat = e / row_sum[:, None] + sinkhorn_eps
+
+    col_sum = tl.sum(mat, axis=0)
+    mat = mat / (col_sum[None, :] + sinkhorn_eps)
+
+    if STORE_HIST:
+        tl.store(
+            hist_ptr + pid * stride_hn + 0 * stride_ht + rows * stride_hi + cols * stride_hj,
+            mat,
+        )
+
+    for t in tl.static_range(0, TMAX - 1):
+        row_sum = tl.sum(mat, axis=1)
+        mat = mat / (row_sum[:, None] + sinkhorn_eps)
+        col_sum = tl.sum(mat, axis=0)
+        mat = mat / (col_sum[None, :] + sinkhorn_eps)
+        if STORE_HIST:
+            tl.store(
+                hist_ptr + pid * stride_hn + (t + 1) * stride_ht + rows * stride_hi + cols * stride_hj,
+                mat,
+            )
+
+    # Store h_res [N, HC, HC] (row-major: out, in)
+    tl.store(hres_ptr + pid * stride_hr_n + rows * stride_hr_i + cols * stride_hr_j, mat)
+
+
+@triton.jit
+def _mhc_sinkhorn_bwd_kernel(
+    mix_ptr,
+    b_ptr,
+    grad_out_ptr,
+    grad_logits_ptr,
+    N: tl.constexpr,
+    HC: tl.constexpr,
+    stride_mn: tl.constexpr,
+    stride_mm: tl.constexpr,
+    stride_go_n: tl.constexpr,
+    stride_go_i: tl.constexpr,
+    stride_go_j: tl.constexpr,
+    stride_gl_n: tl.constexpr,
+    stride_gl_i: tl.constexpr,
+    stride_gl_j: tl.constexpr,
+    alpha_res_ptr,
+    sinkhorn_eps: tl.constexpr,
+    TMAX: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    if pid >= N:
+        return
+
+    alpha_res = tl.load(alpha_res_ptr).to(tl.float32)
+
+    rows = tl.arange(0, HC)[:, None]
+    cols = tl.arange(0, HC)[None, :]
+    flat = rows * HC + cols
+
+    # Rebuild logits
+    mix_res = tl.load(mix_ptr + pid * stride_mn + (2 * HC + flat) * stride_mm).to(tl.float32)
+    b_res = tl.load(b_ptr + (2 * HC + flat)).to(tl.float32)
+    logits = mix_res * alpha_res + b_res
+
+    # Forward recompute (no lists) and backward with recompute per step.
+    row_max = tl.max(logits, axis=1)
+    e = tl.exp(logits - row_max[:, None])
+    row_sum0 = tl.sum(e, axis=1)
+    p = e / row_sum0[:, None]  # softmax, row-wise
+    p_eps = p + sinkhorn_eps
+
+    col_sum0 = tl.sum(p_eps, axis=0)
+    mat0 = p_eps / (col_sum0[None, :] + sinkhorn_eps)
+
+    # Start backward from grad_out
+    g = tl.load(
+        grad_out_ptr + pid * stride_go_n + rows * stride_go_i + cols * stride_go_j,
+    ).to(tl.float32)
+
+    # Reverse iterations (TMAX-1 .. 1), recomputing mat_t, rs_t, cs_t
+    for t in tl.static_range(TMAX - 1, 0, -1):
+        mat = mat0
+        rs_t = row_sum0
+        cs_t = col_sum0
+        mat_t = mat0
+
+        for s in tl.static_range(1, TMAX):
+            rs = tl.sum(mat, axis=1)
+            mat = mat / (rs[:, None] + sinkhorn_eps)
+            cs = tl.sum(mat, axis=0)
+            mat = mat / (cs[None, :] + sinkhorn_eps)
+            if s == t:
+                mat_t = mat
+                rs_t = rs
+                cs_t = cs
+
+        denom_col = cs_t + sinkhorn_eps  # [HC]
+        dot_col = tl.sum(g * mat_t, axis=0)  # [HC]
+        g_row = (g - dot_col[None, :]) / denom_col[None, :]
+
+        m_row = mat_t * denom_col[None, :]  # invert col norm: m_row = m_out * denom
+        denom_row = rs_t + sinkhorn_eps
+        dot_row = tl.sum(g_row * m_row, axis=1)
+        g = (g_row - dot_row[:, None]) / denom_row[:, None]
+
+    # Undo initial col norm (t=0)
+    denom_col0 = col_sum0 + sinkhorn_eps
+    dot_col0 = tl.sum(g * mat0, axis=0)
+    g_p = (g - dot_col0[None, :]) / denom_col0[None, :]
+
+    # Softmax backward on rows: p * (g_p - sum(g_p * p))
+    dot_soft = tl.sum(g_p * p, axis=1)
+    grad_logits = p * (g_p - dot_soft[:, None])
+
+    tl.store(grad_logits_ptr + pid * stride_gl_n + rows * stride_gl_i + cols * stride_gl_j, grad_logits)
+
+
+@triton.jit
+def _mhc_sinkhorn_bwd_hist_kernel(
+    mix_ptr,
+    b_ptr,
+    hist_ptr,
+    grad_out_ptr,
+    grad_logits_ptr,
+    N: tl.constexpr,
+    HC: tl.constexpr,
+    stride_mn: tl.constexpr,
+    stride_mm: tl.constexpr,
+    stride_hn: tl.constexpr,
+    stride_ht: tl.constexpr,
+    stride_hi: tl.constexpr,
+    stride_hj: tl.constexpr,
+    stride_go_n: tl.constexpr,
+    stride_go_i: tl.constexpr,
+    stride_go_j: tl.constexpr,
+    stride_gl_n: tl.constexpr,
+    stride_gl_i: tl.constexpr,
+    stride_gl_j: tl.constexpr,
+    alpha_res_ptr,
+    sinkhorn_eps: tl.constexpr,
+    TMAX: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    if pid >= N:
+        return
+
+    alpha_res = tl.load(alpha_res_ptr).to(tl.float32)
+
+    rows = tl.arange(0, HC)[:, None]
+    cols = tl.arange(0, HC)[None, :]
+    flat = rows * HC + cols
+
+    # Rebuild logits
+    mix_res = tl.load(mix_ptr + pid * stride_mn + (2 * HC + flat) * stride_mm).to(tl.float32)
+    b_res = tl.load(b_ptr + (2 * HC + flat)).to(tl.float32)
+    logits = mix_res * alpha_res + b_res
+
+    # Initial row-softmax
+    row_max = tl.max(logits, axis=1)
+    e = tl.exp(logits - row_max[:, None])
+    row_sum0 = tl.sum(e, axis=1)
+    p = e / row_sum0[:, None]
+    p_eps = p + sinkhorn_eps
+
+    col_sum0 = tl.sum(p_eps, axis=0)
+    mat0 = p_eps / (col_sum0[None, :] + sinkhorn_eps)
+
+    # Start backward from grad_out
+    g = tl.load(
+        grad_out_ptr + pid * stride_go_n + rows * stride_go_i + cols * stride_go_j,
+    ).to(tl.float32)
+
+    # Reverse iterations (TMAX-1 .. 1) using stored mats
+    for t in tl.static_range(TMAX - 1, 0, -1):
+        mat_t = tl.load(hist_ptr + pid * stride_hn + t * stride_ht + rows * stride_hi + cols * stride_hj).to(tl.float32)
+        mat_prev = tl.load(hist_ptr + pid * stride_hn + (t - 1) * stride_ht + rows * stride_hi + cols * stride_hj).to(
+            tl.float32
+        )
+
+        row_sum = tl.sum(mat_prev, axis=1)
+        mat_row = mat_prev / (row_sum[:, None] + sinkhorn_eps)
+        col_sum = tl.sum(mat_row, axis=0)
+        denom_col = col_sum + sinkhorn_eps
+
+        dot_col = tl.sum(g * mat_t, axis=0)
+        g_row = (g - dot_col[None, :]) / denom_col[None, :]
+
+        m_row = mat_t * denom_col[None, :]
+        denom_row = row_sum + sinkhorn_eps
+        dot_row = tl.sum(g_row * m_row, axis=1)
+        g = (g_row - dot_row[:, None]) / denom_row[:, None]
+
+    # Undo initial col norm (t=0)
+    denom_col0 = col_sum0 + sinkhorn_eps
+    dot_col0 = tl.sum(g * mat0, axis=0)
+    g_p = (g - dot_col0[None, :]) / denom_col0[None, :]
+
+    # Softmax backward on rows: p * (g_p - sum(g_p * p))
+    dot_soft = tl.sum(g_p * p, axis=1)
+    grad_logits = p * (g_p - dot_soft[:, None])
+
+    tl.store(grad_logits_ptr + pid * stride_gl_n + rows * stride_gl_i + cols * stride_gl_j, grad_logits)
+
+
+def mhc_split_sinkhorn_fwd(
+    mix: torch.Tensor,
+    b: torch.Tensor,
+    alpha_pre: torch.Tensor,
+    alpha_post: torch.Tensor,
+    alpha_res: torch.Tensor,
+    *,
+    tmax: int,
+    pre_eps: float,
+    sinkhorn_eps: float,
+    post_mult: float,
+    out_hpre: Optional[torch.Tensor] = None,
+    out_hpost: Optional[torch.Tensor] = None,
+    out_hres: Optional[torch.Tensor] = None,
+    out_hist: Optional[torch.Tensor] = None,
+    return_hist: bool = False,
+    num_warps: int = 1,
+) -> Union[
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+]:
+    """
+    Compute h_pre, h_post, h_res from `mix` (already normalized by RMS scalar).
+
+    mix: [N, M] float32 where M = HC*HC + 2*HC
+    b: [M] float32
+    """
+    assert mix.is_contiguous() and b.is_contiguous()
+
+    N, M = mix.shape
+    assert M == b.numel()
+    # infer HC from M = HC*HC + 2*HC
+    # Solve HC^2 + 2HC - M = 0
+    HC = int((math.isqrt(4 + 4 * M) - 2) // 2)
+    assert HC * HC + 2 * HC == M, f"Invalid M for mHC: M={M}"
+
+    if out_hpre is None:
+        out_hpre = torch.empty((N, HC), device=mix.device, dtype=torch.float32)
+    if out_hpost is None:
+        out_hpost = torch.empty((N, HC), device=mix.device, dtype=torch.float32)
+    if out_hres is None:
+        out_hres = torch.empty((N, HC, HC), device=mix.device, dtype=torch.float32)
+    if return_hist:
+        if out_hist is None:
+            out_hist = torch.empty((N, tmax, HC, HC), device=mix.device, dtype=torch.float32)
+    else:
+        if out_hist is None:
+            out_hist = torch.empty((1,), device=mix.device, dtype=torch.float32)
+
+    grid = (N,)
+
+    _mhc_split_sinkhorn_fwd_kernel[grid](
+        mix,
+        b,
+        out_hpre,
+        out_hpost,
+        out_hres,
+        out_hist,
+        N=N,
+        HC=HC,
+        M=M,
+        stride_mn=mix.stride(0),
+        stride_mm=mix.stride(1),
+        stride_hp_n=out_hpre.stride(0),
+        stride_hp_h=out_hpre.stride(1),
+        stride_hq_n=out_hpost.stride(0),
+        stride_hq_h=out_hpost.stride(1),
+        stride_hr_n=out_hres.stride(0),
+        stride_hr_i=out_hres.stride(1),
+        stride_hr_j=out_hres.stride(2),
+        stride_hn=out_hist.stride(0) if out_hist.ndim > 1 else 0,
+        stride_ht=out_hist.stride(1) if out_hist.ndim > 1 else 0,
+        stride_hi=out_hist.stride(2) if out_hist.ndim > 1 else 0,
+        stride_hj=out_hist.stride(3) if out_hist.ndim > 1 else 0,
+        alpha_pre_ptr=alpha_pre.contiguous(),
+        alpha_post_ptr=alpha_post.contiguous(),
+        alpha_res_ptr=alpha_res.contiguous(),
+        pre_eps=pre_eps,
+        sinkhorn_eps=sinkhorn_eps,
+        post_mult=post_mult,
+        TMAX=tmax,
+        STORE_HIST=return_hist,
+        num_warps=num_warps,
+    )
+    if return_hist:
+        return out_hpre, out_hpost, out_hres, out_hist
+    return out_hpre, out_hpost, out_hres
+
+
+def mhc_sinkhorn_bwd(
+    mix: torch.Tensor,
+    b: torch.Tensor,
+    alpha_res: torch.Tensor,
+    grad_hres: torch.Tensor,
+    *,
+    tmax: int,
+    sinkhorn_eps: float,
+    hist: Optional[torch.Tensor] = None,
+    out_grad_logits: Optional[torch.Tensor] = None,
+    num_warps: int = 1,
+) -> torch.Tensor:
+    """
+    Backward for Sinkhorn: returns grad_logits (same shape as h_res).
+
+    mix: [N, M] float32
+    b: [M] float32
+    grad_hres: [N, HC, HC] float32
+    """
+    assert mix.is_contiguous() and b.is_contiguous() and grad_hres.is_contiguous()
+
+    N, M = mix.shape
+    HC = grad_hres.shape[1]
+    assert grad_hres.shape == (N, HC, HC)
+    assert M == HC * HC + 2 * HC
+
+    if out_grad_logits is None:
+        out_grad_logits = torch.empty((N, HC, HC), device=mix.device, dtype=torch.float32)
+
+    grid = (N,)
+
+    alpha_res_c = alpha_res.contiguous()
+
+    if hist is not None:
+        assert hist.is_contiguous()
+        assert hist.shape == (N, tmax, HC, HC)
+        _mhc_sinkhorn_bwd_hist_kernel[grid](
+            mix,
+            b,
+            hist,
+            grad_hres,
+            out_grad_logits,
+            N=N,
+            HC=HC,
+            stride_mn=mix.stride(0),
+            stride_mm=mix.stride(1),
+            stride_hn=hist.stride(0),
+            stride_ht=hist.stride(1),
+            stride_hi=hist.stride(2),
+            stride_hj=hist.stride(3),
+            stride_go_n=grad_hres.stride(0),
+            stride_go_i=grad_hres.stride(1),
+            stride_go_j=grad_hres.stride(2),
+            stride_gl_n=out_grad_logits.stride(0),
+            stride_gl_i=out_grad_logits.stride(1),
+            stride_gl_j=out_grad_logits.stride(2),
+            alpha_res_ptr=alpha_res_c,
+            sinkhorn_eps=sinkhorn_eps,
+            TMAX=tmax,
+            num_warps=num_warps,
+        )
+    else:
+        _mhc_sinkhorn_bwd_kernel[grid](
+            mix,
+            b,
+            grad_hres,
+            out_grad_logits,
+            N=N,
+            HC=HC,
+            stride_mn=mix.stride(0),
+            stride_mm=mix.stride(1),
+            stride_go_n=grad_hres.stride(0),
+            stride_go_i=grad_hres.stride(1),
+            stride_go_j=grad_hres.stride(2),
+            stride_gl_n=out_grad_logits.stride(0),
+            stride_gl_i=out_grad_logits.stride(1),
+            stride_gl_j=out_grad_logits.stride(2),
+            alpha_res_ptr=alpha_res_c,
+            sinkhorn_eps=sinkhorn_eps,
+            TMAX=tmax,
+            num_warps=num_warps,
+        )
+    return out_grad_logits
+
+
+# -------------------------------------------------------------------------------------------------
+# Apply kernels: mhc_pre and mhc_post_res (forward + backward)
+# -------------------------------------------------------------------------------------------------
+
+
+@triton.jit
+def _mhc_pre_fwd_kernel(
+    x_ptr,
+    hpre_ptr,
+    out_ptr,
+    N: tl.constexpr,
+    HC: tl.constexpr,
+    C: tl.constexpr,
+    stride_xn: tl.constexpr,
+    stride_xh: tl.constexpr,
+    stride_xc: tl.constexpr,
+    stride_hn: tl.constexpr,
+    stride_hh: tl.constexpr,
+    stride_on: tl.constexpr,
+    stride_oc: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_C: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_c = tl.program_id(1)
+
+    n_offs = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    c_offs = pid_c * BLOCK_C + tl.arange(0, BLOCK_C)
+
+    acc = tl.zeros((BLOCK_N, BLOCK_C), tl.float32)
+    for s in tl.static_range(0, HC):
+        h_s = tl.load(
+            hpre_ptr + n_offs * stride_hn + s * stride_hh,
+            mask=(n_offs < N),
+            other=0.0,
+        ).to(tl.float32)
+        xs = tl.load(
+            x_ptr + n_offs[:, None] * stride_xn + s * stride_xh + c_offs[None, :] * stride_xc,
+            mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+            other=0.0,
+        ).to(tl.float32)
+        acc += xs * h_s[:, None]
+
+    tl.store(
+        out_ptr + n_offs[:, None] * stride_on + c_offs[None, :] * stride_oc,
+        acc,
+        mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+    )
+
+
+@triton.jit
+def _mhc_pre_bwd_kernel(
+    x_ptr,
+    hpre_ptr,
+    grad_out_ptr,
+    grad_x_ptr,
+    grad_h_ptr,
+    N: tl.constexpr,
+    HC: tl.constexpr,
+    C: tl.constexpr,
+    stride_xn: tl.constexpr,
+    stride_xh: tl.constexpr,
+    stride_xc: tl.constexpr,
+    stride_hn: tl.constexpr,
+    stride_hh: tl.constexpr,
+    stride_gon: tl.constexpr,
+    stride_goc: tl.constexpr,
+    stride_gxn: tl.constexpr,
+    stride_gxh: tl.constexpr,
+    stride_gxc: tl.constexpr,
+    stride_ghn: tl.constexpr,
+    stride_ghh: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_C: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_c = tl.program_id(1)
+
+    n_offs = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    c_offs = pid_c * BLOCK_C + tl.arange(0, BLOCK_C)
+
+    go = tl.load(
+        grad_out_ptr + n_offs[:, None] * stride_gon + c_offs[None, :] * stride_goc,
+        mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+        other=0.0,
+    ).to(tl.float32)
+
+    # grad_x = grad_out * hpre
+    for s in tl.static_range(0, HC):
+        h_s = tl.load(
+            hpre_ptr + n_offs * stride_hn + s * stride_hh,
+            mask=(n_offs < N),
+            other=0.0,
+        ).to(tl.float32)
+        gx = go * h_s[:, None]
+        tl.store(
+            grad_x_ptr + n_offs[:, None] * stride_gxn + s * stride_gxh + c_offs[None, :] * stride_gxc,
+            gx,
+            mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+        )
+
+        # grad_hpre: dot(go, x_s) over C -> atomic add
+        xs = tl.load(
+            x_ptr + n_offs[:, None] * stride_xn + s * stride_xh + c_offs[None, :] * stride_xc,
+            mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+            other=0.0,
+        ).to(tl.float32)
+        part = tl.sum(go * xs, axis=1)
+        tl.atomic_add(
+            grad_h_ptr + n_offs * stride_ghn + s * stride_ghh,
+            part,
+            mask=n_offs < N,
+        )
+
+
+def mhc_pre_fwd(
+    x: torch.Tensor,
+    h_pre: torch.Tensor,
+    *,
+    out: Optional[torch.Tensor] = None,
+    block_n: int = 32,
+    block_c: int = 128,
+    num_warps: int = 4,
+) -> torch.Tensor:
+    assert x.is_contiguous() and h_pre.is_contiguous()
+    N, HC, C = x.shape
+    assert h_pre.shape == (N, HC)
+
+    if out is None:
+        out = torch.empty((N, C), device=x.device, dtype=torch.float32)
+
+    grid = (triton.cdiv(N, block_n), triton.cdiv(C, block_c))
+    _mhc_pre_fwd_kernel[grid](
+        x,
+        h_pre,
+        out,
+        N=N,
+        HC=HC,
+        C=C,
+        stride_xn=x.stride(0),
+        stride_xh=x.stride(1),
+        stride_xc=x.stride(2),
+        stride_hn=h_pre.stride(0),
+        stride_hh=h_pre.stride(1),
+        stride_on=out.stride(0),
+        stride_oc=out.stride(1),
+        BLOCK_N=block_n,
+        BLOCK_C=block_c,
+        num_warps=num_warps,
+    )
+    return out
+
+
+def mhc_pre_bwd(
+    x: torch.Tensor,
+    h_pre: torch.Tensor,
+    grad_out: torch.Tensor,
+    *,
+    out_grad_x: Optional[torch.Tensor] = None,
+    out_grad_h: Optional[torch.Tensor] = None,
+    block_n: int = 32,
+    block_c: int = 128,
+    num_warps: int = 4,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.is_contiguous() and h_pre.is_contiguous() and grad_out.is_contiguous()
+    N, HC, C = x.shape
+    assert grad_out.shape == (N, C)
+
+    if out_grad_x is None:
+        out_grad_x = torch.empty_like(x, dtype=torch.float32)
+    if out_grad_h is None:
+        out_grad_h = torch.zeros((N, HC), device=x.device, dtype=torch.float32)
+
+    grid = (triton.cdiv(N, block_n), triton.cdiv(C, block_c))
+    _mhc_pre_bwd_kernel[grid](
+        x,
+        h_pre,
+        grad_out,
+        out_grad_x,
+        out_grad_h,
+        N=N,
+        HC=HC,
+        C=C,
+        stride_xn=x.stride(0),
+        stride_xh=x.stride(1),
+        stride_xc=x.stride(2),
+        stride_hn=h_pre.stride(0),
+        stride_hh=h_pre.stride(1),
+        stride_gon=grad_out.stride(0),
+        stride_goc=grad_out.stride(1),
+        stride_gxn=out_grad_x.stride(0),
+        stride_gxh=out_grad_x.stride(1),
+        stride_gxc=out_grad_x.stride(2),
+        stride_ghn=out_grad_h.stride(0),
+        stride_ghh=out_grad_h.stride(1),
+        BLOCK_N=block_n,
+        BLOCK_C=block_c,
+        num_warps=num_warps,
+    )
+    return out_grad_x, out_grad_h
+
+
+@triton.jit
+def _mhc_post_res_fwd_kernel(
+    x_ptr,
+    f_ptr,
+    hpost_ptr,
+    hres_ptr,
+    out_ptr,
+    N: tl.constexpr,
+    HC: tl.constexpr,
+    C: tl.constexpr,
+    stride_xn: tl.constexpr,
+    stride_xh: tl.constexpr,
+    stride_xc: tl.constexpr,
+    stride_fn: tl.constexpr,
+    stride_fc: tl.constexpr,
+    stride_hpn: tl.constexpr,
+    stride_hph: tl.constexpr,
+    stride_hrn: tl.constexpr,
+    stride_hri: tl.constexpr,
+    stride_hrj: tl.constexpr,
+    stride_on: tl.constexpr,
+    stride_oh: tl.constexpr,
+    stride_oc: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_C: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_c = tl.program_id(1)
+
+    n_offs = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    c_offs = pid_c * BLOCK_C + tl.arange(0, BLOCK_C)
+
+    f = tl.load(
+        f_ptr + n_offs[:, None] * stride_fn + c_offs[None, :] * stride_fc,
+        mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+        other=0.0,
+    ).to(tl.float32)
+
+    o2 = tl.arange(0, HC)[:, None]  # [HC,1]
+    hpost = tl.load(
+        hpost_ptr + n_offs[None, :] * stride_hpn + o2 * stride_hph,
+        mask=(n_offs[None, :] < N),
+        other=0.0,
+    ).to(tl.float32)  # [HC, BN]
+
+    acc = f[None, :, :] * hpost[:, :, None]  # [HC, BN, BC]
+
+    # residual mixing: sum_i hres[o,i] * x_i
+    for i in tl.static_range(0, HC):
+        xs = tl.load(
+            x_ptr + n_offs[:, None] * stride_xn + i * stride_xh + c_offs[None, :] * stride_xc,
+            mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+            other=0.0,
+        ).to(tl.float32)  # [BN, BC]
+        w = tl.load(
+            hres_ptr + n_offs[None, :] * stride_hrn + o2 * stride_hri + i * stride_hrj,
+            mask=(n_offs[None, :] < N),
+            other=0.0,
+        ).to(tl.float32)  # [HC, BN]
+        acc += xs[None, :, :] * w[:, :, None]
+
+    o3 = tl.arange(0, HC)[:, None, None]
+    n3 = n_offs[None, :, None]
+    c3 = c_offs[None, None, :]
+    tl.store(
+        out_ptr + n3 * stride_on + o3 * stride_oh + c3 * stride_oc,
+        acc,
+        mask=(n3 < N) & (c3 < C),
+    )
+
+
+@triton.jit
+def _mhc_post_res_bwd_kernel(
+    x_ptr,
+    f_ptr,
+    hpost_ptr,
+    hres_ptr,
+    grad_out_ptr,
+    grad_x_ptr,
+    grad_f_ptr,
+    grad_hpost_ptr,
+    grad_hres_ptr,
+    N: tl.constexpr,
+    HC: tl.constexpr,
+    C: tl.constexpr,
+    stride_xn: tl.constexpr,
+    stride_xh: tl.constexpr,
+    stride_xc: tl.constexpr,
+    stride_fn: tl.constexpr,
+    stride_fc: tl.constexpr,
+    stride_hpn: tl.constexpr,
+    stride_hph: tl.constexpr,
+    stride_hrn: tl.constexpr,
+    stride_hri: tl.constexpr,
+    stride_hrj: tl.constexpr,
+    stride_gon: tl.constexpr,
+    stride_goh: tl.constexpr,
+    stride_goc: tl.constexpr,
+    stride_gxn: tl.constexpr,
+    stride_gxh: tl.constexpr,
+    stride_gxc: tl.constexpr,
+    stride_gfn: tl.constexpr,
+    stride_gfc: tl.constexpr,
+    stride_ghpn: tl.constexpr,
+    stride_ghph: tl.constexpr,
+    stride_ghrn: tl.constexpr,
+    stride_ghri: tl.constexpr,
+    stride_ghrj: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_C: tl.constexpr,
+):
+    pid_n = tl.program_id(0)
+    pid_c = tl.program_id(1)
+
+    n_offs = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    c_offs = pid_c * BLOCK_C + tl.arange(0, BLOCK_C)
+
+    f = tl.load(
+        f_ptr + n_offs[:, None] * stride_fn + c_offs[None, :] * stride_fc,
+        mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+        other=0.0,
+    ).to(tl.float32)
+
+    o2 = tl.arange(0, HC)[:, None]  # [HC,1]
+    hpost = tl.load(
+        hpost_ptr + n_offs[None, :] * stride_hpn + o2 * stride_hph,
+        mask=(n_offs[None, :] < N),
+        other=0.0,
+    ).to(tl.float32)  # [HC, BN]
+
+    o3 = tl.arange(0, HC)[:, None, None]
+    n3 = n_offs[None, :, None]
+    c3 = c_offs[None, None, :]
+    go = tl.load(
+        grad_out_ptr + n3 * stride_gon + o3 * stride_goh + c3 * stride_goc,
+        mask=(n3 < N) & (c3 < C),
+        other=0.0,
+    ).to(tl.float32)  # [HC, BN, BC]
+
+    # grad_f: sum_o go[o] * hpost[o]
+    gf = tl.sum(go * hpost[:, :, None], axis=0)
+    tl.store(
+        grad_f_ptr + n_offs[:, None] * stride_gfn + c_offs[None, :] * stride_gfc,
+        gf,
+        mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+    )
+
+    # grad_hpost: dot(go[o], f) over C  (atomic over C blocks)
+    part_hpost = tl.sum(go * f[None, :, :], axis=2)  # [HC, BN]
+    tl.atomic_add(
+        grad_hpost_ptr + n_offs[None, :] * stride_ghpn + o2 * stride_ghph,
+        part_hpost,
+        mask=(n_offs[None, :] < N),
+    )
+
+    # grad_x: hres^T @ go  (in-stream i gets sum_o hres[o,i] * go[o])
+    for i in tl.static_range(0, HC):
+        w = tl.load(
+            hres_ptr + n_offs[None, :] * stride_hrn + o2 * stride_hri + i * stride_hrj,
+            mask=(n_offs[None, :] < N),
+            other=0.0,
+        ).to(tl.float32)  # [HC, BN]
+        gx = tl.sum(go * w[:, :, None], axis=0)  # [BN, BC]
+        tl.store(
+            grad_x_ptr + n_offs[:, None] * stride_gxn + i * stride_gxh + c_offs[None, :] * stride_gxc,
+            gx,
+            mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+        )
+
+    # grad_hres[o,i]: dot(go[o], x[i]) over C (atomic)
+    for i in tl.static_range(0, HC):
+        xi = tl.load(
+            x_ptr + n_offs[:, None] * stride_xn + i * stride_xh + c_offs[None, :] * stride_xc,
+            mask=(n_offs[:, None] < N) & (c_offs[None, :] < C),
+            other=0.0,
+        ).to(tl.float32)
+        part_hres = tl.sum(go * xi[None, :, :], axis=2)  # [HC, BN]
+        tl.atomic_add(
+            grad_hres_ptr + n_offs[None, :] * stride_ghrn + o2 * stride_ghri + i * stride_ghrj,
+            part_hres,
+            mask=(n_offs[None, :] < N),
+        )
+
+
+def mhc_post_res_fwd(
+    x: torch.Tensor,
+    f_out: torch.Tensor,
+    h_post: torch.Tensor,
+    h_res: torch.Tensor,
+    *,
+    out: Optional[torch.Tensor] = None,
+    block_n: Optional[int] = None,
+    block_c: Optional[int] = None,
+    num_warps: Optional[int] = None,
+    num_stages: Optional[int] = None,
+) -> torch.Tensor:
+    assert x.is_contiguous() and f_out.is_contiguous() and h_post.is_contiguous() and h_res.is_contiguous()
+
+    N, HC, C = x.shape
+    assert f_out.shape == (N, C)
+    assert h_post.shape == (N, HC)
+    assert h_res.shape == (N, HC, HC)
+
+    if out is None:
+        out = torch.empty((N, HC, C), device=x.device, dtype=torch.float32)
+
+    block_n, block_c, num_warps, num_stages = _post_res_meta(C, block_n, block_c, num_warps, num_stages)
+
+    grid = (triton.cdiv(N, block_n), triton.cdiv(C, block_c))
+    _mhc_post_res_fwd_kernel[grid](
+        x,
+        f_out,
+        h_post,
+        h_res,
+        out,
+        N=N,
+        HC=HC,
+        C=C,
+        stride_xn=x.stride(0),
+        stride_xh=x.stride(1),
+        stride_xc=x.stride(2),
+        stride_fn=f_out.stride(0),
+        stride_fc=f_out.stride(1),
+        stride_hpn=h_post.stride(0),
+        stride_hph=h_post.stride(1),
+        stride_hrn=h_res.stride(0),
+        stride_hri=h_res.stride(1),
+        stride_hrj=h_res.stride(2),
+        stride_on=out.stride(0),
+        stride_oh=out.stride(1),
+        stride_oc=out.stride(2),
+        BLOCK_N=block_n,
+        BLOCK_C=block_c,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return out
+
+
+def mhc_post_res_bwd(
+    x: torch.Tensor,
+    f_out: torch.Tensor,
+    h_post: torch.Tensor,
+    h_res: torch.Tensor,
+    grad_out: torch.Tensor,
+    *,
+    out_grad_x: Optional[torch.Tensor] = None,
+    out_grad_f: Optional[torch.Tensor] = None,
+    out_grad_hpost: Optional[torch.Tensor] = None,
+    out_grad_hres: Optional[torch.Tensor] = None,
+    block_n: Optional[int] = None,
+    block_c: Optional[int] = None,
+    num_warps: Optional[int] = None,
+    num_stages: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert (
+        x.is_contiguous()
+        and f_out.is_contiguous()
+        and h_post.is_contiguous()
+        and h_res.is_contiguous()
+        and grad_out.is_contiguous()
+    )
+
+    N, HC, C = x.shape
+    assert grad_out.shape == (N, HC, C)
+
+    if out_grad_x is None:
+        out_grad_x = torch.empty_like(x, dtype=torch.float32)
+    if out_grad_f is None:
+        out_grad_f = torch.empty_like(f_out, dtype=torch.float32)
+    if out_grad_hpost is None:
+        out_grad_hpost = torch.zeros((N, HC), device=x.device, dtype=torch.float32)
+    if out_grad_hres is None:
+        out_grad_hres = torch.zeros((N, HC, HC), device=x.device, dtype=torch.float32)
+
+    block_n, block_c, num_warps, num_stages = _post_res_meta(C, block_n, block_c, num_warps, num_stages)
+
+    grid = (triton.cdiv(N, block_n), triton.cdiv(C, block_c))
+    _mhc_post_res_bwd_kernel[grid](
+        x,
+        f_out,
+        h_post,
+        h_res,
+        grad_out,
+        out_grad_x,
+        out_grad_f,
+        out_grad_hpost,
+        out_grad_hres,
+        N=N,
+        HC=HC,
+        C=C,
+        stride_xn=x.stride(0),
+        stride_xh=x.stride(1),
+        stride_xc=x.stride(2),
+        stride_fn=f_out.stride(0),
+        stride_fc=f_out.stride(1),
+        stride_hpn=h_post.stride(0),
+        stride_hph=h_post.stride(1),
+        stride_hrn=h_res.stride(0),
+        stride_hri=h_res.stride(1),
+        stride_hrj=h_res.stride(2),
+        stride_gon=grad_out.stride(0),
+        stride_goh=grad_out.stride(1),
+        stride_goc=grad_out.stride(2),
+        stride_gxn=out_grad_x.stride(0),
+        stride_gxh=out_grad_x.stride(1),
+        stride_gxc=out_grad_x.stride(2),
+        stride_gfn=out_grad_f.stride(0),
+        stride_gfc=out_grad_f.stride(1),
+        stride_ghpn=out_grad_hpost.stride(0),
+        stride_ghph=out_grad_hpost.stride(1),
+        stride_ghrn=out_grad_hres.stride(0),
+        stride_ghri=out_grad_hres.stride(1),
+        stride_ghrj=out_grad_hres.stride(2),
+        BLOCK_N=block_n,
+        BLOCK_C=block_c,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return out_grad_x, out_grad_f, out_grad_hpost, out_grad_hres
+
+
+def _flatten_tokens(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Size]:
+    """
+    Flattens leading dimensions so x becomes [N, HC, C].
+    Returns (x_flat, x_shape) where x_shape is the original shape.
+    """
+    assert x.dim() >= 3, "x must be [..., HC, C]"
+    return x.contiguous().view(-1, x.shape[-2], x.shape[-1]), x.shape
+
+
+class LigerMHCCoeffsFunction(torch.autograd.Function):
+    """
+    Autograd function for mHC coefficient computation.
+
+    Memory/Compute Trade-off:
+        When gradients are needed, Sinkhorn iteration history (hist) is saved
+        during forward to avoid recomputation in backward. This increases
+        memory usage by O(N * tmax * HC^2) but reduces backward compute.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(  # type: ignore[override]
+        ctx: Any,
+        x: torch.Tensor,  # [..., HC, C] bf16/fp16 (or fp32 if allow_fp32)
+        phi: torch.Tensor,  # [HC*C, M]
+        b: torch.Tensor,  # [M]
+        alpha_pre: torch.Tensor,  # scalar
+        alpha_post: torch.Tensor,  # scalar
+        alpha_res: torch.Tensor,  # scalar
+        allow_fp32: bool,
+        tmax: int,
+        rms_eps: float,
+        pre_eps: float,
+        sinkhorn_eps: float,
+        post_mult: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if allow_fp32:
+            assert x.dtype in (
+                torch.bfloat16,
+                torch.float16,
+                torch.float32,
+            ), "x should be BF16/FP16/FP32 when allow_fp32=True"
+        else:
+            assert x.dtype in (torch.bfloat16, torch.float16), "x should be BF16/FP16 (set allow_fp32=True for FP32)"
+        # Store original shape for restoring at the end
+        x_shape = x.shape
+        x_flat, _ = _flatten_tokens(x)
+        N, HC, C = x_flat.shape
+        K = HC * C
+        x_mat = x_flat.view(-1, K)
+
+        assert phi.dim() == 2 and phi.shape[0] == K, f"phi must be [HC*C, M], got {tuple(phi.shape)}"
+        M = int(phi.shape[1])
+        assert b.shape == (M,), f"b must be [M], got {tuple(b.shape)}"
+
+        # (1) fused coeff matmul + norm
+        mix, invr = mhc_mm_norm_fwd(x_mat, phi, eps=float(rms_eps))
+
+        # (2) split + sigmoid + sinkhorn
+        need_hist = any(ctx.needs_input_grad)
+        if need_hist:
+            h_pre, h_post, h_res, hist = mhc_split_sinkhorn_fwd(
+                mix,
+                b,
+                alpha_pre,
+                alpha_post,
+                alpha_res,
+                tmax=int(tmax),
+                pre_eps=float(pre_eps),
+                sinkhorn_eps=float(sinkhorn_eps),
+                post_mult=float(post_mult),
+                return_hist=True,
+            )
+        else:
+            h_pre, h_post, h_res = mhc_split_sinkhorn_fwd(
+                mix,
+                b,
+                alpha_pre,
+                alpha_post,
+                alpha_res,
+                tmax=int(tmax),
+                pre_eps=float(pre_eps),
+                sinkhorn_eps=float(sinkhorn_eps),
+                post_mult=float(post_mult),
+            )
+            hist = None
+
+        # Save for backward
+        if hist is not None:
+            ctx.save_for_backward(x_mat, phi, b, mix, invr, alpha_pre, alpha_post, alpha_res, hist)
+        else:
+            ctx.save_for_backward(x_mat, phi, b, mix, invr, alpha_pre, alpha_post, alpha_res)
+        ctx.meta = (
+            x_shape,
+            HC,
+            C,
+            int(tmax),
+            float(sinkhorn_eps),
+            float(post_mult),
+            hist is not None,
+        )
+
+        # Reshape to original leading dims
+        outer = x_shape[:-2]
+        return (
+            h_pre.view(*outer, HC),
+            h_post.view(*outer, HC),
+            h_res.view(*outer, HC, HC),
+        )
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(
+        ctx: Any,
+        grad_h_pre: torch.Tensor | None,
+        grad_h_post: torch.Tensor | None,
+        grad_h_res: torch.Tensor | None,
+    ):
+        saved = ctx.saved_tensors
+        x_shape, HC, C, tmax, sinkhorn_eps, post_mult, has_hist = ctx.meta
+        if has_hist:
+            x_mat, phi, b, mix, invr, alpha_pre, alpha_post, alpha_res, hist = saved
+        else:
+            x_mat, phi, b, mix, invr, alpha_pre, alpha_post, alpha_res = saved
+            hist = None
+        N = x_mat.shape[0]
+        M = mix.shape[1]
+        assert M == HC * HC + 2 * HC
+
+        need_pre = grad_h_pre is not None
+        need_post = grad_h_post is not None
+        need_res = grad_h_res is not None
+
+        # flatten grads (None -> zeros)
+        if need_pre:
+            gh_pre = grad_h_pre.view(-1, HC).to(torch.float32)
+        else:
+            gh_pre = torch.zeros((N, HC), device=mix.device, dtype=torch.float32)
+        if need_post:
+            gh_post = grad_h_post.view(-1, HC).to(torch.float32)
+        else:
+            gh_post = torch.zeros((N, HC), device=mix.device, dtype=torch.float32)
+        if need_res:
+            gh_res = grad_h_res.view(-1, HC, HC).to(torch.float32)
+        else:
+            gh_res = torch.zeros((N, HC, HC), device=mix.device, dtype=torch.float32)
+
+        # --- Sinkhorn backward -> grad logits for residual matrix
+        if need_res:
+            grad_res_logits = mhc_sinkhorn_bwd(
+                mix,
+                b,
+                alpha_res,
+                gh_res,
+                tmax=tmax,
+                sinkhorn_eps=sinkhorn_eps,
+                hist=hist,
+            )  # [N, HC, HC] fp32
+        else:
+            grad_res_logits = gh_res
+
+        # --- Pre/post derivatives (sigmoid)
+        mix_pre = mix[:, :HC]
+        mix_post = mix[:, HC : 2 * HC]
+        mix_res = mix[:, 2 * HC :]
+
+        b_pre = b[:HC]
+        b_post = b[HC : 2 * HC]
+        if need_pre:
+            pre_logits = mix_pre * alpha_pre + b_pre
+            pre_sig = torch.sigmoid(pre_logits)
+            grad_pre_logits = gh_pre * (pre_sig * (1.0 - pre_sig))  # [N,HC]
+        else:
+            grad_pre_logits = gh_pre
+
+        if need_post:
+            post_logits = mix_post * alpha_post + b_post
+            post_sig = torch.sigmoid(post_logits)
+            grad_post_logits = gh_post * (post_mult * post_sig * (1.0 - post_sig))  # [N,HC]
+        else:
+            grad_post_logits = gh_post
+
+        grad_res_logits_flat = grad_res_logits.reshape(N, HC * HC)
+
+        # --- Grad w.r.t mix
+        grad_mix = torch.empty_like(mix)
+        grad_mix[:, :HC] = grad_pre_logits * alpha_pre
+        grad_mix[:, HC : 2 * HC] = grad_post_logits * alpha_post
+        grad_mix[:, 2 * HC :] = grad_res_logits_flat * alpha_res
+
+        # --- Grad w.r.t b
+        grad_b = torch.zeros_like(b, dtype=torch.float32)
+        if need_pre:
+            grad_b[:HC] = grad_pre_logits.sum(dim=0)
+        if need_post:
+            grad_b[HC : 2 * HC] = grad_post_logits.sum(dim=0)
+        if need_res:
+            grad_b[2 * HC :] = grad_res_logits_flat.sum(dim=0)
+
+        # --- Grad w.r.t alphas
+        if need_pre:
+            grad_alpha_pre = (grad_pre_logits * mix_pre).sum()
+        else:
+            grad_alpha_pre = torch.zeros((), device=mix.device, dtype=torch.float32)
+        if need_post:
+            grad_alpha_post = (grad_post_logits * mix_post).sum()
+        else:
+            grad_alpha_post = torch.zeros((), device=mix.device, dtype=torch.float32)
+        if need_res:
+            grad_alpha_res = (grad_res_logits_flat * mix_res).sum()
+        else:
+            grad_alpha_res = torch.zeros((), device=mix.device, dtype=torch.float32)
+
+        # --- Grad w.r.t x and phi via fused mm+norm backward
+        grad_x_mat, grad_phi = mhc_mm_norm_bwd(
+            x_mat,
+            phi,
+            mix,
+            invr,
+            grad_mix,
+        )
+
+        # Reshape to original shape
+        grad_x = grad_x_mat.view(x_shape)
+
+        # Return grads for each forward input
+        return (
+            grad_x,  # x
+            grad_phi,  # phi
+            grad_b,  # b
+            grad_alpha_pre,  # alpha_pre
+            grad_alpha_post,  # alpha_post
+            grad_alpha_res,  # alpha_res
+            None,  # allow_fp32
+            None,  # tmax
+            None,  # rms_eps
+            None,  # pre_eps
+            None,  # sinkhorn_eps
+            None,  # post_mult
+        )
+
+
+class LigerMHCPreFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx: Any, x: torch.Tensor, h_pre: torch.Tensor) -> torch.Tensor:
+        x_shape = x.shape
+        x_flat, _ = _flatten_tokens(x)
+        h_pre_flat = h_pre.view(-1, x_flat.shape[1]).to(torch.float32)
+        out = mhc_pre_fwd(x_flat, h_pre_flat)  # [N,C] fp32
+        ctx.save_for_backward(x_flat, h_pre_flat)
+        ctx.x_shape = x_shape
+        out = out.to(x_flat.dtype)
+        return out.view(*x_shape[:-2], out.shape[-1])
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx: Any, grad_out: torch.Tensor):
+        x_flat, h_pre_flat = ctx.saved_tensors
+        x_shape = ctx.x_shape
+        N, HC, C = x_flat.shape
+        go = grad_out.view(-1, C).to(torch.float32)
+        grad_x, grad_h = mhc_pre_bwd(x_flat, h_pre_flat, go)
+        grad_x = grad_x.to(x_flat.dtype)
+        return grad_x.view(*x_shape), grad_h.view(*x_shape[:-1])
+
+
+class LigerMHCPostResFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx: Any, x: torch.Tensor, f_out: torch.Tensor, h_post: torch.Tensor, h_res: torch.Tensor
+    ) -> torch.Tensor:
+        x_shape = x.shape
+        x_flat, _ = _flatten_tokens(x)
+        N, HC, C = x_flat.shape
+        f_flat = f_out.view(-1, C)
+        h_post_flat = h_post.view(-1, HC).to(torch.float32)
+        h_res_flat = h_res.view(-1, HC, HC).to(torch.float32)
+        out = mhc_post_res_fwd(x_flat, f_flat, h_post_flat, h_res_flat)  # [N,HC,C] fp32
+        ctx.save_for_backward(x_flat, f_flat, h_post_flat, h_res_flat)
+        ctx.x_shape = x_shape
+        out = out.to(x_flat.dtype)
+        return out.view(*x_shape)
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx: Any, grad_out: torch.Tensor):
+        x_flat, f_flat, h_post_flat, h_res_flat = ctx.saved_tensors
+        x_shape = ctx.x_shape
+        N, HC, C = x_flat.shape
+        go = grad_out.view(-1, HC, C).to(torch.float32)
+
+        grad_x, grad_f, grad_hpost, grad_hres = mhc_post_res_bwd(x_flat, f_flat, h_post_flat, h_res_flat, go)
+
+        outer = x_shape[:-2]
+        return (
+            grad_x.to(x_flat.dtype).view(*x_shape),
+            grad_f.to(f_flat.dtype).view(*outer, C),
+            grad_hpost.view(*outer, HC),
+            grad_hres.view(*outer, HC, HC),
+        )
diff --git a/src/liger_kernel/ops/multi_token_attention.py b/src/liger_kernel/ops/multi_token_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..a91ebf58a90d5c56b746a384e043cf1031522a85
--- /dev/null
+++ b/src/liger_kernel/ops/multi_token_attention.py
@@ -0,0 +1,207 @@
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+from torch.nn.modules.utils import _pair
+
+from liger_kernel.ops.softmax import _softmax_forward
+from liger_kernel.ops.sparsemax import _sparsemax_backward
+from liger_kernel.ops.sparsemax import _sparsemax_forward
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+@triton.jit
+def _mask_fwd_kernel(
+    scores_ptr,
+    out_ptr,
+    stride_b,
+    stride_m,
+    stride_n,
+    L,
+    mask_val: tl.constexpr,
+    BLOCK: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    row_block = tl.program_id(0)
+    col_block = tl.program_id(1)
+    batch_id = tl.program_id(2)
+
+    row_idx = row_block * BLOCK + tl.arange(0, BLOCK)
+    col_idx = col_block * BLOCK + tl.arange(0, BLOCK)
+    in_bounds = (row_idx[:, None] < L) & (col_idx[None, :] < L)
+
+    base = scores_ptr + batch_id * stride_b
+    offs = row_idx[:, None] * stride_m + col_idx[None, :] * stride_n
+    future = col_idx[None, :] > row_idx[:, None]
+    mask_load = in_bounds & ~future
+    out = tl.load(base + offs, mask=mask_load, other=mask_val, cache_modifier=".ca")
+    tl.store(out_ptr + batch_id * stride_b + offs, out, mask=in_bounds, cache_modifier=".cs")
+
+
+@triton.jit
+def _mask_bwd_kernel(
+    grad_in_ptr, out_ptr, stride_b, stride_m, stride_n, L, BLOCK: tl.constexpr, num_warps: tl.constexpr
+):
+    row_block = tl.program_id(0)
+    col_block = tl.program_id(1)
+    batch_id = tl.program_id(2)
+
+    row_idx = row_block * BLOCK + tl.arange(0, BLOCK)
+    col_idx = col_block * BLOCK + tl.arange(0, BLOCK)
+    in_bounds = (row_idx[:, None] < L) & (col_idx[None, :] < L)
+
+    base = grad_in_ptr + batch_id * stride_b
+    offs = row_idx[:, None] * stride_m + col_idx[None, :] * stride_n
+    grad_vals = tl.load(base + offs, mask=in_bounds, other=0.0, cache_modifier=".ca")
+
+    future = col_idx[None, :] > row_idx[:, None]
+    zero = tl.zeros(grad_vals.shape, dtype=grad_vals.dtype)
+    out = tl.where(future, zero, grad_vals)
+
+    tl.store(out_ptr + batch_id * stride_b + offs, out, mask=in_bounds, cache_modifier=".wb")
+
+
+def _mask_inf_forward(scores: torch.Tensor) -> torch.Tensor:
+    *batch, L, _ = scores.shape
+    N = int(torch.prod(torch.tensor(batch))) if batch else 1
+    scores_f = scores.view(N, L, L)
+    out = torch.empty_like(scores_f)
+
+    sb, sm, sn = scores_f.stride(0), scores_f.stride(1), scores_f.stride(2)
+    BLOCK_SIZE, num_warps = calculate_settings(L)
+    grid = (triton.cdiv(L, BLOCK_SIZE), triton.cdiv(L, BLOCK_SIZE), N)
+    _mask_fwd_kernel[grid](scores_f, out, sb, sm, sn, L, mask_val=-1e9, BLOCK=BLOCK_SIZE, num_warps=num_warps)
+    return out.view(*batch, L, L)
+
+
+def _mask_inf_backward(grad: torch.Tensor) -> torch.Tensor:
+    *batch, L, _ = grad.shape
+    N = int(torch.prod(torch.tensor(batch))) if batch else 1
+    grad_f = grad.view(N, L, L)
+    out = torch.empty_like(grad_f)
+
+    sb, sm, sn = grad_f.stride(0), grad_f.stride(1), grad_f.stride(2)
+    BLOCK_SIZE, num_warps = calculate_settings(L)
+    grid = (triton.cdiv(L, BLOCK_SIZE), triton.cdiv(L, BLOCK_SIZE), N)
+    _mask_bwd_kernel[grid](grad_f, out, sb, sm, sn, L, BLOCK=BLOCK_SIZE, num_warps=num_warps)
+    return out.view(*batch, L, L)
+
+
+def _mask_zero_forward(scores: torch.Tensor) -> torch.Tensor:
+    *batch, L, _ = scores.shape
+    N = int(torch.prod(torch.tensor(batch))) if batch else 1
+    scores_f = scores.view(N, L, L)
+    out = torch.empty_like(scores_f)
+
+    sb, sm, sn = scores_f.stride(0), scores_f.stride(1), scores_f.stride(2)
+    BLOCK_SIZE, num_warps = calculate_settings(L)
+    grid = (triton.cdiv(L, BLOCK_SIZE), triton.cdiv(L, BLOCK_SIZE), N)
+    _mask_fwd_kernel[grid](scores_f, out, sb, sm, sn, L, mask_val=0.0, BLOCK=BLOCK_SIZE, num_warps=num_warps)
+    return out.view(*batch, L, L)
+
+
+def _mask_zero_backward(grad: torch.Tensor) -> torch.Tensor:
+    *batch, L, _ = grad.shape
+    N = int(torch.prod(torch.tensor(batch))) if batch else 1
+    grad_f = grad.view(N, L, L)
+    out = torch.empty_like(grad_f)
+
+    sb, sm, sn = grad_f.stride(0), grad_f.stride(1), grad_f.stride(2)
+    BLOCK_SIZE, num_warps = calculate_settings(L)
+    grid = (triton.cdiv(L, BLOCK_SIZE), triton.cdiv(L, BLOCK_SIZE), N)
+    _mask_bwd_kernel[grid](grad_f, out, sb, sm, sn, L, BLOCK=BLOCK_SIZE, num_warps=num_warps)
+    return out.view(*batch, L, L)
+
+
+class LigerMultiTokenAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, scores, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, sparse=False):
+        scores_inf = _mask_inf_forward(scores)
+
+        out_flat_sparse = None
+        activation_output = None
+
+        ctx.sparse = sparse
+
+        if sparse:
+            if scores_inf.dtype != torch.float32:
+                raise RuntimeError("Liger sparse multi-token attention currently only supports fp32 input scores")
+            probs_sparse, out_flat_sparse = _sparsemax_forward(scores_inf, dim=-1)
+            activation_output = probs_sparse
+            ctx.save_for_backward(scores_inf, activation_output, out_flat_sparse, weight, bias)
+            ctx.out_flat_sparse_saved = True
+        else:
+            probs_softmax, _, _, _ = _softmax_forward(scores_inf)
+            activation_output = probs_softmax
+            ctx.save_for_backward(scores_inf, activation_output, weight, bias)
+            ctx.out_flat_sparse_saved = False
+
+        out_conv = F.conv2d(
+            activation_output,
+            weight,
+            bias,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+        )
+
+        out = _mask_zero_forward(out_conv)
+
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.dim = -1
+
+        return out
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_out):
+        if ctx.out_flat_sparse_saved:
+            scores_inf, activation_output, out_flat_sparse, weight, bias = ctx.saved_tensors
+        else:
+            scores_inf, activation_output, weight, bias = ctx.saved_tensors
+            out_flat_sparse = None
+
+        use_sparsemax = ctx.sparse
+        dim = ctx.dim
+        stride, padding, dilation, groups = (ctx.stride, ctx.padding, ctx.dilation, ctx.groups)
+
+        grad_conv = _mask_zero_backward(grad_out)
+
+        grad_probs = F.conv_transpose2d(
+            grad_conv, weight, None, stride=stride, padding=padding, dilation=dilation, groups=groups
+        )
+
+        grad_weight = torch.nn.grad.conv2d_weight(
+            input=activation_output,
+            weight_size=weight.shape,
+            grad_output=grad_conv,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+        )
+        grad_bias = None
+        if bias is not None:
+            grad_bias = grad_conv.sum(dim=(0, 2, 3))
+
+        grad_scores_inf = None
+        if use_sparsemax:
+            if not ctx.out_flat_sparse_saved or out_flat_sparse is None:
+                raise RuntimeError("Internal error: Sparse flag is set but sparse tensor was not saved.")
+            grad_scores_inf = _sparsemax_backward(grad_probs, out_flat_sparse, dim=dim)
+        else:
+            grad_probs_cont = grad_probs
+            probs_cont = activation_output
+            dot = (grad_probs_cont * probs_cont).sum(dim=-1, keepdim=True)
+            grad_scores_inf = probs_cont * (grad_probs_cont - dot)
+
+        grad_scores = _mask_inf_backward(grad_scores_inf)
+
+        return (grad_scores, grad_weight, grad_bias, None, None, None, None, None)
diff --git a/src/liger_kernel/ops/poly_norm.py b/src/liger_kernel/ops/poly_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..2198e522d2bc1acc61644bb8882253e4d7650268
--- /dev/null
+++ b/src/liger_kernel/ops/poly_norm.py
@@ -0,0 +1,384 @@
+import operator
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+from liger_kernel.ops.utils import set_large_grf_mode
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+
+
+@triton.jit
+def _poly_norm_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,  # weight: [3] for [w0, w1, w2]
+    B_ptr,  # bias: scalar
+    RSTD_ptr,  # cache rstd for backward: shape (n_rows, 3)
+    RSTD_row_stride,
+    n_cols,
+    eps,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+
+    Reference:
+    1. https://github.com/BryceZhuo/PolyCom/
+    2. https://arxiv.org/pdf/2411.03884
+
+    Cache rstd values for backward pass
+    """
+    row_idx = tl.program_id(0).to(tl.int64)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    # Load pointers
+    Y_ptr += row_idx * Y_row_stride
+    X_ptr += row_idx * X_row_stride
+    RSTD_ptr += row_idx * RSTD_row_stride
+
+    # Load input row
+    X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0.0)
+
+    # Load weights and bias
+    w0 = tl.load(W_ptr + 0)
+    w1 = tl.load(W_ptr + 1)
+    w2 = tl.load(W_ptr + 2)
+    b = tl.load(B_ptr)
+
+    # Compute x³, x², x
+    X_pow3 = X_row * X_row * X_row
+    X_pow2 = X_row * X_row
+    X_pow1 = X_row
+
+    # Compute norm(x³): norm(u) = u * rsqrt(mean(u²) + eps)
+    mean_square_3 = tl.sum(X_pow3 * X_pow3, axis=0) / n_cols
+    rstd_3 = rsqrt(mean_square_3 + eps)
+    norm_x3 = X_pow3 * rstd_3
+
+    # Compute norm(x²)
+    mean_square_2 = tl.sum(X_pow2 * X_pow2, axis=0) / n_cols
+    rstd_2 = rsqrt(mean_square_2 + eps)
+    norm_x2 = X_pow2 * rstd_2
+
+    # Compute norm(x)
+    mean_square_1 = tl.sum(X_pow1 * X_pow1, axis=0) / n_cols
+    rstd_1 = rsqrt(mean_square_1 + eps)
+    norm_x1 = X_pow1 * rstd_1
+
+    # Cache rstd values for backward
+    tl.store(RSTD_ptr + 0, rstd_3)
+    tl.store(RSTD_ptr + 1, rstd_2)
+    tl.store(RSTD_ptr + 2, rstd_1)
+
+    # Compute output: y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+    Y_row = w0 * norm_x3 + w1 * norm_x2 + w2 * norm_x1 + b
+
+    # Store output
+    tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
+
+
+@triton.jit
+def _poly_norm_backward_kernel(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,  # shape: (n_programs, 3)
+    dW_row_stride,
+    dB_ptr,  # shape: (n_programs,)
+    n_rows,
+    n_cols,
+    rows_per_program: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    PolyNorm Backward Kernel Gradient:
+        ∂L/∂x_i = Σ_p w_p * [p*x_i^(p-1) * grad_i/D_p - (p/d)*x_i^(2p-1) * S_p/(D_p³)]
+
+    where:
+        - D_p = RMS(x^p) = 1/rstd_p
+        - S_p = sum(grad * x^p) over the row
+        - d = n_cols
+        - p ∈ {3, 2, 1}
+    """
+    row_block_id = tl.program_id(0).to(tl.int64)
+    row_start = row_block_id * rows_per_program
+    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    # Initialize accumulators for weight and bias gradients (scalars)
+    dW0_acc = 0.0
+    dW1_acc = 0.0
+    dW2_acc = 0.0
+    dB_acc = 0.0
+
+    # Load weights
+    w0 = tl.load(W_ptr + 0).to(tl.float32)
+    w1 = tl.load(W_ptr + 1).to(tl.float32)
+    w2 = tl.load(W_ptr + 2).to(tl.float32)
+
+    for row_idx in range(row_start, row_end):
+        dy_base = dY_ptr + row_idx * dY_row_stride
+        x_base = X_ptr + row_idx * X_row_stride
+        dx_base = dX_ptr + row_idx * dX_row_stride
+        rstd_base = RSTD_ptr + row_idx * RSTD_row_stride
+
+        dY_row = tl.load(dy_base + col_offsets, mask=mask, other=0.0).to(tl.float32)
+        X_row = tl.load(x_base + col_offsets, mask=mask, other=0.0).to(tl.float32)
+
+        # Load cached rstd values
+        rstd_3 = tl.load(rstd_base + 0).to(tl.float32)
+        rstd_2 = tl.load(rstd_base + 1).to(tl.float32)
+        rstd_1 = tl.load(rstd_base + 2).to(tl.float32)
+
+        # Compute powers
+        X_pow3 = X_row * X_row * X_row
+        X_pow2 = X_row * X_row
+        X_pow1 = X_row
+
+        # Accumulate bias gradient: dB = sum(dY)
+        dB_acc += tl.sum(dY_row, axis=0)
+
+        # Compute gradient w.r.t. input using closed-form formula
+        # For p=3: ∂L/∂x from w0 * norm(x³)
+        S_3 = tl.sum(dY_row * X_pow3, axis=0)  # scalar
+        grad_x_3 = w0 * (
+            3.0 * X_pow2 * rstd_3 * dY_row
+            - (3.0 / n_cols) * X_row * X_row * X_row * X_row * X_row * (rstd_3 * rstd_3 * rstd_3) * S_3
+        )
+
+        # For p=2: ∂L/∂x from w1 * norm(x²)
+        S_2 = tl.sum(dY_row * X_pow2, axis=0)  # scalar
+        grad_x_2 = w1 * (
+            2.0 * X_row * rstd_2 * dY_row - (2.0 / n_cols) * X_row * X_row * X_row * (rstd_2 * rstd_2 * rstd_2) * S_2
+        )
+
+        # For p=1: ∂L/∂x from w2 * norm(x)
+        S_1 = tl.sum(dY_row * X_pow1, axis=0)  # scalar
+        grad_x_1 = w2 * (1.0 * rstd_1 * dY_row - (1.0 / n_cols) * X_row * (rstd_1 * rstd_1 * rstd_1) * S_1)
+
+        # Accumulate weight gradients using closed-form: dW_p = rstd_p * S_p
+        dW0_acc += rstd_3 * S_3
+        dW1_acc += rstd_2 * S_2
+        dW2_acc += rstd_1 * S_1
+
+        # Total gradient
+        dX_row = grad_x_3 + grad_x_2 + grad_x_1
+
+        # Store gradient
+        tl.store(dx_base + col_offsets, dX_row, mask=mask)
+
+    # Store accumulated gradients (scalars)
+    tl.store(dW_ptr + row_block_id * dW_row_stride + 0, dW0_acc)
+    tl.store(dW_ptr + row_block_id * dW_row_stride + 1, dW1_acc)
+    tl.store(dW_ptr + row_block_id * dW_row_stride + 2, dW2_acc)
+    tl.store(dB_ptr + row_block_id, dB_acc)
+
+
+def poly_norm_forward(X, W, B, eps=1e-6):
+    """
+    PolyNorm Forward Pass
+
+    Args:
+        X: input tensor of shape (*, H) where H is hidden dimension
+        W: weight tensor of shape (3,) for [w0, w1, w2]
+        B: bias scalar tensor
+        eps: epsilon for numerical stability
+
+    Returns:
+        Y: output tensor of same shape as X
+        X: reshaped input (for backward)
+        RSTD: cached rstd values (for backward)
+        BLOCK_SIZE: block size used
+        num_warps: number of warps used
+    """
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    n_rows, n_cols = X.shape
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    # RSTD is to cache rstd for each row
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    RSTD = torch.empty((n_rows, 3), dtype=torch.float32, device=X.device)
+
+    # Check constraints
+    assert W.shape[0] == 3, "Weight tensor must have shape (3,)"
+    assert B.numel() == 1, "Bias must be a scalar"
+
+    # XPU-specific optimization
+    kernel_args = {}
+    if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+
+    # Launch kernel
+    _poly_norm_forward_kernel[(n_rows,)](
+        Y,
+        Y.stride(0),
+        X,
+        X.stride(0),
+        W,
+        B,
+        RSTD,
+        RSTD.stride(0),
+        n_cols,
+        eps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        **kernel_args,
+    )
+
+    return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps
+
+
+def poly_norm_backward(dY, X, W, RSTD, BLOCK_SIZE, num_warps, in_place):
+    """
+    PolyNorm Backward Pass
+
+    Args:
+        dY: gradient of output
+        X: input tensor (already reshaped to 2D)
+        W: weight tensor
+        RSTD: cached rstd values from forward
+        BLOCK_SIZE: block size from forward
+        num_warps: number of warps from forward
+        in_place: whether to in-place modify dY to store dX (saves memory)
+
+    Returns:
+        dX: gradient w.r.t. input
+        dW: gradient w.r.t. weight
+        dB: gradient w.r.t. bias
+    """
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    # Get number of SMs for parallelization
+    import math
+
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_core_count()
+
+    # Allocate or reuse gradients
+    if in_place is True:
+        dX = dY
+    else:
+        dX = torch.zeros_like(dY)
+
+    _dW = torch.empty((sm_count, 3), dtype=torch.float32, device=W.device)
+    _dB = torch.empty((sm_count,), dtype=torch.float32, device=W.device)
+
+    rows_per_program = math.ceil(n_rows / sm_count)
+    grid = (sm_count,)
+
+    # XPU-specific optimization
+    kernel_args = {}
+    if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+
+    # Launch backward kernel
+    _poly_norm_backward_kernel[grid](
+        dY,
+        dY.stride(0),
+        dX,
+        dX.stride(0),
+        X,
+        X.stride(0),
+        W,
+        RSTD,
+        RSTD.stride(0),
+        _dW,
+        _dW.stride(0),
+        _dB,
+        n_rows,
+        n_cols,
+        rows_per_program,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        **kernel_args,
+    )
+
+    # Reduce gradients across SMs
+    dX = dX.view(*shape)
+    dW = _dW.sum(dim=0).to(W.dtype)
+    dB = _dB.sum().to(W.dtype)
+
+    return dX, dW, dB
+
+
+class LigerPolyNormFunction(torch.autograd.Function):
+    """
+    PolyNorm Function with forward and backward pass
+
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+
+    Backward uses closed-form gradient:
+        ∂L/∂x_i = Σ_p w_p * [p*x_i^(p-1) * grad_i/D_p - (p/d)*x_i^(2p-1) * S_p/(D_p³)]
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, B, eps=1e-6, in_place=True):
+        """
+        Args:
+            X: input tensor of shape (B, T, H) or (BxT, H)
+            W: weight tensor of shape (3,) for [w0, w1, w2]
+            B: bias scalar
+            eps: epsilon for numerical stability
+            in_place: whether to in-place modify grad_output in backward (saves memory)
+
+        Returns:
+            Y: output tensor of same shape as X
+        """
+        Y, X, RSTD, BLOCK_SIZE, num_warps = poly_norm_forward(X, W, B, eps)
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.in_place = in_place
+        ctx.save_for_backward(X, W, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output):
+        """
+        Args:
+            grad_output: gradient of output
+
+        Returns:
+            dX, dW, dB: gradients w.r.t. X, W, B
+        """
+        X, W, RSTD = ctx.saved_tensors
+        dX, dW, dB = poly_norm_backward(grad_output, X, W, RSTD, ctx.BLOCK_SIZE, ctx.num_warps, ctx.in_place)
+        return dX, dW, dB, None, None
diff --git a/src/liger_kernel/ops/qwen2vl_mrope.py b/src/liger_kernel/ops/qwen2vl_mrope.py
new file mode 100755
index 0000000000000000000000000000000000000000..fbd120f96d4b0a7e81da06f118c0ba375976db31
--- /dev/null
+++ b/src/liger_kernel/ops/qwen2vl_mrope.py
@@ -0,0 +1,222 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _triton_qwen2vl_mrope(
+    q_ptr,
+    k_ptr,
+    cos,
+    sin,
+    sl,
+    bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    mrope_section_t: tl.constexpr,
+    mrope_section_h: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    pid = tl.program_id(0)
+
+    # locate start address
+    q_ptr = q_ptr + pid * (n_qh * hd)
+    k_ptr = k_ptr + pid * (n_kh * hd)
+
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+
+    # 1. program instances are laid out in a 1D vector of size bsz * seq_len, which
+    # effectively represents a 2D grid of size [bsz, seq_len] with seq_len dimension
+    # being the fastest changing dimension. Thus we can simply do pid // sl to get the batch index
+    # and pid % sl to get the sequence index.
+    # 2. We only need the left half of cos and sin matrix because the right half is just
+    # a clone of the left half.
+    t_end = mrope_section_t
+    h_end = t_end + mrope_section_h
+
+    t_cos = cos + pid * hd
+    h_cos = t_cos + bs * sl * hd
+    w_cos = h_cos + bs * sl * hd
+    t_sin = sin + pid * hd
+    h_sin = t_sin + bs * sl * hd
+    w_sin = h_sin + bs * sl * hd
+
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    t_mask = cos_offsets < t_end
+    h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
+    w_mask = (h_end <= cos_offsets) & (cos_offsets < hd // 2)
+    t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
+    h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
+    w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0)
+    t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0)
+    h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0)
+    w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0)
+    cos_row = t_cos_row + h_cos_row + w_cos_row
+    sin_row = t_sin_row + h_sin_row + w_sin_row
+
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
+
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (hd // 2)
+    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
+
+    if not BACKWARD_PASS:
+        # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+    else:
+        # with some math, we can get:
+        # dy = [dx1, dx2] * [cos, cos] + [-dx2, dx1] * [-sin, -sin]
+        new_q_tile_1 = q_tile_1 * cos_row + q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row - q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+        new_k_tile_1 = k_tile_1 * cos_row + k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row - k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+
+
+def qwen2vl_mrope_forward(q, k, cos, sin, mrope_section):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+
+    _triton_qwen2vl_mrope[(n_row,)](
+        q,
+        k,
+        cos,
+        sin,
+        seq_len,
+        batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+
+
+def qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+
+    # backward is similar to forward except swapping few ops
+    _triton_qwen2vl_mrope[(n_row,)](
+        dq,
+        dk,
+        cos,
+        sin,
+        seq_len,
+        batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        mrope_section[0],
+        mrope_section[1],
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+
+
+class LigerQwen2VLMRopeFunction(torch.autograd.Function):
+    """
+    Triton implementation of the Qwen2VL Multimodal Rotary Positional Embedding (M-RoPE) operation.
+
+    Please find the corresponding HuggingFace implementation here:
+    https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+    """
+
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = qwen2vl_mrope_forward(q, k, cos, sin, mrope_section)
+        ctx.save_for_backward(cos, sin)
+        ctx.mrope_section = mrope_section
+        return q, k
+
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (3, bsz, seq_len, head_dim)
+        sin size: (3, bsz, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        mrope_section = ctx.mrope_section
+        dq, dk = qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section)
+        return dq, dk, None, None, None, None
diff --git a/src/liger_kernel/ops/rms_norm.py b/src/liger_kernel/ops/rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..e5cab72ea661351fadb5e4513e47dcfd303289ac
--- /dev/null
+++ b/src/liger_kernel/ops/rms_norm.py
@@ -0,0 +1,654 @@
+"""
+This file incorporates code from Unsloth licensed under the Apache License, Version 2.0.
+See the original Unsloth repository at https://github.com/unslothai/unsloth.
+
+The following line
+https://github.com/linkedin/Liger-Kernel/blob/7382a8761f9af679482b968f9348013d933947c7/src/liger_kernel/ops/rms_norm.py#L30
+is based on code from Unsloth, located at:
+https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/rms_layernorm.py#L22
+
+Modifications made by Yanning Chen, 2024.
+"""
+
+import math
+import operator
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import compare_version
+from liger_kernel.ops.utils import ensure_contiguous
+from liger_kernel.ops.utils import get_npu_core_count
+from liger_kernel.ops.utils import set_large_grf_mode
+from liger_kernel.ops.utils import torch_to_triton_dtype
+from liger_kernel.utils import is_npu_available
+
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
+    try:
+        # typical import path with dispatch available
+        from triton.language.extra.libdevice import rsqrt
+    except ModuleNotFoundError:
+        # for working with NGC containers
+        from triton.language.extra.cuda.libdevice import rsqrt
+else:
+    from triton.language.math import rsqrt
+
+
+_CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
+_CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
+_CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
+
+
+@triton.jit
+def _rms_norm_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_cols,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,  # constexpr so the `if` blocks can be optimized out
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    y_i = (x_i / (RMS)) * (offset + wi), RMS = sqrt(sum(x_i^2) / N)
+
+    Reference:
+    1. https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    2. https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/rms_layernorm.py#L22
+    3. https://arxiv.org/pdf/1910.07467
+    """
+
+    row_idx = tl.program_id(0).to(tl.int64)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    y_base = Y_ptr + row_idx * Y_row_stride
+    x_base = X_ptr + row_idx * X_row_stride
+    rstd_base = RSTD_ptr + row_idx * RSTD_row_stride
+
+    X_row = tl.load(x_base + col_offsets, mask=mask, other=0)
+    X_row_dtype = X_row.dtype
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
+
+    # On Llama, only rstd is computed on fp32
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(tl.float32)
+
+    # Gemma computes everything on fp32, and then casts back the output to the original dtype
+    if casting_mode == _CASTING_MODE_GEMMA:
+        if elementwise_affine:
+            W_row = W_row.to(tl.float32)
+        X_row = X_row.to(tl.float32)
+
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(X_row_dtype)
+        offset = offset.to(X_row_dtype)
+
+    mean_square = tl.sum(X_row * X_row, axis=0) / n_cols
+    rstd = rsqrt(mean_square + eps)
+
+    # We can save time by caching rms with minimal memory overhead
+    # because rms is much smaller compared to X_row, as rms is for each row.
+    # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
+    tl.store(rstd_base, rstd)
+
+    X_row = X_row * rstd
+
+    # On Llama, the multiplication with the weight is done on the original dtype
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(X_row_dtype)
+
+    if elementwise_affine:
+        Y_row = X_row * (offset + W_row)
+    else:
+        Y_row = X_row
+
+    if casting_mode == _CASTING_MODE_GEMMA:
+        Y_row = Y_row.to(X_row_dtype)
+
+    tl.store(y_base + col_offsets, Y_row, mask=mask)
+
+
+@triton.jit
+def _rms_norm_backward_kernel(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows,
+    n_cols,
+    offset,
+    rows_per_program,
+    casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    dx = (1 / RMS) * [dy * (w + offset - (1 / N) * (1 / RMS^2) * ((dy * (w + offset)) dot x) * x]. * means element-wise multiplication, whileas dot means dot product
+    dw = sum(dy * (x / RMS)). summation over BxT dimension
+    """
+
+    row_block_id = tl.program_id(0).to(tl.int64)
+    row_start = row_block_id * rows_per_program
+    row_end = min((row_block_id + 1) * rows_per_program, n_rows)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    if elementwise_affine:
+        dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+        W_row = W_row + offset
+
+    for row_idx in range(row_start, row_end):
+        dy_base = dY_ptr + row_idx * dY_row_stride
+        dx_base = dX_ptr + row_idx * dX_row_stride
+
+        x_base = X_ptr + row_idx * X_row_stride
+        rstd_base = RSTD_ptr + row_idx * RSTD_row_stride
+
+        dY_row = tl.load(dy_base + col_offsets, mask=mask, other=0.0)
+        X_row = tl.load(x_base + col_offsets, mask=mask, other=0.0)
+
+        # Get cached rms
+        rstd_row = tl.load(rstd_base)
+
+        X_row = X_row.to(tl.float32)
+
+        # Different bacward graphs for different casting modes
+        if casting_mode == _CASTING_MODE_LLAMA:
+            if elementwise_affine:
+                m = (dY_row * W_row).to(tl.float32)
+            else:
+                m = dY_row.to(tl.float32)
+
+        elif casting_mode == _CASTING_MODE_GEMMA:
+            dY_row = dY_row.to(tl.float32)
+            if elementwise_affine:
+                m = dY_row * W_row
+            else:
+                m = dY_row
+        else:
+            if elementwise_affine:
+                m = dY_row * W_row
+            else:
+                m = dY_row
+
+        dX_row = rstd_row * m
+
+        dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
+
+        if elementwise_affine:
+            # calculate the gradient of W
+            if casting_mode == _CASTING_MODE_LLAMA:
+                dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
+            else:
+                # here X_row is already in fp32 (see previous if block)
+                dW_row += dY_row * (X_row * rstd_row)
+
+        tl.store(dx_base + col_offsets, dX_row.to(X_dtype), mask=mask)
+
+    if elementwise_affine:
+        tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
+
+
+@triton.jit
+def _block_rms_norm_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows,
+    n_cols,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,  # constexpr so the `if` blocks can be optimized out
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BLOCK_ROW: tl.constexpr,
+):
+    """
+    y_i = (x_i / (RMS)) * (offset + wi), RMS = sqrt(sum(x_i^2) / N)
+
+    Reference:
+    1. https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    2. https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/rms_layernorm.py#L22
+    3. https://arxiv.org/pdf/1910.07467
+    """
+
+    row_idx = tl.program_id(0) * BLOCK_ROW + tl.arange(0, BLOCK_ROW)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    row_mask = row_idx < n_rows
+    col_mask = col_offsets < n_cols
+
+    X_row = tl.load(
+        X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+        mask=row_mask[:, None] & col_mask[None, :],
+        other=0,
+    )
+    X_row_dtype = X_row.dtype
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0)
+
+    # On Llama, only rstd is computed on fp32
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(tl.float32)
+
+    # Gemma computes everything on fp32, and then casts back the output to the original dtype
+    if casting_mode == _CASTING_MODE_GEMMA:
+        if elementwise_affine:
+            W_row = W_row.to(tl.float32)
+        X_row = X_row.to(tl.float32)
+
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(X_row_dtype)
+        offset = offset.to(X_row_dtype)
+
+    mean_square = tl.sum(X_row * X_row, axis=1) / n_cols
+    rstd = rsqrt(mean_square + eps)
+
+    # We can save time by caching rms with minimal memory overhead
+    # because rms is much smaller compared to X_row, as rms is for each row.
+    # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
+    tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd, row_mask)
+
+    X_row = X_row * rstd[:, None]
+
+    # On Llama, the multiplication with the weight is done on the original dtype
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(X_row_dtype)
+
+    if elementwise_affine:
+        Y_row = X_row * (offset + W_row)[None, :]
+    else:
+        Y_row = X_row
+
+    if casting_mode == _CASTING_MODE_GEMMA:
+        Y_row = Y_row.to(X_row_dtype)
+
+    tl.store(
+        Y_ptr + row_idx[:, None] * Y_row_stride + col_offsets[None, :],
+        Y_row,
+        mask=row_mask[:, None] & col_mask[None, :],
+    )
+
+
+@triton.jit
+def _block_rms_norm_backward_kernel(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows,
+    n_cols,
+    offset,
+    casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BLOCK_ROW: tl.constexpr,
+):
+    """
+    dx = (1 / RMS) * [dy * (w + offset - (1 / N) * (1 / RMS^2) * ((dy * (w + offset)) dot x) * x]. * means element-wise multiplication, whileas dot means dot product
+    dw = sum(dy * (x / RMS)). summation over BxT dimension
+    """
+
+    pid = tl.program_id(0).cast(tl.int64)
+    NUM_SMS = tl.num_programs(0)
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    col_mask = col_offsets < n_cols
+
+    if elementwise_affine:
+        dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+
+        W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0)
+        W_row = W_row + offset
+
+    for start in range(pid * BLOCK_ROW, n_rows, NUM_SMS * BLOCK_ROW):
+        row_idx = start + tl.arange(0, BLOCK_ROW)
+        row_mask = row_idx < n_rows
+        dY_row = tl.load(
+            dY_ptr + row_idx[:, None] * dY_row_stride + col_offsets[None, :],
+            mask=row_mask[:, None] & col_mask[None, :],
+            other=0.0,
+        )
+        X_row = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=row_mask[:, None] & col_mask[None, :],
+            other=0.0,
+        )
+
+        # Get cached rms
+        rstd_row = tl.load(RSTD_ptr + row_idx * RSTD_row_stride, row_mask)
+
+        X_row = X_row.to(tl.float32)
+
+        # Different bacward graphs for different casting modes
+        if casting_mode == _CASTING_MODE_LLAMA:
+            if elementwise_affine:
+                m = (dY_row * W_row[None, :]).to(tl.float32)
+            else:
+                m = dY_row.to(tl.float32)
+
+        elif casting_mode == _CASTING_MODE_GEMMA:
+            dY_row = dY_row.to(tl.float32)
+            if elementwise_affine:
+                m = dY_row * W_row[None, :]
+            else:
+                m = dY_row
+        else:
+            if elementwise_affine:
+                m = dY_row * W_row[None, :]
+            else:
+                m = dY_row
+
+        dX_row = rstd_row[:, None] * m
+
+        dX_row += (rstd_row[:, None]) * (
+            -(1 / n_cols) * (rstd_row * rstd_row * tl.sum(m * X_row, axis=1))[:, None] * X_row
+        )
+
+        if elementwise_affine:
+            if casting_mode == _CASTING_MODE_LLAMA:
+                # TODO(tcc): use tl.sum(..., dtype=tl.float32) once we upgrade to triton>=3.3.0
+                dW_row += tl.sum((dY_row * (X_row * rstd_row[:, None]).to(X_dtype)).to(tl.float32), 0)
+            else:
+                # here X_row is already in fp32 (see previous if block)
+                dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]), 0)
+
+        tl.store(
+            dX_ptr + row_idx[:, None] * dX_row_stride + col_offsets[None, :],
+            dX_row,
+            mask=row_mask[:, None] & col_mask[None, :],
+        )
+
+    if elementwise_affine:
+        tl.store(dW_ptr + pid * dW_row_stride + col_offsets, dW_row, mask=col_mask)
+
+
+_str_to_casting_mode = {
+    "llama": _CASTING_MODE_LLAMA.value,
+    "gemma": _CASTING_MODE_GEMMA.value,
+    "none": _CASTING_MODE_NONE.value,
+}
+
+
+def rms_norm_forward(X, W, eps, offset, casting_mode, row_mode):
+    if not isinstance(casting_mode, int):
+        assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
+        casting_mode = _str_to_casting_mode[casting_mode]
+    else:
+        assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
+
+    shape = X.shape
+    dim = shape[-1]
+    X = X.view(-1, dim)
+    n_rows, n_cols = X.shape
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    # RSTD is to cache rstd for each row
+    # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
+    rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
+    RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
+
+    if W is not None:
+        # Check constraints.
+        assert X.shape[1] == W.shape[0], (
+            "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+        )
+        elementwise_affine = True
+    else:
+        elementwise_affine = False
+
+    # XPU-specific optimization
+    kernel_args = {}
+    if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+    if BLOCK_SIZE > 256 or n_rows < 4096 * 8 or row_mode:
+        _rms_norm_forward_kernel[(n_rows,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
+    else:
+        BLOCK_ROW = 16
+        kernel_args["BLOCK_ROW"] = BLOCK_ROW
+        _block_rms_norm_forward_kernel[(triton.cdiv(n_rows, BLOCK_ROW),)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
+    return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps, casting_mode
+
+
+def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place, row_mode):
+    shape = dY.shape
+    dim = shape[-1]
+    dY = dY.view(-1, dim)
+    n_rows, n_cols = dY.shape
+
+    sm_count = 1
+    if X.device.type == "cuda":
+        sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
+    elif X.device.type == "xpu":
+        sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_core_count()
+
+    if W is not None:
+        # fp32 for numerical stability especially.
+        _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+        elementwise_affine = True
+    else:
+        _dW = None
+        elementwise_affine = False
+
+    if n_cols > BLOCK_SIZE:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    rows_per_program = math.ceil(n_rows / sm_count)
+    grid = (sm_count,)
+
+    if in_place is True:
+        dX = dY
+    else:
+        dX = torch.zeros_like(dY)
+
+    # XPU-specific optimization
+    kernel_args = {}
+    if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+
+    if BLOCK_SIZE > 256 or n_rows < 4096 * 8 or row_mode:
+        _rms_norm_backward_kernel[grid](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            torch_to_triton_dtype[X.dtype],
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0) if elementwise_affine else 0,
+            n_rows,
+            n_cols,
+            offset,
+            rows_per_program,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
+    else:
+        BLOCK_ROW = 16
+        kernel_args["BLOCK_ROW"] = BLOCK_ROW
+        _block_rms_norm_backward_kernel[grid](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            torch_to_triton_dtype[X.dtype],
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0) if elementwise_affine else 0,
+            n_rows,
+            n_cols,
+            offset,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
+    dX = dX.view(*shape)
+
+    if elementwise_affine:
+        dW = _dW.sum(dim=0).to(W.dtype)
+    else:
+        dW = None
+
+    return dX, dW
+
+
+class LigerRMSNormFunction(torch.autograd.Function):
+    """
+    Performs RMSNorm (Root Mean Square Normalization), which normalizes the input tensor `X` using the
+    weight tensor `W`, with an optional offset and casting mode.
+
+    Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
+    uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
+    `(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
+
+    In addition, different models cast their inputs at different places during RMSNorm computation. For
+    example, Gemma casts everything to fp32 nefore starting the computation, while Llama casts only the
+    inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
+    support the following casting modes (they match HuggingFace Transformers' implementations):
+    - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
+    - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
+    - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
+
+    `in_place` option means whether to in_place modify dY to store dX. This is default to `True` to save memory. However, under certain cases, it can produce incorrect inputs.
+        For example, gemma2 uses two rmsnorm sequentially with residual in between. The resesidual part needs dY so it cannot be modified in-place.
+        Therefore, for the patching of RMSNorm in gemma2, we set `in_place` to `False`
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama", in_place=True, row_mode=None):
+        """
+        X: (B, T, H) or (BxT, H)
+        W: (H,)
+        """
+        if isinstance(X, torch.distributed.tensor.DTensor):
+            # Input tensor is output of a tensor parallel module and
+            # needs to be gathered to a local tensor to compute
+            # RMSE layer norm on each TP worker.
+            # TODO: support CP.
+            X = X.full_tensor()
+
+        Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode, row_mode)
+        ctx.offset = offset
+        ctx.casting_mode = casting_mode
+        ctx.in_place = in_place
+        ctx.row_mode = row_mode
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.elementwise_affine = W is not None
+        if W is not None:
+            ctx.save_for_backward(X, W, RSTD)
+        else:
+            ctx.save_for_backward(X, RSTD)
+        return Y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dY):
+        """
+        Y: (B, T, H) or (BxT, H)
+        """
+        if ctx.elementwise_affine:
+            X, W, RSTD = ctx.saved_tensors
+        else:
+            X, RSTD = ctx.saved_tensors
+            W = None
+
+        if isinstance(dY, torch.distributed.tensor.DTensor):
+            # Gradients are output of a tensor parallel module and
+            # needs to be gathered to a local tensor for computing RMSE layer.
+            # TODO: support CP.
+            dY = dY.full_tensor()
+
+        dX, dW = rms_norm_backward(
+            dY, X, W, RSTD, ctx.offset, ctx.casting_mode, ctx.BLOCK_SIZE, ctx.num_warps, ctx.in_place, ctx.row_mode
+        )
+        return dX, dW, None, None, None, None, None
diff --git a/src/liger_kernel/ops/rope.py b/src/liger_kernel/ops/rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..bd8ded7308eeddb7762752f4adc95e68903a77ec
--- /dev/null
+++ b/src/liger_kernel/ops/rope.py
@@ -0,0 +1,239 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _triton_rope(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    cos_row_stride,
+    sin,
+    sin_row_stride,
+    sl,
+    bs: tl.constexpr,
+    cos_bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    # q size: (bsz, seq_len, num_q_heads, head_dim)
+    # q stride: (seq_len * num_q_heads * head_dim, num_q_heads * head_dim, head_dim, 1)
+    # k size: (bsz, seq_len, num_kv_heads, head_dim)
+    # k stride: (seq_len * num_kv_heads * head_dim, num_kv_heads * head_dim, head_dim, 1)
+
+    # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+    # stride: (seq_len * head_dim, head_dim, 1)
+    pid = tl.program_id(0).to(tl.int64)
+
+    # locate start address
+    q_ptr = q_ptr + pid * q_row_stride
+    k_ptr = k_ptr + pid * k_row_stride
+
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+
+    # 1. program instances are laid out in a 1D vector of size bsz * seq_len, which
+    # effectively represents a 2D grid of size [bsz, seq_len] with seq_len dimension
+    # being the fastest changing dimension. Thus we can simply do pid // sl to get the batch index
+    # and pid % sl to get the sequence index.
+    # 2. We only need the left half of cos and sin matrix because the right half is just
+    # a clone of the left half.
+    batch_idx = pid // sl
+    cos_row_idx = pid % sl
+    cos = cos + tl.where(
+        cos_bs == 1,
+        cos_row_idx * cos_row_stride,
+        batch_idx * (sl * cos_row_stride) + cos_row_idx * cos_row_stride,
+    )
+    sin = sin + tl.where(
+        cos_bs == 1,
+        cos_row_idx * sin_row_stride,
+        batch_idx * (sl * sin_row_stride) + cos_row_idx * sin_row_stride,
+    )
+
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    cos_mask = cos_offsets < hd // 2
+    cos_row = tl.load(cos + cos_offsets, mask=cos_mask, other=0)
+    sin_row = tl.load(sin + cos_offsets, mask=cos_mask, other=0)
+
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
+
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (hd // 2)
+    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
+
+    if not BACKWARD_PASS:
+        # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+    else:
+        # with some math, we can get:
+        # dy = [dx1, dx2] * [cos, cos] + [-dx2, dx1] * [-sin, -sin]
+        new_q_tile_1 = q_tile_1 * cos_row + q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row - q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+        new_k_tile_1 = k_tile_1 * cos_row + k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row - k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+
+
+def rope_forward(q, k, cos, sin):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
+
+    _triton_rope[(n_row,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        batch_size,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=False,
+    )
+    return q.transpose(1, 2), k.transpose(1, 2), cos, sin
+
+
+def rope_backward(dq, dk, cos, sin):
+    dq = dq.transpose(1, 2)
+    dk = dk.transpose(1, 2)
+
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+
+    n_row = batch_size * seq_len
+
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+
+    # backward is similar to forward except swapping few ops
+    _triton_rope[(n_row,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        batch_size,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=True,
+    )
+    return dq.transpose(1, 2), dk.transpose(1, 2)
+
+
+class LigerRopeFunction(torch.autograd.Function):
+    """
+    Triton implementation of the Rotary Positional Embedding (RoPE) operation. Please note that
+    this implements the HuggingFace Llama & Mistral version, whose rotation matrix is slightly different
+    than the original RoPE paper.
+
+    Please find the corresponding HuggingFace implementation here:
+    https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llama/modeling_llama.py#L184
+
+    For more details about the rotation matrix used here, please refer to:
+    https://discuss.huggingface.co/t/is-llama-rotary-embedding-implementation-correct/44509/2
+    """
+
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = rope_forward(q, k, cos, sin)
+        ctx.save_for_backward(cos, sin)
+        return q, k
+
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+
+        cos, sin = ctx.saved_tensors
+        dq, dk = rope_backward(dq, dk, cos, sin)
+        return dq, dk, None, None, None, None
diff --git a/src/liger_kernel/ops/softmax.py b/src/liger_kernel/ops/softmax.py
new file mode 100755
index 0000000000000000000000000000000000000000..15db6cdda36e442007d8c380ef13c3c72293abb1
--- /dev/null
+++ b/src/liger_kernel/ops/softmax.py
@@ -0,0 +1,201 @@
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+@triton.jit
+def _softmax_single_block_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_id = tl.program_id(0)
+    offs = tl.arange(0, BLOCK_SIZE)
+    mask = offs < n_cols
+
+    x = tl.load(X_ptr + row_id * X_row_stride + offs, mask=mask, other=-float("inf"), cache_modifier=".ca")
+    m = tl.max(x, axis=0)
+    e = tl.exp(x - m)
+    d = tl.sum(e, axis=0)
+    y = e / d
+    tl.store(Y_ptr + row_id * Y_row_stride + offs, y, mask=mask, cache_modifier=".cs")
+
+
+@triton.jit
+def _softmax_multi_block_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_id = tl.program_id(0)
+    offs = tl.arange(0, BLOCK_SIZE)
+
+    m = tl.float32(-float("inf"))
+    d = tl.float32(0.0)
+    for start in tl.range(0, n_cols, BLOCK_SIZE):
+        idx = start + offs
+        mask = idx < n_cols
+        xblk = tl.load(X_ptr + row_id * X_row_stride + idx, mask=mask, other=-float("inf"), cache_modifier=".ca")
+        blk_max = tl.max(xblk, axis=0)
+        new_m = tl.max(m, blk_max)
+        d = d * tl.exp(m - new_m) + tl.sum(tl.exp(xblk - new_m), axis=0)
+        m = new_m
+
+    for start in tl.range(0, n_cols, BLOCK_SIZE):
+        idx = start + offs
+        mask = idx < n_cols
+        xblk = tl.load(X_ptr + row_id * X_row_stride + idx, mask=mask, other=-float("inf"), cache_modifier=".ca")
+        yblk = tl.exp(xblk - m) / d
+        tl.store(Y_ptr + row_id * Y_row_stride + idx, yblk, mask=mask, cache_modifier=".cs")
+
+
+@triton.jit
+def _softmax_single_block_backward_kernel(
+    dy_ptr,
+    dy_stride,
+    y_ptr,
+    y_stride,
+    dx_ptr,
+    dx_stride,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_id = tl.program_id(0)
+    offs = tl.arange(0, BLOCK_SIZE)
+    mask = offs < n_cols
+
+    dy = tl.load(dy_ptr + row_id * dy_stride + offs, mask=mask, other=0.0)
+    y = tl.load(y_ptr + row_id * y_stride + offs, mask=mask, other=0.0, cache_modifier=".ca")
+    dot = tl.sum(dy * y, axis=0)
+    dx = y * (dy - dot)
+    tl.store(dx_ptr + row_id * dx_stride + offs, dx, mask=mask, cache_modifier=".wb")
+
+
+@triton.jit
+def _softmax_multi_block_backward_kernel(
+    dy_ptr,
+    dy_stride,
+    y_ptr,
+    y_stride,
+    dx_ptr,
+    dx_stride,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row_id = tl.program_id(0)
+    offs = tl.arange(0, BLOCK_SIZE)
+    acc = tl.float32(0.0)
+
+    for start in tl.range(0, n_cols, BLOCK_SIZE):
+        idx = start + offs
+        mask = idx < n_cols
+        dy_blk = tl.load(dy_ptr + row_id * dy_stride + idx, mask=mask, other=0.0)
+        y_blk = tl.load(y_ptr + row_id * y_stride + idx, mask=mask, other=0.0, cache_modifier=".ca")
+        acc += tl.sum(dy_blk * y_blk, axis=0)
+
+    for start in tl.range(0, n_cols, BLOCK_SIZE):
+        idx = start + offs
+        mask = idx < n_cols
+        dy_blk = tl.load(dy_ptr + row_id * dy_stride + idx, mask=mask, other=0.0)
+        y_blk = tl.load(y_ptr + row_id * y_stride + idx, mask=mask, other=0.0, cache_modifier=".ca")
+        dx_blk = y_blk * (dy_blk - acc)
+        tl.store(dx_ptr + row_id * dx_stride + idx, dx_blk, mask=mask, cache_modifier=".wb")
+
+
+def _softmax_forward(x: torch.Tensor) -> Tuple[torch.Tensor, int, int, bool]:
+    *batch, n_cols = x.shape
+    x2d = x.contiguous().view(-1, n_cols)
+    n_rows = x2d.shape[0]
+
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    y2d = torch.empty_like(x2d)
+
+    if n_cols <= BLOCK_SIZE:
+        _softmax_single_block_forward_kernel[(n_rows,)](
+            y2d, y2d.stride(0), x2d, x2d.stride(0), n_cols, BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps
+        )
+        multi_block_launch = False
+    else:
+        _softmax_multi_block_forward_kernel[(n_rows,)](
+            y2d, y2d.stride(0), x2d, x2d.stride(0), n_cols, BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps
+        )
+        multi_block_launch = True
+
+    return y2d.view(*batch, n_cols), BLOCK_SIZE, num_warps, multi_block_launch
+
+
+def _softmax_backward(
+    dy: torch.Tensor,
+    y: torch.Tensor,
+    BLOCK_SIZE: int,
+    num_warps: int,
+    multi_block_launch: bool,
+) -> torch.Tensor:
+    *batch, n_cols = dy.shape
+    dy2d = dy.contiguous().view(-1, n_cols)
+    y2d = y.contiguous().view(-1, n_cols)
+    n_rows = dy2d.shape[0]
+    dx2d = torch.empty_like(dy2d)
+
+    if not multi_block_launch and n_cols <= BLOCK_SIZE:
+        _softmax_single_block_backward_kernel[(n_rows,)](
+            dy2d,
+            dy2d.stride(0),
+            y2d,
+            y2d.stride(0),
+            dx2d,
+            dx2d.stride(0),
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+    else:
+        _softmax_multi_block_backward_kernel[(n_rows,)](
+            dy2d,
+            dy2d.stride(0),
+            y2d,
+            y2d.stride(0),
+            dx2d,
+            dx2d.stride(0),
+            n_cols,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+        )
+
+    return dx2d.view(*batch, n_cols)
+
+
+class LigerSoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, input_: torch.Tensor):
+        y, BLOCK_SIZE, num_warps, multi_block_launch = _softmax_forward(input_)
+        ctx.save_for_backward(y)
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.multi_block_launch = multi_block_launch
+        return y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output):
+        (y,) = ctx.saved_tensors
+        dx = _softmax_backward(
+            grad_output,
+            y,
+            ctx.BLOCK_SIZE,
+            ctx.num_warps,
+            ctx.multi_block_launch,
+        )
+        return dx
diff --git a/src/liger_kernel/ops/sparsemax.py b/src/liger_kernel/ops/sparsemax.py
new file mode 100755
index 0000000000000000000000000000000000000000..065785a2aa0de756a11788b5c3b1f9e2464fd0c4
--- /dev/null
+++ b/src/liger_kernel/ops/sparsemax.py
@@ -0,0 +1,177 @@
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+@triton.jit
+def _sparsemax_forward_kernel(
+    x_ptr,
+    x_stride_row,
+    sorted_x_ptr,
+    sorted_x_stride_row,
+    o_ptr,
+    o_stride_row,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    num_warps: tl.constexpr,
+):
+    pid_row = tl.program_id(0)
+    ptr_x_data_row = x_ptr + pid_row * x_stride_row
+    ptr_sorted_x_data_row = sorted_x_ptr + pid_row * sorted_x_stride_row
+    ptr_output_row = o_ptr + pid_row * o_stride_row
+
+    offs = tl.arange(0, BLOCK_SIZE)
+    mask = offs < n_cols
+
+    z_sorted_block = tl.load(
+        ptr_sorted_x_data_row + offs,
+        mask=mask,
+        other=-float("inf"),
+        cache_modifier=".cg",
+    ).to(tl.float32)
+
+    z_valid = tl.where(mask, z_sorted_block, 0.0)
+    cssv = tl.cumsum(z_valid, 0)
+
+    r = (offs + 1).to(tl.float32)
+    t_vec = (cssv - 1.0) / r
+
+    support = (z_sorted_block > t_vec) & mask
+
+    k_int = tl.sum(support.to(tl.int32), 0)
+    k_clamped_int = tl.maximum(k_int, 1)
+    k = k_clamped_int.to(tl.float32)
+
+    s = tl.sum(tl.where(support, z_sorted_block, 0.0), 0)
+
+    tau = (s - 1.0) / k
+
+    x_block = tl.load(
+        ptr_x_data_row + offs,
+        mask=mask,
+        other=0.0,
+        cache_modifier=".cg",
+    ).to(tl.float32)
+
+    y = tl.maximum(x_block - tau, 0.0)
+
+    tl.store(
+        ptr_output_row + offs,
+        y.to(ptr_output_row.dtype.element_ty),
+        mask=mask,
+        cache_modifier=".cs",
+    )
+
+
+@triton.jit
+def _sparsemax_backward_kernel(
+    o_ptr, go_ptr, gi_ptr, stride, n_cols, BLOCK_SIZE: tl.constexpr, num_warps: tl.constexpr
+):
+    row = tl.program_id(0)
+    o_row = o_ptr + row * stride
+    go_row = go_ptr + row * stride
+    gi_row = gi_ptr + row * stride
+
+    offs = tl.arange(0, BLOCK_SIZE)
+
+    supp_cnt = tl.zeros((), tl.float32)
+    go_sum = tl.zeros((), tl.float32)
+
+    for i in tl.range(0, tl.cdiv(n_cols, BLOCK_SIZE)):
+        offs_iter = i * BLOCK_SIZE + offs
+        mask_iter = offs_iter < n_cols
+        o_val = tl.load(o_row + offs_iter, mask=mask_iter, other=0.0, cache_modifier=".ca").to(tl.float32)
+        go_val = tl.load(go_row + offs_iter, mask=mask_iter, other=0.0).to(tl.float32)
+        supp = o_val > 0.0
+        go_sum += tl.sum(tl.where(supp, go_val, 0.0))
+        supp_cnt += tl.sum(supp.to(tl.float32))
+
+    for i in tl.range(0, tl.cdiv(n_cols, BLOCK_SIZE)):
+        offs_iter = i * BLOCK_SIZE + offs
+        mask_iter = offs_iter < n_cols
+        o_val = tl.load(o_row + offs_iter, mask=mask_iter, other=0.0, cache_modifier=".ca").to(tl.float32)
+        go_val = tl.load(go_row + offs_iter, mask=mask_iter, other=0.0).to(tl.float32)
+        supp = o_val > 0.0
+        gi_val = tl.where(
+            supp,
+            go_val - tl.cast(go_sum / tl.maximum(supp_cnt, 1e-6), gi_row.dtype.element_ty).to(tl.float32),
+            0.0,
+        )
+        tl.store(gi_row + offs_iter, gi_val.to(gi_row.dtype.element_ty), mask=mask_iter, cache_modifier=".wb")
+
+
+def _sparsemax_forward(x: torch.Tensor, dim: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    if dim < 0:
+        dim += x.dim()
+    x_sw = x.transpose(dim, -1).contiguous()
+    n_cols = x_sw.size(-1)
+    n_rows = x_sw.numel() // n_cols
+    x_flat = x_sw.view(n_rows, n_cols)
+    x_sorted_flat = torch.sort(x_flat.float(), dim=-1, descending=True).values
+
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    out_flat = torch.empty_like(x_flat)
+    grid = (n_rows,)
+    _sparsemax_forward_kernel[grid](
+        x_flat,
+        x_flat.stride(0),
+        x_sorted_flat,
+        x_sorted_flat.stride(0),
+        out_flat,
+        out_flat.stride(0),
+        n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+
+    y = out_flat.view_as(x_sw).transpose(dim, -1)
+    return y, out_flat
+
+
+def _sparsemax_backward(
+    grad_out: torch.Tensor,
+    out_flat: torch.Tensor,
+    dim: int,
+) -> torch.Tensor:
+    grad_sw = grad_out.transpose(dim, -1).contiguous()
+    n_cols = grad_sw.size(-1)
+    n_rows = grad_sw.numel() // n_cols
+    go_flat = grad_sw.view(n_rows, n_cols)
+
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    dx_flat = torch.empty_like(go_flat)
+    grid = (n_rows,)
+    _sparsemax_backward_kernel[grid](
+        out_flat,
+        go_flat,
+        dx_flat,
+        out_flat.stride(0),
+        n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+
+    dx = dx_flat.view_as(grad_sw).transpose(dim, -1)
+    return dx
+
+
+class LigerSparsemaxFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, x: torch.Tensor, dim: int):
+        y, out_flat = _sparsemax_forward(x, dim)
+        ctx.save_for_backward(out_flat)
+        ctx.dim = dim
+        return y
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_out: torch.Tensor):
+        (out_flat,) = ctx.saved_tensors
+        dx = _sparsemax_backward(grad_out, out_flat, ctx.dim)
+        return dx, None
diff --git a/src/liger_kernel/ops/swiglu.py b/src/liger_kernel/ops/swiglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..675033683e733b9d15ecafe1dc8c08e70e641c9d
--- /dev/null
+++ b/src/liger_kernel/ops/swiglu.py
@@ -0,0 +1,151 @@
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import calculate_settings
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+@triton.jit
+def silu(x):
+    return x * tl.sigmoid(x)
+
+
+@triton.jit
+def _swiglu_forward_kernel(a_ptr, b_ptr, c_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    program_id = tl.program_id(0).to(tl.int64)
+
+    # locate start index
+    a_ptr += program_id * stride
+    b_ptr += program_id * stride
+    c_ptr += program_id * stride
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    # sigmoid requires type float32
+    a_row = tl.load(a_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b_ptr + col_offsets, mask=mask, other=0)
+    c_row = silu(a_row).cast(b_row.dtype) * b_row
+    tl.store(c_ptr + col_offsets, c_row, mask=mask)
+
+
+@triton.jit
+def _swiglu_backward_kernel(dc_ptr, a_ptr, b_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    program_id = tl.program_id(0).to(tl.int64)
+
+    # locate start index
+    dc_ptr += program_id * stride
+    a_ptr += program_id * stride
+    b_ptr += program_id * stride
+
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < n_cols
+
+    dc_row = tl.load(dc_ptr + col_offsets, mask=mask, other=0)
+    # sigmoid requires type float32
+    a_row = tl.load(a_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
+    b_row = tl.load(b_ptr + col_offsets, mask=mask, other=0)
+
+    # recomputation to save memory
+    sig_a = tl.sigmoid(a_row)
+    silu_a = a_row * sig_a
+    db_row = dc_row * silu_a
+    da_row = dc_row * (silu_a * (1 - sig_a) + sig_a) * b_row
+
+    tl.store(a_ptr + col_offsets, da_row, mask=mask)
+    tl.store(b_ptr + col_offsets, db_row, mask=mask)
+
+
+def swiglu_forward(a, b):
+    ori_shape = a.shape
+
+    n_cols = ori_shape[-1]
+    a = a.view(-1, n_cols)
+    b = b.view(-1, n_cols)
+    c = torch.empty_like(a)
+    n_rows = a.shape[0]
+
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    _swiglu_forward_kernel[(n_rows,)](
+        a,
+        b,
+        c,
+        c.stride(-2),
+        n_cols=n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return a, b, c.view(*ori_shape)
+
+
+def swiglu_backward(a, b, dc):
+    ori_shape = dc.shape
+    n_cols = ori_shape[-1]
+    dc = dc.view(-1, n_cols)
+    n_rows = dc.shape[0]
+
+    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+
+    _swiglu_backward_kernel[(n_rows,)](
+        dc,
+        a,
+        b,
+        dc.stride(-2),
+        n_cols=n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return a.view(*ori_shape), b.view(*ori_shape)
+
+
+class LigerSiLUMulFunction(torch.autograd.Function):
+    @staticmethod
+    @ensure_contiguous
+    def forward(ctx, a, b):
+        if isinstance(a, torch.distributed.tensor.DTensor) or isinstance(b, torch.distributed.tensor.DTensor):
+            device_mesh, placements = (
+                (a.device_mesh, a.placements)
+                if isinstance(a, torch.distributed.tensor.DTensor)
+                else (b.device_mesh, b.placements)
+            )
+
+            # Assume that full tensors are gathered before and identical across
+            # the associated process groups.
+            if not isinstance(a, torch.distributed.tensor.DTensor):
+                a = torch.distributed.tensor.distribute_tensor(a, device_mesh=device_mesh, placements=placements)
+            if not isinstance(b, torch.distributed.tensor.DTensor):
+                b = torch.distributed.tensor.distribute_tensor(b, device_mesh=device_mesh, placements=placements)
+            a_local, b_local, c_local = swiglu_forward(a.to_local(), b.to_local())
+            ctx.save_for_backward(a_local, b_local)
+            ctx.dtensor_metadata = (device_mesh, placements)
+            return torch.distributed.tensor.DTensor.from_local(c_local, device_mesh, placements)
+        else:
+            a, b, c = swiglu_forward(a, b)
+            ctx.save_for_backward(a, b)
+            ctx.dtensor_metadata = None
+            return c
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, dc):
+        a, b = ctx.saved_tensors
+        if ctx.dtensor_metadata is not None:
+            device_mesh, placements = ctx.dtensor_metadata
+
+            # Assume that full tensors are gathered before and identical across
+            # the associated process groups.
+            dc_local = (
+                dc.to_local()
+                if isinstance(dc, torch.distributed.tensor.DTensor)
+                else torch.distributed.tensor.distribute_tensor(dc, device_mesh=device_mesh, placements=placements)
+            )
+            a_local, b_local = swiglu_backward(a, b, dc_local)
+            return (
+                torch.distributed.tensor.DTensor.from_local(a_local, device_mesh, placements),
+                torch.distributed.tensor.DTensor.from_local(b_local, device_mesh, placements),
+            )
+
+        a, b = swiglu_backward(a, b, dc)
+        return a, b
diff --git a/src/liger_kernel/ops/tiled_mlp.py b/src/liger_kernel/ops/tiled_mlp.py
new file mode 100755
index 0000000000000000000000000000000000000000..2c1943c3aaf7f0e37f51dfef4ab5f8950a328790
--- /dev/null
+++ b/src/liger_kernel/ops/tiled_mlp.py
@@ -0,0 +1,136 @@
+import math
+
+from typing import Callable
+from typing import List
+from typing import Optional
+
+import torch
+
+from liger_kernel.ops.utils import ensure_contiguous
+
+
+class LigerTiledMLPFunction(torch.autograd.Function):
+    """
+    Based on DeepSpeed's TiledMLP:
+    https://github.com/deepspeedai/DeepSpeed/blob/v0.18.2/deepspeed/runtime/sequence_parallel/ulysses_sp.py#L838
+
+    Perform a tiled MLP computation to massively reduce memory usage needed to compute MLP
+    when using very long sequence lengths.
+
+    This module re-computes `forward` in the `backward`. So the `forward` occurs twice each iteration.
+    And if you're using activation checkpointing it then occurs thrice.
+
+    Args:
+        fn: the function to call on sharded inputs (e.g., mlp.forward)
+        mlp_module: the MLP nn.Module object
+        x: the input to MLP.forward (hidden_states)
+        shards: how many shards to use
+        compute_params: a list of weights engaged in the compute
+
+    Returns:
+        the computed hidden_states
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        fn: Callable,
+        mlp_module: torch.nn.Module,
+        x: torch.Tensor,
+        shards: int,
+        compute_params: Optional[List[torch.nn.Parameter]] = None,
+    ) -> torch.Tensor:
+        ctx.fn = fn
+        ctx.mlp_module = mlp_module
+        ctx.shards = shards
+        ctx.save_for_backward(x)
+
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size] (moe experts)
+        x_shards = list(torch.chunk(x, chunks=shards, dim=-2))
+        with torch.no_grad():
+            output_shards = [fn(mlp_module, x_shard) for x_shard in x_shards]
+        output_unsharded = torch.cat(output_shards, dim=-2)
+
+        return output_unsharded
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, *grads) -> tuple:
+        fn = ctx.fn
+        (x,) = ctx.saved_tensors
+        mlp_module = ctx.mlp_module
+        shards = ctx.shards
+
+        x_requires_grad = x.requires_grad
+        x = x.detach()
+        # detach() unsets x.requires_grad, so restore it
+        x.requires_grad_(x_requires_grad)
+
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size] (moe experts)
+        hidden_size = x.shape[-1]
+        x_shape_orig = x.shape
+
+        # flatten bs+seqlen to avoid having stride issues when narrowing into seqlen w/ bs>1
+        x = x.view(-1, hidden_size)
+        incoming_grad = grads[0].view(-1, hidden_size)
+        x_grad = torch.zeros_like(x)
+
+        x_shards = list(torch.chunk(x, chunks=shards, dim=0))
+
+        for i, x_shard in enumerate(x_shards):
+            x_shard.requires_grad_(x_requires_grad)
+
+            # if seqlen is not exactly divisible by shards the last step will be shorter than shard_step
+            shard_step = x_shards[i].shape[0]
+            shard_offset = i * x_shards[0].shape[0]
+
+            x_shard.grad = x_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
+            incoming_grad_shard = incoming_grad.narrow(0, shard_offset, shard_step).view_as(x_shard)
+
+            with torch.enable_grad():
+                output = fn(mlp_module, x_shard)
+            torch.autograd.backward(output, incoming_grad_shard)
+
+        # unflatten
+        x_grad = x_grad.view(x_shape_orig)
+
+        return (None, None, x_grad, None, None)
+
+
+def apply_tiled_mlp(
+    fn: Callable,
+    mlp_module: torch.nn.Module,
+    x: torch.Tensor,
+    num_shards: Optional[int] = None,
+    compute_params: Optional[List[torch.nn.Parameter]] = None,
+) -> torch.Tensor:
+    """
+    Apply tiled MLP computation for memory efficiency.
+
+    Args:
+        fn: the function to call on sharded inputs (e.g., lambda module, x: module(x))
+        mlp_module: the MLP nn.Module object
+        x: the input tensor with shape [bs, seqlen, hidden_size] or [seqlen, hidden_size]
+        num_shards: number of shards to use. If None, automatically calculated as ceil(seqlen / hidden_size)
+        compute_params: list of parameters for DeepSpeed ZeRO optimization
+
+    Returns:
+        output tensor with the same shape as input
+    """
+    if num_shards is None:
+        # x.shape could be [bs, seqlen, hidden_size] or [seqlen, hidden_size]
+        hidden_size = x.shape[-1]
+        seqlen = x.shape[-2]
+        num_shards = math.ceil(seqlen / hidden_size)
+
+    # Ensure num_shards is at least 1
+    num_shards = max(1, num_shards)
+
+    return LigerTiledMLPFunction.apply(
+        fn,
+        mlp_module,
+        x,
+        num_shards,
+        compute_params,
+    )
diff --git a/src/liger_kernel/ops/tvd.py b/src/liger_kernel/ops/tvd.py
new file mode 100755
index 0000000000000000000000000000000000000000..154df000539438c4c6e0dde5810a7e76468dffc6
--- /dev/null
+++ b/src/liger_kernel/ops/tvd.py
@@ -0,0 +1,218 @@
+from typing import Literal
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from liger_kernel.ops.utils import ensure_contiguous
+
+MAX_FUSED_SIZE = 65536 // 4
+
+REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
+
+_REDUCTION_MODE_NONE = tl.constexpr(0)
+_REDUCTION_MODE_SUM = tl.constexpr(1)
+_REDUCTION_MODE_MEAN = tl.constexpr(2)
+_REDUCTION_MODE_BATCHMEAN = tl.constexpr(3)
+
+_str_to_reduction_mode = {
+    "none": _REDUCTION_MODE_NONE.value,
+    "sum": _REDUCTION_MODE_SUM.value,
+    "mean": _REDUCTION_MODE_MEAN.value,
+    "batchmean": _REDUCTION_MODE_BATCHMEAN.value,
+}
+
+
+def get_num_warps(BLOCK_SIZE):
+    num_warps = 4
+    if BLOCK_SIZE >= 32768:
+        num_warps = 32
+    elif BLOCK_SIZE >= 8192:
+        num_warps = 16
+    elif BLOCK_SIZE >= 2048:
+        num_warps = 8
+
+    return num_warps
+
+
+@triton.jit
+def _tv_distance_kernel(
+    p_ptr,
+    p_stride,
+    q_ptr,
+    q_stride,
+    loss_ptr,
+    loss_stride,
+    grads_ptr,
+    grads_stride,
+    label_ptr,
+    ignore_index: tl.constexpr,
+    n_cols,
+    scale,  # pre-computed reduction scale for gradients (fused into kernel)
+    BLOCK_SIZE: tl.constexpr,
+    HAS_LABEL: tl.constexpr,
+    reduction: tl.constexpr = _REDUCTION_MODE_BATCHMEAN,
+):
+    pid = tl.program_id(0).to(tl.int64)
+    p_ptr += pid * p_stride
+    q_ptr += pid * q_stride
+    loss_ptr += pid * loss_stride
+    grads_ptr += pid * grads_stride
+    label_ptr += pid
+
+    base_offsets = tl.arange(0, BLOCK_SIZE)
+
+    if HAS_LABEL:
+        label = tl.load(label_ptr)
+        if label == ignore_index:
+            for i in range(0, n_cols, BLOCK_SIZE):
+                offsets = i + base_offsets
+                mask = offsets < n_cols
+                tl.store(grads_ptr + offsets, 0.0, mask=mask)
+                if reduction == _REDUCTION_MODE_NONE:
+                    tl.store(loss_ptr + offsets, 0.0, mask=mask)
+            return
+
+    loss_sum = 0.0
+    for i in range(0, n_cols, BLOCK_SIZE):
+        offsets = i + base_offsets
+        mask = offsets < n_cols
+
+        p = tl.load(p_ptr + offsets, mask=mask, other=0.0)
+        q = tl.load(q_ptr + offsets, mask=mask, other=0.0)
+
+        # TVD(P || Q) = 0.5 * |P - Q|
+        tv_loss = 0.5 * tl.abs(p - q)
+
+        # Fuse reduction scaling into gradient computation (eliminates separate Python division)
+        grad_res = tl.where(p > q, 0.5 * scale, -0.5 * scale)
+
+        tl.store(grads_ptr + offsets, grad_res, mask=mask)
+
+        if reduction == _REDUCTION_MODE_NONE:
+            tl.store(loss_ptr + offsets, tv_loss, mask=mask)
+        else:
+            loss_sum += tl.sum(tv_loss, axis=0)
+
+    if reduction != _REDUCTION_MODE_NONE:
+        # Fuse reduction scaling into loss (same scale as gradients; avoids Python division)
+        tl.store(loss_ptr, loss_sum * scale)
+
+
+def tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label):
+    BT, V = p.shape
+
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
+    num_warps = get_num_warps(BLOCK_SIZE)
+
+    grid = (BT,)
+
+    reduction = _str_to_reduction_mode[reduction]
+
+    out_size = (BT, V) if reduction == _REDUCTION_MODE_NONE.value else (BT,)
+    output_tensor = torch.zeros(out_size, device=p.device, dtype=torch.float32)
+    grads = torch.empty_like(p)
+
+    n_non_ignore = (shift_labels != ignore_index).sum().item() if has_label else BT
+
+    # Pre-compute gradient scale factor (fused into kernel to avoid separate division)
+    if reduction == _REDUCTION_MODE_BATCHMEAN.value:
+        scale = 1.0 / n_non_ignore
+    elif reduction == _REDUCTION_MODE_MEAN.value:
+        scale = 1.0 / (n_non_ignore * V)
+    else:
+        scale = 1.0
+
+    _tv_distance_kernel[grid](
+        p,
+        p.stride(0),
+        q,
+        q.stride(0),
+        output_tensor,
+        output_tensor.stride(0),
+        grads,
+        grads.stride(0),
+        shift_labels if has_label else torch.empty(1, device=p.device),
+        ignore_index,
+        V,
+        scale,
+        BLOCK_SIZE=BLOCK_SIZE,
+        HAS_LABEL=has_label,
+        num_warps=num_warps,
+        reduction=reduction,
+    )
+
+    # Loss and gradients are already scaled inside the kernel — no separate division needed
+    if reduction in (_REDUCTION_MODE_BATCHMEAN.value, _REDUCTION_MODE_MEAN.value):
+        return output_tensor.sum(), grads
+    elif reduction == _REDUCTION_MODE_SUM.value:
+        return output_tensor.sum(dim=0), grads
+    else:
+        return output_tensor, grads
+
+
+def tvd_backward_triton(grad_output, grads):
+    # If cross entropy is the last layer, grad_output is 1.0. Skip the mul then.
+    if torch.equal(grad_output, torch.tensor(1.0, device=grad_output.device)):
+        return grads
+
+    return grads * grad_output
+
+
+class LigerTVDLossFunction(torch.autograd.Function):
+    """
+    Class implementing the forward and backward pass for the Total Variation Distance Loss using Triton.
+    """
+
+    @staticmethod
+    @ensure_contiguous
+    def forward(
+        ctx,
+        p: torch.Tensor,
+        q: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+        reduction: REDUCTION_LITERAL = "batchmean",
+        ignore_index: int = -100,
+    ) -> torch.Tensor:
+        """A forward pass for the Total Variation Distance Loss.
+
+        Args:
+            ctx: Torch autograd context
+            p (torch.Tensor): A tensor of shape (BT, V) containing the first distribution.
+            q (torch.Tensor): A tensor of shape (BT, V) containing the second distribution.
+            shift_labels (Optional[torch.Tensor]): A tensor of shape (BT,) containing the labels.
+            reduction (REDUCTION_LITERAL, optional): The reduction method to be applied. Defaults to "batchmean".
+            ignore_index (int, optional): The index to ignore during loss calculation. Defaults to -100.
+
+        Returns:
+            torch.Tensor: The computed Total Variation Distance Loss.
+        """
+        has_label = False
+        if shift_labels is not None:
+            assert shift_labels.shape == (p.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
+            shift_labels = shift_labels.contiguous()
+            has_label = True
+
+        loss, grads = tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label)
+        ctx.save_for_backward(grads)
+        return loss
+
+    @staticmethod
+    @ensure_contiguous
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        """A backward pass for the Total Variation Distance Loss.
+
+        Args:
+            ctx: Torch autograd context
+            grad_output (torch.Tensor): The gradient of the loss with respect to the output.
+
+        Returns:
+            tuple[torch.Tensor, None, None, None, None]: The gradient of the loss with respect to the inputs.
+        """
+        (grads,) = ctx.saved_tensors
+        grads = tvd_backward_triton(grad_output, grads)
+
+        return grads, None, None, None, None
diff --git a/src/liger_kernel/ops/utils.py b/src/liger_kernel/ops/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..41c916324a796ac167e09f1759762bb2d3d10cf8
--- /dev/null
+++ b/src/liger_kernel/ops/utils.py
@@ -0,0 +1,152 @@
+"""
+This file incorporates code from Unsloth licensed under the Apache License, Version 2.0.
+See the original Unsloth repository at https://github.com/unslothai/unsloth.
+
+The following line
+https://github.com/linkedin/Liger-Kernel/blob/7382a8761f9af679482b968f9348013d933947c7/src/liger_kernel/ops/utils.py#L23
+is based on code from Unsloth, located at:
+https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/utils.py#L43
+
+Modifications made by Yanning Chen, 2024.
+"""
+
+import functools
+import importlib
+import operator
+
+from typing import Callable
+
+import torch
+import triton
+import triton.language as tl
+
+from packaging.version import Version
+
+from liger_kernel.utils import infer_device
+
+
+def is_hip() -> bool:
+    return torch.version.hip is not None
+
+
+def ensure_contiguous(fn):
+    @functools.wraps(fn)
+    def wrapper(ctx, *args, **kwargs):
+        def maybe_to_contiguous(x):
+            return x.contiguous() if isinstance(x, torch.Tensor) else x
+
+        args = [maybe_to_contiguous(arg) for arg in args]
+        kwargs = {k: maybe_to_contiguous(v) for k, v in kwargs.items()}
+        return fn(ctx, *args, **kwargs)
+
+    return wrapper
+
+
+def calculate_settings(n):
+    # reference: https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/utils.py#L43
+
+    MAX_FUSED_SIZE = 65536
+    BLOCK_SIZE = triton.next_power_of_2(n)
+    if BLOCK_SIZE > MAX_FUSED_SIZE:
+        raise RuntimeError(
+            f"Cannot launch Triton kernel since n = {n} exceeds the recommended Triton blocksize = {MAX_FUSED_SIZE}."
+        )
+
+    num_warps = 4
+    if BLOCK_SIZE >= 32768:
+        num_warps = 32 if not is_hip() else 16
+    elif BLOCK_SIZE >= 8192:
+        num_warps = 16
+    elif BLOCK_SIZE >= 2048:
+        num_warps = 8
+    return BLOCK_SIZE, num_warps
+
+
+def compare_version(package: str, operator: Callable, target: str):
+    try:
+        pkg = importlib.import_module(package)
+    except ImportError:
+        return False
+    pkg_version = Version(pkg.__version__)
+    return operator(pkg_version, Version(target))
+
+
+def get_amp_custom_fwd_bwd() -> Callable:
+    device = infer_device()
+    if compare_version("torch", operator.ge, "2.4.0"):
+        return (
+            functools.partial(torch.amp.custom_fwd, device_type=device),
+            functools.partial(torch.amp.custom_bwd, device_type=device),
+        )
+    if hasattr(torch, "npu") and getattr(torch.npu, "amp", None) is not None:
+        return torch.npu.amp.custom_fwd, torch.npu.amp.custom_bwd
+    return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd
+
+
+amp_custom_fwd, amp_custom_bwd = get_amp_custom_fwd_bwd()
+
+
+torch_to_triton_dtype = {
+    torch.float32: tl.float32,
+    torch.float16: tl.float16,
+    torch.bfloat16: tl.bfloat16,
+}
+
+
+@triton.jit
+def element_mul_kernel(
+    X_ptr,
+    X_stride,
+    grad_output_ptr,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    This function multiplies each element of the tensor pointed by X_ptr with the value pointed by grad_output_ptr.
+    The multiplication is performed in-place on the tensor pointed by X_ptr.
+
+    Parameters:
+    X_ptr: Pointer to the input tensor.
+    X_stride (int): The stride of the input tensor.
+    grad_output_ptr: Pointer to the gradient output value.
+    n_cols (int): The number of columns in the input tensor.
+    BLOCK_SIZE (int): The block size for Triton operations.
+    """
+
+    # Get the program ID and convert it to int64 to avoid overflow
+    program_id = tl.program_id(0).to(tl.int64)
+
+    # Locate the start index
+    X_ptr += program_id * X_stride
+
+    # Load the gradient output value
+    grad_output = tl.load(grad_output_ptr)
+
+    # Perform the element-wise multiplication
+    for i in range(0, n_cols, BLOCK_SIZE):
+        X_offsets = i + tl.arange(0, BLOCK_SIZE)
+        X_block = tl.load(X_ptr + X_offsets, mask=X_offsets < n_cols)
+        tl.store(X_ptr + X_offsets, X_block * grad_output, mask=X_offsets < n_cols)
+
+
+def get_npu_core_count(default: int = 20) -> int:
+    """Return NPU vector core count.
+    Fallback to `default` if Triton runtime or NPU device is unavailable.
+    """
+    try:
+        utils = triton.runtime.driver.active.utils
+        props = utils.get_device_properties(0)
+        return int(props.get("num_vectorcore", default))
+    except Exception:
+        return default
+
+
+def set_large_grf_mode(kernel_args: dict):
+    """Set large GRF mode for XPU devices."""
+    # On XPU triton installed along with pytorch-xpu will be called `pytorch-triton-xpu`,
+    # triton XPU installed from source will be called `triton`.
+    if compare_version("pytorch-triton-xpu", operator.ge, "3.6.0") or compare_version("triton", operator.ge, "3.6.0"):
+        kernel_args["grf_mode"] = "256"
+    else:
+        # API was changed in https://github.com/intel/intel-xpu-backend-for-triton/pull/5430
+        kernel_args["grf_mode"] = "large"
diff --git a/src/liger_kernel/transformers/__init__.py b/src/liger_kernel/transformers/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..b90d05c430d085108dc96fa31cf11ee390c4a6e0
--- /dev/null
+++ b/src/liger_kernel/transformers/__init__.py
@@ -0,0 +1,233 @@
+import importlib
+
+from typing import TYPE_CHECKING
+
+# Always-safe imports (independent of 'transformers')
+from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss  # noqa: F401
+from liger_kernel.transformers.dyt import LigerDyT  # noqa: F401
+from liger_kernel.transformers.fused_add_rms_norm import LigerFusedAddRMSNorm  # noqa: F401
+from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss  # noqa: F401
+from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD  # noqa: F401
+from liger_kernel.transformers.geglu import LigerGEGLUMLP  # noqa: F401
+from liger_kernel.transformers.jsd import LigerJSD  # noqa: F401
+from liger_kernel.transformers.kl_div import LigerKLDIVLoss  # noqa: F401
+from liger_kernel.transformers.layer_norm import LigerLayerNorm  # noqa: F401
+from liger_kernel.transformers.llama4_rope import liger_llama4_text_rotary_pos_emb  # noqa: F401
+from liger_kernel.transformers.llama4_rope import liger_llama4_vision_rotary_pos_emb  # noqa: F401
+from liger_kernel.transformers.mhc import LigerMHC  # noqa: F401
+from liger_kernel.transformers.multi_token_attention import LigerMultiTokenAttention  # noqa: F401
+from liger_kernel.transformers.poly_norm import LigerPolyNorm  # noqa: F401
+from liger_kernel.transformers.rms_norm import LigerRMSNorm  # noqa: F401
+from liger_kernel.transformers.rope import liger_rotary_pos_emb  # noqa: F401
+from liger_kernel.transformers.softmax import LigerSoftmax  # noqa: F401
+from liger_kernel.transformers.sparsemax import LigerSparsemax  # noqa: F401
+from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP  # noqa: F401
+from liger_kernel.transformers.swiglu import LigerExperts  # noqa: F401
+from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP  # noqa: F401
+from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP  # noqa: F401
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP  # noqa: F401
+from liger_kernel.transformers.tiled_mlp import LigerTiledGEGLUMLP  # noqa: F401
+from liger_kernel.transformers.tiled_mlp import LigerTiledSwiGLUMLP  # noqa: F401
+from liger_kernel.transformers.tvd import LigerTVDLoss  # noqa: F401
+
+# Static-only imports for IDEs and type checkers
+if TYPE_CHECKING:
+    from liger_kernel.transformers.auto_model import AutoLigerKernelForCausalLM  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import _apply_liger_kernel  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_exaone4  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_falcon_h1  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma2  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma3  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gemma3_text  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v_moe  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_gpt_oss  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_dense  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_moe  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_internvl  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama4  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llava  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mistral  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mixtral  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_mllama  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_olmo2  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_olmo3  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_paligemma  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_phi3  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_pixtral  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2_5_vl  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2_vl  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_5  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_5_moe  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_moe  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_next  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_vl  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_vl_moe  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_smollm3  # noqa: F401
+    from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_smolvlm  # noqa: F401
+
+
+# Check if 'transformers' is installed
+try:
+    import transformers  # noqa: F401
+
+    _TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    _TRANSFORMERS_AVAILABLE = False
+
+
+def is_transformers_available() -> bool:
+    """
+    Returns True if the 'transformers' package is available.
+    Useful for conditional logic in downstream code.
+    """
+    return _TRANSFORMERS_AVAILABLE
+
+
+def __getattr__(name: str):
+    """
+    Handles lazy access to transformer-dependent attributes.
+    If 'transformers' is not installed, raises a user-friendly ImportError.
+    """
+    if not _TRANSFORMERS_AVAILABLE:
+        raise ImportError(
+            f"The attribute '{name}' requires the 'transformers' library, which is not installed.\n"
+            f"Please install it with `pip install transformers` to use this functionality."
+        )
+
+    if name == "AutoLigerKernelForCausalLM":
+        module = importlib.import_module("liger_kernel.transformers.auto_model")
+        return getattr(module, name)
+
+    monkey_patch_symbols = {
+        "_apply_liger_kernel",
+        "_apply_liger_kernel_to_instance",
+        "apply_liger_kernel_to_falcon_h1",
+        "apply_liger_kernel_to_gemma",
+        "apply_liger_kernel_to_gemma2",
+        "apply_liger_kernel_to_gemma3",
+        "apply_liger_kernel_to_gemma3_text",
+        "apply_liger_kernel_to_glm4",
+        "apply_liger_kernel_to_glm4v",
+        "apply_liger_kernel_to_glm4v_moe",
+        "apply_liger_kernel_to_gpt_oss",
+        "apply_liger_kernel_to_granite",
+        "apply_liger_kernel_to_internvl",
+        "apply_liger_kernel_to_llama",
+        "apply_liger_kernel_to_llava",
+        "apply_liger_kernel_to_llama4",
+        "apply_liger_kernel_to_mistral",
+        "apply_liger_kernel_to_mixtral",
+        "apply_liger_kernel_to_mllama",
+        "apply_liger_kernel_to_olmo2",
+        "apply_liger_kernel_to_olmo3",
+        "apply_liger_kernel_to_paligemma",
+        "apply_liger_kernel_to_phi3",
+        "apply_liger_kernel_to_pixtral",
+        "apply_liger_kernel_to_qwen2",
+        "apply_liger_kernel_to_qwen2_5_vl",
+        "apply_liger_kernel_to_qwen2_vl",
+        "apply_liger_kernel_to_qwen3",
+        "apply_liger_kernel_to_qwen3_moe",
+        "apply_liger_kernel_to_qwen3_5",
+        "apply_liger_kernel_to_qwen3_5_moe",
+        "apply_liger_kernel_to_qwen3_next",
+        "apply_liger_kernel_to_qwen3_vl",
+        "apply_liger_kernel_to_qwen3_vl_moe",
+        "apply_liger_kernel_to_smollm3",
+        "apply_liger_kernel_to_smolvlm",
+        "apply_liger_kernel_to_hunyuan_v1_dense",
+        "apply_liger_kernel_to_hunyuan_v1_moe",
+        "apply_liger_kernel_to_exaone4",
+    }
+
+    if name in monkey_patch_symbols:
+        module = importlib.import_module("liger_kernel.transformers.monkey_patch")
+        return getattr(module, name)
+
+    raise AttributeError(f"module {__name__} has no attribute {name}")
+
+
+# Shared symbols in all environments
+__all__ = [
+    "is_transformers_available",
+    "LigerCrossEntropyLoss",
+    "LigerDyT",
+    "LigerFusedLinearCrossEntropyLoss",
+    "LigerFusedLinearJSD",
+    "LigerGEGLUMLP",
+    "LigerJSD",
+    "LigerLayerNorm",
+    "LigerFusedAddRMSNorm",
+    "LigerPolyNorm",
+    "LigerRMSNorm",
+    "liger_rotary_pos_emb",
+    "liger_llama4_text_rotary_pos_emb",
+    "liger_llama4_vision_rotary_pos_emb",
+    "LigerBlockSparseTop2MLP",
+    "LigerPhi3SwiGLUMLP",
+    "LigerQwen3MoeSwiGLUMLP",
+    "LigerSwiGLUMLP",
+    "LigerTiledGEGLUMLP",
+    "LigerTiledSwiGLUMLP",
+    "LigerTVDLoss",
+    "LigerKLDIVLoss",
+    "LigerMHC",
+    "LigerMultiTokenAttention",
+    "LigerSoftmax",
+    "LigerSparsemax",
+]
+
+# Add transformer-dependent symbols only if available
+if _TRANSFORMERS_AVAILABLE:
+    __all__.extend(
+        [
+            "AutoLigerKernelForCausalLM",
+            "_apply_liger_kernel",
+            "_apply_liger_kernel_to_instance",
+            "apply_liger_kernel_to_falcon_h1",
+            "apply_liger_kernel_to_gemma",
+            "apply_liger_kernel_to_gemma2",
+            "apply_liger_kernel_to_gemma3",
+            "apply_liger_kernel_to_gemma3_text",
+            "apply_liger_kernel_to_glm4",
+            "apply_liger_kernel_to_glm4v",
+            "apply_liger_kernel_to_glm4v_moe",
+            "apply_liger_kernel_to_gpt_oss",
+            "apply_liger_kernel_to_granite",
+            "apply_liger_kernel_to_internvl",
+            "apply_liger_kernel_to_llama",
+            "apply_liger_kernel_to_llava",
+            "apply_liger_kernel_to_llama4",
+            "apply_liger_kernel_to_mistral",
+            "apply_liger_kernel_to_mixtral",
+            "apply_liger_kernel_to_mllama",
+            "apply_liger_kernel_to_olmo2",
+            "apply_liger_kernel_to_olmo3",
+            "apply_liger_kernel_to_paligemma",
+            "apply_liger_kernel_to_phi3",
+            "apply_liger_kernel_to_pixtral",
+            "apply_liger_kernel_to_qwen2",
+            "apply_liger_kernel_to_qwen2_5_vl",
+            "apply_liger_kernel_to_qwen2_vl",
+            "apply_liger_kernel_to_qwen3",
+            "apply_liger_kernel_to_qwen3_moe",
+            "apply_liger_kernel_to_qwen3_5",
+            "apply_liger_kernel_to_qwen3_5_moe",
+            "apply_liger_kernel_to_qwen3_next",
+            "apply_liger_kernel_to_qwen3_vl",
+            "apply_liger_kernel_to_qwen3_vl_moe",
+            "apply_liger_kernel_to_smollm3",
+            "apply_liger_kernel_to_smolvlm",
+            "apply_liger_kernel_to_hunyuan_v1_dense",
+            "apply_liger_kernel_to_hunyuan_v1_moe",
+            "apply_liger_kernel_to_exaone4",
+        ]
+    )
diff --git a/src/liger_kernel/transformers/auto_model.py b/src/liger_kernel/transformers/auto_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..004a9808ab631b90f11a8f41c2a3111eaceac66f
--- /dev/null
+++ b/src/liger_kernel/transformers/auto_model.py
@@ -0,0 +1,59 @@
+import inspect
+import logging
+
+from transformers import AutoConfig
+from transformers import AutoModelForCausalLM
+
+from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel
+
+logger = logging.getLogger(__name__)
+
+
+def _get_model_config(model_dir, **model_init_kwargs):
+    config = AutoConfig.from_pretrained(model_dir, **model_init_kwargs)
+    return config
+
+
+class AutoLigerKernelForCausalLM(AutoModelForCausalLM):
+    """
+    This class is a drop-in replacement for AutoModelForCausalLM that applies the Liger Kernel to the model
+    if applicable.
+    """
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        model_config = _get_model_config(pretrained_model_name_or_path, **kwargs)
+
+        # Determine the model type and apply the Liger Kernel if applicable
+        # Note: _apply_liger_kernel will only pass relevant kwargs to the apply_liger_kernel_to_* function
+        model_type = model_config.model_type
+
+        _apply_liger_kernel(model_type, **kwargs)
+
+        # Filter out kwargs that were passed to the apply_liger_* function, which will cause
+        # model initialization errors otherwise
+        apply_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[model_type]
+        apply_fn_signature = inspect.signature(apply_fn)
+
+        applicable_kwargs = {key: value for key, value in kwargs.items() if key not in apply_fn_signature.parameters}
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **applicable_kwargs)
+
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        model_type = getattr(config, "model_type", None)
+        if not model_type:
+            logger.info("Model type could not be determined from model config. No Liger kernels will be applied.")
+            return
+        model_type = config.model_type
+
+        _apply_liger_kernel(model_type, **kwargs)
+
+        # Filter out kwargs that were passed to the apply_liger_* function, which will cause
+        # model initialization errors otherwise
+        apply_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[model_type]
+        apply_fn_signature = inspect.signature(apply_fn)
+        applicable_kwargs = {key: value for key, value in kwargs.items() if key not in apply_fn_signature.parameters}
+
+        return super().from_config(config, **applicable_kwargs)
diff --git a/src/liger_kernel/transformers/cross_entropy.py b/src/liger_kernel/transformers/cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..ed4310714994e3d75d061a7f87a4c1831df7f95b
--- /dev/null
+++ b/src/liger_kernel/transformers/cross_entropy.py
@@ -0,0 +1,61 @@
+from typing import Optional
+
+import torch
+
+from liger_kernel.ops import LigerCrossEntropyFunction
+from liger_kernel.transformers.functional import CrossEntropyOutput
+
+
+class LigerCrossEntropyLoss(torch.nn.Module):
+    def __init__(
+        self,
+        weight: Optional[torch.FloatTensor] = None,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
+        return_token_accuracy: bool = False,
+        return_predicted_tokens: bool = False,
+    ):
+        super().__init__()
+        assert (label_smoothing >= 0) and (label_smoothing <= 1), (
+            f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        )
+        assert reduction in {
+            "mean",
+            "sum",
+            "none",
+        }, f"reduction must be one of 'mean', 'sum', or 'none'. Got: {reduction}"
+        assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}"
+        self.weight = weight
+        self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.softcap = softcap
+        self.return_z_loss = return_z_loss
+        self.return_token_accuracy = return_token_accuracy
+        self.return_predicted_tokens = return_predicted_tokens
+
+    def forward(self, _input: torch.Tensor, target: torch.Tensor):
+        loss, z_loss, token_accuracy, predicted_tokens = LigerCrossEntropyFunction.apply(
+            _input,
+            target,
+            self.weight,
+            self.ignore_index,
+            self.lse_square_scale,
+            self.label_smoothing,
+            self.reduction,
+            self.softcap,
+            self.return_z_loss,
+            self.return_token_accuracy,
+            self.return_predicted_tokens,
+        )
+        if not self.return_z_loss and not self.return_token_accuracy and not self.return_predicted_tokens:
+            return loss
+
+        return CrossEntropyOutput(
+            loss=loss, z_loss=z_loss, token_accuracy=token_accuracy, predicted_tokens=predicted_tokens
+        )
diff --git a/src/liger_kernel/transformers/dyt.py b/src/liger_kernel/transformers/dyt.py
new file mode 100755
index 0000000000000000000000000000000000000000..8dd0796fc2bc1f6bc7b1e692c7617f0d4e19ea92
--- /dev/null
+++ b/src/liger_kernel/transformers/dyt.py
@@ -0,0 +1,22 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerDyTFunction
+
+
+class LigerDyT(nn.Module):
+    def __init__(self, hidden_size, beta=True, init_alpha=0.5):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.init_alpha = init_alpha
+        self.alpha = nn.Parameter(torch.ones(1) * init_alpha)
+        self.gamma = nn.Parameter(torch.ones(hidden_size))
+        self.beta = None
+        if beta:
+            self.beta = nn.Parameter(torch.zeros(hidden_size))
+
+    def forward(self, x):
+        return LigerDyTFunction.apply(x, self.alpha, self.gamma, self.beta)
+
+    def extra_repr(self):
+        return f"{self.hidden_size}, init_alpha={self.init_alpha}, beta={self.beta}"
diff --git a/src/liger_kernel/transformers/experimental/__init__.py b/src/liger_kernel/transformers/experimental/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..a662f76e5e439a460a2b2c249301a1313ec7348a
--- /dev/null
+++ b/src/liger_kernel/transformers/experimental/__init__.py
@@ -0,0 +1,5 @@
+from liger_kernel.transformers.experimental.embedding import LigerEmbedding  # noqa: F401
+
+__all__ = [
+    "LigerEmbedding",
+]
diff --git a/src/liger_kernel/transformers/experimental/embedding.py b/src/liger_kernel/transformers/experimental/embedding.py
new file mode 100755
index 0000000000000000000000000000000000000000..7c230b885eb2a6f8cce69181040598db2cf2f8f3
--- /dev/null
+++ b/src/liger_kernel/transformers/experimental/embedding.py
@@ -0,0 +1,26 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerEmbeddingFunction
+
+
+class LigerEmbedding(nn.Module):
+    def __init__(self, num_embeddings, embedding_dim, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.weight = nn.Parameter(torch.randn(num_embeddings, embedding_dim))
+
+        if padding_idx is not None:
+            with torch.no_grad():
+                self.weight[padding_idx].fill_(0)
+
+    def forward(self, indices):
+        embedded = LigerEmbeddingFunction.apply(self.weight, indices)
+        if self.padding_idx is not None:
+            embedded = embedded.clone()
+            embedded[indices == self.padding_idx] = 0
+        return embedded
diff --git a/src/liger_kernel/transformers/fsdp.py b/src/liger_kernel/transformers/fsdp.py
new file mode 100755
index 0000000000000000000000000000000000000000..d32bdd2603b4cbebd2b5bb913978f460d00cb179
--- /dev/null
+++ b/src/liger_kernel/transformers/fsdp.py
@@ -0,0 +1,55 @@
+from typing import Any
+from typing import Callable
+
+from torch.distributed.fsdp import FullyShardedDataParallel
+
+
+class _FSDPForwardRedirection:
+    """
+    Modified based on
+    https://github.com/Lightning-AI/pytorch-lightning/blob/d3f9c83d6efa4f1def36aa6c199600946cdb9117/src/lightning/pytorch/strategies/strategy.py#L601-L648
+    Redirect a method call through FullyShardedDataParallel.forward so that the FSDP module's root pre-forward and
+    post-forward can be properly executed around the method call.
+    This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only
+    the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving
+    GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`)
+    will not work because the first `nn.Embedding` layer is not independently wrapped as a FSDP module (because of
+    the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather
+    its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just
+    the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.
+    """
+
+    def __call__(
+        self,
+        wrapper_module: FullyShardedDataParallel,
+        method: Callable,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Reroutes a method call through the `wrapper_module`'s `forward` method.
+        Args:
+            wrapper_module: The module that has `original_module` wrapped.
+            original_module: The module that was wrapped inside `wrapper_module`.
+            method_name: The name of the method that should be called on the `original_module` after inputs get
+                redirected through the `wrapper_module`'s `forward` method.
+            *args: The positional arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+        """
+        assert isinstance(wrapper_module, FullyShardedDataParallel)
+        original_module = wrapper_module._fsdp_wrapped_module
+        original_forward = original_module.forward
+
+        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
+            # Unpatch ourselves immediately before calling the method `method_name`
+            # because itself may want to call the real `forward`
+            original_module.forward = original_forward  # type: ignore[method-assign]
+            # Call the actual method e.g. `.training_step(...)`
+            out = method(*_args, **_kwargs)
+            return out
+
+        # Patch the original_module's forward so we can redirect the arguments back to the real method
+        original_module.forward = wrapped_forward  # type: ignore[method-assign]
+        wrapper_output = wrapper_module(*args, **kwargs)
+        return wrapper_output
diff --git a/src/liger_kernel/transformers/functional.py b/src/liger_kernel/transformers/functional.py
new file mode 100755
index 0000000000000000000000000000000000000000..9fc083ba4e0fbaba1dfc06b990c540af6669a81e
--- /dev/null
+++ b/src/liger_kernel/transformers/functional.py
@@ -0,0 +1,410 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from liger_kernel.ops import LigerCrossEntropyFunction
+from liger_kernel.ops import LigerDyTFunction
+from liger_kernel.ops import LigerFusedAddRMSNormFunction
+from liger_kernel.ops import LigerFusedLinearCrossEntropyFunction
+from liger_kernel.ops import LigerFusedLinearJSDFunction
+from liger_kernel.ops import LigerFusedNeighborhoodAttentionFunction
+from liger_kernel.ops import LigerGELUMulFunction
+from liger_kernel.ops import LigerGroupNormFunction
+from liger_kernel.ops import LigerJSDFunction
+from liger_kernel.ops import LigerKLDivLossFunction
+from liger_kernel.ops import LigerLayerNormFunction
+from liger_kernel.ops import LigerMHCCoeffsFunction
+from liger_kernel.ops import LigerMHCPostResFunction
+from liger_kernel.ops import LigerMHCPreFunction
+from liger_kernel.ops import LigerMultiTokenAttentionFunction
+from liger_kernel.ops import LigerPolyNormFunction
+from liger_kernel.ops import LigerQwen2VLMRopeFunction
+from liger_kernel.ops import LigerRMSNormFunction
+from liger_kernel.ops import LigerRopeFunction
+from liger_kernel.ops import LigerSiLUMulFunction
+from liger_kernel.ops import LigerSoftmaxFunction
+from liger_kernel.ops import LigerSparsemaxFunction
+from liger_kernel.ops import LigerTVDLossFunction
+
+
+@dataclass
+class CrossEntropyOutput:
+    loss: torch.Tensor
+    z_loss: Optional[torch.Tensor] = None
+    token_accuracy: Optional[torch.Tensor] = None
+    predicted_tokens: Optional[torch.Tensor] = None
+
+
+# conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
+# `weight` and `size_average` are placeholders and not implemented yet
+def liger_cross_entropy(
+    input,
+    target,
+    weight=None,
+    size_average=None,
+    ignore_index: int = -100,
+    reduce=None,
+    reduction: str = "mean",
+    label_smoothing: float = 0.0,
+    lse_square_scale: float = 0.0,
+    softcap: Optional[float] = None,
+    return_z_loss: bool = False,
+    return_token_accuracy: bool = False,
+    return_predicted_tokens: bool = False,
+):
+    loss, z_loss, token_accuracy, predicted_tokens = LigerCrossEntropyFunction.apply(
+        input,
+        target,
+        weight,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        reduction,
+        softcap,
+        return_z_loss,
+        return_token_accuracy,
+        return_predicted_tokens,
+    )
+
+    if not return_z_loss and not return_token_accuracy and not return_predicted_tokens:
+        return loss
+
+    return CrossEntropyOutput(
+        loss=loss, z_loss=z_loss, token_accuracy=token_accuracy, predicted_tokens=predicted_tokens
+    )
+
+
+def liger_fused_linear_cross_entropy(
+    input,
+    weight,
+    target,
+    bias=None,
+    ce_weight=None,
+    ignore_index: int = -100,
+    lse_square_scale: float = 0.0,
+    label_smoothing: float = 0.0,
+    reduction: str = "mean",
+    softcap: Optional[float] = None,
+    return_z_loss: bool = False,
+    accum_dtype=None,
+    use_token_scaling: bool = False,
+    return_token_accuracy: bool = False,
+    return_predicted_tokens: bool = False,
+):
+    loss, z_loss, token_accuracy, predicted_tokens = LigerFusedLinearCrossEntropyFunction.apply(
+        input,
+        weight,
+        target,
+        bias,
+        ce_weight,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        reduction,
+        softcap,
+        return_z_loss,
+        accum_dtype,
+        use_token_scaling,
+        return_token_accuracy,
+        return_predicted_tokens,
+    )
+
+    if not return_z_loss and not return_token_accuracy and not return_predicted_tokens:
+        return loss
+
+    return CrossEntropyOutput(
+        loss=loss, z_loss=z_loss, token_accuracy=token_accuracy, predicted_tokens=predicted_tokens
+    )
+
+
+def liger_fused_linear_jsd(
+    student_input,
+    student_weight,
+    teacher_input,
+    teacher_weight,
+    shift_labels=None,
+    jsd_beta: float = 0.5,
+    ignore_index: int = -100,
+    temperature: float = 1.0,
+):
+    return LigerFusedLinearJSDFunction.apply(
+        student_input,
+        student_weight,
+        teacher_input,
+        teacher_weight,
+        shift_labels,
+        jsd_beta,
+        ignore_index,
+        temperature,
+    )
+
+
+def liger_geglu(a, b):
+    return LigerGELUMulFunction.apply(a, b)
+
+
+def liger_group_norm(
+    X,
+    affine_scaling_weight,
+    affine_shifting_bias,
+    num_channels,
+    num_groups,
+    eps,
+):
+    return LigerGroupNormFunction.apply(
+        X,
+        affine_scaling_weight,
+        affine_shifting_bias,
+        num_channels,
+        num_groups,
+        eps,
+    )
+
+
+def liger_jsd(
+    input,
+    target,
+    shift_labels=None,
+    beta: float = 0.5,
+    ignore_index: int = -100,
+):
+    return LigerJSDFunction.apply(
+        input,
+        target,
+        shift_labels,
+        beta,
+        ignore_index,
+    )
+
+
+# conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.kl_div.html#torch.nn.functional.kl_div
+# `size_average` and `mean` are being deprecated in torch API and are placeholders here
+def liger_kl_div(
+    input,
+    target,
+    size_average: bool = True,
+    reduce: bool = True,
+    reduction: str = "mean",
+    log_target: bool = False,
+    eps: float = 1e-10,
+):
+    # Note: the default reduction in torch is `mean`, but being `batchmean` in Liger
+    return LigerKLDivLossFunction.apply(
+        input,
+        target,
+        reduction,
+        log_target,
+        eps,
+    )
+
+
+def liger_sparsemax(
+    input,
+    dim: int = -1,
+):
+    return LigerSparsemaxFunction.apply(input, dim)
+
+
+def liger_multi_token_attention(
+    scores,
+    weight,
+    bias=None,
+    stride: int = 1,
+    padding: int = 0,
+    dilation: int = 1,
+    groups: int = 1,
+    sparse: bool = False,
+):
+    """
+    Functional interface for multi-token attention.
+
+    Args:
+        scores: Input tensor of shape (B, C_in, L, L)
+        weight: Convolution weight tensor of shape (C_out, C_in // groups, K, K)
+        bias: Optional bias tensor of shape (C_out,)
+        stride: Stride for the convolution (default: 1)
+        padding: Padding for the convolution (default: 0)
+        dilation: Dilation factor for the convolution (default: 1)
+        groups: Number of groups for the convolution (default: 1)
+        sparse: Specifies if input tensors are expected to be sparse (default: False)
+    Returns:
+        Output tensor after applying multi-token attention.
+    """
+    return LigerMultiTokenAttentionFunction.apply(scores, weight, bias, stride, padding, dilation, groups, sparse)
+
+
+def liger_fused_neighborhood_attention(
+    query,
+    key,
+    value,
+    kernel_size: int = 7,
+    dilation: int = 1,
+    scale: float = None,
+):
+    """
+    Liger fused neighborhood attention.
+
+    paper: https://arxiv.org/pdf/2504.16922
+
+    Args:
+        query: Query tensor of shape [batch_size, num_heads, seq_len, head_dim]
+        key: Key tensor of shape [batch_size, num_heads, seq_len, head_dim]
+        value: Value tensor of shape [batch_size, num_heads, seq_len, head_dim]
+        kernel_size: Size of the neighborhood window (default: 7)
+        dilation: Dilation factor for the neighborhood (default: 1)
+        scale: Scaling factor for attention scores (default: rsqrt(head_dim))
+
+    Returns:
+        Output tensor of shape [batch_size, num_heads, seq_len, head_dim]
+    """
+    return LigerFusedNeighborhoodAttentionFunction.apply(query, key, value, kernel_size, dilation, scale)
+
+
+def liger_tvd(
+    input,
+    target,
+    shift_labels=None,
+    reduction: str = "mean",
+    ignore_index: int = -100,
+):
+    return LigerTVDLossFunction.apply(
+        input,
+        target,
+        shift_labels,
+        reduction,
+        ignore_index,
+    )
+
+
+def liger_layer_norm(X, W, B, eps):
+    return LigerLayerNormFunction.apply(X, W, B, eps)
+
+
+def liger_qwen2vl_mrope(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    return LigerQwen2VLMRopeFunction.apply(q, k, cos, sin, mrope_section, unsqueeze_dim)
+
+
+def liger_rms_norm(X, W, eps, offset: float = 0.0, casting_mode: str = "llama", in_place: bool = True):
+    return LigerRMSNormFunction.apply(X, W, eps, offset, casting_mode, in_place)
+
+
+def liger_poly_norm(X, W, B, eps=1e-6, in_place=True):
+    return LigerPolyNormFunction.apply(X, W, B, eps, in_place)
+
+
+def liger_fused_add_rms_norm(X, R, W, eps, offset: float = 0.0, casting_mode: str = "llama", in_place: bool = True):
+    return LigerFusedAddRMSNormFunction.apply(X, R, W, eps, offset, casting_mode, in_place)
+
+
+def liger_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    return LigerRopeFunction.apply(q, k, cos, sin, position_ids, unsqueeze_dim)
+
+
+def liger_swiglu(a, b):
+    return LigerSiLUMulFunction.apply(a, b)
+
+
+def liger_softmax(x):
+    return LigerSoftmaxFunction.apply(x)
+
+
+def liger_dyt(x, alpha, gamma, beta):
+    return LigerDyTFunction.apply(x, alpha, gamma, beta)
+
+
+def liger_mhc_coeffs(
+    x,
+    phi,
+    b,
+    alpha_pre,
+    alpha_post,
+    alpha_res,
+    *,
+    allow_fp32: bool = False,
+    tmax: int = 20,
+    rms_eps: float = 1e-6,
+    pre_eps: float = 0.0,
+    sinkhorn_eps: float = 1e-6,
+    post_mult: float = 2.0,
+):
+    # Convert config scalars to Python types so they are not included in the
+    # autograd computation graph (they are not learnable parameters).
+    return LigerMHCCoeffsFunction.apply(
+        x,
+        phi,
+        b,
+        alpha_pre,
+        alpha_post,
+        alpha_res,
+        allow_fp32,
+        int(tmax),
+        float(rms_eps),
+        float(pre_eps),
+        float(sinkhorn_eps),
+        float(post_mult),
+    )
+
+
+def liger_mhc_pre(x, h_pre):
+    return LigerMHCPreFunction.apply(x, h_pre)
+
+
+def liger_mhc_post_res(x, f_out, h_post, h_res):
+    return LigerMHCPostResFunction.apply(x, f_out, h_post, h_res)
+
+
+def liger_mhc_apply(x, f_out, h_pre, h_post, h_res, *, return_x_in: bool = False):
+    x_in = liger_mhc_pre(x, h_pre)
+    x_out = liger_mhc_post_res(x, f_out, h_post, h_res)
+    if return_x_in:
+        return x_out, x_in
+    return x_out
+
+
+def liger_mhc_forward(
+    x,
+    layer,
+    phi,
+    b,
+    alpha_pre,
+    alpha_post,
+    alpha_res,
+    *,
+    allow_fp32=False,
+    tmax=20,
+    rms_eps=1e-6,
+    pre_eps=0.0,
+    sinkhorn_eps=1e-6,
+    post_mult=2.0,
+    return_coeffs=False,
+):
+    """High-level helper: compute coeffs, apply pre, run layer, then apply post+res."""
+    h_pre, h_post, h_res = liger_mhc_coeffs(
+        x,
+        phi,
+        b,
+        alpha_pre,
+        alpha_post,
+        alpha_res,
+        allow_fp32=allow_fp32,
+        tmax=tmax,
+        rms_eps=rms_eps,
+        pre_eps=pre_eps,
+        sinkhorn_eps=sinkhorn_eps,
+        post_mult=post_mult,
+    )
+    x_in = liger_mhc_pre(x, h_pre)
+    layer_dtype = x_in.dtype
+    if hasattr(layer, "parameters"):
+        try:
+            layer_dtype = next(layer.parameters()).dtype
+        except StopIteration:
+            layer_dtype = x_in.dtype
+    if x_in.dtype != layer_dtype:
+        x_in = x_in.to(layer_dtype)
+    f_out = layer(x_in)
+    x_out = liger_mhc_post_res(x, f_out, h_post, h_res)
+    if return_coeffs:
+        return x_out, (h_pre, h_post, h_res)
+    return x_out
diff --git a/src/liger_kernel/transformers/fused_add_rms_norm.py b/src/liger_kernel/transformers/fused_add_rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..e9bd0baa6aaba62326b8d6e00ad4b4daba99692f
--- /dev/null
+++ b/src/liger_kernel/transformers/fused_add_rms_norm.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerFusedAddRMSNormFunction
+
+
+class LigerFusedAddRMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-6,
+        offset=0.0,
+        casting_mode="llama",
+        init_fn="ones",
+        in_place=False,
+    ):
+        super().__init__()
+        assert init_fn in [
+            "ones",
+            "zeros",
+        ], f"init_fn must be either 'ones' or 'zeros', got {init_fn}"
+        self.weight = nn.Parameter(torch.ones(hidden_size) if init_fn == "ones" else torch.zeros(hidden_size))
+        self.variance_epsilon, self.offset, self.casting_mode, self.in_place = (eps, offset, casting_mode, in_place)
+
+    def forward(self, hidden_states, residual):
+        return LigerFusedAddRMSNormFunction.apply(
+            hidden_states,
+            residual,
+            self.weight,
+            self.variance_epsilon,
+            self.offset,
+            self.casting_mode,
+            self.in_place,
+        )
+
+    def extra_repr(self):
+        return (
+            f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}, offset={self.offset}, in_place={self.in_place}"
+        )
diff --git a/src/liger_kernel/transformers/fused_linear_cross_entropy.py b/src/liger_kernel/transformers/fused_linear_cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..c4a4474ce75c713c7199a786ed2408886ebfb0cb
--- /dev/null
+++ b/src/liger_kernel/transformers/fused_linear_cross_entropy.py
@@ -0,0 +1,69 @@
+from typing import Optional
+
+import torch
+
+from liger_kernel.ops import LigerFusedLinearCrossEntropyFunction
+from liger_kernel.transformers.functional import CrossEntropyOutput
+
+
+class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
+    def __init__(
+        self,
+        ce_weight: Optional[torch.FloatTensor] = None,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
+        accum_dtype: Optional[torch.dtype] = None,
+        use_token_scaling: bool = False,
+        return_token_accuracy: bool = False,
+        return_predicted_tokens: bool = False,
+    ):
+        super().__init__()
+        assert (label_smoothing >= 0) and (label_smoothing <= 1), (
+            f"label_smoothing must be between 0.0 and 1.0. Got: {label_smoothing}"
+        )
+        assert reduction in {
+            "mean",
+            "sum",
+            "none",
+        }, f"reduction must be 'mean' or 'sum' or 'none'. Got: {reduction}"
+        assert softcap is None or softcap > 0, f"softcap must greater than 0.0 or None. Got: {softcap}"
+        self.ce_weight = ce_weight
+        self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.softcap = softcap
+        self.return_z_loss = return_z_loss
+        self.accum_dtype = accum_dtype
+        self.use_token_scaling = use_token_scaling
+        self.return_token_accuracy = return_token_accuracy
+        self.return_predicted_tokens = return_predicted_tokens
+
+    def forward(self, lin_weight, _input, target, bias=None):
+        loss, z_loss, token_accuracy, predicted_tokens = LigerFusedLinearCrossEntropyFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ce_weight,
+            self.ignore_index,
+            self.lse_square_scale,
+            self.label_smoothing,
+            self.reduction,
+            self.softcap,
+            self.return_z_loss,
+            self.accum_dtype,
+            self.use_token_scaling,
+            self.return_token_accuracy,
+            self.return_predicted_tokens,
+        )
+        if not self.return_z_loss and not self.return_token_accuracy and not self.return_predicted_tokens:
+            return loss
+
+        return CrossEntropyOutput(
+            loss=loss, z_loss=z_loss, token_accuracy=token_accuracy, predicted_tokens=predicted_tokens
+        )
diff --git a/src/liger_kernel/transformers/fused_linear_jsd.py b/src/liger_kernel/transformers/fused_linear_jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..38f668c6f88397c3d58a004dc769daff016644b5
--- /dev/null
+++ b/src/liger_kernel/transformers/fused_linear_jsd.py
@@ -0,0 +1,95 @@
+from typing import Optional
+
+import torch
+
+from liger_kernel.ops import LigerFusedLinearJSDFunction
+
+
+class LigerFusedLinearJSD(torch.nn.Module):
+    r"""Fusing the last linear layer with generalized JSD
+
+    Handle the forward and backward pass of the final linear layer via JSD by avoiding
+    the materialization of the large logits tensor.
+
+    Args:
+        jsd_beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
+        ignore_index (int): The index to ignore in the target. Default: `-100`
+        temperature (float): temperature in softmax function to control the output probability distribution. Default: `1.0`
+
+    Shape:
+        - student_input: :math:`(BT, H)`, where B is batch size, T is sequence length, H is hidden dimension.
+        - student_weight: :math:`(V, H)`, where V is vocab size.
+        - teacher_input: :math:`(BT, H')`, where H' is hidden dimension of the teacher model.
+        - teacher_weight: :math:`(V, H')`, where hidden size H and H' can be different.
+        - shift_labels: :math:`(BT,)`
+        - Output: a scalar.
+
+    Examples:
+    ```python
+    >>> (B, T, H_s, H_t, V) = (2, 2, 3, 5, 10)
+    >>> fused_jsd = LigerFusedLinearJSD(jsd_beta=0.1, temperature=2.0)
+    >>> # generate inputs and weights
+    >>> student_input = torch.rand(B * T, H_s, device="cuda", requires_grad=True)
+    >>> student_lin = torch.nn.Linear(H_s, V, bias=False, device="cuda")
+    >>> # teacher input doesn't require grad, hidden_dim can be different from student's
+    >>> teacher_input = torch.rand(B * T, H_t, device="cuda")
+    >>> teacher_lin = torch.nn.Linear(H_t, V, bias=False, device="cuda")
+    >>> output = fused_jsd(student_input, student_lin.weight, teacher_input, teacher_lin.weight)
+    >>> output.backward()
+    >>>
+    >>> # Example with labels for supervised fine-tuning (SFT) context:
+    >>>
+    >>> # Assume hidden_states, lm_heads and corresponding labels are given
+    >>> student_lm_head = torch.nn.Linear(H_s, V, bias=False)
+    >>> student_hidden_states = torch.randn(B * T, H_s, requires_grad=True).log_softmax(dim=-1)
+    >>> teacher_lm_head = torch.nn.Linear(H_t, V, bias=False)
+    >>> teacher_hidden_states = torch.randn(B * T, H_t).log_softmax(dim=-1)
+    >>> labels = torch.randint(0, V, (B * T,), torch.long)
+    >>>
+    >>> # Shift so that tokens < n predict n
+    >>> shift_student_hidden_states = student_hidden_states[..., :-1, :].contiguous()
+    >>> shift_teacher_hidden_states = teacher_hidden_states[..., :-1, :].contiguous()
+    >>> shift_labels = labels[..., 1:].contiguous()
+    >>>
+    >>> # Flatten tokens
+    >>> shift_student_hidden_states = shift_student_hidden_states.view(-1, V)
+    >>> shift_teacher_hidden_states = shift_teacher_hidden_states.view(-1, V)
+    >>> shift_labels = shift_labels.view(-1)
+    >>>
+    >>> # Calculate loss
+    >>> loss_fct = LigerJSD(beta=0.1)
+    >>> loss = loss_fct(
+    >>>     shift_studetn_hidden_states,
+    >>>     student_lm_head.weight,
+    >>>     shift_teacher_hidden_states,
+    >>>     teacher_lm_head.weight,
+    >>>     shift_labels
+    >>> )
+    ```
+    """
+
+    def __init__(self, jsd_beta=0.5, ignore_index=-100, temperature=1.0):
+        super().__init__()
+        assert temperature != 0, "temperature cannot be 0."
+        self.jsd_beta = jsd_beta
+        self.temperature = temperature
+        self.ignore_index = ignore_index
+
+    def forward(
+        self,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        shift_labels: Optional[torch.LongTensor],
+    ):
+        return LigerFusedLinearJSDFunction.apply(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            shift_labels,
+            self.jsd_beta,
+            self.ignore_index,
+            self.temperature,
+        )
diff --git a/src/liger_kernel/transformers/fused_neighborhood_attention.py b/src/liger_kernel/transformers/fused_neighborhood_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..92a3e8503379156c10cc7d0361ee05c833053ac8
--- /dev/null
+++ b/src/liger_kernel/transformers/fused_neighborhood_attention.py
@@ -0,0 +1,234 @@
+import math
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerFusedNeighborhoodAttentionFunction
+
+
+class LigerFusedNeighborhoodAttention(nn.Module):
+    """
+    Liger Fused Neighborhood Attention Module.
+
+    Paper: https://arxiv.org/pdf/2504.16922
+
+    Fused Neighborhood attention restricts the attention mechanism to a local neighborhood
+    around each position, reducing computational complexity from O(n²) to O(n*k)
+    where k is the neighborhood size.
+
+    Args:
+        hidden_size (int): The hidden dimension size
+        num_heads (int): Number of attention heads
+        kernel_size (int): Size of the neighborhood window (default: 7)
+        dilation (int): Dilation factor for the neighborhood (default: 1)
+        bias (bool): Whether to use bias in linear projections (default: True)
+        dropout (float): Dropout probability (default: 0.0)
+        scale (Optional[float]): Scaling factor for attention scores.
+                                If None, uses 1/sqrt(head_dim) (default: None)
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        kernel_size: int = 7,
+        dilation: int = 1,
+        bias: bool = True,
+        dropout: float = 0.0,
+        scale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        if hidden_size % num_heads != 0:
+            raise ValueError(f"hidden_size ({hidden_size}) must be divisible by num_heads ({num_heads})")
+
+        if kernel_size <= 0:
+            raise ValueError(f"kernel_size ({kernel_size}) must be positive")
+
+        if kernel_size % 2 == 0:
+            raise ValueError(f"kernel_size ({kernel_size}) must be odd")
+
+        if dilation < 1:
+            raise ValueError(f"dilation ({dilation}) must be positive")
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.scale = scale if scale is not None else 1.0 / math.sqrt(self.head_dim)
+        self.dropout_p = dropout
+
+        self.q_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.k_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.v_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+
+        self.out_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+
+        if dropout > 0.0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the fused neighborhood attention module.
+
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+            attention_mask (Optional[torch.Tensor]): Attention mask (currently not supported)
+
+        Returns:
+            torch.Tensor: Output tensor of shape [batch_size, seq_len, hidden_size]
+        """
+        if attention_mask is not None:
+            raise NotImplementedError("Attention mask is not yet supported in LigerFusedNeighborhoodAttention")
+
+        batch_size, seq_len, hidden_size = hidden_states.shape
+
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = query.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key = key.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value = value.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attn_output = LigerFusedNeighborhoodAttentionFunction.apply(
+            query, key, value, self.kernel_size, self.dilation, self.scale
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_size)
+
+        if self.dropout is not None:
+            attn_output = self.dropout(attn_output)
+
+        output = self.out_proj(attn_output)
+
+        return output
+
+    def extra_repr(self) -> str:
+        return (
+            f"hidden_size={self.hidden_size}, num_heads={self.num_heads}, "
+            f"head_dim={self.head_dim}, kernel_size={self.kernel_size}, "
+            f"dilation={self.dilation}, scale={self.scale}, dropout={self.dropout_p}"
+        )
+
+
+class LigerFusedNeighborhoodAttentionLayer(nn.Module):
+    """
+    A complete neighborhood attention layer with layer norm and residual connection.
+
+    Args:
+        hidden_size (int): The hidden dimension size
+        num_heads (int): Number of attention heads
+        kernel_size (int): Size of the neighborhood window (default: 7)
+        dilation (int): Dilation factor for the neighborhood (default: 1)
+        bias (bool): Whether to use bias in linear projections (default: True)
+        dropout (float): Dropout probability (default: 0.0)
+        layer_norm_eps (float): Epsilon for layer normalization (default: 1e-5)
+        scale (Optional[float]): Scaling factor for attention scores (default: None)
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        kernel_size: int = 7,
+        dilation: int = 1,
+        bias: bool = True,
+        dropout: float = 0.0,
+        layer_norm_eps: float = 1e-5,
+        scale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        self.attention = LigerFusedNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+            dropout=dropout,
+            scale=scale,
+        )
+
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+        if dropout > 0.0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass with residual connection and layer normalization.
+
+        Args:
+            hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+            attention_mask (Optional[torch.Tensor]): Attention mask (currently not supported)
+
+        Returns:
+            torch.Tensor: Output tensor of shape [batch_size, seq_len, hidden_size]
+        """
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        attn_output = self.attention(normed_hidden_states, attention_mask)
+
+        if self.dropout is not None:
+            attn_output = self.dropout(attn_output)
+
+        output = hidden_states + attn_output
+
+        return output
+
+
+class LigerFusedNeighborhoodAttentionConfig:
+    """
+    Configuration class for Fused Neighborhood Attention.
+
+    This can be used to easily configure neighborhood attention parameters
+    for different model architectures.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_heads: int = 12,
+        kernel_size: int = 7,
+        dilation: int = 1,
+        bias: bool = True,
+        dropout: float = 0.0,
+        layer_norm_eps: float = 1e-5,
+        scale: Optional[float] = None,
+    ):
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.bias = bias
+        self.dropout = dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.scale = scale
+
+    def to_dict(self):
+        return {
+            "hidden_size": self.hidden_size,
+            "num_heads": self.num_heads,
+            "kernel_size": self.kernel_size,
+            "dilation": self.dilation,
+            "bias": self.bias,
+            "dropout": self.dropout,
+            "layer_norm_eps": self.layer_norm_eps,
+            "scale": self.scale,
+        }
diff --git a/src/liger_kernel/transformers/geglu.py b/src/liger_kernel/transformers/geglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..fb72cbbab508c1a2f20afda39ce2b8c2f60ed784
--- /dev/null
+++ b/src/liger_kernel/transformers/geglu.py
@@ -0,0 +1,22 @@
+import torch.nn as nn
+
+from liger_kernel.ops import LigerGELUMulFunction
+
+
+class LigerGEGLUMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        # TODO: support exact GELU
+        # Right now Gemma 1, 1.1 and 2 models are all using `gelu_pytorch_tanh`
+        # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/gemma/modeling_gemma.py#L175
+        # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/activations.py#L46
+        # So we can safely assume we use tanh approximation form all the time
+
+    def forward(self, x):
+        return self.down_proj(LigerGELUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
diff --git a/src/liger_kernel/transformers/group_norm.py b/src/liger_kernel/transformers/group_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..efc6f8ac157e0beacbcd8d0d8b31f3ba16e64073
--- /dev/null
+++ b/src/liger_kernel/transformers/group_norm.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerGroupNormFunction
+
+
+class LigerGroupNorm(nn.Module):
+    def __init__(self, num_channels, num_groups, eps=1e-6, bias=False, init_fn="ones"):
+        """
+        A Group Normalization layer.
+        Args:
+            num_channels (int): Number of channels in the input tensor.
+            num_groups (int): Number of groups to divide the channels into.
+            eps (float, optional): A value added to the denominator for numerical stability. Default: 1e-6.
+            bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``False``.
+            init_fn (str, optional): Initialization function for the learnable parameters. Default: "ones".
+        """
+        super().__init__()
+        assert init_fn in [
+            "ones",
+            "zeros",
+        ], f"init_fn must be either 'ones' or 'zeros', got {init_fn}"
+
+        assert num_channels % num_groups == 0, (
+            f"Number of channels {num_channels} must be divisible by num_groups {num_groups}"
+        )
+        self.num_channels = num_channels
+        self.num_groups = num_groups
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(num_channels) if init_fn == "ones" else torch.zeros(num_channels))
+        self.bias = nn.Parameter(torch.randn(num_channels) if bias else torch.zeros(num_channels))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # hidden_states: (batch_size, num_channels, *)
+        assert hidden_states.dim() >= 3, f"Input must have atleast 3 dimensions, got {hidden_states.dim()}"
+        assert hidden_states.size(1) == self.num_channels, (
+            f"Input tensor must have {self.num_channels} channels, got {hidden_states.size(1)}"
+        )
+        return LigerGroupNormFunction.apply(
+            hidden_states,
+            self.weight,
+            self.bias,
+            self.num_channels,
+            self.num_groups,
+            self.variance_epsilon,
+        )
+
+    def extra_repr(self):
+        return f"{self.hidden_size}, num_channels={self.num_channels}, num_groups={self.num_groups}, eps={self.eps}"
diff --git a/src/liger_kernel/transformers/grpo_loss.py b/src/liger_kernel/transformers/grpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..caa053bd6e74acfa5a035fc06fd5dfdff2c0aeb5
--- /dev/null
+++ b/src/liger_kernel/transformers/grpo_loss.py
@@ -0,0 +1,206 @@
+import torch
+
+from liger_kernel.chunked_loss.fused_linear_ppo import LigerFusedLinearPPOBase
+from liger_kernel.ops import GrpoLossFunction
+
+
+def triton_grpo_loss(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask=None,
+    temperature=0.9,
+    beta=0.04,
+    eps_low=0.2,
+    eps_high=0.4,
+    inplace=True,
+    loss_type="dapo",
+    max_completion_length=None,
+    importance_sampling_level="token",
+    reduce=False,
+    sapo_temperature_pos=1.0,
+    sapo_temperature_neg=1.05,
+    vllm_is_ratio=None,
+    delta=None,
+    use_bias_correction_kl=False,
+):
+    """
+    Triton-optimized GRPO loss function.
+
+    Args:
+        logits: Model logits (B, L+1, V)
+        old_logp: Old policy log probabilities (B, L) or None
+        ref_logp: Reference model log probabilities (B, L) or None (required if beta != 0)
+        completion_ids: Token IDs for completions (B, L)
+        advantages: Per-sequence advantages (B,)
+        completion_mask: Mask for valid tokens (B, L) or None
+        temperature: Temperature for log softmax
+        beta: KL penalty coefficient
+        eps_low: Lower clipping bound for importance ratio
+        eps_high: Upper clipping bound for importance ratio
+        inplace: Whether to modify logits in-place during backward
+        loss_type: Loss reduction type ("grpo", "bnpo", "dr_grpo", "dapo", "cispo", "sapo", "luspo")
+        max_completion_length: Max completion length for dr_grpo loss type; defaults to sequence length if None
+        importance_sampling_level: "token" or "sequence" importance sampling
+        reduce: If True, return reduced loss; if False, return per-token loss
+        vllm_is_ratio: vLLM importance sampling ratio (B, L) or (B, 1) or None.
+            Used to correct for distribution mismatch when using vLLM for generation.
+            Applied to PPO loss BEFORE adding KL penalty.
+        delta: Upper clamp for two-sided clipping (INTELLECT-2). When set, coef_1 is clamped
+            to max=delta before computing the PPO loss. Only supported for standard PPO loss
+            types (grpo, bnpo, dr_grpo, dapo, luspo). None means disabled.
+        use_bias_correction_kl: If True, multiply KL divergence by coef_1 (importance sampling
+            ratio) for bias-corrected KL estimation (DeepSeek-V3.2). Default False.
+
+    Returns:
+        If reduce=True: (loss, metrics) where metrics = [kl_mean, clip_ratio] or [clip_ratio]
+        If reduce=False: (per_token_loss, per_token_kl, is_clipped)
+    """
+    assert logits is not None and completion_ids is not None and advantages is not None, (
+        "must provide logits, completion_ids and advantages"
+    )
+    assert importance_sampling_level in ("token", "sequence"), (
+        f"importance_sampling_level must be 'token' or 'sequence', got {importance_sampling_level}"
+    )
+
+    result = GrpoLossFunction.apply(
+        logits,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace,
+        loss_type,
+        max_completion_length,
+        reduce,
+        importance_sampling_level,
+        sapo_temperature_pos,
+        sapo_temperature_neg,
+        vllm_is_ratio,
+        delta,
+        use_bias_correction_kl,
+    )
+
+    if not reduce:
+        # Returns (per_token_loss, per_token_kl, is_clipped) - all (B, L) tensors
+        return result
+
+    # reduce=True: Returns (reduced_loss, kl_mean, clip_ratio) - all scalars
+    reduced_loss, kl_mean, clip_ratio = result
+    metrics = []
+    if beta != 0.0 and kl_mean is not None:
+        metrics.append(kl_mean)
+    metrics.append(clip_ratio)
+    return reduced_loss, metrics
+
+
+def _reduce_grpo_loss(per_token_loss, completion_mask, loss_type, max_completion_length):
+    mask = completion_mask
+    if mask is None:
+        mask = torch.ones_like(per_token_loss, dtype=per_token_loss.dtype, device=per_token_loss.device)
+    mask = mask.to(per_token_loss.dtype)
+
+    if loss_type == "grpo" or loss_type == "sapo":
+        # SAPO uses the same normalization as GRPO (per-sequence average)
+        per_seq = (per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)
+        return per_seq.mean()
+    if loss_type == "bnpo":
+        return (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0)
+    if loss_type == "dr_grpo":
+        batch = per_token_loss.shape[0]
+        max_len = max_completion_length if max_completion_length is not None else per_token_loss.shape[1]
+        return (per_token_loss * mask).sum() / (batch * max_len)
+    if loss_type == "dapo" or loss_type == "cispo":
+        # CISPO uses the same normalization as DAPO
+        normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(mask)
+        return (per_token_loss * mask).sum() / normalizer
+    if loss_type == "luspo":
+        # LUSPO: scale each sequence's loss by its valid token count, then average across sequences
+        return (per_token_loss * mask.sum(-1, keepdim=True)).mean()
+    raise ValueError(f"Unsupported loss_type '{loss_type}' for Triton GRPO loss.")
+
+
+def _masked_mean(values, mask):
+    if mask is None:
+        mask = torch.ones_like(values, dtype=values.dtype, device=values.device)
+    mask = mask.to(values.dtype)
+    return (values * mask).sum() / mask.sum().clamp(min=1.0)
+
+
+# This is a demo how to use grpo_loss in GRPOTrainer. The Trl version must be 0.26.2+
+"""
+import torch
+import trl
+from packaging.version import Version
+assert Version(trl.__version__) >= Version("0.26.2"), "please pip install trl>=0.26.2"
+from trl.extras.profiling import profiling_decorator
+
+@profiling_decorator
+def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep):
+    # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+    logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
+    return fused_selective_log_softmax(logits, input_ids, self.temperature, mask=attention_mask)
+
+@profiling_decorator
+def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+    if return_outputs:
+        raise ValueError("The GRPOTrainer does not support returning outputs")
+    # Compute the per-token log probabilities for the model
+
+    prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+    completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+    input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+    attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+    logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+    logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
+
+    ref_per_token_logps = inputs["ref_per_token_logps"]
+    advantages = inputs["advantages"]
+    old_per_token_logps = inputs["old_per_token_logps"]
+
+    # Get vLLM importance sampling ratio if using vLLM with importance sampling correction
+    vllm_is_ratio = inputs.get("importance_sampling_ratio", None)
+
+    per_token_loss, per_token_kl, is_clipped = triton_grpo_loss(
+        logits,
+        old_per_token_logps,
+        ref_per_token_logps,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature=self.temperature,
+        beta=self.beta,
+        eps_low=self.epsilon_low,
+        eps_high=self.epsilon_high,
+        importance_sampling_level=self.importance_sampling_level,  # "token" or "sequence"
+        vllm_is_ratio=vllm_is_ratio,  # vLLM distribution correction
+    )
+    loss = (per_token_loss * completion_mask).sum() / completion_mask.sum()
+
+    # Log the metrics
+    mode = "eval" if self.control.should_evaluate else "train"
+
+    if self.beta != 0.0:
+        mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum()
+        self._metrics[mode]["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+
+    clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum()
+    self._metrics[mode]["clip_ratio"].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item())
+    return loss
+
+trl.GRPOTrainer._get_per_token_logps = _get_per_token_logps
+trl.GRPOTrainer.compute_loss = compute_loss
+trigger = None
+"""
+
+# add this line at the first line of grpo.py in open-r1
+"""
+from liger_kernel.transformers.grpo_loss import trigger
+"""
diff --git a/src/liger_kernel/transformers/jsd.py b/src/liger_kernel/transformers/jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..a8489e7d1a42a2001c6846eeaf228571267474c5
--- /dev/null
+++ b/src/liger_kernel/transformers/jsd.py
@@ -0,0 +1,70 @@
+from typing import Optional
+
+import torch
+
+from liger_kernel.ops import LigerJSDFunction
+
+
+class LigerJSD(torch.nn.Module):
+    r"""The generalized Jensen-Shannon Divergence.
+    .. math::
+    JSD(\beta)(P || Q)
+        = \beta * KLDiv(P || (\beta * P + (1 - \beta) * Q)) + (1 - \beta) * KLDiv(Q || (\beta * P + (1 - \beta) * Q))
+    .. note::
+    As all the other losses in PyTorch, this function expects the first argument,
+    :attr:`log_q`, to be the predictions, the output of the student model in log-space,
+    and the second, :attr:`log_p`, to be the observations, the output of the teacher model in log-space.
+    This differs from the standard mathematical notation :math:`JSD(P || Q)` where
+    :math:`P` denotes the teacher model and :math:`Q` denotes the student model.
+
+    Args:
+        beta (float): coefficient beta of generalized JSD in the interval [0, 1]. It implements forward/reverse KL when beta equals 0 and 1 respectively. Default: `0.5`
+        ignore_index (int): The index to ignore in the target. Default: `-100`
+
+    Shape:
+        - Input: :math:`(BT, V)`, where B is batch size, T is sequence length, V is vocab size.
+        - Target: :math:`(BT, V)`, same shape as the input.
+        - shift_labels (Optional): :math:`(BT,)`
+        - Output: a scalar.
+
+    Examples:
+    ```python
+    >>> (B, T, V) = (2, 2, 5)
+    >>> jsd = LigerJSD(beta=0.1)
+    >>> # input should be a distribution in the log space
+    >>> input = torch.randn(B * T, V, requires_grad=True).log_softmax(dim=-1)
+    >>> target = torch.randn(B * T, V).log_softmax(dim=-1)
+    >>> output = jsd(input, target)
+    >>>
+    >>> # Example with labels for supervised fine-tuning (SFT) context
+    >>> # Assume logits and corresponding labels are given
+    >>> student_logits = torch.randn(B * T, V, requires_grad=True).log_softmax(dim=-1)
+    >>> teacher_logits = torch.randn(B * T, V).log_softmax(dim=-1)
+    >>> labels = torch.randint(0, V, (B * T,), torch.long)
+    >>> # Shift so that tokens < n predict n
+    >>> shift_student_logits = student_logits[..., :-1, :].contiguous()
+    >>> shift_teacher_logits = teacher_logits[..., :-1, :].contiguous()
+    >>> shift_labels = labels[..., 1:].contiguous()
+    >>> # Flatten tokens
+    >>> shift_student_logits = shift_student_logits.view(-1, V)
+    >>> shift_teacher_logits = shift_teacher_logits.view(-1, V)
+    >>> shift_labels = shift_labels.view(-1)
+    >>> # Calculate loss
+    >>> loss_fct = LigerJSD(beta=0.1)
+    >>> loss = loss_fct(shift_studetn_logits, shift_teacher_logits, shift_labels)
+
+    ```
+    """
+
+    def __init__(self, beta: float = 0.5, ignore_index: int = -100):
+        super().__init__()
+        self.beta = beta
+        self.ignore_index = ignore_index
+
+    def forward(
+        self,
+        log_q: torch.Tensor,
+        log_p: torch.Tensor,
+        shift_labels: Optional[torch.LongTensor] = None,
+    ):
+        return LigerJSDFunction.apply(log_q, log_p, shift_labels, self.beta, self.ignore_index)
diff --git a/src/liger_kernel/transformers/kl_div.py b/src/liger_kernel/transformers/kl_div.py
new file mode 100755
index 0000000000000000000000000000000000000000..97d9e68c591e7a7a8e48f4b529fce320511d9948
--- /dev/null
+++ b/src/liger_kernel/transformers/kl_div.py
@@ -0,0 +1,12 @@
+import torch.nn as nn
+
+from liger_kernel.ops import LigerKLDivLossFunction
+
+
+class LigerKLDIVLoss(nn.KLDivLoss):
+    def __init__(self, eps: float = 1e-10, *args, **kwargs):
+        super(LigerKLDIVLoss, self).__init__(*args, **kwargs)
+        self.eps = eps
+
+    def forward(self, y_pred, y_true):
+        return LigerKLDivLossFunction.apply(y_pred, y_true, self.reduction, self.log_target, self.eps)
diff --git a/src/liger_kernel/transformers/layer_norm.py b/src/liger_kernel/transformers/layer_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..34a6325117a65cc99967ba86805a68eb84a2ffa8
--- /dev/null
+++ b/src/liger_kernel/transformers/layer_norm.py
@@ -0,0 +1,24 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerLayerNormFunction
+
+
+class LigerLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, bias=False, init_fn="ones"):
+        super().__init__()
+        assert init_fn in [
+            "ones",
+            "zeros",
+        ], f"init_fn must be either 'ones' or 'zeros', got {init_fn}"
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size) if init_fn == "ones" else torch.zeros(hidden_size))
+        self.bias = nn.Parameter(torch.randn(hidden_size) if bias else torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        return LigerLayerNormFunction.apply(hidden_states, self.weight, self.bias, self.variance_epsilon)
+
+    def extra_repr(self):
+        return f"{self.hidden_size}, eps={self.eps}"
diff --git a/src/liger_kernel/transformers/llama4_rope.py b/src/liger_kernel/transformers/llama4_rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..d808fdec6ccdcd51ff51268f999b4dec43d15305
--- /dev/null
+++ b/src/liger_kernel/transformers/llama4_rope.py
@@ -0,0 +1,93 @@
+"""
+Liger Kernel implementation of Llama4 Rotary Position Embedding (RoPE).
+Supports both text and vision RoPE variants with fused operations for optimal performance.
+"""
+
+import torch
+
+from liger_kernel.ops import LigerLlama4RopeFunction
+
+
+def liger_llama4_text_rotary_pos_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Liger-optimized implementation of Llama4 text rotary position embedding.
+
+    This implementation uses a fused Triton kernel for complex multiplication,
+    providing significant performance improvements over the original PyTorch implementation.
+
+    Args:
+        xq (torch.Tensor): Query tensor of shape (batch_size, seq_len, num_heads, head_dim)
+        xk (torch.Tensor): Key tensor of shape (batch_size, seq_len, num_heads, head_dim)
+        freqs_cis (torch.Tensor): Complex frequency tensor from Llama4TextRotaryEmbedding
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Rotated query and key tensors
+    """
+    # Use fused Triton kernel for complex RoPE
+    return LigerLlama4RopeFunction.apply(xq, xk, freqs_cis)
+
+
+def liger_llama4_vision_rotary_pos_emb(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    freqs_ci: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Liger-optimized implementation of Llama4 vision rotary position embedding.
+
+    This implementation uses the same fused Triton kernel as text RoPE,
+    providing performance improvements for vision transformer attention.
+
+    Args:
+        query (torch.Tensor): Query tensor of shape (batch_size, seq_len, num_heads, head_dim)
+        key (torch.Tensor): Key tensor of shape (batch_size, seq_len, num_heads, head_dim)
+        freqs_ci (torch.Tensor): Complex frequency tensor for 2D positions
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Rotated query and key tensors
+    """
+    # Handle broadcasting for vision RoPE
+    if freqs_ci.dim() == 3:
+        try:
+            # Try the regular 3D expansion
+            freqs_ci = freqs_ci.unsqueeze(0).expand(query.shape[0], -1, -1)
+        except RuntimeError as e:
+            if "expand" in str(e) and "4" in str(e):
+                # The tensor is actually 4D internally, handle it differently
+                freqs_ci = freqs_ci.squeeze(1)  # Remove the middle dimension
+                freqs_ci = freqs_ci.unsqueeze(0).expand(query.shape[0], -1, -1)
+            else:
+                raise e
+    elif freqs_ci.dim() == 4:  # (1, seq_len, 1, head_dim//2) - already properly shaped
+        # Squeeze the middle dimension to get (1, seq_len, head_dim//2)
+        freqs_ci = freqs_ci.squeeze(2)
+    elif freqs_ci.dim() == 2:  # (seq_len, head_dim//2) - needs expansion
+        freqs_ci = freqs_ci.unsqueeze(0).expand(query.shape[0], -1, -1)
+    else:
+        raise ValueError(f"Unexpected freqs_ci shape: {freqs_ci.shape}")
+
+    # Use the same fused kernel as text RoPE
+    return LigerLlama4RopeFunction.apply(query, key, freqs_ci)
+
+
+# Note: We only patch the functions, not the classes
+# The original Llama4TextRotaryEmbedding and Llama4VisionRotaryEmbedding classes remain unchanged
+
+
+# Convenience functions for monkey patching
+def apply_liger_llama4_rope_full(modeling_module):
+    """
+    Apply Liger optimizations to Llama4 RoPE functions.
+
+    Args:
+        modeling_module: The transformers modeling module to patch
+    """
+    # Replace the text RoPE function
+    modeling_module.apply_rotary_emb = liger_llama4_text_rotary_pos_emb
+
+    # Replace the vision RoPE function
+    modeling_module.vision_apply_rotary_emb = liger_llama4_vision_rotary_pos_emb
diff --git a/src/liger_kernel/transformers/mhc.py b/src/liger_kernel/transformers/mhc.py
new file mode 100755
index 0000000000000000000000000000000000000000..30459dfbe98eb0e924956c12a9fef90988c35d2b
--- /dev/null
+++ b/src/liger_kernel/transformers/mhc.py
@@ -0,0 +1,162 @@
+import warnings
+
+import torch
+import torch.nn as nn
+
+from liger_kernel.transformers.functional import liger_mhc_coeffs
+from liger_kernel.transformers.functional import liger_mhc_post_res
+from liger_kernel.transformers.functional import liger_mhc_pre
+
+
+class LigerMHC(nn.Module):
+    """
+    Manifold-Constrained Hyper-Connections (mHC) wrapper.
+
+    Wraps an arbitrary layer ``F: [..., C] -> [..., C]`` with multiple residual
+    streams, following the mHC architecture (arXiv:2512.24880). The input is a
+    multi-stream tensor of shape ``[..., HC, C]`` where ``HC`` is the number of
+    residual streams.
+
+    The forward pass performs:
+
+    1. **Coefficients** -- Compute data-dependent routing coefficients
+       (``h_pre``, ``h_post``, ``h_res``) via a fused matmul + RMS
+       normalization + Sinkhorn-Knopp iterations.
+    2. **Pre-aggregate** -- ``x_in = sum_i h_pre[i] * x[i]``
+       (shape: ``[..., C]``)
+    3. **Layer** -- ``f_out = layer(x_in)``  (shape: ``[..., C]``)
+    4. **Post + residual** --
+       ``x_out[o] = sum_i h_res[o,i] * x[i] + h_post[o] * f_out``
+       (shape: ``[..., HC, C]``)
+
+    Args:
+        layer: The module applied to the aggregated single-stream input.
+            Must accept ``[..., C]`` and return ``[..., C]``. Common choices
+            include ``nn.Linear``, attention layers, or MLP blocks.
+        hc: Number of residual streams (called *n* in the original paper).
+            Recommended range: [2, 16]. Larger values increase register
+            pressure and Triton compile time.
+        c: Per-stream channel dimension.
+        tmax: Maximum Sinkhorn-Knopp iterations for doubly stochastic
+            normalization of ``h_res``. Default: 20.
+        rms_eps: Epsilon for RMS normalization of the projection.
+            Default: 1e-6.
+        pre_eps: Additive epsilon for ``h_pre`` after sigmoid. Default: 0.0.
+        sinkhorn_eps: Epsilon added during Sinkhorn normalization.
+            Default: 1e-6.
+        post_mult: Scaling factor for ``h_post`` after sigmoid. Default: 2.0.
+        phi_dtype: Dtype for the projection matrix ``phi``. Using float16 or
+            bfloat16 enables Tensor Core acceleration. Default: torch.float16.
+        allow_fp32: If True, accept FP32 input tensors. Note that FP32 mode
+            does **not** use Tensor Cores and will be slower. Default: False.
+
+    Learnable Parameters:
+        - **phi** ``[HC*C, HC*HC + 2*HC]`` -- Projection matrix for computing
+          routing coefficients from flattened stream tokens.
+        - **b** ``[HC*HC + 2*HC]`` -- Bias for routing logits (float32).
+        - **alpha_pre** (scalar) -- Scales pre-routing logits before sigmoid.
+        - **alpha_post** (scalar) -- Scales post-routing logits before sigmoid.
+        - **alpha_res** (scalar) -- Scales residual logits before Sinkhorn.
+
+    Example::
+
+        import torch
+        import torch.nn as nn
+        from liger_kernel.transformers import LigerMHC
+
+        # Wrap a linear layer with 4 residual streams of dimension 256
+        layer = nn.Linear(256, 256, bias=False, device="cuda", dtype=torch.bfloat16)
+        mhc = LigerMHC(layer, hc=4, c=256, phi_dtype=torch.bfloat16).cuda()
+
+        # Input: [batch, seq_len, num_streams, channels]
+        x = torch.randn(2, 128, 4, 256, device="cuda", dtype=torch.bfloat16)
+        out = mhc(x)  # shape: [2, 128, 4, 256]
+
+        # In a transformer block (pseudocode):
+        # x = mhc_attn(x)   # attention wrapped in LigerMHC
+        # x = mhc_ffn(x)    # FFN wrapped in LigerMHC
+    """
+
+    def __init__(
+        self,
+        layer: nn.Module,
+        *,
+        hc: int,
+        c: int,
+        tmax: int = 20,
+        rms_eps: float = 1e-6,
+        pre_eps: float = 0.0,
+        sinkhorn_eps: float = 1e-6,
+        post_mult: float = 2.0,
+        phi_dtype: torch.dtype = torch.float16,
+        allow_fp32: bool = False,
+    ):
+        super().__init__()
+        self.layer = layer
+        # hc: number of residual streams (n in the paper)
+        self.hc = int(hc)
+        self.c = int(c)
+
+        if hc > 16:
+            warnings.warn(
+                f"hc={hc} exceeds recommended range [2, 16]. "
+                "Large values may cause register pressure and increased compile time.",
+                stacklevel=2,
+            )
+        self.tmax = int(tmax)
+        self.rms_eps = float(rms_eps)
+        self.pre_eps = float(pre_eps)
+        self.sinkhorn_eps = float(sinkhorn_eps)
+        self.post_mult = float(post_mult)
+        self.allow_fp32 = bool(allow_fp32)
+
+        m = hc * hc + 2 * hc
+        k = hc * c
+
+        try:
+            layer_device = next(self.layer.parameters()).device
+        except StopIteration:
+            layer_device = torch.device("cpu")
+
+        # Note: for best speed, keep phi in BF16/FP16 to enable tensor-core matmul in Triton.
+        self.phi = nn.Parameter(torch.randn(k, m, dtype=phi_dtype, device=layer_device) * 0.02)
+        self.b = nn.Parameter(torch.zeros(m, dtype=torch.float32, device=layer_device))
+        self.alpha_pre = nn.Parameter(torch.tensor(1.0, dtype=torch.float32, device=layer_device))
+        self.alpha_post = nn.Parameter(torch.tensor(1.0, dtype=torch.float32, device=layer_device))
+        self.alpha_res = nn.Parameter(torch.tensor(1.0, dtype=torch.float32, device=layer_device))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: [..., HC, C] (BF16/FP16 recommended; FP32 allowed if allow_fp32=True)
+        returns: [..., HC, C]
+        """
+        if x.shape[-2] != self.hc or x.shape[-1] != self.c:
+            raise ValueError(f"Expected x.shape[-2:]=[{self.hc}, {self.c}], got {list(x.shape[-2:])}")
+
+        h_pre, h_post, h_res = liger_mhc_coeffs(
+            x,
+            self.phi,
+            self.b,
+            self.alpha_pre,
+            self.alpha_post,
+            self.alpha_res,
+            allow_fp32=self.allow_fp32,
+            tmax=self.tmax,
+            rms_eps=self.rms_eps,
+            pre_eps=self.pre_eps,
+            sinkhorn_eps=self.sinkhorn_eps,
+            post_mult=self.post_mult,
+        )
+        x_in = liger_mhc_pre(x, h_pre)  # [..., C]
+        layer_dtype = x_in.dtype
+        for param in self.layer.parameters(recurse=True):
+            layer_dtype = param.dtype
+            break
+        if x_in.dtype != layer_dtype:
+            x_in = x_in.to(layer_dtype)
+        f_out = self.layer(x_in)  # [..., C]
+        x_out = liger_mhc_post_res(x, f_out, h_post, h_res)  # [..., HC, C]
+        return x_out
+
+    def extra_repr(self) -> str:
+        return f"hc={self.hc}, c={self.c}, tmax={self.tmax}"
diff --git a/src/liger_kernel/transformers/model/__init__.py b/src/liger_kernel/transformers/model/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/liger_kernel/transformers/model/exaone4.py b/src/liger_kernel/transformers/model/exaone4.py
new file mode 100755
index 0000000000000000000000000000000000000000..c1fb863b12c6b4b08608202d9d99af5870546f83
--- /dev/null
+++ b/src/liger_kernel/transformers/model/exaone4.py
@@ -0,0 +1,139 @@
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> LigerCausalLMOutputWithPast:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ````python
+    >>> from transformers import AutoTokenizer, Exaone4ForCausalLM
+
+    >>> model = Exaone4ForCausalLM.from_pretrained("LGAI-EXAONE/EXAONE-4.0-1.2B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-4.0-1.2B")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    # Remove output-control parameters that shouldn't be passed to loss functions
+    kwargs.pop("return_dict", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/falcon_h1.py b/src/liger_kernel/transformers/model/falcon_h1.py
new file mode 100755
index 0000000000000000000000000000000000000000..1529ca057983706f5801242bf253fdddc75d4d14
--- /dev/null
+++ b/src/liger_kernel/transformers/model/falcon_h1.py
@@ -0,0 +1,125 @@
+from typing import TYPE_CHECKING
+from typing import Optional
+from typing import Union
+
+import torch
+
+if TYPE_CHECKING:
+    from transformers.models.falcon_h1.modeling_falcon_h1 import FalconHybridMambaAttentionDynamicCache
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional["FalconHybridMambaAttentionDynamicCache"] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> Union[tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, FalconH1ForCausalLM
+
+    >>> model = FalconH1ForCausalLM.from_pretrained("...")
+    >>> tokenizer = AutoTokenizer.from_pretrained("...")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    # if in training mode, don't materialize logits
+    if skip_logits and labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and labels is not None
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/gemma.py b/src/liger_kernel/transformers/model/gemma.py
new file mode 100755
index 0000000000000000000000000000000000000000..7fcdc9e282fe91a7d68350bfca7c27261eb748ac
--- /dev/null
+++ b/src/liger_kernel/transformers/model/gemma.py
@@ -0,0 +1,144 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.cache_utils import Cache
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, GemmaForCausalLM
+
+    >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
+
+    >>> prompt = "What is your favorite condiment?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "What is your favorite condiment?"
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        if loss is not None:
+            output_tuple = (loss,) + output_tuple
+        if token_accuracy is not None:
+            output_tuple = output_tuple + (token_accuracy,)
+        if predicted_tokens is not None:
+            output_tuple = output_tuple + (predicted_tokens,)
+        return output_tuple
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/gemma2.py b/src/liger_kernel/transformers/model/gemma2.py
new file mode 100755
index 0000000000000000000000000000000000000000..5eebd22b2852712cc6b50083eaa8f8a5af8c6e5e
--- /dev/null
+++ b/src/liger_kernel/transformers/model/gemma2.py
@@ -0,0 +1,157 @@
+import logging
+
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.cache_utils import Cache
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+logger = logging.getLogger(__name__)
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, GemmaForCausalLM
+
+    >>> model = GemmaForCausalLM.from_pretrained("google/gemma-2-9b")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+    >>> prompt = "What is your favorite condiment?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "What is your favorite condiment?"
+    ```"""
+
+    if self.training and self.config._attn_implementation != "eager":
+        logger.warning_once(
+            "It is strongly recommended to train Gemma2 models with the `eager` attention implementation "
+            f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+        )
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            final_logit_softcapping=self.config.final_logit_softcapping,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+
+        loss = None
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output_tuple = (loss,) + output_tuple if loss is not None else output_tuple
+        output_tuple = output_tuple + (token_accuracy,) if token_accuracy is not None else output_tuple
+        output_tuple = output_tuple + (predicted_tokens,) if predicted_tokens is not None else output_tuple
+        return output_tuple
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/gemma3.py b/src/liger_kernel/transformers/model/gemma3.py
new file mode 100755
index 0000000000000000000000000000000000000000..74aae960070d49cdc5dabd586874b6d23d103247
--- /dev/null
+++ b/src/liger_kernel/transformers/model/gemma3.py
@@ -0,0 +1,343 @@
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from transformers.cache_utils import Cache
+from transformers.utils import logging
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+from liger_kernel.transformers.model.output_classes import LigerGemma3CausalLMOutputWithPast
+
+logger = logging.get_logger(__name__)
+
+
+def causal_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Cache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **loss_kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
+
+    >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+
+    >>> prompt = "What is your favorite condiment?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "What is your favorite condiment?"
+    ```"""
+
+    if self.training and self.config._attn_implementation != "eager":
+        logger.warning_once(
+            "It is strongly recommended to train Gemma3 models with the `eager` attention implementation "
+            f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`."
+        )
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **loss_kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+    shift_labels = loss_kwargs.pop("shift_labels", None)
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            final_logit_softcapping=self.config.final_logit_softcapping,
+            **loss_kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if self.config.final_logit_softcapping is not None:
+            logits = logits / self.config.final_logit_softcapping
+            logits = torch.tanh(logits)
+            logits = logits * self.config.final_logit_softcapping
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.vocab_size,
+                **loss_kwargs,
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output_tuple = (loss,) + output_tuple if loss is not None else output_tuple
+        output_tuple = output_tuple + (token_accuracy,) if token_accuracy is not None else output_tuple
+        output_tuple = output_tuple + (predicted_tokens,) if predicted_tokens is not None else output_tuple
+        return output_tuple
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
+
+
+def multimodal_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **lm_kwargs,
+) -> Union[tuple, LigerGemma3CausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+    >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+    >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+    >>> messages = [
+    ...     {
+    ...         "role": "system",
+    ...         "content": [
+    ...             {"type": "text", "text": "You are a helpful assistant."}
+    ...         ]
+    ...     },
+    ...     {
+    ...         "role": "user", "content": [
+    ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+    ...             {"type": "text", "text": "Where is the cat standing?"},
+    ...         ]
+    ...     },
+    ... ]
+
+    >>> inputs = processor.apply_chat_template(
+    ...     messages,
+    ...     tokenize=True,
+    ...     return_dict=True,
+    ...     return_tensors="pt",
+    ...     add_generation_prompt=True
+    ... )
+    >>> # Generate
+    >>> generate_ids = model.generate(**inputs)
+    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+    ```
+    """
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        token_type_ids=token_type_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        labels=labels,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **lm_kwargs,
+    )
+
+    shift_labels = lm_kwargs.pop("shift_labels", None)
+    hidden_states = outputs[0]
+
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+    if skip_logits and labels is None:
+        raise ValueError("skip_logits is True, but labels is None")
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None)
+
+    if skip_logits:
+        shift_hidden_states = kept_hidden_states[..., :-1, :]
+        shift_labels = labels[..., 1:]
+
+        hidden_device = shift_hidden_states.device
+        if attention_mask is not None:
+            # we use the input attention mask to shift the hidden_states and labels, because it is 2D.
+            # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+            shift_attention_mask = attention_mask[:, -shift_hidden_states.shape[1] :].to(hidden_device)
+            shift_hidden_states = shift_hidden_states[shift_attention_mask.to(hidden_device) != 0].contiguous()
+            shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+        else:
+            shift_hidden_states = shift_hidden_states.contiguous()
+            shift_labels = shift_labels.contiguous()
+
+        # Flatten hidden state
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
+        shift_labels = shift_labels.view(-1).to(hidden_device)
+
+        result = LigerForCausalLMLoss(
+            hidden_states=shift_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            shift_labels=shift_labels,
+            final_logit_softcapping=getattr(self.config.text_config, "final_logit_softcapping", None),
+            **lm_kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+        elif shift_labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = (loss,) + output if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    return LigerGemma3CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        image_hidden_states=outputs.image_hidden_states,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/glm4.py b/src/liger_kernel/transformers/model/glm4.py
new file mode 100755
index 0000000000000000000000000000000000000000..cb314292a1877110b834e0ce113600a216ff21e5
--- /dev/null
+++ b/src/liger_kernel/transformers/model/glm4.py
@@ -0,0 +1,141 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Glm4ForCausalLM
+
+    >>> model = Glm4ForCausalLM.from_pretrained("THUDM/GLM-4-9B-0414")
+    >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/GLM-4-9B-0414")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
+    ```
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/glm4v.py b/src/liger_kernel/transformers/model/glm4v.py
new file mode 100755
index 0000000000000000000000000000000000000000..0dd3cda7f9d67f130753b4119f5a1ecf2768d2af
--- /dev/null
+++ b/src/liger_kernel/transformers/model/glm4v.py
@@ -0,0 +1,165 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> from transformers import AutoTokenizer, Glm4vForConditionalGeneration
+
+    >>> MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
+    >>> messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "url": "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
+                },
+                {
+                    "type": "text",
+                    "text": "describe this image"
+                }
+            ],
+        }
+    ]
+    >>> processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
+    >>> model = Glm4vForConditionalGeneration.from_pretrained(
+        pretrained_model_name_or_path=MODEL_PATH,
+        dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    >>> inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device)
+    >>> generated_ids = model.generate(**inputs, max_new_tokens=8192)
+    output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
+    <think>Got it, let's describe the image. First, there's a vintage car, specifically a Volkswagen Beetle
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        mm_token_type_ids=mm_token_type_ids,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/glm4v_moe.py b/src/liger_kernel/transformers/model/glm4v_moe.py
new file mode 100755
index 0000000000000000000000000000000000000000..3203958f8111242933f70128e14c7e4f2565842b
--- /dev/null
+++ b/src/liger_kernel/transformers/model/glm4v_moe.py
@@ -0,0 +1,174 @@
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerGlm4vMoeCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[list[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.Tensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    rope_deltas: Optional[torch.LongTensor] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerGlm4vMoeCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoProcessor, Glm4vMoeForConditionalGeneration
+    >>> import torch
+
+    >>> MODEL_PATH = "zai-org/GLM-4.5V"
+    >>> messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "url": "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
+                },
+                {
+                    "type": "text",
+                    "text": "describe this image"
+                }
+            ],
+        }
+    ]
+    >>> processor = AutoProcessor.from_pretrained(MODEL_PATH)
+    >>> model = Glm4vMoeForConditionalGeneration.from_pretrained(
+        pretrained_model_name_or_path=MODEL_PATH,
+        dtype="auto",
+        device_map="auto",
+    )
+    >>> inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device)
+    >>> inputs.pop("token_type_ids", None)
+    >>> generated_ids = model.generate(**inputs, max_new_tokens=8192)
+    >>> output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
+    ```
+    """
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        mm_token_type_ids=mm_token_type_ids,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Build output kwargs and include aux_loss only if present (depends on transformers version)
+    output_kwargs = dict(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        rope_deltas=outputs.rope_deltas,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
+    if hasattr(outputs, "aux_loss"):
+        output_kwargs["aux_loss"] = outputs.aux_loss
+
+    # Return GLM4V MoE output with accuracy
+    return LigerGlm4vMoeCausalLMOutputWithPast(**output_kwargs)
diff --git a/src/liger_kernel/transformers/model/gpt_oss.py b/src/liger_kernel/transformers/model/gpt_oss.py
new file mode 100755
index 0000000000000000000000000000000000000000..8787fde65d0cd1a814cb18a25c8acbf72b118537
--- /dev/null
+++ b/src/liger_kernel/transformers/model/gpt_oss.py
@@ -0,0 +1,213 @@
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from transformers.modeling_outputs import MoeModelOutputWithPast
+from transformers.models.mixtral.modeling_mixtral import load_balancing_loss_func
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> LigerMoeCausalLMOutputWithPast:
+    r"""
+        Forward pass for causal language modeling with Mixture of Experts (MoE) architecture using Liger Kernel optimizations.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using tokenizers.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings.
+        past_key_values (`List[torch.FloatTensor]` or `Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up
+            sequential decoding. See `past_key_values` input for more details.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+            (see `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_router_logits (`bool`, *optional*):
+            Whether or not to return the router logits of all MoE layers. See `router_logits` under returned tensors
+            for more detail.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+        skip_logits (`bool`, *optional*):
+            Whether to skip logit computation and directly compute loss. If `None`, defaults to `True` during training
+            when labels are provided (to save memory), and `False` during inference.
+
+    Returns:
+        `LigerMoeCausalLMOutputWithPast`: An output object containing:
+            - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+                Language modeling loss (for next-token prediction), including the auxiliary load balancing loss.
+            - aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+                Auxiliary load balancing loss for the sparse MoE modules.
+            - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+                Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+                Note: logits are `None` during training when `skip_logits=True` to save memory.
+            - past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed):
+                Cached key and value projection states for faster sequential decoding.
+            - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
+                Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for each layer) of shape
+                `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer.
+            - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
+                Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+                sequence_length)`. Attentions weights after the attention softmax.
+            - router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True`):
+                Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+                Router logits of the MoE layers, useful to compute the auxiliary loss and z_loss.
+            - token_accuracy (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+                Token-level prediction accuracy.
+
+    Example:
+
+    ```python
+        >>> from transformers import AutoTokenizer, GptOssForCausalLM
+        >>> from liger_kernel.transformers import apply_liger_kernel_to_gpt_oss
+
+        >>> # Apply Liger Kernel patches for optimized performance
+        >>> apply_liger_kernel_to_gpt_oss()
+
+        >>> model = GptOssForCausalLM.from_pretrained("openai/gpt-oss-20b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Inference: Forward pass returns logits
+        >>> outputs = model(**inputs)
+        >>> outputs.logits.shape
+        torch.Size([1, 12, 201088])
+
+        >>> # Get next token prediction
+        >>> next_token_logits = outputs.logits[:, -1, :]
+        >>> predicted_token_id = next_token_logits.argmax(dim=-1)
+
+        >>> # Training: Forward pass with labels returns loss
+        >>> labels = inputs.input_ids.clone()
+        >>> outputs = model(**inputs, labels=labels)
+        >>> outputs.loss
+        tensor(2.6454)
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_router_logits = (
+        output_router_logits if output_router_logits is not None else self.config.output_router_logits
+    )
+
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs: MoeModelOutputWithPast = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:  # if in inference model materialize logits
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.vocab_size,
+                **kwargs,
+            )
+
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits,
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+    return LigerMoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        router_logits=outputs.router_logits,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/hunyuan_v1.py b/src/liger_kernel/transformers/model/hunyuan_v1.py
new file mode 100755
index 0000000000000000000000000000000000000000..dd5aa7a21328ee8270789faef9204f5138cfadbb
--- /dev/null
+++ b/src/liger_kernel/transformers/model/hunyuan_v1.py
@@ -0,0 +1,137 @@
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> LigerCausalLMOutputWithPast:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, HunYuanDenseV1ForCausalLM
+
+    >>> model = HunYuanDenseV1ForCausalLM.from_pretrained("meta-hunyuan_v1_dense/HunYuanDenseV1-2-7b-hf")
+    >>> tokenizer = AutoTokenizer.from_pretrained("meta-hunyuan_v1_dense/HunYuanDenseV1-2-7b-hf")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/internvl.py b/src/liger_kernel/transformers/model/internvl.py
new file mode 100755
index 0000000000000000000000000000000000000000..d9c5aa365179461e0227c8a14dba8d249d29a817
--- /dev/null
+++ b/src/liger_kernel/transformers/model/internvl.py
@@ -0,0 +1,160 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.utils import can_return_tuple
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerInternVLCausalLMOutputWithPast
+
+
+# Copied from https://github.com/huggingface/transformers/blob/d888bd435d0c0eaabaabad5b33d52af518c7187c/src/transformers/models/internvl/modeling_internvl.py#L862
+@can_return_tuple
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    vision_feature_layer: Optional[Union[int, List[int]]] = None,
+    vision_feature_select_strategy: Optional[str] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    image_sizes: Optional[torch.Tensor] = None,
+    skip_logits: Optional[bool] = None,  # Added argument for liger-kernel
+    **lm_kwargs,  # renamed from kwargs
+) -> Union[Tuple, LigerInternVLCausalLMOutputWithPast]:
+    r"""
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+    >>> torch_device = "cuda"
+    >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
+    >>> model = AutoModelForImageTextToText.from_pretrained(
+    ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
+    ... )
+
+    >>> messages = [
+    ...     {
+    ...         "role": "user",
+    ...         "content": [
+    ...             {
+    ...                 "type": "image",
+    ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+    ...             },
+    ...             {
+    ...                 "type": "image",
+    ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
+    ...             },
+    ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+    ...         ],
+    ...     },
+    ... ]
+
+    >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
+    >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
+    >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
+    The images depict the Statue of Liberty and the Golden Gate Bridge.
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    vision_feature_layer = (
+        vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+    )
+    vision_feature_select_strategy = (
+        vision_feature_select_strategy
+        if vision_feature_select_strategy is not None
+        else self.config.vision_feature_select_strategy
+    )
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        vision_feature_layer=vision_feature_layer,
+        vision_feature_select_strategy=vision_feature_select_strategy,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        image_sizes=image_sizes,
+        **lm_kwargs,
+    )
+
+    # Copied from llava.py
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = lm_kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            **lm_kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = (loss,) + output if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerInternVLCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        image_hidden_states=outputs.image_hidden_states,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/llama.py b/src/liger_kernel/transformers/model/llama.py
new file mode 100755
index 0000000000000000000000000000000000000000..9ad3edcb3a135a61268a34b320309f505265ab7a
--- /dev/null
+++ b/src/liger_kernel/transformers/model/llama.py
@@ -0,0 +1,202 @@
+from typing import TYPE_CHECKING
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from torch.distributed.fsdp import FullyShardedDataParallel
+
+from liger_kernel.transformers.fsdp import _FSDPForwardRedirection
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+from liger_kernel.utils import PEFT_AVAILABLE
+
+if TYPE_CHECKING:
+    from transformers.cache_utils import Cache
+
+if PEFT_AVAILABLE:
+    from peft.utils.other import ModulesToSaveWrapper
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+    >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    if self.config.pretraining_tp > 1:
+        raise Exception("Liger Kernel does not support pretraining_tp!!")
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    # if in training mode, don't materialize logits
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = lce_maybe_trainable_lm_head(
+            self,
+            hidden_states=kept_hidden_states,
+            hidden_size=self.config.hidden_size,
+            labels=labels,
+            shift_labels=shift_labels,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
+
+
+def lce_maybe_trainable_lm_head(self, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
+    lm_head = self.lm_head
+
+    # Unwrap the module if lm_head has been added as trainable module in PEFT LoRA configuration,
+    # i.e. listed in the modules_to_save field of LoraConfig, so the lm_head weights are read
+    # from the unwrapped module.
+    # See https://huggingface.co/docs/peft/package_reference/lora for reference.
+    if PEFT_AVAILABLE and isinstance(lm_head, ModulesToSaveWrapper):
+        lm_head = lm_head.modules_to_save.default
+
+    # If FSDP is used and lm_head is trainable, e.g., during full fine-tuning or with LoRA,
+    # reading the lm_head module weights and calling the kernel must be done within FSDP forward pass
+    # so the module entire parameters are summoned and kept in memory during the kernel execution.
+    if isinstance(lm_head, FullyShardedDataParallel):
+        return _FSDPForwardRedirection()(
+            lm_head,
+            _liger_for_causal_lm_loss,
+            lm_head.module,
+            hidden_states,
+            hidden_size,
+            labels,
+            shift_labels,
+            **loss_kwargs,
+        )
+
+    # FSDP is not used so we can read the lm_head weights and call the kernel directly
+    return _liger_for_causal_lm_loss(
+        lm_head=self.lm_head,
+        hidden_states=hidden_states,
+        hidden_size=hidden_size,
+        labels=labels,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )
+
+
+def _liger_for_causal_lm_loss(lm_head, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
+    return LigerForCausalLMLoss(
+        hidden_states=hidden_states,
+        lm_head_weight=lm_head.weight,
+        labels=labels,
+        hidden_size=hidden_size,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )
diff --git a/src/liger_kernel/transformers/model/llama4.py b/src/liger_kernel/transformers/model/llama4.py
new file mode 100755
index 0000000000000000000000000000000000000000..32d4986a94aa987451c2162da622adc8fa0008cb
--- /dev/null
+++ b/src/liger_kernel/transformers/model/llama4.py
@@ -0,0 +1,124 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.cache_utils import Cache
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Llama4ForCausalLM
+
+    >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=True,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    # Compute loss
+    if self.training and (labels is not None or shift_labels is not None):
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:  # if in inference mode materialize logits
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/llava.py b/src/liger_kernel/transformers/model/llava.py
new file mode 100755
index 0000000000000000000000000000000000000000..1af92165e8b84472ac0f4ef9d4b512edcd84a4a8
--- /dev/null
+++ b/src/liger_kernel/transformers/model/llava.py
@@ -0,0 +1,160 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerLlavaCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    vision_feature_layer: Optional[int] = None,
+    vision_feature_select_strategy: Optional[str] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    image_sizes: torch.Tensor = None,
+    skip_logits: Optional[bool] = None,
+    **lm_kwargs,
+) -> Union[Tuple, LigerLlavaCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+
+    >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+    >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+    >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    vision_feature_layer = (
+        vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+    )
+    vision_feature_select_strategy = (
+        vision_feature_select_strategy
+        if vision_feature_select_strategy is not None
+        else self.config.vision_feature_select_strategy
+    )
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        vision_feature_layer=vision_feature_layer,
+        vision_feature_select_strategy=vision_feature_select_strategy,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=True,
+        cache_position=cache_position,
+        image_sizes=image_sizes,
+        **lm_kwargs,
+    )
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = lm_kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            **lm_kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.text_config.vocab_size,
+                **lm_kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = (loss,) + output if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerLlavaCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        image_hidden_states=outputs.image_hidden_states,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/loss_utils.py b/src/liger_kernel/transformers/model/loss_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..508b3583d11d479df68dcec8e6ad82463b02a21e
--- /dev/null
+++ b/src/liger_kernel/transformers/model/loss_utils.py
@@ -0,0 +1,106 @@
+import inspect
+
+from typing import Optional
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+import liger_kernel.transformers.functional as F
+
+from liger_kernel.transformers.functional import CrossEntropyOutput
+
+
+def unpack_cross_entropy_result(
+    result,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+    if isinstance(result, CrossEntropyOutput):
+        return result.loss, result.z_loss, result.token_accuracy, result.predicted_tokens
+
+    if isinstance(result, tuple):
+        loss = result[0]
+        z_loss = result[1] if len(result) > 1 else None
+        token_accuracy = result[2] if len(result) > 2 else None
+        predicted_tokens = result[3] if len(result) > 3 else None
+        return loss, z_loss, token_accuracy, predicted_tokens
+
+    return result, None, None, None
+
+
+def fixed_fused_linear_cross_entropy(
+    hidden_states: torch.Tensor,
+    lm_head_weight: torch.Tensor,
+    target: torch.Tensor,
+    num_items_in_batch: Optional[int] = None,
+    ignore_index: int = -100,
+    final_logit_softcapping: Optional[float] = None,
+    accum_dtype: Optional[torch.dtype] = None,
+    return_token_accuracy: bool = False,
+    return_predicted_tokens: bool = False,
+    **kwargs,
+):
+    reduction = "sum" if num_items_in_batch is not None else "mean"
+    result = F.liger_fused_linear_cross_entropy(
+        hidden_states,
+        lm_head_weight,
+        target,
+        reduction=reduction,
+        ignore_index=ignore_index,
+        softcap=final_logit_softcapping,
+        accum_dtype=accum_dtype,
+        return_token_accuracy=return_token_accuracy,
+        return_predicted_tokens=return_predicted_tokens,
+        **kwargs,
+    )
+
+    loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    if reduction == "sum":
+        loss = loss / num_items_in_batch
+
+    if return_token_accuracy or return_predicted_tokens:
+        return CrossEntropyOutput(loss=loss, token_accuracy=token_accuracy, predicted_tokens=predicted_tokens)
+
+    return loss
+
+
+def LigerForCausalLMLoss(
+    hidden_states,
+    lm_head_weight,
+    labels,
+    hidden_size: int,
+    num_items_in_batch: Optional[int] = None,
+    ignore_index: int = -100,
+    shift_labels: Optional[torch.Tensor] = None,
+    final_logit_softcapping: Optional[float] = None,
+    return_token_accuracy: bool = False,
+    return_predicted_tokens: bool = False,
+    **kwargs,
+):
+    # Filter out inapplicable kwargs to liger_fused_linear_cross_entropy
+    applicable_params = inspect.signature(F.liger_fused_linear_cross_entropy).parameters
+    kwargs = {k: v for k, v in kwargs.items() if k in applicable_params}
+
+    # Skip upcast since intermediate values for the loss are all fp32 in kernel
+    if shift_labels is None:
+        # Shift so that token < n predict n
+        labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
+        shift_labels = labels[..., 1:].contiguous()
+
+    # Flatten the tokens
+    hidden_states = hidden_states.view(-1, hidden_size)
+    shift_labels = shift_labels.view(-1)
+    # Enable model parallelism
+    shift_labels = shift_labels.to(hidden_states.device)
+    result = fixed_fused_linear_cross_entropy(
+        hidden_states,
+        lm_head_weight,
+        shift_labels,
+        num_items_in_batch,
+        ignore_index,
+        final_logit_softcapping,
+        return_token_accuracy=return_token_accuracy,
+        return_predicted_tokens=return_predicted_tokens,
+        **kwargs,
+    )
+    return result
diff --git a/src/liger_kernel/transformers/model/mistral.py b/src/liger_kernel/transformers/model/mistral.py
new file mode 100755
index 0000000000000000000000000000000000000000..09efebf5fa14140be230a0ed67e3eb89da78a637
--- /dev/null
+++ b/src/liger_kernel/transformers/model/mistral.py
@@ -0,0 +1,146 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.cache_utils import Cache
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Copy paste Mistral's forward but replace torch cross entropy with liger fused linear cross entropy
+
+
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+    >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+
+        loss = None
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output = (loss,) + output_tuple if loss is not None else output_tuple
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/mixtral.py b/src/liger_kernel/transformers/model/mixtral.py
new file mode 100755
index 0000000000000000000000000000000000000000..5c87746bd64af2bc627394d5177af7d53126c818
--- /dev/null
+++ b/src/liger_kernel/transformers/model/mixtral.py
@@ -0,0 +1,167 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.models.mixtral.modeling_mixtral import load_balancing_loss_func
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
+
+
+# Ignore copy
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerMoeCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, MixtralForCausalLM
+
+    >>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_router_logits = (
+        output_router_logits if output_router_logits is not None else self.config.output_router_logits
+    )
+
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+
+        loss = None
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.vocab_size,
+                **kwargs,
+            )
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits if return_dict else outputs[-1],
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        if output_router_logits:
+            output_tuple = (aux_loss,) + output_tuple
+        if token_accuracy is not None:
+            output_tuple = output_tuple + (token_accuracy,)
+        if predicted_tokens is not None:
+            output_tuple = output_tuple + (predicted_tokens,)
+        return (loss,) + output_tuple if loss is not None else output_tuple
+
+    # Return custom output class with token_accuracy field
+    return LigerMoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        router_logits=outputs.router_logits if return_dict else outputs[-1],
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/mllama.py b/src/liger_kernel/transformers/model/mllama.py
new file mode 100755
index 0000000000000000000000000000000000000000..72094f77a7633ef78cbfd3c00a7e446479c3b4dd
--- /dev/null
+++ b/src/liger_kernel/transformers/model/mllama.py
@@ -0,0 +1,149 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.cache_utils import Cache
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    cross_attention_states: Optional[torch.LongTensor] = None,
+    cross_attention_mask: Optional[torch.LongTensor] = None,
+    full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, MllamaForCausalLM
+
+    >>> model = MllamaForCausalLM.from_pretrained("Llama-3.2-11B-Vision")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Llama-3.2-11B-Vision")
+
+    >>> prompt = "If I had to write a haiku, it would be:"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=40, do_sample=True, temperature=0.6)
+    >>> result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    >>> print(result)
+    If I had to write a haiku, it would be: "Snowflakes gently fall" - simple, yet peaceful.
+    I love the idea of snowflakes gently falling, each one
+    ```
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        cross_attention_states=cross_attention_states,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        cross_attention_mask=cross_attention_mask,
+        full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = (loss,) + output if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/olmo2.py b/src/liger_kernel/transformers/model/olmo2.py
new file mode 100755
index 0000000000000000000000000000000000000000..e78d7815af7d3673e6faebd0b57a17cc4c3d696f
--- /dev/null
+++ b/src/liger_kernel/transformers/model/olmo2.py
@@ -0,0 +1,141 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Olmo2ForCausalLM
+
+    >>> model = Olmo2ForCausalLM.from_pretrained("allenai/Olmo2-1B-hf")
+    >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo2-1B-hf")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
+    ```
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/olmo3.py b/src/liger_kernel/transformers/model/olmo3.py
new file mode 100755
index 0000000000000000000000000000000000000000..e9d1b54a8c252748acd620ded45684cc16976d1e
--- /dev/null
+++ b/src/liger_kernel/transformers/model/olmo3.py
@@ -0,0 +1,143 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.modeling_outputs import BaseModelOutputWithPast
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Olmo3ForCausalLM
+
+    >>> model = Olmo3ForCausalLM.from_pretrained("allenai/Olmo-3-7B-Instruct")
+    >>> tokenizer = AutoTokenizer.from_pretrained("allenai/Olmo-3-7B-Instruct")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    'Hey, are you conscious? Can you talk to me?\nI’m not sure if you’re conscious of this, but I’m'
+    ```
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs: BaseModelOutputWithPast = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/output_classes.py b/src/liger_kernel/transformers/model/output_classes.py
new file mode 100755
index 0000000000000000000000000000000000000000..f6b768c5065ba1a97a6c384500d39b3afd395e15
--- /dev/null
+++ b/src/liger_kernel/transformers/model/output_classes.py
@@ -0,0 +1,173 @@
+"""
+Custom output classes for Liger-Kernel that extend transformers' ModelOutput classes
+with optional token accuracy field.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_outputs import MoeCausalLMOutputWithPast
+
+# The following model-specific outputs are optional and depend on the installed
+# transformers version. Guard their imports so our module remains importable
+# even when those models are not available in the environment.
+try:
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3CausalLMOutputWithPast as _Gemma3CausalLMOutputWithPast
+except Exception:
+    _Gemma3CausalLMOutputWithPast = None
+
+try:
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import (
+        Glm4vMoeCausalLMOutputWithPast as _Glm4vMoeCausalLMOutputWithPast,
+    )
+except Exception:
+    _Glm4vMoeCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.internvl.modeling_internvl import (
+        InternVLCausalLMOutputWithPast as _InternVLCausalLMOutputWithPast,
+    )
+except Exception:
+    _InternVLCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast as _LlavaCausalLMOutputWithPast
+except Exception:
+    _LlavaCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.paligemma.modeling_paligemma import (
+        PaliGemmaCausalLMOutputWithPast as _PaliGemmaCausalLMOutputWithPast,
+    )
+except Exception:
+    _PaliGemmaCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+        Qwen2_5_VLCausalLMOutputWithPast as _Qwen2_5_VLCausalLMOutputWithPast,
+    )
+except Exception:
+    _Qwen2_5_VLCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+        Qwen2VLCausalLMOutputWithPast as _Qwen2VLCausalLMOutputWithPast,
+    )
+except Exception:
+    _Qwen2VLCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import (
+        Qwen3VLCausalLMOutputWithPast as _Qwen3VLCausalLMOutputWithPast,
+    )
+except Exception:
+    _Qwen3VLCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import (
+        Qwen3VLMoeCausalLMOutputWithPast as _Qwen3VLMoeCausalLMOutputWithPast,
+    )
+except Exception:
+    _Qwen3VLMoeCausalLMOutputWithPast = None
+
+try:
+    from transformers.models.qwen3_5.modeling_qwen3_5 import (
+        Qwen3_5CausalLMOutputWithPast as _Qwen3_5CausalLMOutputWithPast,
+    )
+except Exception:
+    _Qwen3_5CausalLMOutputWithPast = None
+
+
+@dataclass
+class LigerCausalLMOutputWithPast(CausalLMOutputWithPast):
+    token_accuracy: Optional[torch.FloatTensor] = None
+    predicted_tokens: Optional[torch.LongTensor] = None
+
+
+@dataclass
+class LigerMoeCausalLMOutputWithPast(MoeCausalLMOutputWithPast):
+    token_accuracy: Optional[torch.FloatTensor] = None
+    predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _Gemma3CausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerGemma3CausalLMOutputWithPast(_Gemma3CausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _Glm4vMoeCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerGlm4vMoeCausalLMOutputWithPast(_Glm4vMoeCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _LlavaCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerLlavaCausalLMOutputWithPast(_LlavaCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _InternVLCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerInternVLCausalLMOutputWithPast(_InternVLCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _PaliGemmaCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerPaliGemmaCausalLMOutputWithPast(_PaliGemmaCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _Qwen2_5_VLCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerQwen2_5_VLCausalLMOutputWithPast(_Qwen2_5_VLCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _Qwen2VLCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerQwen2VLCausalLMOutputWithPast(_Qwen2VLCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _Qwen3VLCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerQwen3VLCausalLMOutputWithPast(_Qwen3VLCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _Qwen3VLMoeCausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerQwen3VLMoeCausalLMOutputWithPast(_Qwen3VLMoeCausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
+
+
+if _Qwen3_5CausalLMOutputWithPast is not None:
+
+    @dataclass
+    class LigerQwen3_5CausalLMOutputWithPast(_Qwen3_5CausalLMOutputWithPast):
+        token_accuracy: Optional[torch.FloatTensor] = None
+        predicted_tokens: Optional[torch.LongTensor] = None
diff --git a/src/liger_kernel/transformers/model/paligemma.py b/src/liger_kernel/transformers/model/paligemma.py
new file mode 100755
index 0000000000000000000000000000000000000000..235635771a69aa6c34be15d5f7dfc5165917777f
--- /dev/null
+++ b/src/liger_kernel/transformers/model/paligemma.py
@@ -0,0 +1,250 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache
+from transformers.utils import is_torchdynamo_compiling
+from transformers.utils import logging
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerPaliGemmaCausalLMOutputWithPast
+
+logger = logging.get_logger(__name__)
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **lm_kwargs,
+) -> Union[Tuple, LigerPaliGemmaCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+
+    >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
+    >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
+
+    >>> prompt = "answer en Where is the cow standing?"
+    >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(**inputs, max_length=30)
+    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "answer en Where is the cow standing?\nbeach"
+    ```"""
+
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+    if pixel_values is not None and inputs_embeds is not None:
+        raise ValueError(
+            "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+        )
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    is_training = token_type_ids is not None and labels is not None
+
+    if inputs_embeds is None:
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+
+    if cache_position is None:
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+
+    # Merge text and images
+    if pixel_values is not None:
+        image_features = self.get_image_features(pixel_values)
+
+        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+        if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+            raise ValueError(
+                f"Number of images does not match number of special image tokens in the input text. "
+                f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                "tokens from image embeddings."
+            )
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+    # mask out pad-token-ids in labels for BC
+    if labels is not None and self.pad_token_id in labels:
+        logger.warning_once(
+            "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
+            "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+        )
+        labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+
+    causal_mask = self._update_causal_mask(
+        attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
+    )
+
+    outputs = self.language_model.model(
+        attention_mask=causal_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        logits_to_keep=logits_to_keep,
+        **lm_kwargs,
+    )
+
+    shift_labels = lm_kwargs.pop("shift_labels", None)
+    hidden_states = outputs[0]
+
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None:
+        raise ValueError("skip_logits is True, but labels is None")
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None)
+
+    if skip_logits:
+        shift_hidden_states = hidden_states[..., :-1, :]
+        shift_labels = labels[..., 1:]
+
+        hidden_device = shift_hidden_states.device
+
+        if attention_mask is not None:
+            # we use the input attention mask to shift the hidden_states and labels, because it is 2D.
+            # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+            shift_attention_mask = attention_mask[:, -shift_hidden_states.shape[1] :].to(hidden_device)
+            shift_hidden_states = shift_hidden_states[shift_attention_mask.to(hidden_device) != 0].contiguous()
+            shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+        else:
+            shift_hidden_states = shift_hidden_states.contiguous()
+            shift_labels = shift_labels.contiguous()
+
+        # Flatten hidden state
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
+        shift_labels = shift_labels.view(-1).to(hidden_device)
+
+        # Use LigerForCausalLMLoss with accuracy support and pass already shifted labels
+        result = LigerForCausalLMLoss(
+            hidden_states=shift_hidden_states,
+            lm_head_weight=self.language_model.lm_head.weight,
+            labels=None,
+            shift_labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            **lm_kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.language_model.lm_head(hidden_states)
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+        elif shift_labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = (loss,) + output if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return PaliGemma output with token_accuracy field
+    return LigerPaliGemmaCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        image_hidden_states=image_features if pixel_values is not None else None,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/phi3.py b/src/liger_kernel/transformers/model/phi3.py
new file mode 100755
index 0000000000000000000000000000000000000000..b3a9fa1f1a7aed6be543cfe2d24891f5cb5221ee
--- /dev/null
+++ b/src/liger_kernel/transformers/model/phi3.py
@@ -0,0 +1,123 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.modeling_outputs import BaseModelOutputWithPast
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+
+    >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf")
+    >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    outputs: BaseModelOutputWithPast = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output = (loss,) + output_tuple if loss is not None else output_tuple
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/pixtral.py b/src/liger_kernel/transformers/model/pixtral.py
new file mode 100755
index 0000000000000000000000000000000000000000..c8b3b7b69d7159a6f928e083376a4c40a82314c2
--- /dev/null
+++ b/src/liger_kernel/transformers/model/pixtral.py
@@ -0,0 +1,4 @@
+# Pixtral vision encoder does not require a custom forward function.
+# The Liger kernel optimizations for Pixtral (RMSNorm, SwiGLU, RoPE) are applied
+# via class/function-level monkey patching in monkey_patch.py, which is sufficient
+# since the vision encoder has no cross-entropy loss to fuse.
diff --git a/src/liger_kernel/transformers/model/qwen2.py b/src/liger_kernel/transformers/model/qwen2.py
new file mode 100755
index 0000000000000000000000000000000000000000..6d43caadaa5856de4558bd2a49a4ab1e55faf8cd
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen2.py
@@ -0,0 +1,260 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward_deprecated(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    skip_logits: Optional[bool] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Copy paste Qwen2's forward but replace torch cross entropy with liger fused linear cross entropy
+
+
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+    >>> model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2-1.5B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+
+    hidden_states = outputs[0]
+
+    loss = None
+    logits = None
+
+    if skip_logits and labels is None:
+        raise ValueError("skip_logits is True, but labels is None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and labels is not None
+
+    if self.training and (labels is not None):
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+
+        # flatten tokens
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
+        shift_labels = shift_labels.view(-1)
+
+        lce = LigerFusedLinearCrossEntropyLoss()
+        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+
+    else:
+        logits = self.lm_head(hidden_states)
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+    >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+    >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output = (loss,) + output_tuple if loss is not None else output_tuple
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen2_5_vl.py b/src/liger_kernel/transformers/model/qwen2_5_vl.py
new file mode 100755
index 0000000000000000000000000000000000000000..ac4aae51cfcb799fdec89bf1a76413c04ce33123
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen2_5_vl.py
@@ -0,0 +1,186 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from packaging import version
+from transformers import __version__ as transformers_version
+from transformers.utils import can_return_tuple
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerQwen2_5_VLCausalLMOutputWithPast
+
+_TRANSFORMERS_V5_OR_LATER = version.parse(transformers_version) >= version.parse("5.0.0")
+
+
+def _get_hidden_size(config) -> int:
+    """Get hidden_size from Qwen2.5VLConfig in a version-aware manner."""
+    if _TRANSFORMERS_V5_OR_LATER:
+        return config.text_config.hidden_size
+    return config.hidden_size
+
+
+def _get_vocab_size(config) -> int:
+    """Get vocab_size from Qwen2.5VLConfig in a version-aware manner."""
+    if _TRANSFORMERS_V5_OR_LATER:
+        return config.text_config.vocab_size
+    return config.vocab_size
+
+
+@can_return_tuple
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    pixel_values: Optional[torch.Tensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    rope_deltas: Optional[torch.LongTensor] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerQwen2_5_VLCausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+    pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
+        The tensors corresponding to the input videos. Pixel values can be obtained using
+        [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
+        [`Qwen2_5_VLImageProcessor`] for processing videos.
+    image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+        The temporal, height and width of feature shape of each image in LLM.
+    video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+        The temporal, height and width of feature shape of each video in LLM.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+        The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+
+    >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+
+    >>> messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What is shown in this image?"},
+            ],
+        },
+    ]
+    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        second_per_grid_ts=second_per_grid_ts,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        mm_token_type_ids=mm_token_type_ids,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=_get_hidden_size(self.config),
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=_get_vocab_size(self.config),
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output = (loss,) + output_tuple if loss is not None else output_tuple
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return Qwen2.5-VL output with token accuracy
+    return LigerQwen2_5_VLCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        rope_deltas=outputs.rope_deltas,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen2_vl.py b/src/liger_kernel/transformers/model/qwen2_vl.py
new file mode 100755
index 0000000000000000000000000000000000000000..b51600a2eddedc26d7d41afce628cc34a066f2be
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen2_vl.py
@@ -0,0 +1,182 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from packaging import version
+from transformers import __version__ as transformers_version
+from transformers.utils import can_return_tuple
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerQwen2VLCausalLMOutputWithPast
+
+_TRANSFORMERS_V5_OR_LATER = version.parse(transformers_version) >= version.parse("5.0.0")
+
+
+def _get_hidden_size(config) -> int:
+    """Get hidden_size from Qwen2VLConfig in a version-aware manner."""
+    if _TRANSFORMERS_V5_OR_LATER:
+        return config.text_config.hidden_size
+    return config.hidden_size
+
+
+def _get_vocab_size(config) -> int:
+    """Get vocab_size from Qwen2VLConfig in a version-aware manner."""
+    if _TRANSFORMERS_V5_OR_LATER:
+        return config.text_config.vocab_size
+    return config.vocab_size
+
+
+@can_return_tuple
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    pixel_values: Optional[torch.Tensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    rope_deltas: Optional[torch.LongTensor] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerQwen2VLCausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+    pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
+        The tensors corresponding to the input videos. Pixel values can be obtained using
+        [`AutoImageProcessor`]. See [`Qwen2VLImageProcessor.__call__`] for details. [`Qwen2VLProcessor`] uses
+        [`Qwen2VLImageProcessor`] for processing videos.
+    image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+        The temporal, height and width of feature shape of each image in LLM.
+    video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+        The temporal, height and width of feature shape of each video in LLM.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
+
+    >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+
+    >>> messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What is shown in this image?"},
+            ],
+        },
+    ]
+    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        mm_token_type_ids=mm_token_type_ids,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=_get_hidden_size(self.config),
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=_get_vocab_size(self.config),
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output = (loss,) + output_tuple if loss is not None else output_tuple
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return Qwen2VL output with token accuracy
+    return LigerQwen2VLCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        rope_deltas=outputs.rope_deltas,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen3.py b/src/liger_kernel/transformers/model/qwen3.py
new file mode 100755
index 0000000000000000000000000000000000000000..5b64fb90ccaa933549360662aba9070beae33156
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen3.py
@@ -0,0 +1,139 @@
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> LigerCausalLMOutputWithPast:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
+
+    >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    # Remove output-control parameters that shouldn't be passed to loss functions
+    kwargs.pop("return_dict", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen3_5.py b/src/liger_kernel/transformers/model/qwen3_5.py
new file mode 100755
index 0000000000000000000000000000000000000000..b94304b653c0429aba3d3c020b13b896496b5466
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen3_5.py
@@ -0,0 +1,256 @@
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+from liger_kernel.transformers.model.output_classes import LigerQwen3_5CausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> LigerCausalLMOutputWithPast:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Qwen3_5ForCausalLM
+
+    >>> model = Qwen3_5ForCausalLM.from_pretrained("Qwen/Qwen3.5-9B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3.5-9B")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    return_dict = kwargs.pop("return_dict", None)
+    if return_dict is None:
+        return_dict = self.config.use_return_dict
+
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
+
+
+def lce_forward_for_multimodal(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.Tensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[tuple, LigerQwen3_5CausalLMOutputWithPast]:
+    r"""
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+    image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+        The temporal, height and width of feature shape of each image in LLM.
+    video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+        The temporal, height and width of feature shape of each video in LLM.
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoProcessor, Qwen3_5ForConditionalGeneration
+
+    >>> model = Qwen3_5ForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
+    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
+
+    >>> messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
+                },
+                {"type": "text", "text": "Describe the image."},
+            ],
+        }
+    ]
+
+    >>> inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+    )
+
+    >>> # Generate
+    >>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
+    >>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+    >>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    >>> print(output_text)
+    ```
+    """
+    return_dict = kwargs.pop("return_dict", None)
+    if return_dict is None:
+        return_dict = self.config.use_return_dict
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        mm_token_type_ids=mm_token_type_ids,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.text_config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    return LigerQwen3_5CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        rope_deltas=outputs.rope_deltas,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen3_5_moe.py b/src/liger_kernel/transformers/model/qwen3_5_moe.py
new file mode 100755
index 0000000000000000000000000000000000000000..e93a3d02ad4774665222a3d95ad97638dc10a51a
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen3_5_moe.py
@@ -0,0 +1,157 @@
+from typing import TYPE_CHECKING
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from transformers.modeling_outputs import MoeModelOutputWithPast
+
+if TYPE_CHECKING:
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import load_balancing_loss_func
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> LigerMoeCausalLMOutputWithPast:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3.5-35B-A3B-Instruct")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3.5-35B-A3B-Instruct")
+
+    >>> prompt = "Give me a short introduction to large language model."
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_router_logits = (
+        output_router_logits if output_router_logits is not None else self.config.output_router_logits
+    )
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs: MoeModelOutputWithPast = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        mm_token_type_ids=mm_token_type_ids,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:  # if in inference model materialize logits
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.vocab_size,
+                **kwargs,
+            )
+
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits,
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((aux_loss,) + output) if aux_loss is not None else output
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    return LigerMoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        router_logits=outputs.router_logits,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen3_moe.py b/src/liger_kernel/transformers/model/qwen3_moe.py
new file mode 100755
index 0000000000000000000000000000000000000000..cee0c9ad3d5e91092cb8db5131459e39048eff71
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen3_moe.py
@@ -0,0 +1,155 @@
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from transformers.modeling_outputs import MoeModelOutputWithPast
+from transformers.models.mixtral.modeling_mixtral import load_balancing_loss_func
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> LigerMoeCausalLMOutputWithPast:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM
+
+    >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_router_logits = (
+        output_router_logits if output_router_logits is not None else self.config.output_router_logits
+    )
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs: MoeModelOutputWithPast = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:  # if in inference model materialize logits
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.vocab_size,
+                **kwargs,
+            )
+
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits,
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((aux_loss,) + output) if aux_loss is not None else output
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with accuracy field
+    return LigerMoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        router_logits=outputs.router_logits,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen3_next.py b/src/liger_kernel/transformers/model/qwen3_next.py
new file mode 100755
index 0000000000000000000000000000000000000000..5f6dd0062769637177d8a4968f0b24fce721a73c
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen3_next.py
@@ -0,0 +1,155 @@
+from typing import TYPE_CHECKING
+from typing import List
+from typing import Optional
+from typing import Union
+
+import torch
+
+from transformers.modeling_outputs import MoeModelOutputWithPast
+
+if TYPE_CHECKING:
+    from transformers.models.qwen3_next.modeling_qwen3_next import load_balancing_loss_func
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
+
+
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    **kwargs,
+) -> LigerMoeCausalLMOutputWithPast:
+    r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")
+    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")
+
+    >>> prompt = "Give me a short introduction to large language model."
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_router_logits = (
+        output_router_logits if output_router_logits is not None else self.config.output_router_logits
+    )
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs: MoeModelOutputWithPast = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs.last_hidden_state
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:  # if in inference model materialize logits
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.vocab_size,
+                **kwargs,
+            )
+
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits,
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = ((aux_loss,) + output) if aux_loss is not None else output
+        output = ((loss,) + output) if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    return LigerMoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        router_logits=outputs.router_logits,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen3_vl.py b/src/liger_kernel/transformers/model/qwen3_vl.py
new file mode 100755
index 0000000000000000000000000000000000000000..83738ebff14d7666cb9ddfe2baa7ec3a951fb7b0
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen3_vl.py
@@ -0,0 +1,155 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.utils import can_return_tuple
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerQwen3VLCausalLMOutputWithPast
+
+
+@can_return_tuple
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    pixel_values: Optional[torch.Tensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    rope_deltas: Optional[torch.LongTensor] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerQwen3VLCausalLMOutputWithPast]:
+    """
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+    pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
+        The tensors corresponding to the input videos. Pixel values can be obtained using
+        [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
+        [`Qwen2_5_VLImageProcessor`] for processing videos.
+    image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+        The temporal, height and width of feature shape of each image in LLM.
+    video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+        The temporal, height and width of feature shape of each video in LLM.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+        The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+    Example:
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+    >>> model = Qwen3VLForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL")
+    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL")
+    >>> messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What is shown in this image?"},
+            ],
+        },
+    ]
+    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        second_per_grid_ts=second_per_grid_ts,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        mm_token_type_ids=mm_token_type_ids,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = (loss,) + output if loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    return LigerQwen3VLCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        rope_deltas=outputs.rope_deltas,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/qwen3_vl_moe.py b/src/liger_kernel/transformers/model/qwen3_vl_moe.py
new file mode 100755
index 0000000000000000000000000000000000000000..8c0c805f68f1baa9c42bb3ddc49df05b78e86e7a
--- /dev/null
+++ b/src/liger_kernel/transformers/model/qwen3_vl_moe.py
@@ -0,0 +1,131 @@
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import load_balancing_loss_func
+from transformers.utils import can_return_tuple
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerQwen3VLMoeCausalLMOutputWithPast
+
+
+@can_return_tuple
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    pixel_values: Optional[torch.Tensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    rope_deltas: Optional[torch.LongTensor] = None,
+    mm_token_type_ids: Optional[torch.IntTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerQwen3VLMoeCausalLMOutputWithPast]:
+    """
+    Qwen3-VL-MoE forward with fused linear cross entropy support mirroring Qwen3-VL behaviour.
+    """
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        second_per_grid_ts=second_per_grid_ts,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        mm_token_type_ids=mm_token_type_ids,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    loss = None
+    logits = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        result = LigerForCausalLMLoss(
+            hidden_states=hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+    else:
+        logits = self.lm_head(hidden_states)
+
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+
+    # Compute auxiliary load-balancing loss for MoE when requested
+    aux_loss = None
+    if kwargs.get("output_router_logits", False):
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits,
+            self.config.text_config.num_experts,
+            self.config.text_config.num_experts_per_tok,
+            attention_mask,
+        )
+        # If we computed training loss, add the scaled aux loss to it
+        if loss is not None and aux_loss is not None:
+            loss = loss + self.config.text_config.router_aux_loss_coef * aux_loss.to(loss.device)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        output = (loss,) + output if loss is not None else output
+        output = output + (aux_loss,) if aux_loss is not None else output
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    return LigerQwen3VLMoeCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        rope_deltas=outputs.rope_deltas,
+        aux_loss=aux_loss,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
diff --git a/src/liger_kernel/transformers/model/smollm3.py b/src/liger_kernel/transformers/model/smollm3.py
new file mode 100755
index 0000000000000000000000000000000000000000..3a9167f5658d7b26f10c76efbe399992dceddaf7
--- /dev/null
+++ b/src/liger_kernel/transformers/model/smollm3.py
@@ -0,0 +1,200 @@
+from typing import TYPE_CHECKING
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import torch
+
+from torch.distributed.fsdp import FullyShardedDataParallel
+
+from liger_kernel.transformers.fsdp import _FSDPForwardRedirection
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
+from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
+from liger_kernel.utils import PEFT_AVAILABLE
+
+if TYPE_CHECKING:
+    from transformers.cache_utils import Cache
+
+if PEFT_AVAILABLE:
+    from peft.utils.other import ModulesToSaveWrapper
+
+
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,
+    **kwargs,
+) -> Union[Tuple, LigerCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        logits_to_keep (`int` or `torch.Tensor`, *optional*):
+            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+            This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Smollm3ForCausalLM
+
+    >>> model = Smollm3ForCausalLM.from_pretrained("HuggingFaceTB/SmolLM3-3B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+    token_accuracy = None
+    predicted_tokens = None
+
+    # if in training mode, don't materialize logits
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    # Compute loss
+    if skip_logits:
+        result = lce_maybe_trainable_lm_head(
+            self,
+            hidden_states=kept_hidden_states,
+            hidden_size=self.config.hidden_size,
+            labels=labels,
+            shift_labels=shift_labels,
+            **kwargs,
+        )
+        loss, _, token_accuracy, predicted_tokens = unpack_cross_entropy_result(result)
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                shift_labels=shift_labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+    if not return_dict:
+        output_tuple = (logits,) + outputs[1:]
+        output = (loss,) + output_tuple if loss is not None else output_tuple
+        output = output + (token_accuracy,) if token_accuracy is not None else output
+        output = output + (predicted_tokens,) if predicted_tokens is not None else output
+        return output
+
+    # Return custom output class with token_accuracy field
+    return LigerCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
+
+
+def lce_maybe_trainable_lm_head(self, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
+    lm_head = self.lm_head
+
+    # Unwrap the module if lm_head has been added as trainable module in PEFT LoRA configuration,
+    # i.e. listed in the modules_to_save field of LoraConfig, so the lm_head weights are read
+    # from the unwrapped module.
+    # See https://huggingface.co/docs/peft/package_reference/lora for reference.
+    if PEFT_AVAILABLE and isinstance(lm_head, ModulesToSaveWrapper):
+        lm_head = lm_head.modules_to_save.default
+
+    # If FSDP is used and lm_head is trainable, e.g., during full fine-tuning or with LoRA,
+    # reading the lm_head module weights and calling the kernel must be done within FSDP forward pass
+    # so the module entire parameters are summoned and kept in memory during the kernel execution.
+    if isinstance(lm_head, FullyShardedDataParallel):
+        return _FSDPForwardRedirection()(
+            lm_head,
+            _liger_for_causal_lm_loss,
+            lm_head.module,
+            hidden_states,
+            hidden_size,
+            labels,
+            shift_labels,
+            **loss_kwargs,
+        )
+
+    # FSDP is not used so we can read the lm_head weights and call the kernel directly
+    return _liger_for_causal_lm_loss(
+        lm_head=self.lm_head,
+        hidden_states=hidden_states,
+        hidden_size=hidden_size,
+        labels=labels,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )
+
+
+def _liger_for_causal_lm_loss(lm_head, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
+    return LigerForCausalLMLoss(
+        hidden_states=hidden_states,
+        lm_head_weight=lm_head.weight,
+        labels=labels,
+        hidden_size=hidden_size,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )
diff --git a/src/liger_kernel/transformers/model/smolvlm.py b/src/liger_kernel/transformers/model/smolvlm.py
new file mode 100755
index 0000000000000000000000000000000000000000..395c0c95770de3fff463c543c60c955adf8b7d7f
--- /dev/null
+++ b/src/liger_kernel/transformers/model/smolvlm.py
@@ -0,0 +1,158 @@
+from typing import TYPE_CHECKING
+from typing import Optional
+from typing import Union
+
+import torch
+
+from transformers.models.smolvlm.modeling_smolvlm import SmolVLMCausalLMOutputWithPast
+from transformers.processing_utils import Unpack
+from transformers.utils.generic import can_return_tuple
+
+from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+
+if TYPE_CHECKING:
+    from transformers.cache_utils import Cache
+    from transformers.utils.generic import TransformersKwargs
+
+
+# Forward adapted to enable fused Linear + CE without materializing logits.
+# Mirrors the pattern used for other multimodal models (e.g., InternVL, LLaVA).
+@can_return_tuple
+def lce_forward(
+    self,
+    input_ids: Optional[torch.LongTensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional["Cache"] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_attention_mask: Optional[torch.BoolTensor] = None,
+    image_hidden_states: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    return_dict: Optional[bool] = None,
+    logits_to_keep: Union[int, torch.Tensor] = 0,
+    skip_logits: Optional[bool] = None,  # Added argument for liger-kernel
+    **lm_kwargs: Unpack["TransformersKwargs"],  # renamed from kwargs
+) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
+    r"""
+    pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+        Mask to avoid performing attention on padding pixel indices.
+    image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+        The hidden states of the image encoder after modality projection.
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
+        ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Example:
+
+    ```python
+    >>> import requests
+    >>> import torch
+    >>> from PIL import Image
+    >>> from io import BytesIO
+
+    >>> from transformers import AutoProcessor, AutoModelForImageTextToText
+    >>> from transformers.image_utils import load_image
+
+    >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
+    >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
+    >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
+    >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
+
+    >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
+    >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", dtype=torch.bfloat16, device_map="auto")
+
+    >>> # Create inputs
+    >>> messages = [
+    ...     {
+    ...         "role": "user",
+    ...         "content": [
+    ...             {"type": "video", "path": path/to/video},
+    ...             {"type": "text", "text": "What is happening in this video?"},
+    ...         ]
+    ...     }
+    ... ]
+
+    >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)
+
+    >>> # Generate
+    >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
+    >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+    >>> print(generated_texts)
+    ```"""
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        pixel_values=pixel_values,
+        pixel_attention_mask=pixel_attention_mask,
+        image_hidden_states=image_hidden_states,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        cache_position=cache_position,
+        return_dict=True,
+        **lm_kwargs,
+    )
+
+    # Copied from llava.py
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
+    shift_labels = lm_kwargs.pop("shift_labels", None)
+    logits = None
+    loss = None
+
+    if skip_logits and labels is None and shift_labels is None:
+        raise ValueError("skip_logits is True, but labels and shift_labels are None")
+
+    if skip_logits is None:
+        # By default, if in training mode, don't materialize logits
+        skip_logits = self.training and (labels is not None or shift_labels is not None)
+
+    if skip_logits:
+        loss = LigerForCausalLMLoss(
+            hidden_states=kept_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=labels,
+            shift_labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            **lm_kwargs,
+        )
+
+    else:
+        logits = self.lm_head(kept_hidden_states)
+        if labels is not None or shift_labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    return SmolVLMCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        image_hidden_states=outputs.image_hidden_states,
+    )
diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
new file mode 100755
index 0000000000000000000000000000000000000000..e5a526003eaafdf131ba9aecf49b88b1f1e857e6
--- /dev/null
+++ b/src/liger_kernel/transformers/monkey_patch.py
@@ -0,0 +1,3178 @@
+import inspect
+import logging
+
+from functools import partial
+from types import MethodType
+from typing import Callable
+from typing import Optional
+
+import transformers
+
+from packaging import version
+from transformers import PreTrainedModel
+
+from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+from liger_kernel.transformers.functional import liger_cross_entropy
+from liger_kernel.transformers.geglu import LigerGEGLUMLP
+from liger_kernel.transformers.layer_norm import LigerLayerNorm
+from liger_kernel.transformers.model.falcon_h1 import lce_forward as falcon_h1_lce_forward
+from liger_kernel.transformers.model.gemma import lce_forward as gemma_lce_forward
+from liger_kernel.transformers.model.gemma2 import lce_forward as gemma2_lce_forward
+from liger_kernel.transformers.model.gpt_oss import lce_forward as gpt_oss_lce_forward
+from liger_kernel.transformers.model.llama import lce_forward as llama_lce_forward
+from liger_kernel.transformers.model.llava import lce_forward as llava_lce_forward
+from liger_kernel.transformers.model.mistral import lce_forward as mistral_lce_forward
+from liger_kernel.transformers.model.mixtral import lce_forward as mixtral_lce_forward
+from liger_kernel.transformers.model.phi3 import lce_forward as phi3_lce_forward
+from liger_kernel.transformers.model.qwen2 import lce_forward as qwen2_lce_forward
+from liger_kernel.transformers.model.smollm3 import lce_forward as smollm3_lce_forward
+from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
+from liger_kernel.transformers.rms_norm import LigerRMSNorm
+from liger_kernel.transformers.rope import liger_rotary_pos_emb
+from liger_kernel.transformers.rope import liger_rotary_pos_emb_vision
+from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP
+from liger_kernel.transformers.swiglu import LigerExperts
+from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+
+try:
+    import peft
+
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
+
+transformer_version = version.parse(transformers.__version__)
+
+logger = logging.getLogger(__name__)
+
+MIN_SUPPORTED_TRANSFORMERS_VERSION = version.parse("4.52.0")
+if transformer_version < MIN_SUPPORTED_TRANSFORMERS_VERSION:
+    raise ImportError(
+        f"liger-kernel requires transformers >= {MIN_SUPPORTED_TRANSFORMERS_VERSION}, got {transformers.__version__}. "
+        "Please install an older version of liger-kernel that is compatible with your transformers version."
+    )
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+
+def _bind_method_to_module(module, method_name: str, new_method: Callable):
+    # Binds a new method to a module instance so that self is passed as the first argument
+    module.__dict__[method_name] = new_method.__get__(module, module.__class__)
+
+
+def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", in_place=True, row_mode=None):
+    # Check if the module is a PEFT ModulesToSaveWrapper
+    # If it is, we need to patch the modules_to_save.default and original_modules
+    if PEFT_AVAILABLE and isinstance(module, peft.utils.other.ModulesToSaveWrapper):
+        module.modules_to_save.default.offset = offset
+        module.modules_to_save.default.casting_mode = casting_mode
+        module.modules_to_save.default.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.modules_to_save.default.in_place = in_place
+        module.modules_to_save.default.row_mode = row_mode
+        module.original_module.offset = offset
+        module.original_module.casting_mode = casting_mode
+        module.original_module.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.original_module.in_place = in_place
+        module.original_module.row_mode = row_mode
+        _bind_method_to_module(module.modules_to_save.default, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module.modules_to_save.default, "extra_repr", LigerRMSNorm.extra_repr)
+        _bind_method_to_module(module.original_module, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module.original_module, "extra_repr", LigerRMSNorm.extra_repr)
+        _bind_method_to_module(module.modules_to_save.default, "_get_name", lambda self: LigerRMSNorm.__name__)
+        _bind_method_to_module(module.original_module, "_get_name", lambda self: LigerRMSNorm.__name__)
+    else:
+        module.offset = offset
+        module.casting_mode = casting_mode
+        module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        module.in_place = in_place
+        module.row_mode = row_mode
+        _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
+        _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
+        _bind_method_to_module(module, "_get_name", lambda self: LigerRMSNorm.__name__)
+
+
+def _patch_layer_norm_module(module, eps=1e-6):
+    # Check if the module is a PEFT ModulesToSaveWrapper
+    # If it is, we need to patch the modules_to_save.default and original_modules
+    if PEFT_AVAILABLE and isinstance(module, peft.utils.other.ModulesToSaveWrapper):
+        module.hidden_size = module.normalized_shape
+        _bind_method_to_module(module, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
+        module.modules_to_save.default.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.original_module.hidden_size = getattr(module, "hidden_size", None) or getattr(
+            module, "normalized_shape", None
+        )
+        module.original_module.variance_epsilon = (
+            getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        )
+        module.original_module.hidden_size = getattr(module, "hidden_size", None) or getattr(
+            module, "normalized_shape", None
+        )
+        _bind_method_to_module(module.modules_to_save.default, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module.modules_to_save.default, "extra_repr", LigerLayerNorm.extra_repr)
+        _bind_method_to_module(module.original_module, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module.original_module, "extra_repr", LigerLayerNorm.extra_repr)
+        _bind_method_to_module(module.modules_to_save.default, "_get_name", lambda self: LigerLayerNorm.__name__)
+        _bind_method_to_module(module.original_module, "_get_name", lambda self: LigerLayerNorm.__name__)
+    else:
+        module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
+        module.hidden_size = getattr(module, "hidden_size", None) or getattr(module, "normalized_shape", None)
+        _bind_method_to_module(module, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
+        _bind_method_to_module(module, "_get_name", lambda self: LigerLayerNorm.__name__)
+
+
+def _patch_swiglu_module(module, liger_module):
+    _bind_method_to_module(module, "forward", liger_module.forward)
+    _bind_method_to_module(module, "_get_name", lambda self: liger_module.__name__)
+
+
+def _patch_geglu_module(module):
+    _bind_method_to_module(module, "forward", LigerGEGLUMLP.forward)
+    _bind_method_to_module(module, "_get_name", lambda self: LigerGEGLUMLP.__name__)
+
+
+def apply_liger_kernel_to_granite(
+    rope: bool = True,
+    cross_entropy: bool = True,
+    fused_linear_cross_entropy: bool = False,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Granite 3 models
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is False.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+
+
+
+    Debugging notes:
+        If LigerSwiGLUMLP is OK for Llama, it should be fine for Granite, but it's not.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.granite import modeling_granite
+    from transformers.models.granite.modeling_granite import GraniteModel
+
+    if swiglu:
+        modeling_granite.GraniteMLP = LigerSwiGLUMLP
+
+    if rms_norm:
+        modeling_granite.GraniteRMSNorm = LigerRMSNorm
+
+    if rope:
+        modeling_granite.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        raise NotImplementedError("LigerFusedLinearCrossEntropy is not available for Granite models.")
+        # NOTE: Granite model `GraniteForCausalLM.forward` scales logits each
+        # call, so we can't sidestep logit materialization. A bit more work
+        # would be needed to add a scaling term to the `LigerFusedLinearCrossEntropyFunction`
+        # for the logit output.
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. GraniteRMSNorm or GraniteMLP)
+
+        # get the base model from the model instance
+        base_model: GraniteModel = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_llama(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.llama import modeling_llama
+    from transformers.models.llama.modeling_llama import LlamaModel
+
+    if rope:
+        modeling_llama.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_llama.LlamaRMSNorm = LigerRMSNorm
+    if swiglu:
+        modeling_llama.LlamaMLP = LigerSwiGLUMLP
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(llama_lce_forward, model)
+        else:
+            modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. LlamaRMSNorm or LlamaMLP)
+
+        # get the base model from the model instance
+        base_model: LlamaModel = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_smollm3(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace SmolLM3 model
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.smollm3 import modeling_smollm3
+    from transformers.models.smollm3.modeling_smollm3 import SmolLM3Model
+
+    if rope:
+        modeling_smollm3.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_smollm3.SmolLM3RMSNorm = LigerRMSNorm
+    if swiglu:
+        modeling_smollm3.SmolLM3MLP = LigerSwiGLUMLP
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(smollm3_lce_forward, model)
+        else:
+            modeling_smollm3.SmolLM3ForCausalLM.forward = smollm3_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. SmolLM3RMSNorm or SmolLM3MLP)
+
+        # get the base model from the model instance
+        base_model: SmolLM3Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_llava(
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    model: PreTrainedModel = None,
+    **kwargs,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Llava models.
+    Due to the characteristics of LlaVa, the model must be passed to apply Liger-Kernel's patch to other models connected to LLaVa.
+    However, if an LM not supported by Liger-Kernel is connected to LLaVa, unexpected side effects may occur.
+    NOTE: Llava is not available in transformers<4.36.0
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.llava import modeling_llava
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(llava_lce_forward, model)
+        else:
+            modeling_llava.LlavaForConditionalGeneration.forward = llava_lce_forward
+
+    if model is not None:
+        text_model_name, vision_model_name = model.config.text_config.model_type, model.config.vision_config.model_type
+        text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
+        vision_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(vision_model_name, None)
+
+        kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs}
+        if text_liger_fn:
+            accept_params = inspect.signature(text_liger_fn).parameters
+            remain_params = set(kwargs) - (set(accept_params) & set(kwargs))
+            text_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
+
+            if remain_params:
+                logger.warning(
+                    f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
+                    f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
+                )
+            text_kwargs["model"] = model.model.language_model
+            text_liger_fn(**text_kwargs)
+        elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
+            logger.warning(f"{text_model_name} is not supported by Liger kernel.")
+
+        if vision_liger_fn:
+            accept_params = inspect.signature(vision_liger_fn).parameters
+            remain_params = set(kwargs) - (set(accept_params) & set(kwargs))
+            vision_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
+
+            if remain_params:
+                logger.warning(
+                    f"These parameters are not supported by {vision_model_name}. Enter the remaining {list(vision_kwargs.keys())} except for {list(remain_params)}\n"
+                    f"Parameters accepted by {vision_model_name}: {list(accept_params.keys())}"
+                )
+            vision_kwargs["model"] = model.model.vision_tower
+            vision_liger_fn(**vision_kwargs)
+        elif vision_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
+            logger.warning(f"{vision_model_name} is not supported by Liger kernel.")
+
+
+def apply_liger_kernel_to_llama4(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+    layer_norm: bool = True,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Llama4 models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.llama4 import modeling_llama4
+    from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
+    from transformers.models.llama4.modeling_llama4 import Llama4ForConditionalGeneration
+    from transformers.models.llama4.modeling_llama4 import Llama4TextModel
+    from transformers.models.llama4.modeling_llama4 import Llama4VisionModel
+
+    from liger_kernel.transformers.model.llama4 import lce_forward as llama4_lce_forward
+
+    if rope:
+        from liger_kernel.transformers.llama4_rope import apply_liger_llama4_rope_full
+
+        apply_liger_llama4_rope_full(modeling_llama4)
+    if rms_norm:
+        modeling_llama4.Llama4TextRMSNorm = LigerRMSNorm
+    if swiglu:
+        modeling_llama4.Llama4TextMLP = LigerSwiGLUMLP
+
+    if cross_entropy:
+        modeling_llama4.CrossEntropyLoss = LigerCrossEntropyLoss
+
+    if fused_linear_cross_entropy:
+        modeling_llama4.Llama4ForCausalLM.forward = llama4_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, Llama4ForConditionalGeneration):
+            language_model: Llama4ForCausalLM = model.language_model
+            vision_model: Llama4VisionModel = model.vision_model
+            text_model: Llama4TextModel = language_model.model
+        elif isinstance(model, Llama4ForCausalLM):
+            text_model = model.model
+            vision_model = None
+        elif isinstance(model, Llama4TextModel):
+            text_model = model
+            vision_model = None
+
+        else:
+            raise ValueError(f"Unsupported Llama4 model type: {type(model)}")
+
+        if text_model:
+            if rms_norm:
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    if decoder_layer.is_moe_layer:
+                        _patch_swiglu_module(decoder_layer.feed_forward.shared_expert, LigerSwiGLUMLP)
+                    else:
+                        _patch_swiglu_module(decoder_layer.feed_forward, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+        if vision_model:
+            _patch_layer_norm_module(vision_model.layernorm_pre)
+            _patch_layer_norm_module(vision_model.layernorm_post)
+
+            for layer in vision_model.model.layers:
+                if layer_norm:
+                    _patch_layer_norm_module(layer.input_layernorm)
+                    _patch_layer_norm_module(layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_mllama(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    layer_norm: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace MLlama models.
+    NOTE: MLlama is not available in transformers<4.45.0
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.mllama import modeling_mllama
+    from transformers.models.mllama.modeling_mllama import MllamaForCausalLM
+    from transformers.models.mllama.modeling_mllama import MllamaForConditionalGeneration
+    from transformers.models.mllama.modeling_mllama import MllamaTextModel
+    from transformers.models.mllama.modeling_mllama import MllamaVisionModel
+
+    from liger_kernel.transformers.model.mllama import lce_forward as mllama_lce_forward
+
+    if rope:
+        modeling_mllama.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if layer_norm and model is None:
+        modeling_mllama.nn.LayerNorm = LigerLayerNorm
+    if rms_norm:
+        modeling_mllama.MllamaTextRMSNorm = LigerRMSNorm
+    if swiglu:
+        modeling_mllama.MllamaTextMLP = LigerSwiGLUMLP
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(mllama_lce_forward, model)
+        else:
+            modeling_mllama.MllamaForCausalLM.forward = mllama_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        if isinstance(model, MllamaForConditionalGeneration):
+            language_model: MllamaForCausalLM = model.model.language_model
+            vision_model: MllamaVisionModel = model.model.vision_model
+            if isinstance(language_model, MllamaForCausalLM):
+                text_model: MllamaTextModel = language_model.model
+            else:
+                text_model = language_model
+        elif isinstance(model, MllamaForCausalLM):
+            text_model = model.model
+            vision_model = None
+        elif isinstance(model, MllamaTextModel):
+            text_model = model
+            vision_model = None
+
+        else:
+            raise ValueError(f"Unsupported Mllama model type: {type(model)}")
+
+        if text_model:
+            if rms_norm:
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+        if vision_model:
+            _patch_layer_norm_module(vision_model.layernorm_pre)
+            _patch_layer_norm_module(vision_model.layernorm_post)
+
+            for layer in vision_model.transformer.layers:
+                if layer_norm:
+                    _patch_layer_norm_module(layer.input_layernorm)
+                    _patch_layer_norm_module(layer.post_attention_layernorm)
+
+            for layer in vision_model.global_transformer.layers:
+                if layer_norm:
+                    _patch_layer_norm_module(layer.input_layernorm)
+                    _patch_layer_norm_module(layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_mistral(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Mistral models
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.mistral import modeling_mistral
+    from transformers.models.mistral.modeling_mistral import MistralModel
+
+    if rope:
+        modeling_mistral.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_mistral.MistralRMSNorm = LigerRMSNorm
+    if cross_entropy:
+        modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(mistral_lce_forward, model)
+        else:
+            modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
+
+    if swiglu:
+        modeling_mistral.MistralMLP = LigerSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: MistralModel = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_mixtral(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Mixtral models
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.mixtral import modeling_mixtral
+    from transformers.models.mixtral.modeling_mixtral import MixtralModel
+
+    if rope:
+        modeling_mixtral.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_mixtral.MixtralRMSNorm = LigerRMSNorm
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(mixtral_lce_forward, model)
+        else:
+            modeling_mixtral.MixtralForCausalLM.forward = mixtral_lce_forward
+    if swiglu:
+        if IS_TRANSFORMERS_V5_OR_LATER:
+            modeling_mixtral.MixtralExperts = LigerExperts
+        else:
+            modeling_mixtral.MixtralBlockSparseTop2MLP = LigerBlockSparseTop2MLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: MixtralModel = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                if IS_TRANSFORMERS_V5_OR_LATER:
+                    _patch_swiglu_module(decoder_layer.mlp.experts, LigerExperts)
+                else:
+                    for expert in decoder_layer.block_sparse_moe.experts:
+                        _patch_swiglu_module(expert, LigerBlockSparseTop2MLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_pixtral(
+    rope: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Pixtral vision models.
+
+    Note: Pixtral's vision encoder does not have a cross-entropy loss, so there is no
+    `fused_linear_cross_entropy` or `cross_entropy` option. The language model side of
+    Pixtral uses Mistral, which can be patched separately via `apply_liger_kernel_to_mistral`.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model
+            has already been loaded. Default is None.
+    """
+    from transformers.models.pixtral import modeling_pixtral
+    from transformers.models.pixtral.modeling_pixtral import PixtralVisionModel
+
+    if rope:
+        modeling_pixtral.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_pixtral.PixtralRMSNorm = LigerRMSNorm
+    if swiglu:
+        modeling_pixtral.PixtralMLP = LigerSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules.
+        if isinstance(model, PixtralVisionModel):
+            transformer = model.transformer
+        else:
+            raise ValueError(f"Unsupported Pixtral model type: {type(model)}")
+
+        if rms_norm:
+            _patch_rms_norm_module(model.ln_pre, eps=1e-5)
+
+        for layer in transformer.layers:
+            if swiglu:
+                _patch_swiglu_module(layer.feed_forward, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(layer.attention_norm, eps=1e-5)
+                _patch_rms_norm_module(layer.ffn_norm, eps=1e-5)
+
+
+def apply_liger_kernel_to_gemma(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    geglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Gemma
+    (Gemma 1 and 1.1 supported, for Gemma2 please use `apply_liger_kernel_to_gemma2` ) to make GPU go burrr.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.gemma import modeling_gemma
+    from transformers.models.gemma.modeling_gemma import GemmaModel
+
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGemma
+
+    _patch_rms_norm_module_for_gemma = partial(_patch_rms_norm_module, casting_mode="gemma", offset=1.0)
+
+    if rope:
+        modeling_gemma.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_gemma.GemmaRMSNorm = LigerRMSNormForGemma
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if geglu:
+        modeling_gemma.GemmaMLP = LigerGEGLUMLP
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(gemma_lce_forward, model)
+        else:
+            modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: GemmaModel = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module_for_gemma(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if geglu:
+                _patch_geglu_module(decoder_layer.mlp)
+            if rms_norm:
+                _patch_rms_norm_module_for_gemma(decoder_layer.input_layernorm)
+                _patch_rms_norm_module_for_gemma(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_gemma2(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    geglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Gemma2
+    (for Gemma1 please use `apply_liger_kernel_to_gemma`) to make GPU go burrr.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.gemma2 import modeling_gemma2
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
+
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGemma2
+
+    _patch_rms_norm_module_for_gemma2 = partial(
+        _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
+    )
+
+    if rope:
+        modeling_gemma2.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        # https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/gemma/modeling_gemma.py#L109
+        modeling_gemma2.Gemma2RMSNorm = LigerRMSNormForGemma2
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(gemma2_lce_forward, model)
+        else:
+            modeling_gemma2.Gemma2ForCausalLM.forward = gemma2_lce_forward
+    if geglu:
+        modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Gemma2Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module_for_gemma2(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if geglu:
+                _patch_geglu_module(decoder_layer.mlp)
+            if rms_norm:
+                _patch_rms_norm_module_for_gemma2(decoder_layer.input_layernorm)
+                _patch_rms_norm_module_for_gemma2(decoder_layer.post_attention_layernorm)
+                _patch_rms_norm_module_for_gemma2(decoder_layer.pre_feedforward_layernorm)
+                _patch_rms_norm_module_for_gemma2(decoder_layer.post_feedforward_layernorm)
+
+
+def apply_liger_kernel_to_gemma3_text(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    geglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Gemma3
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.gemma3 import modeling_gemma3
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3DecoderLayer
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3TextModel
+
+    from liger_kernel.transformers.model.gemma3 import causal_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGemma3
+
+    _patch_rms_norm_module_for_gemma3 = partial(
+        _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
+    )
+
+    if rope:
+        modeling_gemma3.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if rms_norm:
+        modeling_gemma3.Gemma3RMSNorm = LigerRMSNormForGemma3
+
+    if geglu:
+        modeling_gemma3.Gemma3MLP = LigerGEGLUMLP
+
+    # Handle loss function
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(causal_forward, model)
+        else:
+            modeling_gemma3.Gemma3ForCausalLM.forward = causal_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        if isinstance(model, Gemma3ForCausalLM) or isinstance(model, Gemma3TextModel):
+            # get the base model from the model instance
+            base_model = model.model if isinstance(model, Gemma3ForCausalLM) else model
+
+            if rms_norm:
+                _patch_rms_norm_module_for_gemma3(base_model.norm)
+
+            for decoder_layer in base_model.layers:
+                decoder_layer: Gemma3DecoderLayer
+                if geglu:
+                    _bind_method_to_module(decoder_layer.mlp, "forward", LigerGEGLUMLP.forward)
+                if rms_norm:
+                    _patch_rms_norm_module_for_gemma3(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module_for_gemma3(decoder_layer.post_attention_layernorm)
+                    _patch_rms_norm_module_for_gemma3(decoder_layer.pre_feedforward_layernorm)
+                    _patch_rms_norm_module_for_gemma3(decoder_layer.post_feedforward_layernorm)
+                    _patch_rms_norm_module_for_gemma3(decoder_layer.self_attn.q_norm)
+                    _patch_rms_norm_module_for_gemma3(decoder_layer.self_attn.k_norm)
+
+        else:
+            raise TypeError("The model must be Gemma3ForCausalLM.")
+
+
+def apply_liger_kernel_to_gemma3(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    layer_norm: bool = True,
+    rms_norm: bool = True,
+    geglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Gemma3
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.gemma3 import modeling_gemma3
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForConditionalGeneration
+    from transformers.models.siglip import modeling_siglip
+    from transformers.models.siglip.modeling_siglip import SiglipEncoderLayer
+    from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+
+    from liger_kernel.transformers.model.gemma3 import multimodal_forward
+
+    _patch_rms_norm_module_for_gemma3 = partial(
+        _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
+    )
+
+    if layer_norm and model is None:
+        modeling_siglip.nn.LayerNorm = LigerLayerNorm
+
+    apply_liger_kernel_to_gemma3_text(
+        rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
+    )
+
+    if cross_entropy:
+        modeling_gemma3.nn.CrossEntropyLoss = LigerCrossEntropyLoss
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(multimodal_forward, model)
+        else:
+            modeling_gemma3.Gemma3ForConditionalGeneration.forward = multimodal_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        if isinstance(model, Gemma3ForConditionalGeneration):
+            if isinstance(model.model.vision_tower, SiglipVisionModel):
+                vision_tower = model.model.vision_tower
+
+                _patch_layer_norm_module(vision_tower.vision_model.post_layernorm)
+
+                for layer in vision_tower.vision_model.encoder.layers:
+                    layer: SiglipEncoderLayer
+                    if layer_norm:
+                        _patch_layer_norm_module(layer.layer_norm1)
+                        _patch_layer_norm_module(layer.layer_norm2)
+            else:
+                raise TypeError("The vision tower must be SiglipVisionModel")
+
+            if rms_norm:
+                _patch_rms_norm_module_for_gemma3(model.model.multi_modal_projector.mm_soft_emb_norm)
+
+            apply_liger_kernel_to_gemma3_text(
+                rope=rope,
+                cross_entropy=False,
+                fused_linear_cross_entropy=False,
+                rms_norm=rms_norm,
+                geglu=geglu,
+                model=model.model.language_model,
+            )
+
+        else:
+            raise TypeError("The model must be Gemma3ForConditionalGeneration.")
+
+
+def apply_liger_kernel_to_paligemma(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    layer_norm: bool = True,
+    rms_norm: bool = True,
+    geglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace PaliGemma
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    # PaliGemma submodules are ['vision_tower', 'multi_modal_projector', 'language_model']
+
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+    from transformers.models.gemma.modeling_gemma import GemmaModel
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
+    from transformers.models.paligemma import modeling_paligemma
+    from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+    from transformers.models.siglip import modeling_siglip
+    from transformers.models.siglip.modeling_siglip import SiglipEncoderLayer
+    from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+
+    from liger_kernel.transformers.model.paligemma import lce_forward
+
+    # The vision_tower is a SiglipVisionModel
+    if layer_norm and model is None:
+        modeling_siglip.nn.LayerNorm = LigerLayerNorm
+
+    # SiglipMLP is standard FFN so LigerGEGLUMLP is not compatible
+    # The multi_modal_projector is Linear, nothing to do
+
+    # The language_model is GemmaForCausalLM or Gemma2ForCausalLM
+    apply_liger_kernel_to_gemma(
+        rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
+    )
+    apply_liger_kernel_to_gemma2(
+        rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
+    )
+    # Handle loss function
+    if cross_entropy:
+        modeling_paligemma.nn.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(lce_forward, model)
+        else:
+            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        if not isinstance(model, PaliGemmaForConditionalGeneration):
+            raise TypeError("model have to be of type PaliGemmaForConditionalGeneration")
+
+        vision_tower: SiglipVisionModel = model.model.vision_tower
+
+        _patch_layer_norm_module(vision_tower.vision_model.post_layernorm)
+
+        for layer in vision_tower.vision_model.encoder.layers:
+            layer: SiglipEncoderLayer
+            if layer_norm:
+                _patch_layer_norm_module(layer.layer_norm1)
+                _patch_layer_norm_module(layer.layer_norm2)
+
+        language_model = model.model.language_model
+
+        if isinstance(language_model, (GemmaForCausalLM, GemmaModel)):
+            apply_liger_kernel_to_gemma(
+                rope=rope,
+                cross_entropy=False,
+                fused_linear_cross_entropy=False,
+                rms_norm=rms_norm,
+                geglu=geglu,
+                model=language_model,
+            )
+
+        elif isinstance(language_model, (Gemma2ForCausalLM, Gemma2Model)):
+            apply_liger_kernel_to_gemma2(
+                rope=rope,
+                cross_entropy=False,
+                fused_linear_cross_entropy=False,
+                rms_norm=rms_norm,
+                geglu=geglu,
+                model=language_model,
+            )
+        else:
+            raise TypeError(
+                "The language_model of a PaliGemma model must be either GemmaForCausalLM or Gemma2ForCausalLM."
+            )
+
+
+def apply_liger_kernel_to_qwen2(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen2 models
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen2 import modeling_qwen2
+    from transformers.models.qwen2.modeling_qwen2 import Qwen2Model
+
+    if rope:
+        modeling_qwen2.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_qwen2.Qwen2RMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen2_lce_forward, model)
+        else:
+            modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
+
+    if swiglu:
+        modeling_qwen2.Qwen2MLP = LigerSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Qwen2Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_qwen3(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3 models.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen3 import modeling_qwen3
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3Model
+
+    from liger_kernel.transformers.model.qwen3 import lce_forward as qwen3_lce_forward
+
+    if rope:
+        modeling_qwen3.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if rms_norm:
+        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen3_lce_forward, model)
+        else:
+            modeling_qwen3.Qwen3ForCausalLM.forward = qwen3_lce_forward
+
+    if swiglu:
+        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Qwen3Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_qwen3_moe(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3 models.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen3_moe import modeling_qwen3_moe
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeModel
+
+    from liger_kernel.transformers.model.qwen3_moe import lce_forward as qwen3_lce_forward
+    from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP
+
+    if rope:
+        modeling_qwen3_moe.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if rms_norm:
+        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen3_lce_forward, model)
+        else:
+            modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = qwen3_lce_forward
+
+    if swiglu:
+        if IS_TRANSFORMERS_V5_OR_LATER:
+            modeling_qwen3_moe.Qwen3MoeExperts = LigerExperts
+        else:
+            modeling_qwen3_moe.Qwen3MoeMLP = LigerQwen3MoeSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Qwen3MoeModel = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                if IS_TRANSFORMERS_V5_OR_LATER:
+                    _patch_swiglu_module(decoder_layer.mlp.experts, LigerExperts)
+                else:
+                    for mlp_expert in decoder_layer.mlp.experts:
+                        _patch_swiglu_module(mlp_expert, LigerQwen3MoeSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_gpt_oss(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = False,  # Set to False by default since GPT-OSS has custom expert implementation
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace GPT-OSS models.
+    NOTE: GPT-OSS is supported in transformers >= 4.55.0
+    NOTE: SwiGLU patching is disabled by default for GPT-OSS as it uses a custom expert
+          implementation with clamping and MXFP4 quantization.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
+            Note: GPT-OSS uses a custom expert implementation, so SwiGLU patching is disabled by default.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+            loaded. Default is None.
+    """
+    if version.parse(transformers.__version__) < version.parse("4.55.0"):
+        logger.warning("GPT-OSS support requires transformers >= 4.55.0")
+        return
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.gpt_oss import modeling_gpt_oss
+    from transformers.models.gpt_oss.modeling_gpt_oss import GptOssModel
+
+    if rope:
+        modeling_gpt_oss.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if rms_norm:
+        modeling_gpt_oss.GptOssRMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(gpt_oss_lce_forward, model)
+        else:
+            modeling_gpt_oss.GptOssForCausalLM.forward = gpt_oss_lce_forward
+
+    # Note: SwiGLU patching is not implemented for GPT-OSS due to custom expert implementation
+    # with clamping (swiglu_limit=7.0) and MXFP4 quantization
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: GptOssModel = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_qwen2_vl(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    layer_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen2-VL models.
+    NOTE: Qwen2-VL is not supported in transformers<4.52.4
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    if transformer_version < version.parse("4.52.4"):
+        logger.warning("Qwen2-VL support is only compatible with transformers >= 4.52.4")
+        return
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen2_vl import modeling_qwen2_vl
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VisionTransformerPretrainedModel
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLModel
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLTextModel
+
+    from liger_kernel.transformers.model.qwen2_vl import lce_forward as qwen2_vl_lce_forward
+
+    if rope:
+        modeling_qwen2_vl.apply_multimodal_rotary_pos_emb = liger_multimodal_rotary_pos_emb
+    if rms_norm:
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L439
+        modeling_qwen2_vl.Qwen2RMSNorm = LigerRMSNorm
+    if layer_norm and model is None:
+        modeling_qwen2_vl.LayerNorm = LigerLayerNorm
+    if cross_entropy:
+        modeling_qwen2_vl.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen2_vl_lce_forward, model)
+        else:
+            modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = qwen2_vl_lce_forward
+    if swiglu:
+        modeling_qwen2_vl.Qwen2MLP = LigerSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, Qwen2VLForConditionalGeneration):
+            text_model: Qwen2VLTextModel = model.model.language_model
+            vision_model: Qwen2VisionTransformerPretrainedModel = model.model.visual
+        elif isinstance(model, Qwen2VLModel):
+            text_model: Qwen2VLTextModel = model.language_model
+            vision_model: Qwen2VisionTransformerPretrainedModel = model.visual
+        elif isinstance(model, Qwen2VLTextModel):
+            text_model: Qwen2VLTextModel = model
+            vision_model = None
+        else:
+            # Note: Currently there's no support for patching vision model only. Feel free to raise an issue if needed.
+            raise TypeError(
+                f"Unsupported Qwen2VL model type. `model` must be `Qwen2VLForConditionalGeneration`, `Qwen2VLModel` or `Qwen2VLTextModel`. Got: {type(model)}"
+            )
+
+        # Patch Qwen2VisionTransformerPretrainedModel
+        if vision_model is not None:
+            for vision_block in vision_model.blocks:
+                if layer_norm:
+                    _patch_layer_norm_module(vision_block.norm1)
+                    _patch_layer_norm_module(vision_block.norm2)
+
+        # Patch Qwen2VisionTextModel
+        if text_model is not None:
+            if rms_norm:
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_qwen2_5_vl(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen2.5-VL models.
+    NOTE: Qwen2.5-VL is not available in transformers<4.48.2
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    if transformer_version < version.parse("4.52.4"):
+        logger.warning("Qwen2.5-VL support is only compatible with transformers >= 4.52.4")
+        return
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VisionTransformerPretrainedModel
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLModel
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLTextModel
+
+    from liger_kernel.transformers.model.qwen2_5_vl import lce_forward as qwen2_5_vl_lce_forward
+
+    if rope:
+        modeling_qwen2_5_vl.apply_multimodal_rotary_pos_emb = liger_multimodal_rotary_pos_emb
+    if rms_norm:
+        modeling_qwen2_5_vl.Qwen2RMSNorm = LigerRMSNorm
+    if cross_entropy:
+        modeling_qwen2_5_vl.CrossEntropyLoss = LigerCrossEntropyLoss
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen2_5_vl_lce_forward, model)
+        else:
+            modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = qwen2_5_vl_lce_forward
+    if swiglu:
+        modeling_qwen2_5_vl.Qwen2MLP = LigerSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, Qwen2_5_VLForConditionalGeneration):
+            text_model: Qwen2_5_VLTextModel = model.model.language_model
+            vision_model: Qwen2_5_VisionTransformerPretrainedModel = model.model.visual
+        elif isinstance(model, Qwen2_5_VLModel):
+            text_model: Qwen2_5_VLTextModel = model.language_model
+            vision_model: Qwen2_5_VisionTransformerPretrainedModel = model.visual
+        elif isinstance(model, Qwen2_5_VLTextModel):
+            text_model: Qwen2_5_VLTextModel = model
+            vision_model = None
+        else:
+            # Note: Currently there's no support for patching vision model only. Feel free to raise an issue if needed.
+            raise TypeError(
+                f"Unsupported Qwen2VL model type. `model` must be `Qwen2VLForConditionalGeneration`, `Qwen2VLModel` or `Qwen2VLTextModel`. Got: {type(model)}"
+            )
+
+        if vision_model is not None:
+            # Patch Qwen2_5_VisionTransformerPretrainedModel
+            for vision_block in vision_model.blocks:
+                if rms_norm:
+                    _patch_rms_norm_module(vision_block.norm1)
+                    _patch_rms_norm_module(vision_block.norm2)
+
+        if text_model is not None:
+            if rms_norm:
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_qwen3_vl(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = False,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3-VL models.
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen3_vl import modeling_qwen3_vl
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLModel
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextModel
+
+    from liger_kernel.transformers.model.qwen3_vl import lce_forward as qwen3_vl_lce_forward
+
+    if rope:
+        modeling_qwen3_vl.apply_rotary_pos_emb = liger_rotary_pos_emb
+        modeling_qwen3_vl.apply_rotary_pos_emb_vision = liger_rotary_pos_emb_vision
+
+    if rms_norm:
+        modeling_qwen3_vl.Qwen3VLTextRMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen3_vl_lce_forward, model)
+        else:
+            modeling_qwen3_vl.Qwen3VLForConditionalGeneration.forward = qwen3_vl_lce_forward
+
+    if model is not None and rms_norm:
+        if isinstance(model, Qwen3VLForConditionalGeneration):
+            text_model: Qwen3VLTextModel = model.model.language_model
+        elif isinstance(model, Qwen3VLModel):
+            text_model: Qwen3VLTextModel = model.language_model
+        elif isinstance(model, Qwen3VLTextModel):
+            text_model = model
+        else:
+            raise TypeError(
+                f"Unsupported Qwen3VL model type. `model` must be `Qwen3VLForConditionalGeneration`, `Qwen3VLModel` or `Qwen3VLTextModel`. Got: {type(model)}"
+            )
+
+        _patch_qwen3_vl_rms_norm = partial(_patch_rms_norm_module, offset=0.0, casting_mode="llama")
+
+        if text_model is not None:
+            _patch_qwen3_vl_rms_norm(text_model.norm)
+            for decoder_layer in text_model.layers:
+                _patch_qwen3_vl_rms_norm(decoder_layer.input_layernorm)
+                _patch_qwen3_vl_rms_norm(decoder_layer.post_attention_layernorm)
+                self_attn = getattr(decoder_layer, "self_attn", None)
+                if self_attn is not None:
+                    if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                        _patch_qwen3_vl_rms_norm(self_attn.q_norm)
+                    if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                        _patch_qwen3_vl_rms_norm(self_attn.k_norm)
+
+
+def apply_liger_kernel_to_qwen3_vl_moe(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = False,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3-VL MoE models.
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is False.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen3_vl_moe import modeling_qwen3_vl_moe
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeModel
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextModel
+
+    from liger_kernel.transformers.model.qwen3_vl_moe import lce_forward as qwen3_vl_moe_lce_forward
+
+    if rope:
+        modeling_qwen3_vl_moe.apply_rotary_pos_emb = liger_rotary_pos_emb
+        modeling_qwen3_vl_moe.apply_rotary_pos_emb_vision = liger_rotary_pos_emb_vision
+
+    if rms_norm:
+        modeling_qwen3_vl_moe.Qwen3VLMoeTextRMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(qwen3_vl_moe_lce_forward, model)
+        else:
+            modeling_qwen3_vl_moe.Qwen3VLMoeForConditionalGeneration.forward = qwen3_vl_moe_lce_forward
+
+    if model is not None and rms_norm:
+        if isinstance(model, Qwen3VLMoeForConditionalGeneration):
+            text_model: Qwen3VLMoeTextModel = model.model.language_model
+        elif isinstance(model, Qwen3VLMoeModel):
+            text_model: Qwen3VLMoeTextModel = model.language_model
+        elif isinstance(model, Qwen3VLMoeTextModel):
+            text_model = model
+        else:
+            raise TypeError(
+                f"Unsupported Qwen3VLMoe model type. `model` must be `Qwen3VLMoeForConditionalGeneration`, `Qwen3VLMoeModel` or `Qwen3VLMoeTextModel`. Got: {type(model)}"
+            )
+
+        _patch_qwen3_vl_moe_rms_norm = partial(_patch_rms_norm_module, offset=0.0, casting_mode="llama")
+
+        if text_model is not None:
+            _patch_qwen3_vl_moe_rms_norm(text_model.norm)
+            for decoder_layer in text_model.layers:
+                _patch_qwen3_vl_moe_rms_norm(decoder_layer.input_layernorm)
+                _patch_qwen3_vl_moe_rms_norm(decoder_layer.post_attention_layernorm)
+                self_attn = getattr(decoder_layer, "self_attn", None)
+                if self_attn is not None:
+                    if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                        _patch_qwen3_vl_moe_rms_norm(self_attn.q_norm)
+                    if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                        _patch_qwen3_vl_moe_rms_norm(self_attn.k_norm)
+
+
+def apply_liger_kernel_to_phi3(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Phi3 models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU Phi3MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.phi3 import modeling_phi3
+    from transformers.models.phi3.modeling_phi3 import Phi3Model
+
+    if rope:
+        modeling_phi3.apply_rotary_pos_emb = liger_rotary_pos_emb  # Same as Gemma
+    if rms_norm:
+        modeling_phi3.Phi3RMSNorm = LigerRMSNorm  # Same as Llama
+    if swiglu:
+        modeling_phi3.Phi3MLP = LigerPhi3SwiGLUMLP
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(phi3_lce_forward, model)
+        else:
+            modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Phi3Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerPhi3SwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_olmo2(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace OLMO2 models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU Olmo2MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.olmo2 import modeling_olmo2
+    from transformers.models.olmo2.modeling_olmo2 import Olmo2Model
+
+    from liger_kernel.transformers.model.olmo2 import lce_forward as olmo2_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForOlmo2
+
+    if rope:
+        modeling_olmo2.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_olmo2.Olmo2RMSNorm = LigerRMSNormForOlmo2
+    if swiglu:
+        modeling_olmo2.Olmo2MLP = LigerSwiGLUMLP
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(olmo2_lce_forward, model)
+        else:
+            modeling_olmo2.Olmo2ForCausalLM.forward = olmo2_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Olmo2Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
+
+
+def apply_liger_kernel_to_olmo3(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Olmo3 models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU to Olmo3MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.olmo3 import modeling_olmo3
+    from transformers.models.olmo3.modeling_olmo3 import Olmo3Model
+
+    from liger_kernel.transformers.model.olmo3 import lce_forward as olmo3_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForOlmo2
+
+    # Olmo3 arch is very similar to Olmo2, so we can reuse all these components in the same way.
+    if rope:
+        modeling_olmo3.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        modeling_olmo3.Olmo3RMSNorm = LigerRMSNormForOlmo2  # same as olmo2
+    if swiglu:
+        modeling_olmo3.Olmo3MLP = LigerSwiGLUMLP
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(olmo3_lce_forward, model)
+        else:
+            modeling_olmo3.Olmo3ForCausalLM.forward = olmo3_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Olmo3Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
+
+
+def apply_liger_kernel_to_glm4(
+    rope: bool = False,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace GLM-4 models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU Glm4MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.glm4 import modeling_glm4
+    from transformers.models.glm4.modeling_glm4 import Glm4Model
+
+    from liger_kernel.transformers.model.glm4 import lce_forward as glm4_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGlm4
+
+    if rope:
+        raise NotImplementedError("liger_rotary_pos_emb is not available for Glm4 models.")
+    if rms_norm:
+        modeling_glm4.Glm4RMSNorm = LigerRMSNormForGlm4
+    if swiglu:
+        modeling_glm4.Glm4MLP = LigerPhi3SwiGLUMLP
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(glm4_lce_forward, model)
+        else:
+            modeling_glm4.Glm4ForCausalLM.forward = glm4_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Glm4Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm, in_place=False)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerPhi3SwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.post_self_attn_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.post_mlp_layernorm, in_place=False)
+
+
+def apply_liger_kernel_to_glm4v(
+    rope: bool = False,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace GLM-4v models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU Glm4MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.glm4v import modeling_glm4v
+    from transformers.models.glm4v.modeling_glm4v import Glm4vForConditionalGeneration
+    from transformers.models.glm4v.modeling_glm4v import Glm4vModel
+    from transformers.models.glm4v.modeling_glm4v import Glm4vTextModel
+    from transformers.models.glm4v.modeling_glm4v import Glm4vVisionModel
+
+    from liger_kernel.transformers.model.glm4v import lce_forward as glm4v_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGlm4
+
+    if rope:
+        raise NotImplementedError("liger_rotary_pos_emb is not available for Glm4 models.")
+    if rms_norm:
+        modeling_glm4v.Glm4vRMSNorm = LigerRMSNormForGlm4
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(glm4v_lce_forward, model)
+        else:
+            modeling_glm4v.Glm4vForConditionalGeneration.forward = glm4v_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, Glm4vForConditionalGeneration):
+            text_model: Glm4vTextModel = model.model.language_model
+            vision_model: Glm4vVisionModel = model.model.visual
+        elif isinstance(model, Glm4vModel):
+            text_model: Glm4vTextModel = model.language_model
+            vision_model: Glm4vVisionModel = model.visual
+        elif isinstance(model, Glm4vTextModel):
+            text_model: Glm4vTextModel = model
+            vision_model = None
+        else:
+            # Note: Currently there's no support for patching vision model only. Feel free to raise an issue if needed.
+            raise TypeError(
+                f"Unsupported glm4.1v model type. `model` must be `Glm4VLForConditionalGeneration`, `Glm4vVisionModel` or `Glm4vTextModel`. Got: {type(model)}"
+            )
+
+        if vision_model is not None:
+            for vision_block in vision_model.blocks:
+                if rms_norm:
+                    _patch_rms_norm_module(vision_block.norm1)
+                    _patch_rms_norm_module(vision_block.norm2)
+                if swiglu:
+                    _patch_swiglu_module(vision_block.mlp, LigerSwiGLUMLP)
+
+        if text_model is not None:
+            if rms_norm:
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    _patch_swiglu_module(decoder_layer.mlp, LigerPhi3SwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_self_attn_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_mlp_layernorm)
+
+
+def apply_liger_kernel_to_glm4v_moe(
+    rope: bool = False,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace GLM4v_moe models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLUMLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.glm4v_moe import modeling_glm4v_moe
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeForConditionalGeneration
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeModel
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextModel
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeVisionModel
+
+    from liger_kernel.transformers.model.glm4v_moe import lce_forward as glm4v_moe_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForGlm4
+
+    if rope:
+        raise NotImplementedError("liger_rotary_pos_emb is not available for Glm4 models.")
+    if rms_norm:
+        modeling_glm4v_moe.Glm4vMoeRMSNorm = LigerRMSNormForGlm4
+        modeling_glm4v_moe.Glm4vMoeTextRMSNorm = LigerRMSNormForGlm4
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(glm4v_moe_lce_forward, model)
+        else:
+            modeling_glm4v_moe.Glm4vMoeForConditionalGeneration.forward = glm4v_moe_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, Glm4vMoeForConditionalGeneration):
+            text_model: Glm4vMoeTextModel = model.model.language_model
+            vision_model: Glm4vMoeVisionModel = model.model.visual
+            Glm4vMoeTextMoE = modeling_glm4v_moe.Glm4vMoeTextMoE
+        elif isinstance(model, Glm4vMoeModel):
+            text_model: Glm4vMoeTextModel = model.language_model
+            vision_model: Glm4vMoeVisionModel = model.visual
+            Glm4vMoeTextMoE = modeling_glm4v_moe.Glm4vMoeTextMoE
+        elif isinstance(model, Glm4vMoeTextModel):
+            text_model: Glm4vMoeTextModel = model
+            vision_model = None
+        else:
+            # Note: Currently there's no support for patching vision model only. Feel free to raise an issue if needed.
+            raise TypeError(
+                f"Unsupported glm4v_moe model type. `model` must be `Glm4vMoeForConditionalGeneration`, `Glm4vMoeVisionModel` or `Glm4vMoeTextModel`. Got: {type(model)}"
+            )
+
+        if vision_model is not None:
+            _patch_rms_norm_module(vision_model.post_conv_layernorm)
+            _patch_rms_norm_module(vision_model.post_layernorm)
+            for vision_block in vision_model.blocks:
+                if rms_norm:
+                    _patch_rms_norm_module(vision_block.norm1)
+                    _patch_rms_norm_module(vision_block.norm2)
+                if swiglu:
+                    _patch_swiglu_module(vision_block.mlp, LigerSwiGLUMLP)
+
+        if text_model is not None:
+            if rms_norm:
+                _patch_rms_norm_module(text_model.norm)
+            for decoder_layer in text_model.layers:
+                if swiglu:
+                    decoder_layer.mlp = _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+        if isinstance(Glm4vMoeTextMoE, type) and isinstance(decoder_layer.mlp, Glm4vMoeTextMoE):
+            experts = getattr(decoder_layer.mlp, "experts", None)
+            if experts is not None:
+                for expert in experts:
+                    _patch_swiglu_module(expert, LigerSwiGLUMLP)
+            if decoder_layer.mlp.shared_experts is not None:
+                _patch_swiglu_module(decoder_layer.mlp.shared_experts, LigerSwiGLUMLP)
+            for decoder_layer in text_model.layers:
+                if rms_norm:
+                    _patch_rms_norm_module(decoder_layer.input_layernorm)
+                    _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_internvl(
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    layer_norm: bool = True,
+    model: Optional[PreTrainedModel] = None,
+    **kwargs,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace InternVL models.
+    Due to the characteristics of InternVL, the model must be passed to apply Liger-Kernel's patch to other models connected to InternVL.
+    However, if an LM not supported by Liger-Kernel is connected to InternVL, unexpected side effects may occur.
+    NOTE: InternVL is not available in transformers<4.52.1
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+    import torch.nn as torch_nn
+
+    from transformers.models.internvl import modeling_internvl
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+    from transformers.models.internvl.modeling_internvl import InternVLModel
+    from transformers.models.internvl.modeling_internvl import InternVLVisionLayer
+    from transformers.models.internvl.modeling_internvl import InternVLVisionModel
+    from transformers.models.internvl.modeling_internvl import InternVLVisionRMSNorm
+
+    from liger_kernel.transformers.layer_norm import LigerLayerNorm
+    from liger_kernel.transformers.model.internvl import lce_forward as internvl_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm
+
+    if layer_norm and model is None:
+        modeling_internvl.nn.LayerNorm = LigerLayerNorm
+
+    if cross_entropy:
+        logger.info("Apply liger cross entropy")
+
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        modeling_internvl.InternVLForConditionalGeneration.forward = internvl_lce_forward
+    if rms_norm:
+        modeling_internvl.InternVLVisionRMSNorm = LigerRMSNorm
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, InternVLForConditionalGeneration):
+            text_model = model.model.language_model
+            vision_model: InternVLVisionModel = model.model.vision_tower
+        elif isinstance(model, InternVLModel):
+            text_model = model.language_model
+            vision_model: InternVLVisionModel = model.vision_tower
+        else:
+            raise TypeError(
+                f"Unsupported internvl model type. `model` must be `InternVLForConditionalGeneration`, `InternVLModel`. Got: {type(model)}"
+            )
+
+        text_model_name = model.config.text_config.model_type
+        text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
+
+        kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs} | {"rms_norm": rms_norm}
+        if text_liger_fn:
+            accept_params = inspect.signature(text_liger_fn).parameters
+            remain_params = set(kwargs) - (set(accept_params) & set(kwargs))
+            text_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
+
+            if remain_params:
+                logger.warning(
+                    f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
+                    f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
+                )
+            text_kwargs["model"] = text_model
+            text_liger_fn(**text_kwargs)
+        elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
+            logger.warning(f"{text_model_name} is not supported by Liger kernel.")
+
+        # Patch vision model RMSNorm layers
+        if rms_norm:
+            for encoder_layer in vision_model.encoder.layer:
+                encoder_layer: InternVLVisionLayer
+                if isinstance(encoder_layer.attention.q_norm, InternVLVisionRMSNorm):
+                    _patch_rms_norm_module(encoder_layer.attention.q_norm)
+                if isinstance(encoder_layer.attention.k_norm, InternVLVisionRMSNorm):
+                    _patch_rms_norm_module(encoder_layer.attention.k_norm)
+
+        # Patch vision model LayerNorm layers
+        if layer_norm:
+            # Patch layernorm
+            if isinstance(vision_model.layernorm, torch_nn.LayerNorm):
+                _patch_layer_norm_module(vision_model.layernorm)
+
+            # Patch encoder layers
+            for encoder_layer in vision_model.encoder.layer:
+                encoder_layer: InternVLVisionLayer
+                if isinstance(encoder_layer.layernorm_before, torch_nn.LayerNorm):
+                    _patch_layer_norm_module(encoder_layer.layernorm_before)
+                if isinstance(encoder_layer.layernorm_after, torch_nn.LayerNorm):
+                    _patch_layer_norm_module(encoder_layer.layernorm_after)
+
+
+def apply_liger_kernel_to_smolvlm(
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    layer_norm: bool = True,
+    model: Optional[PreTrainedModel] = None,
+    **kwargs,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace SmolVLM models.
+    Due to the characteristics of SmolVLM, the model must be passed to apply Liger-Kernel's patch to other models connected to SmolVLM.
+    However, if an LM not supported by Liger-Kernel is connected to SmolVLM, unexpected side effects may occur.
+    NOTE: SmolVLM is not available in transformers<4.50.0
+
+    Args:
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.smolvlm import modeling_smolvlm
+    from transformers.models.smolvlm.modeling_smolvlm import SmolVLMEncoderLayer
+    from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration
+    from transformers.models.smolvlm.modeling_smolvlm import SmolVLMModel
+    from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
+
+    from liger_kernel.transformers.model.smolvlm import lce_forward as smolvlm_lce_forward
+
+    # Patch LayerNorm for vision model if model is not provided (pre-initialization)
+    if layer_norm and model is None:
+        modeling_smolvlm.nn.LayerNorm = LigerLayerNorm
+
+    if cross_entropy:
+        logger.info("Apply liger cross entropy")
+
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(smolvlm_lce_forward, model)
+        else:
+            modeling_smolvlm.SmolVLMForConditionalGeneration.forward = smolvlm_lce_forward
+    if rms_norm:
+        modeling_smolvlm.SmolVLMRMSNorm = LigerRMSNorm
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, SmolVLMForConditionalGeneration):
+            text_model = model.model.text_model
+            vision_model: SmolVLMVisionTransformer = model.model.vision_model
+        elif isinstance(model, SmolVLMModel):
+            text_model = model.text_model
+            vision_model: SmolVLMVisionTransformer = model.vision_model
+        else:
+            raise TypeError(
+                f"Unsupported smolvlm model type. `model` must be `SmolVLMForConditionalGeneration`, `SmolVLMModel`. Got: {type(model)}"
+            )
+
+        text_model_name = model.config.text_config.model_type
+        text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
+
+        kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs} | {"rms_norm": rms_norm}
+        if text_liger_fn:
+            accept_params = inspect.signature(text_liger_fn).parameters
+            remain_params = set(kwargs) - (set(accept_params) & set(kwargs))
+            text_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
+
+            if remain_params:
+                logger.warning(
+                    f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
+                    f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
+                )
+            text_kwargs["model"] = text_model
+            text_liger_fn(**text_kwargs)
+        elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
+            logger.warning(f"{text_model_name} is not supported by Liger kernel.")
+
+        # Patch vision model LayerNorm layers
+        if layer_norm:
+            # Patch post_layernorm
+            _patch_layer_norm_module(vision_model.post_layernorm)
+
+            # Patch encoder layers
+            for encoder_layer in vision_model.encoder.layers:
+                encoder_layer: SmolVLMEncoderLayer
+                _patch_layer_norm_module(encoder_layer.layer_norm1)
+                _patch_layer_norm_module(encoder_layer.layer_norm2)
+
+
+def apply_liger_kernel_to_falcon_h1(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = False,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Falcon-H1 models
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is True.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is False.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.falcon_h1 import modeling_falcon_h1
+    from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1Model
+
+    if rope:
+        logger.info("Apply liger rotary pos emb.")
+        modeling_falcon_h1.apply_rotary_pos_emb = liger_rotary_pos_emb
+    if rms_norm:
+        logger.info("Apply liger RMSNorm")
+        modeling_falcon_h1.FalconH1RMSNorm = LigerRMSNorm
+    if swiglu:
+        logger.warning("LigerSwiGLUMLP is not available for Falcon-H1 models. There will be no effect.")
+
+    if cross_entropy:
+        logger.info("Apply liger cross entropy")
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(falcon_h1_lce_forward, model)
+        else:
+            modeling_falcon_h1.FalconH1ForCausalLM.forward = falcon_h1_lce_forward
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules (e.g. LlamaRMSNorm or LlamaMLP)
+
+        # get the base model from the model instance
+        base_model: FalconH1Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.final_layernorm)
+
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.pre_ff_layernorm)
+
+
+def apply_liger_kernel_to_qwen3_next(
+    rope: bool = False,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace GLM4v_moe models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLUMLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen3_next import modeling_qwen3_next
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextForCausalLM
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextMLP
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextModel
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextSparseMoeBlock
+
+    from liger_kernel.transformers.model.qwen3_next import lce_forward as qwen3_next_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForQwen3Next
+    from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP
+
+    if rope:
+        # It might enocunter nan issue
+        # modeling_qwen3_next.apply_rotary_pos_emb = liger_rotary_pos_emb
+        raise NotImplementedError("liger_rotary_pos_emb is not available for Qwen3Next models.")
+    if rms_norm:
+        modeling_qwen3_next.Qwen3NextRMSNorm = LigerRMSNormForQwen3Next
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            if isinstance(model, Qwen3NextForCausalLM):
+                model.forward = MethodType(qwen3_next_lce_forward, model)
+            else:
+                raise TypeError(
+                    f" fused_linear_cross_entropy is only applicable on Qwen3NextForCausalLM. Got: {type(model)}"
+                )
+        else:
+            modeling_qwen3_next.Qwen3NextForCausalLM.forward = qwen3_next_lce_forward
+    if swiglu:
+        if IS_TRANSFORMERS_V5_OR_LATER:
+            modeling_qwen3_next.Qwen3NextExperts = LigerExperts
+        else:
+            # Qwen3MoeMLP and Qwen3NextMLP are identical, hence we reuse LigerQwen3MoeSwiGLUMLP
+            modeling_qwen3_next.Qwen3NextMLP = LigerQwen3MoeSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, (Qwen3NextForCausalLM, Qwen3NextModel)):
+            base_model: Qwen3NextForCausalLM = getattr(model, model.base_model_prefix, model)
+        else:
+            raise TypeError(
+                f"Unsupported qwen3_next model type. `model` must be `Qwen3NextForCausalLM`, `Qwen3NextModel`. Got: {type(model)}"
+            )
+
+        _patch_rms_norm_module_for_qwen3_next = partial(
+            _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
+        )
+
+        if rms_norm:
+            _patch_rms_norm_module_for_qwen3_next(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if rms_norm:
+                _patch_rms_norm_module_for_qwen3_next(decoder_layer.input_layernorm)
+                _patch_rms_norm_module_for_qwen3_next(decoder_layer.post_attention_layernorm)
+
+            # Qwen3MoeMLP and Qwen3NextMLP are identical, hence we reuse LigerQwen3MoeSwiGLUMLP
+            if swiglu:
+                if isinstance(decoder_layer.mlp, Qwen3NextMLP):
+                    _patch_swiglu_module(decoder_layer.mlp, LigerQwen3MoeSwiGLUMLP)
+                if isinstance(decoder_layer.mlp, Qwen3NextSparseMoeBlock):
+                    _patch_swiglu_module(decoder_layer.mlp.shared_expert, LigerQwen3MoeSwiGLUMLP)
+                    experts = getattr(decoder_layer.mlp, "experts", None)
+                    if experts is not None:
+                        if IS_TRANSFORMERS_V5_OR_LATER:
+                            _patch_swiglu_module(experts, LigerExperts)
+                        else:
+                            for expert in experts:
+                                _patch_swiglu_module(expert, LigerQwen3MoeSwiGLUMLP)
+
+
+def apply_liger_kernel_to_qwen3_5(
+    rope: bool = False,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3.5 dense models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
+            Not yet supported for Qwen3.5 due to hybrid attention (Gated DeltaNet + Gated Attention).
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLUMLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen3_5 import modeling_qwen3_5
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForCausalLM
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5TextModel
+
+    try:
+        from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
+    except ImportError:
+        Qwen3_5ForConditionalGeneration = None
+
+    from liger_kernel.transformers.model.qwen3_5 import lce_forward as qwen3_5_lce_forward
+    from liger_kernel.transformers.model.qwen3_5 import lce_forward_for_multimodal as qwen3_5_lce_forward_for_multimodal
+    from liger_kernel.transformers.monkey_patch import _patch_rms_norm_module
+    from liger_kernel.transformers.monkey_patch import _patch_swiglu_module
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForQwen3Next
+    from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP
+
+    if rope:
+        raise NotImplementedError("liger_rotary_pos_emb is not available for Qwen3_5 models.")
+
+    if rms_norm:
+        modeling_qwen3_5.Qwen3_5RMSNorm = LigerRMSNormForQwen3Next
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        from liger_kernel.transformers.cross_entropy import liger_cross_entropy
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            if isinstance(model, Qwen3_5ForCausalLM):
+                model.forward = MethodType(qwen3_5_lce_forward, model)
+            elif isinstance(model, Qwen3_5ForConditionalGeneration):
+                model.forward = MethodType(qwen3_5_lce_forward_for_multimodal, model)
+            else:
+                raise TypeError(
+                    f"fused_linear_cross_entropy is only applicable on Qwen3_5ForCausalLM or Qwen3_5ForConditionalGeneration. Got: {type(model)}"
+                )
+        else:
+            modeling_qwen3_5.Qwen3_5ForCausalLM.forward = qwen3_5_lce_forward
+            if Qwen3_5ForConditionalGeneration is not None:
+                modeling_qwen3_5.Qwen3_5ForConditionalGeneration.forward = qwen3_5_lce_forward_for_multimodal
+
+    if swiglu:
+        modeling_qwen3_5.Qwen3_5MLP = LigerQwen3MoeSwiGLUMLP
+
+    if model is not None:
+        if isinstance(model, (Qwen3_5ForCausalLM, Qwen3_5TextModel)):
+            text_model: Qwen3_5TextModel = getattr(model, model.base_model_prefix, model)
+        elif Qwen3_5ForConditionalGeneration is not None and isinstance(model, Qwen3_5ForConditionalGeneration):
+            text_model = model.model.language_model
+        else:
+            raise TypeError(f"Unsupported qwen3_5 model type. Got: {type(model)}")
+
+        _patch_rms_norm_module_for_qwen3_5 = partial(
+            _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
+        )
+
+        if rms_norm:
+            _patch_rms_norm_module_for_qwen3_5(text_model.norm)
+
+        for decoder_layer in text_model.layers:
+            if rms_norm:
+                _patch_rms_norm_module_for_qwen3_5(decoder_layer.input_layernorm)
+                _patch_rms_norm_module_for_qwen3_5(decoder_layer.post_attention_layernorm)
+
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerQwen3MoeSwiGLUMLP)
+
+
+def apply_liger_kernel_to_qwen3_5_moe(
+    rope: bool = False,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3.5 MoE models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLUMLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.qwen3_5_moe import modeling_qwen3_5_moe
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeForCausalLM
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeTextModel
+
+    from liger_kernel.transformers.model.qwen3_5_moe import lce_forward as qwen3_5_moe_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNormForQwen3Next
+    from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP
+
+    if rope:
+        raise NotImplementedError("liger_rotary_pos_emb is not available for Qwen3_5Moe models.")
+    if rms_norm:
+        modeling_qwen3_5_moe.Qwen3_5MoeRMSNorm = LigerRMSNormForQwen3Next
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+    if fused_linear_cross_entropy:
+        if model is not None:
+            if isinstance(model, Qwen3_5MoeForCausalLM):
+                model.forward = MethodType(qwen3_5_moe_lce_forward, model)
+            else:
+                raise TypeError(
+                    f" fused_linear_cross_entropy is only applicable on Qwen3_5MoeForCausalLM. Got: {type(model)}"
+                )
+        else:
+            modeling_qwen3_5_moe.Qwen3_5MoeForCausalLM.forward = qwen3_5_moe_lce_forward
+    if swiglu:
+        modeling_qwen3_5_moe.Qwen3_5MoeExperts = LigerExperts
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, (Qwen3_5MoeForCausalLM, Qwen3_5MoeTextModel)):
+            base_model: Qwen3_5MoeTextModel = getattr(model, model.base_model_prefix, model)
+        else:
+            raise TypeError(
+                f"Unsupported qwen3_5_moe model type. `model` must be `Qwen3_5MoeForCausalLM`, `Qwen3_5MoeTextModel`. Got: {type(model)}"
+            )
+
+        _patch_rms_norm_module_for_qwen3_5_moe = partial(
+            _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
+        )
+
+        if rms_norm:
+            _patch_rms_norm_module_for_qwen3_5_moe(base_model.norm)
+
+        for decoder_layer in base_model.layers:
+            if rms_norm:
+                _patch_rms_norm_module_for_qwen3_5_moe(decoder_layer.input_layernorm)
+                _patch_rms_norm_module_for_qwen3_5_moe(decoder_layer.post_attention_layernorm)
+
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp.shared_expert, LigerQwen3MoeSwiGLUMLP)
+                experts = getattr(decoder_layer.mlp, "experts", None)
+                if experts is not None:
+                    _patch_swiglu_module(experts, LigerExperts)
+
+
+def apply_liger_kernel_to_hunyuan_v1_dense(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Hunyuan v1 dense models.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.hunyuan_v1_dense import modeling_hunyuan_v1_dense
+    from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1Model
+
+    from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_lce_forward
+    from liger_kernel.transformers.swiglu import LigerHunyuanV1SwiGLUMLP
+
+    if rope:
+        modeling_hunyuan_v1_dense.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if rms_norm:
+        modeling_hunyuan_v1_dense.HunYuanDenseV1RMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(hunyuan_v1_lce_forward, model)
+        else:
+            modeling_hunyuan_v1_dense.HunYuanDenseV1ForCausalLM.forward = hunyuan_v1_lce_forward
+
+    if swiglu:
+        modeling_hunyuan_v1_dense.HunYuanDenseV1MLP = LigerHunyuanV1SwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: HunYuanDenseV1Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _patch_swiglu_module(decoder_layer.mlp, LigerHunyuanV1SwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_hunyuan_v1_moe(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace Qwen3 models.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.hunyuan_v1_moe import modeling_hunyuan_v1_moe
+    from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1Model
+
+    from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_moe_lce_forward
+    from liger_kernel.transformers.swiglu import LigerHunyuanV1SwiGLUMLP
+
+    if rope:
+        modeling_hunyuan_v1_moe.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if rms_norm:
+        modeling_hunyuan_v1_moe.HunYuanMoEV1RMSNorm = LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(hunyuan_v1_moe_lce_forward, model)
+        else:
+            modeling_hunyuan_v1_moe.HunYuanMoEV1ForCausalLM.forward = hunyuan_v1_moe_lce_forward
+
+    if swiglu:
+        if IS_TRANSFORMERS_V5_OR_LATER:
+            modeling_hunyuan_v1_moe.HunYuanMoEV1Experts = LigerExperts
+        else:
+            modeling_hunyuan_v1_moe.HunYuanMoEV1MLP = LigerHunyuanV1SwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: HunYuanMoEV1Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                if IS_TRANSFORMERS_V5_OR_LATER:
+                    _patch_swiglu_module(decoder_layer.mlp.experts, LigerExperts)
+                else:
+                    for mlp_expert in decoder_layer.mlp.experts:
+                        _patch_swiglu_module(mlp_expert, LigerHunyuanV1SwiGLUMLP)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.input_layernorm)
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
+
+
+def apply_liger_kernel_to_exaone4(
+    rope: bool = True,
+    cross_entropy: bool = False,
+    fused_linear_cross_entropy: bool = True,
+    rms_norm: bool = True,
+    swiglu: bool = True,
+    model: PreTrainedModel = None,
+) -> None:
+    """
+    Apply Liger kernels to replace original implementation in HuggingFace EXAONE4 models.
+
+    Args:
+        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
+        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
+        fused_linear_cross_entropy (bool):
+            Whether to apply Liger's fused linear cross entropy loss. Default is True.
+            `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
+            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
+        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
+        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
+        loaded. Default is None.
+    """
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
+
+    from transformers.models.exaone4 import modeling_exaone4
+    from transformers.models.exaone4.modeling_exaone4 import Exaone4Model
+
+    from liger_kernel.transformers.model.exaone4 import lce_forward as exaone4_lce_forward
+
+    if rope:
+        modeling_exaone4.apply_rotary_pos_emb = liger_rotary_pos_emb
+
+    if rms_norm:
+        # EXAONE4 requires in_place=False to avoid gradient issues
+        class Exaone4LigerRMSNorm(LigerRMSNorm):
+            def __init__(self, hidden_size, eps=1e-6, **kwargs):
+                super().__init__(hidden_size, eps, **kwargs)
+                self.in_place = False
+
+        modeling_exaone4.Exaone4RMSNorm = Exaone4LigerRMSNorm
+
+    if cross_entropy:
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
+
+    if fused_linear_cross_entropy:
+        if model is not None:
+            model.forward = MethodType(exaone4_lce_forward, model)
+        else:
+            modeling_exaone4.Exaone4ForCausalLM.forward = exaone4_lce_forward
+
+    if swiglu:
+        modeling_exaone4.Exaone4MLP = LigerSwiGLUMLP
+
+    if model is not None:
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+
+        # get the base model from the model instance
+        base_model: Exaone4Model = getattr(model, model.base_model_prefix, model)
+
+        if rms_norm:
+            _patch_rms_norm_module(base_model.norm, in_place=False)
+        for decoder_layer in base_model.layers:
+            if swiglu:
+                _bind_method_to_module(decoder_layer.mlp, "forward", LigerSwiGLUMLP.forward)
+            if rms_norm:
+                _patch_rms_norm_module(decoder_layer.post_attention_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.self_attn.q_norm, in_place=False)
+                _patch_rms_norm_module(decoder_layer.self_attn.k_norm, in_place=False)
+
+
+# Model type corresponds to the keys defined in transformers/models/auto/modeling_auto.py
+MODEL_TYPE_TO_APPLY_LIGER_FN = {
+    "gemma": apply_liger_kernel_to_gemma,
+    "gemma2": apply_liger_kernel_to_gemma2,
+    "gemma3_text": apply_liger_kernel_to_gemma3_text,
+    "gemma3": apply_liger_kernel_to_gemma3,
+    "glm4": apply_liger_kernel_to_glm4,
+    "glm4v": apply_liger_kernel_to_glm4v,
+    "glm4v_moe": apply_liger_kernel_to_glm4v_moe,
+    "gpt_oss": apply_liger_kernel_to_gpt_oss,
+    "internvl": apply_liger_kernel_to_internvl,
+    "llama": apply_liger_kernel_to_llama,
+    "llama4_text": apply_liger_kernel_to_llama4,
+    "llama4": apply_liger_kernel_to_llama4,
+    "llava": apply_liger_kernel_to_llava,
+    "granite": apply_liger_kernel_to_granite,
+    "mllama": apply_liger_kernel_to_mllama,
+    "mllama_text_model": apply_liger_kernel_to_mllama,
+    "mistral": apply_liger_kernel_to_mistral,
+    "mixtral": apply_liger_kernel_to_mixtral,
+    "olmo2": apply_liger_kernel_to_olmo2,
+    "pixtral": apply_liger_kernel_to_pixtral,
+    "olmo3": apply_liger_kernel_to_olmo3,
+    "qwen2": apply_liger_kernel_to_qwen2,
+    "qwen3": apply_liger_kernel_to_qwen3,
+    "qwen3_moe": apply_liger_kernel_to_qwen3_moe,
+    "qwen2_vl": apply_liger_kernel_to_qwen2_vl,
+    "qwen2_vl_text": apply_liger_kernel_to_qwen2_vl,
+    "qwen2_5_vl": apply_liger_kernel_to_qwen2_5_vl,
+    "qwen2_5_vl_text": apply_liger_kernel_to_qwen2_5_vl,
+    "qwen3_next": apply_liger_kernel_to_qwen3_next,
+    "qwen3_5": apply_liger_kernel_to_qwen3_5,
+    "qwen3_5_text": apply_liger_kernel_to_qwen3_5,
+    "qwen3_5_moe": apply_liger_kernel_to_qwen3_5_moe,
+    "qwen3_5_moe_text": apply_liger_kernel_to_qwen3_5_moe,
+    "qwen3_vl": apply_liger_kernel_to_qwen3_vl,
+    "qwen3_vl_text": apply_liger_kernel_to_qwen3_vl,
+    "qwen3_vl_moe": apply_liger_kernel_to_qwen3_vl_moe,
+    "qwen3_vl_moe_text": apply_liger_kernel_to_qwen3_vl_moe,
+    "smollm3": apply_liger_kernel_to_smollm3,
+    "phi3": apply_liger_kernel_to_phi3,
+    "paligemma": apply_liger_kernel_to_paligemma,
+    "falcon_h1": apply_liger_kernel_to_falcon_h1,
+    "smolvlm": apply_liger_kernel_to_smolvlm,
+    "hunyuan_v1_dense": apply_liger_kernel_to_hunyuan_v1_dense,
+    "hunyuan_v1_moe": apply_liger_kernel_to_hunyuan_v1_moe,
+    "exaone4": apply_liger_kernel_to_exaone4,
+}
+
+
+def _apply_liger_kernel(model_type: str, **kwargs) -> None:
+    """
+    Applies Liger kernels based on the specified model type. The custom
+    kernels for the specified model type will be applied with the provided
+    keyword arguments, otherwise the default configuration will be used.
+
+    ** Note: Calling _apply_liger_kernel() after model initialization
+    will not be able to fully patch models. This must be called before model initialization.
+    If the model has already been instantiated
+
+    Args:
+        - model_type: the model types as defined in transformers/models/auto/modeling_auto.py
+          and specified in the model's config.json
+        - kwargs: keyword arguments that are passed to the corresponding apply_liger_kernel_to_* function.
+    """
+    if not model_type:
+        logger.info("Model type was not provided. No Liger kernels will be applied.")
+        return
+
+    if model_type not in MODEL_TYPE_TO_APPLY_LIGER_FN.keys():
+        logger.info(f"There are currently no Liger kernels supported for model type: {model_type}.")
+        return
+
+    apply_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[model_type]
+    apply_fn_signature = inspect.signature(apply_fn)
+
+    # Filter out the keyword arguments that are not supported by the apply function
+    applicable_kwargs = {key: value for key, value in kwargs.items() if key in apply_fn_signature.parameters}
+
+    logger.info(f"Applying Liger kernels for model type: {model_type} with kwargs: {applicable_kwargs}")
+
+    # Assume this is invoked pre-model initialization, so we only need to patch transformers code
+    apply_fn(**applicable_kwargs)
+
+
+def _apply_liger_kernel_to_instance(model: PreTrainedModel, **kwargs) -> None:
+    """
+    Applies Liger kernels to the provided model instance.
+
+    Args:
+        - model: the model instance to apply Liger kernels to
+        - kwargs: keyword arguments that are passed to the corresponding apply_liger_kernel_to_* function.
+    """
+    model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None)
+
+    if not model_type:
+        logger.info("Model type could not be determined from model config. No Liger kernels will be applied.")
+        return
+
+    if model_type not in MODEL_TYPE_TO_APPLY_LIGER_FN.keys():
+        logger.info(f"There are currently no Liger kernels supported for model type: {model_type}.")
+        return
+
+    apply_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[model_type]
+    apply_fn_signature = inspect.signature(apply_fn)
+
+    # Filter out the keyword arguments that are not supported by the apply function
+    applicable_kwargs = {key: value for key, value in kwargs.items() if key in apply_fn_signature.parameters}
+    logger.info(
+        f"Applying Liger kernels to model instance with model type: {model_type} with kwargs: {applicable_kwargs}"
+    )
+
+    apply_fn(model=model, **applicable_kwargs)
diff --git a/src/liger_kernel/transformers/multi_token_attention.py b/src/liger_kernel/transformers/multi_token_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..38b5c6891d6fc7f8d003950640bac73d546945c5
--- /dev/null
+++ b/src/liger_kernel/transformers/multi_token_attention.py
@@ -0,0 +1,64 @@
+import math
+
+import torch
+import torch.nn as nn
+
+from torch.nn.modules.utils import _pair
+
+from liger_kernel.ops import LigerMultiTokenAttentionFunction
+
+
+class LigerMultiTokenAttention(nn.Module):
+    r"""
+    Multi-Token Attention:
+        out = mask_{0}(conv2d(softmax(mask_{-\inf}(scores))))
+
+    Reference: https://arxiv.org/pdf/2504.00927
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        sparse: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.sparse = sparse
+
+        self.weight = nn.Parameter(torch.empty(out_channels, in_channels // groups, *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+
+    def forward(self, scores: torch.Tensor) -> torch.Tensor:
+        return LigerMultiTokenAttentionFunction.apply(
+            scores,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.sparse,
+        )
diff --git a/src/liger_kernel/transformers/poly_norm.py b/src/liger_kernel/transformers/poly_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..24b991db37422b7f84573f9466126f37909a5524
--- /dev/null
+++ b/src/liger_kernel/transformers/poly_norm.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerPolyNormFunction
+
+
+class LigerPolyNorm(nn.Module):
+    """
+    PolyNorm layer wrapper for Liger kernel.
+
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+
+    Reference:
+        https://github.com/BryceZhuo/PolyCom/
+
+    Args:
+        eps: epsilon for numerical stability (default: 1e-6)
+        in_place: whether to in-place modify grad_output in backward to save memory (default: False).
+                  Set to True to save memory if grad_output is not needed elsewhere.
+    """
+
+    def __init__(self, eps=1e-6, in_place=True):
+        super().__init__()
+        # Align with PolyCom reference: initialize weights to (1/3, 1/3, 1/3) and bias to 1.0
+        self.weight = nn.Parameter(torch.full((3,), 1.0 / 3.0))
+        self.bias = nn.Parameter(torch.tensor(1.0))
+        self.variance_epsilon = eps
+        self.in_place = in_place
+
+    def forward(self, hidden_states):
+        return LigerPolyNormFunction.apply(
+            hidden_states,
+            self.weight,
+            self.bias,
+            self.variance_epsilon,
+            self.in_place,
+        )
+
+    def extra_repr(self):
+        return f"weight_shape={tuple(self.weight.shape)}, eps={self.variance_epsilon}, in_place={self.in_place}"
diff --git a/src/liger_kernel/transformers/qwen2vl_mrope.py b/src/liger_kernel/transformers/qwen2vl_mrope.py
new file mode 100755
index 0000000000000000000000000000000000000000..75c2b623b65d5e0ddfcdfa1e6f05f1874fcfc92c
--- /dev/null
+++ b/src/liger_kernel/transformers/qwen2vl_mrope.py
@@ -0,0 +1,20 @@
+from liger_kernel.ops import LigerQwen2VLMRopeFunction
+
+
+def liger_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """
+    Applies Multimodal Rotary Positional Embedding (M-RoPE) operation to query and key states.
+
+    Args:
+        q (torch.Tensor): The query tensor of shape (bsz, n_q_head, seq_len, head_dim).
+        k (torch.Tensor): The key tensor of shape (bsz, n_kv_head, seq_len, head_dim).
+        cos (torch.Tensor): The cosine tensor of shape (3, bsz, seq_len, head_dim).
+        sin (torch.Tensor): The sine tensor of shape (3, bsz, seq_len, head_dim).
+        mrope_section (List[int]): The multimodal rope section for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (int, optional): The dimension to unsqueeze. Defaults to 1.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The query and key tensors after applying the M-RoPE operation.
+    """
+
+    return LigerQwen2VLMRopeFunction.apply(q, k, cos, sin, mrope_section, unsqueeze_dim)
diff --git a/src/liger_kernel/transformers/rms_norm.py b/src/liger_kernel/transformers/rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..3f5aa7684d7d9324403944565c2459ae8e70b854
--- /dev/null
+++ b/src/liger_kernel/transformers/rms_norm.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerRMSNormFunction
+
+
+class LigerRMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-6,
+        offset=0.0,
+        casting_mode="llama",
+        init_fn="ones",
+        in_place=True,
+        row_mode=None,
+        elementwise_affine=True,
+    ):
+        super().__init__()
+        assert init_fn in [
+            "ones",
+            "zeros",
+        ], f"init_fn must be either 'ones' or 'zeros', got {init_fn}"
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(hidden_size) if init_fn == "ones" else torch.zeros(hidden_size))
+        else:
+            self.register_parameter("weight", None)
+        self.variance_epsilon, self.offset, self.casting_mode, self.in_place, self.row_mode = (
+            eps,
+            offset,
+            casting_mode,
+            in_place,
+            row_mode,
+        )
+
+    def forward(self, hidden_states):
+        return LigerRMSNormFunction.apply(
+            hidden_states,
+            self.weight,
+            self.variance_epsilon,
+            self.offset,
+            self.casting_mode,
+            self.in_place,
+            self.row_mode,
+        )
+
+    def extra_repr(self):
+        return f"weight_shape={tuple(self.weight.shape) if self.weight is not None else None}, eps={self.variance_epsilon}, offset={self.offset}, in_place={self.in_place}, row_mode={self.row_mode}"
+
+
+class LigerRMSNormForGemma(LigerRMSNorm):
+    def __init__(
+        self, hidden_size, eps=1e-6, offset=1.0, casting_mode="gemma", init_fn="zeros", in_place=True, row_mode=None
+    ):
+        super().__init__(hidden_size, eps, offset, casting_mode, init_fn, in_place, row_mode)
+
+
+class LigerRMSNormForGemma2(LigerRMSNorm):
+    def __init__(
+        self, hidden_size, eps=1e-6, offset=1.0, casting_mode="gemma", init_fn="zeros", in_place=False, row_mode=None
+    ):
+        super().__init__(hidden_size, eps, offset, casting_mode, init_fn, in_place, row_mode)
+
+
+class LigerRMSNormForGemma3(LigerRMSNorm):
+    """Gemma3RMSNorm has a dim argument not hidden_size used in q_norm and k_norm."""
+
+    def __init__(self, dim, eps=0.000001, offset=1.0, casting_mode="gemma", init_fn="zeros", in_place=False):
+        super().__init__(dim, eps, offset, casting_mode, init_fn, in_place)
+
+
+class LigerRMSNormForOlmo2(LigerRMSNorm):
+    def __init__(
+        self, hidden_size, eps=1e-6, offset=0.0, casting_mode="llama", init_fn="ones", in_place=False, row_mode=None
+    ):
+        super().__init__(hidden_size, eps, offset, casting_mode, init_fn, in_place, row_mode)
+
+
+class LigerRMSNormForGlm4(LigerRMSNorm):
+    def __init__(
+        self, hidden_size, eps=1e-6, offset=0.0, casting_mode="llama", init_fn="ones", in_place=False, row_mode=None
+    ):
+        super().__init__(hidden_size, eps, offset, casting_mode, init_fn, in_place, row_mode)
+
+
+class LigerRMSNormForQwen3Next(LigerRMSNorm):
+    def __init__(
+        self, hidden_size, eps=1e-6, offset=1.0, casting_mode="gemma", init_fn="zeros", in_place=False, row_mode=None
+    ):
+        super().__init__(hidden_size, eps, offset, casting_mode, init_fn, in_place, row_mode)
diff --git a/src/liger_kernel/transformers/rope.py b/src/liger_kernel/transformers/rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..ea2ca86ede0db4a0bd9b8349e78f0833c95e7a87
--- /dev/null
+++ b/src/liger_kernel/transformers/rope.py
@@ -0,0 +1,64 @@
+from typing import Tuple
+
+import torch
+
+from liger_kernel.ops import LigerRopeFunction
+
+
+def liger_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """
+    Applies Rotary Positional Embedding (RoPE) operation to query and key states.
+
+    Args:
+        q (torch.Tensor): The query tensor of shape (bsz, n_q_head, seq_len, head_dim).
+        k (torch.Tensor): The key tensor of shape (bsz, n_kv_head, seq_len, head_dim).
+        cos (torch.Tensor): The cosine tensor of shape (1, seq_len, head_dim) or (bsz, seq_len, head_dim).
+        sin (torch.Tensor): The sine tensor of shape (1, seq_len, head_dim) or (bsz, seq_len, head_dim).
+        position_ids (torch.Tensor, optional): The position ids tensor. Defaults to None.
+        unsqueeze_dim (int, optional): The dimension to unsqueeze. Defaults to 1.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The query and key tensors after applying the RoPE operation.
+    """
+
+    return LigerRopeFunction.apply(q, k, cos, sin, position_ids, unsqueeze_dim)
+
+
+def liger_rotary_pos_emb_vision(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Modified version of liger_rotary_pos_emb for qwen3_vl's apply_rotary_pos_emb_vision function.
+    Manually tranposed the input and output to match the expected shape for liger_rotary_pos_emb.
+    Reference: https://https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L116
+
+    Args:
+        q (torch.Tensor): The query tensor of shape (seq_length, num_heads, head_dim),
+        with stride (num_heads * head_dim, head_dim, 1).
+        k (torch.Tensor): The query tensor of shape (seq_length, num_heads, head_dim),
+        with stride (num_heads * head_dim, head_dim, 1). Same as q.
+        cos (torch.Tensor): The cosine tensor of shape (seq_length, head_dim).
+        sin (torch.Tensor): The sine tensor of shape (seq_length, head_dim).
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The query and key tensors with the same shape and stride as inputs.
+    """
+    orig_q_dtype, orig_k_dtype = q.dtype, k.dtype
+
+    # tranpose to (1, num_heads, seq_length, head_dim) and cast to float32 to match liger_rotary_pos_emb input shape
+    # also unsqueeze for batch dim
+    q32 = q.to(torch.float32).unsqueeze(0).transpose(1, 2)
+    k32 = k.to(torch.float32).unsqueeze(0).transpose(1, 2)
+    cos32 = cos.to(torch.float32)
+    sin32 = sin.to(torch.float32)
+
+    q_out, k_out = liger_rotary_pos_emb(q32, k32, cos32, sin32)
+
+    # transpose back to (seq_length, num_heads, head_dim) and cast back to original dtype
+    # also squeeze out batch dim
+    q_out = q_out.transpose(1, 2).squeeze(0).to(orig_q_dtype)
+    k_out = k_out.transpose(1, 2).squeeze(0).to(orig_k_dtype)
+    return q_out, k_out
diff --git a/src/liger_kernel/transformers/softmax.py b/src/liger_kernel/transformers/softmax.py
new file mode 100755
index 0000000000000000000000000000000000000000..1d81aa16304f1e01873adca6e39d8951d9b80ee9
--- /dev/null
+++ b/src/liger_kernel/transformers/softmax.py
@@ -0,0 +1,12 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerSoftmaxFunction
+
+
+class LigerSoftmax(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor):
+        return LigerSoftmaxFunction.apply(x)
diff --git a/src/liger_kernel/transformers/sparsemax.py b/src/liger_kernel/transformers/sparsemax.py
new file mode 100755
index 0000000000000000000000000000000000000000..af54aac9d889cde8ea7707c3615e9a239e6364c7
--- /dev/null
+++ b/src/liger_kernel/transformers/sparsemax.py
@@ -0,0 +1,16 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerSparsemaxFunction
+
+
+class LigerSparsemax(nn.Module):
+    def __init__(self, dim: int = -1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return LigerSparsemaxFunction.apply(x, self.dim)
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}"
diff --git a/src/liger_kernel/transformers/swiglu.py b/src/liger_kernel/transformers/swiglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..02bf7dadb9f306359f4dab8c874cb3f28e47d0f8
--- /dev/null
+++ b/src/liger_kernel/transformers/swiglu.py
@@ -0,0 +1,145 @@
+import torch
+import torch.nn as nn
+
+from liger_kernel.ops import LigerSiLUMulFunction
+
+
+class LigerSwiGLUMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        if config.hidden_act not in ["silu", "swish"]:
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+
+    def forward(self, x):
+        return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
+
+
+class LigerBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+
+        if config.hidden_act not in ["silu", "swish"]:
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+
+    def forward(self, x):
+        return self.w2(LigerSiLUMulFunction.apply(self.w1(x), self.w3(x)))
+
+
+class LigerExperts(nn.Module):
+    """
+    Patch MixtralExperts for transformers v5 or later to use LigerSiLUMulFunction
+    https://github.com/huggingface/transformers/blob/393b4b3d28e29b4b05b19b4b7f3242a7fc893637/src/transformers/models/mixtral/modeling_mixtral.py#L63
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if "num_experts" in config:
+            # qwen3_moe, qwen3_next uses num_experts
+            self.num_experts = config.num_experts
+        else:
+            self.num_experts = config.num_local_experts
+        if "moe_intermediate_size" in config:
+            # qwen3_moe, qwen3_next uses moe_intermediate_size
+            self.intermediate_dim = config.moe_intermediate_size
+        else:
+            self.intermediate_dim = config.intermediate_size
+
+        self.hidden_dim = config.hidden_size
+        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
+        self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
+
+        if config.hidden_act not in ["silu", "swish"]:
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+
+    def forward(self, hidden_states, top_k_index, top_k_weights):
+        final_hidden_states = torch.zeros_like(hidden_states)
+        with torch.no_grad():
+            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
+            expert_mask = expert_mask.permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+
+        for expert_idx in expert_hit:
+            expert_idx = expert_idx[0]
+            if expert_idx == self.num_experts:
+                continue
+            top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
+            current_state = hidden_states[token_idx]
+            gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
+            current_hidden_states = LigerSiLUMulFunction.apply(gate, up)
+            current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx])
+            current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
+            final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
+
+        return final_hidden_states
+
+
+class LigerPhi3SwiGLUMLP(nn.Module):
+    """
+    Patch Phi3MLP to use LigerSiLUMulFunction
+    https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/models/phi3/modeling_phi3.py#L241
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = nn.Linear(self.hidden_size, 2 * self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        if config.hidden_act not in ["silu", "swish"]:
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+
+    def forward(self, x):
+        up_states = self.gate_up_proj(x)
+        gate, up_states = up_states.chunk(2, dim=-1)
+        return self.down_proj(LigerSiLUMulFunction.apply(gate, up_states))
+
+
+class LigerQwen3MoeSwiGLUMLP(nn.Module):
+    """
+    Patch Qwen3MoeMLP to use LigerSiLUMulFunction.
+    https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen3_moe/modular_qwen3_moe.py#L57
+    """
+
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        if config.hidden_act not in ["silu", "swish"]:
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+
+    def forward(self, x):
+        return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
+
+
+class LigerHunyuanV1SwiGLUMLP(nn.Module):
+    def __init__(self, config, layer_idx=None, is_shared_mlp=False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.layer_idx = layer_idx
+        if config.hidden_act not in ["silu", "swish"]:
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+
+    def forward(self, x):
+        return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
diff --git a/src/liger_kernel/transformers/tiled_mlp.py b/src/liger_kernel/transformers/tiled_mlp.py
new file mode 100755
index 0000000000000000000000000000000000000000..b72507b2eeb19ccfe931c309fba52ec2be3f77ab
--- /dev/null
+++ b/src/liger_kernel/transformers/tiled_mlp.py
@@ -0,0 +1,125 @@
+from typing import Optional
+
+import torch.nn as nn
+
+from liger_kernel.ops import LigerGELUMulFunction
+from liger_kernel.ops import LigerSiLUMulFunction
+from liger_kernel.ops import apply_tiled_mlp
+
+
+class LigerTiledGEGLUMLP(nn.Module):
+    """
+    Memory-efficient GEGLU MLP using tiled computation.
+
+    This module combines GEGLU activation with tiled processing to handle
+    very long sequences efficiently. The forward pass is recomputed during
+    backward to save memory.
+
+    Args:
+        config: Model configuration with hidden_size and intermediate_size attributes
+        num_shards: Number of shards to split the sequence. If None, automatically
+                   calculated as ceil(seqlen / hidden_size)
+    """
+
+    def __init__(self, config, num_shards: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_shards = num_shards
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+        # Validate activation function
+        if hasattr(config, "hidden_act") and config.hidden_act not in [
+            "gelu",
+            "gelu_new",
+            "gelu_pytorch_tanh",
+        ]:
+            raise ValueError(f"LigerTiledGEGLUMLP requires GELU activation, got {config.hidden_act}")
+
+    def _mlp_forward(self, module, x):
+        """Internal MLP forward function for tiled computation."""
+        gate = module.gate_proj(x)
+        up = module.up_proj(x)
+        return module.down_proj(LigerGELUMulFunction.apply(gate, up))
+
+    def forward(self, x):
+        """
+        Forward pass with tiled computation.
+
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size]
+               or [seq_len, hidden_size]
+
+        Returns:
+            Output tensor of the same shape as input
+        """
+        compute_params = [p for p in self.parameters() if p.requires_grad]
+
+        return apply_tiled_mlp(
+            fn=self._mlp_forward,
+            mlp_module=self,
+            x=x,
+            num_shards=self.num_shards,
+            compute_params=compute_params,
+        )
+
+
+class LigerTiledSwiGLUMLP(nn.Module):
+    """
+    Memory-efficient SwiGLU MLP using tiled computation.
+
+    This module combines SwiGLU activation with tiled processing to handle
+    very long sequences efficiently. The forward pass is recomputed during
+    backward to save memory.
+
+    Args:
+        config: Model configuration with hidden_size and intermediate_size attributes
+        num_shards: Number of shards to split the sequence. If None, automatically
+                   calculated as ceil(seqlen / hidden_size)
+    """
+
+    def __init__(self, config, num_shards: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_shards = num_shards
+
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+        # Validate activation function
+        if hasattr(config, "hidden_act") and config.hidden_act not in ["silu", "swish"]:
+            raise ValueError(f"LigerTiledSwiGLUMLP requires SiLU/Swish activation, got {config.hidden_act}")
+
+    def _mlp_forward(self, module, x):
+        """Internal MLP forward function for tiled computation."""
+        gate = module.gate_proj(x)
+        up = module.up_proj(x)
+        return module.down_proj(LigerSiLUMulFunction.apply(gate, up))
+
+    def forward(self, x):
+        """
+        Forward pass with tiled computation.
+
+        Args:
+            x: Input tensor of shape [batch_size, seq_len, hidden_size]
+               or [seq_len, hidden_size]
+
+        Returns:
+            Output tensor of the same shape as input
+        """
+        compute_params = [p for p in self.parameters() if p.requires_grad]
+
+        return apply_tiled_mlp(
+            fn=self._mlp_forward,
+            mlp_module=self,
+            x=x,
+            num_shards=self.num_shards,
+            compute_params=compute_params,
+        )
diff --git a/src/liger_kernel/transformers/trainer/__init__.py b/src/liger_kernel/transformers/trainer/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..df5de2038ace73024ec5d42933990cd178aaeeff
--- /dev/null
+++ b/src/liger_kernel/transformers/trainer/__init__.py
@@ -0,0 +1,4 @@
+try:
+    from liger_kernel.transformers.trainer.orpo_trainer import LigerORPOTrainer  # noqa: F401
+except ImportError:
+    raise ImportError("Please `pip install trl` to use LigerORPOTrainer")
diff --git a/src/liger_kernel/transformers/trainer/orpo_trainer.py b/src/liger_kernel/transformers/trainer/orpo_trainer.py
new file mode 100755
index 0000000000000000000000000000000000000000..6ae10b35b99770e89f7fdd1481d210e55b1d71f3
--- /dev/null
+++ b/src/liger_kernel/transformers/trainer/orpo_trainer.py
@@ -0,0 +1,130 @@
+from typing import Dict
+from typing import List
+from typing import Literal
+from typing import Tuple
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from torch.distributed.fsdp import FullyShardedDataParallel
+from trl.trainer import ORPOTrainer
+
+from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
+from liger_kernel.transformers.fsdp import _FSDPForwardRedirection
+
+
+class LigerORPOTrainer(ORPOTrainer):
+    def concatenated_forward(
+        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """
+        Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(concatenated_batch["concatenated_labels"]),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+
+        if self.is_encoder_decoder:
+            labels = concatenated_batch["concatenated_labels"].clone()
+        else:
+            labels = concatenated_batch["concatenated_input_ids"].clone()
+            attention_mask = concatenated_batch["concatenated_attention_mask"]
+            labels = torch.where(attention_mask == 1, labels, self.label_pad_token_id)
+
+        if isinstance(model, FullyShardedDataParallel):
+            outputs = _FSDPForwardRedirection()(
+                model,
+                model._fsdp_wrapped_module.model,
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        else:
+            if isinstance(model, torch.nn.DataParallel):
+                model = model.module
+            outputs = model.model(
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+
+        orpo_loss_fn = LigerFusedLinearORPOLoss(ignore_index=self.label_pad_token_id, beta=self.beta)
+
+        def orpo_partial(lm_head, last_hidden_state, concatenated_labels, nll_target):
+            return orpo_loss_fn(
+                lm_head.weight, last_hidden_state, concatenated_labels, lm_head.bias, nll_target=nll_target
+            )
+
+        orpo_loss, aux_outputs = _FSDPForwardRedirection()(
+            model,
+            orpo_partial,
+            model.lm_head,
+            outputs.last_hidden_state[:, :-1] if not self.is_encoder_decoder else outputs.last_hidden_state,
+            concatenated_batch["concatenated_labels"][:, 1:]
+            if not self.is_encoder_decoder
+            else concatenated_batch["concatenated_labels"],
+            labels[:, 1:] if not self.is_encoder_decoder else labels,
+        )
+        # if aux_loss_enabled, add the aux_loss to the orpo_loss
+        if self.aux_loss_enabled:
+            orpo_loss += self.aux_loss_coef * outputs.aux_loss
+
+        return orpo_loss, aux_outputs
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: Dict[str, Union[List, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        loss, aux_outputs = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = aux_outputs[:5]
+
+        # return loss, metrics
+        chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = aux_outputs[5:]
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean()
+        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean()
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().mean()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().mean()
+        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.detach().mean()
+        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.detach().mean()
+        metrics[f"{prefix}nll_loss"] = policy_nll_loss.detach().mean()
+        metrics[f"{prefix}log_odds_ratio"] = log_odds_ratio
+        metrics[f"{prefix}log_odds_chosen"] = log_odds_chosen
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+
+        return loss, metrics
diff --git a/src/liger_kernel/transformers/trainer_integration.py b/src/liger_kernel/transformers/trainer_integration.py
new file mode 100755
index 0000000000000000000000000000000000000000..623ceab543aaa4253f56217ac2a93f0597644b5d
--- /dev/null
+++ b/src/liger_kernel/transformers/trainer_integration.py
@@ -0,0 +1,2 @@
+# To not break HF Trainer integration
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel  # noqa: F401
diff --git a/src/liger_kernel/transformers/tvd.py b/src/liger_kernel/transformers/tvd.py
new file mode 100755
index 0000000000000000000000000000000000000000..b57a4898ca2da8c809a50f059d56b3b9c9c9608c
--- /dev/null
+++ b/src/liger_kernel/transformers/tvd.py
@@ -0,0 +1,13 @@
+import torch.nn as nn
+
+from liger_kernel.ops import LigerTVDLossFunction
+
+
+class LigerTVDLoss(nn.Module):
+    def __init__(self, reduction="batchmean", ignore_index: int = -100):
+        super(LigerTVDLoss, self).__init__()
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+
+    def forward(self, p, q, shift_labels=None):
+        return LigerTVDLossFunction.apply(p, q, shift_labels, self.reduction, self.ignore_index)
diff --git a/src/liger_kernel/triton/__init__.py b/src/liger_kernel/triton/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..d373966a9bb488ce448ec6989c919967382fe8c7
--- /dev/null
+++ b/src/liger_kernel/triton/__init__.py
@@ -0,0 +1 @@
+from liger_kernel.triton.monkey_patch import apply_liger_triton_cache_manager  # noqa: F401
diff --git a/src/liger_kernel/triton/monkey_patch.py b/src/liger_kernel/triton/monkey_patch.py
new file mode 100755
index 0000000000000000000000000000000000000000..bac4a6a0d6a8fc74b56562bdba3c659e175c39ca
--- /dev/null
+++ b/src/liger_kernel/triton/monkey_patch.py
@@ -0,0 +1,40 @@
+import os
+import random
+
+from triton.runtime.cache import FileCacheManager
+
+
+class LigerTritonFileCacheManager(FileCacheManager):
+    def put(self, data, filename, binary=True) -> str:
+        if not self.cache_dir:
+            raise RuntimeError("Could not create or locate cache dir")
+        binary = isinstance(data, bytes)
+        if not binary:
+            data = str(data)
+        assert self.lock_path is not None
+        filepath = self._make_path(filename)
+        # Random ID to avoid any collisions
+        rnd_id = random.randint(0, 1000000)
+        # we use the PID incase a bunch of these around so we can see what PID made it
+        pid = os.getpid()
+        # use temp dir to be robust against program interruptions
+        temp_dir = os.path.join(self.cache_dir, f"tmp.pid_{pid}_{rnd_id}")
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_path = os.path.join(temp_dir, filename)
+
+        mode = "wb" if binary else "w"
+        with open(temp_path, mode) as f:
+            f.write(data)
+        # Replace is guaranteed to be atomic on POSIX systems if it succeeds
+        # so filepath cannot see a partial write
+        os.replace(temp_path, filepath)
+        os.removedirs(temp_dir)
+        return filepath
+
+
+def apply_liger_triton_cache_manager():
+    """
+    Experimental feature to get around transient FileNotFoundError in triton compilation.
+    For more details please see https://github.com/triton-lang/triton/pull/4295
+    """
+    os.environ["TRITON_CACHE_MANAGER"] = "liger_kernel.triton.monkey_patch:LigerTritonFileCacheManager"
diff --git a/src/liger_kernel/utils.py b/src/liger_kernel/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..a6bbc31b5e9b3085d3c73ded576d74653128cc02
--- /dev/null
+++ b/src/liger_kernel/utils.py
@@ -0,0 +1,125 @@
+try:
+    import peft  # noqa: F401
+
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
+
+import torch
+
+
+def is_peft_available():
+    return PEFT_AVAILABLE
+
+
+def infer_comm_backend():
+    """
+    Get communication backend name based on the environment.
+    """
+    if torch.distributed.is_nccl_available():
+        # Works for Nvidia
+        # TODO: nccl may not work for AMD decices that may require use of rccl.
+        return "nccl"
+    elif is_npu_available():
+        # Use Ascend NPU if available (torch.npu)
+        # Ascend is not standard torch backend and requires extension.
+        # Assume that it is installed if NPUs are being used in
+        # multi device environment.
+        return "ascend"
+    # XPU (Intel) if available
+    elif torch.distributed.distributed_c10d.is_xccl_available():
+        return "xccl"
+    elif torch.distributed.is_mpi_available():
+        # CPU backend, first option
+        return "mpi"
+    elif torch.distributed.is_gloo_available():
+        # CPU backend, backup option
+        return "gloo"
+    else:
+        raise RuntimeError("There is no distributed backend available.")
+
+
+def infer_device():
+    """
+    Get current device name based on available devices
+    """
+    if torch.cuda.is_available():  # Works for both Nvidia and AMD
+        return "cuda"
+    # Use Ascend NPU if available (torch.npu)
+    elif is_npu_available():
+        return "npu"
+    # XPU (Intel) if available
+    elif torch.xpu.is_available():
+        return "xpu"
+    else:
+        return "cpu"
+
+
+def is_npu_available() -> bool:
+    """Detect Ascend NPU availability."""
+    try:
+        from transformers.utils import is_torch_npu_available
+
+        return is_torch_npu_available()
+    except Exception:
+        return False
+
+
+def transformers_version_dispatch(
+    required_version: str,
+    before_fn,
+    after_fn,
+    before_args: tuple = (),
+    after_args: tuple = (),
+    before_kwargs: dict = None,
+    after_kwargs: dict = None,
+):
+    """
+    Dispatches to different functions based on package version comparison.
+
+    Args:
+        required_version: Version to compare against (e.g. "4.48.0")
+        before_fn: Function to call if package_version < required_version
+        after_fn: Function to call if package_version >= required_version
+        before_args: Positional arguments for before_fn
+        after_args: Positional arguments for after_fn
+        before_kwargs: Keyword arguments for before_fn
+        after_kwargs: Keyword arguments for after_fn
+
+    Returns:
+        Result from either before_fn or after_fn
+
+    Example:
+        >>> rotary_emb = transformers_version_dispatch(
+        ...     "4.48.0",
+        ...     LlamaRotaryEmbedding,
+        ...     LlamaRotaryEmbedding,
+        ...     before_args=(head_dim,),
+        ...     after_args=(LlamaConfig(head_dim=head_dim),),
+        ...     before_kwargs={'device': device},
+        ...     after_kwargs={'device': device}
+        ... )
+    """
+    from packaging import version
+    from transformers import __version__ as transformers_version
+
+    before_kwargs = before_kwargs or {}
+    after_kwargs = after_kwargs or {}
+
+    if version.parse(transformers_version) < version.parse(required_version):
+        return before_fn(*before_args, **before_kwargs)
+    else:
+        return after_fn(*after_args, **after_kwargs)
+
+
+def get_total_gpu_memory() -> int:
+    """Returns total GPU memory in GBs."""
+    device = infer_device()
+    if device == "cuda":
+        return torch.cuda.get_device_properties(0).total_memory // (1024**3)
+    elif device == "xpu":
+        return torch.xpu.get_device_properties(0).total_memory // (1024**3)
+    elif device == "npu":
+        return torch.npu.get_device_properties(0).total_memory // (1024**3)
+    else:
+        raise RuntimeError(f"Unsupported device: {device}")
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test/chunked_loss/__init__.py b/test/chunked_loss/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test/chunked_loss/test_cosine_loss.py b/test/chunked_loss/test_cosine_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..b2711c04d875666830bb9a9c3030d1829ecac253
--- /dev/null
+++ b/test/chunked_loss/test_cosine_loss.py
@@ -0,0 +1,320 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityLoss
+from liger_kernel.chunked_loss.functional import liger_fused_linear_cosine
+from liger_kernel.utils import infer_device
+from test.utils import HFDistillationLoss
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+set_seed()
+
+
+class HFCosineLoss(HFDistillationLoss):
+    """
+    implementation of a distilltion loss using cosine similarity
+    """
+
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__(
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            ignore_index=ignore_index,
+            temperature=temperature,
+        )
+
+    def distillation_loss(self, student_logits, teacher_logits, target=None, ignore_index=None, beta=1.0, **kwargs):
+        # Compute normalized logits
+        print(f"student_logits.shape: {student_logits.shape}")
+        student_norm = F.normalize(student_logits, p=2, dim=-1)
+        teacher_norm = F.normalize(teacher_logits, p=2, dim=-1)
+        # cosine_sim = (student_norm * teacher_norm).sum(dim=1).mean()
+        # loss =  beta * (1 - cosine_sim)
+        cosine_sim = F.cosine_similarity(student_norm, teacher_norm, dim=-1)
+
+        loss = beta * (1 - cosine_sim)
+        return loss.mean()
+
+
+class TorchCosineLoss(torch.nn.Module):
+    """
+    Reference implementation for Cosine Similarity Loss using standard torch operations.
+    Computes the loss as 1 - cosine_similarity averaged over all tokens.
+    """
+
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool,
+        device: torch.device,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 1.0,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__()
+        # Note: student inputs are expected to have hidden size H//2 while teacher inputs have H.
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=bias, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype, device=device)
+        self.beta = beta
+        self.cosine = HFCosineLoss(
+            ignore_index=ignore_index,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            temperature=temperature,
+        ).get_batch_loss_metrics
+
+    def forward(self, student_input, teacher_input, target):
+        loss = self.cosine(
+            student_input,
+            self.student_lin.weight,
+            teacher_input,
+            self.teacher_lin.weight,
+            target,
+            self.student_lin.bias,
+            self.teacher_lin.bias,
+            beta=self.beta,
+        )
+        return loss
+
+
+class LigerCosineLoss(torch.nn.Module):
+    """
+    Liger implementation that uses fused cosine similarity loss.
+    """
+
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool,
+        device: torch.device,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+    ):
+        super().__init__()
+        self.chunked_cosine = LigerFusedLinearCosineSimilarityLoss(
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            compiled=compiled,
+            chunk_size=chunk_size,
+        )
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=bias, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype, device=device)
+
+    def forward(self, student_input, teacher_input, target):
+        return self.chunked_cosine(
+            student_input,
+            self.student_lin.weight,
+            teacher_input,
+            self.teacher_lin.weight,
+            target,
+            self.student_lin.bias,
+            self.teacher_lin.bias,
+        )
+
+
+###############################################################################
+# Test correctness of the module implementations
+###############################################################################
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 32, 128),  # H must be even
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "temperature, weight_hard_loss, weight_soft_loss, beta",
+    [
+        (1.0, 0.5, 0.5, 0.5),
+        (2.0, 0.0, 1.0, 0.8),
+        (0.5, 1.0, 0.0, 0.2),
+    ],
+)
+def test_correctness(
+    B, T, H, V, scalar, dtype, atol, rtol, bias, temperature, weight_hard_loss, weight_soft_loss, beta
+):
+    torch_cosine = TorchCosineLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        device=device,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+        temperature=temperature,
+        beta=beta,
+    )
+    liger_cosine = LigerCosineLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        device=device,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+        temperature=temperature,
+        beta=beta,
+    )
+    # Ensure both implementations start with the same weights and biases.
+    torch_cosine.student_lin.weight.data = liger_cosine.student_lin.weight.data = torch.rand(
+        V, H // 2, device=device, dtype=dtype
+    )
+    torch_cosine.teacher_lin.weight.data = liger_cosine.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+    if bias:
+        torch_cosine.student_lin.bias.data = liger_cosine.student_lin.bias.data = torch.rand(
+            V, device=device, dtype=dtype
+        )
+        torch_cosine.teacher_lin.bias.data = liger_cosine.teacher_lin.bias.data = torch.rand(
+            V, device=device, dtype=dtype
+        )
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    student_input1 = _tensor.clone().detach().requires_grad_(True)
+    student_input2 = _tensor.clone().detach().requires_grad_(True)
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+    # Dummy target (not used in cosine computation)
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    loss1 = torch_cosine(student_input1, teacher_input, target)
+    loss2 = liger_cosine(student_input2, teacher_input, target)
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    print("loss1 shape : {loss1.shape}")
+    loss2.backward()
+
+    assert_verbose_allclose(student_input1.grad, student_input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_cosine.student_lin.weight.grad, liger_cosine.student_lin.weight.grad, atol=atol, rtol=rtol
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_cosine.student_lin.bias.grad, liger_cosine.student_lin.bias.grad, atol=atol, rtol=rtol
+        )
+
+
+###############################################################################
+# Test correctness of the functional interface
+###############################################################################
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (9, 7, 40, 40),  # H must be even
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-2),
+        (1.0, torch.float32, 1e-4, 5e-3),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "temperature, weight_hard_loss, weight_soft_loss, beta, ignore_index",
+    [
+        (1.0, 0.5, 0.5, 0.5, -100),
+        (2.0, 0.1, 0.9, 0.5, 42),
+    ],
+)
+def test_correctness_functional(
+    B, T, H, V, scalar, dtype, bias, weight_hard_loss, weight_soft_loss, beta, ignore_index, temperature, atol, rtol
+):
+    # Prepare weights and biases for functional testing.
+    student_weight1 = torch.rand(V, H // 2, device=device, dtype=dtype).detach().clone().requires_grad_(True)
+    student_weight2 = student_weight1.clone().detach().requires_grad_(True)
+    teacher_weight = torch.rand(V, H, device=device, dtype=dtype)
+
+    if bias:
+        student_bias1 = torch.rand(V, device=device, dtype=dtype).detach().clone().requires_grad_(True)
+        student_bias2 = student_bias1.clone().detach().requires_grad_(True)
+        teacher_bias = torch.rand(V, device=device, dtype=dtype)
+    else:
+        student_bias1 = student_bias2 = teacher_bias = None
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    student_input1 = _tensor.clone().detach().requires_grad_(True)
+    student_input2 = _tensor.clone().detach().requires_grad_(True)
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Functional call using the fused cosine similarity function
+    output1 = liger_fused_linear_cosine(
+        student_input1,
+        student_weight1,
+        teacher_input,
+        teacher_weight,
+        target,
+        student_bias1,
+        teacher_bias,
+        weight_hard_loss,
+        weight_soft_loss,
+        beta,
+        ignore_index,
+        temperature,
+        True,
+        1024,
+    )
+    output2 = LigerFusedLinearCosineSimilarityFunction.apply(
+        student_input2,
+        student_weight2,
+        teacher_input,
+        teacher_weight,
+        target,
+        student_bias2,
+        teacher_bias,
+        weight_hard_loss,
+        weight_soft_loss,
+        beta,
+        ignore_index,
+        temperature,
+        True,
+        1024,
+    )
+
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+    output1.backward()
+    output2.backward()
+
+    assert_verbose_allclose(student_input1.grad, student_input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(student_weight1.grad, student_weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(student_bias1.grad, student_bias2.grad, atol=atol, rtol=rtol)
diff --git a/test/chunked_loss/test_cpo_loss.py b/test/chunked_loss/test_cpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..c996e57f9de2628ee18d7541d3cdcaeed4a9c98f
--- /dev/null
+++ b/test/chunked_loss/test_cpo_loss.py
@@ -0,0 +1,302 @@
+from typing import Tuple
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss import LigerFusedLinearCPOLoss
+from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
+from liger_kernel.chunked_loss.functional import liger_fused_linear_cpo
+from liger_kernel.utils import infer_device
+from test.utils import HFAlignmentLoss
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+
+# set random seed globally
+set_seed()
+
+
+class HFCPOLoss(HFAlignmentLoss):
+    """
+    HF's implementation of CPO loss in TRL. https://github.com/huggingface/trl/blob/main/trl/trainer/cpo_trainer.py
+    """
+
+    def __init__(
+        self,
+        alpha: float = 1.0,
+        beta: float = 0.1,
+        ignore_index: int = -100,
+        label_smoothing: float = 0.0,
+        simpo_gamma: float = 0.5,
+        loss_type: str = "sigmoid",
+    ):
+        super().__init__(alpha=alpha, beta=beta, ignore_index=ignore_index)
+        # Sigmoid defaults to the CPO loss defined in the paper listed above.
+        self.loss_type = loss_type
+        self.label_smoothing = label_smoothing
+        self.simpo_gamma = simpo_gamma
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Compute the CPO loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
+            The losses tensor contains the CPO loss for each example in the batch.
+            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
+        """
+        logits = policy_chosen_logps - policy_rejected_logps
+
+        # The beta is a temperature parameter for the CPO loss, typically something in the range of 0.1 to 0.5.
+        # We ignore the reference model as beta -> 0. The label_smoothing parameter encodes our uncertainty about the labels and
+        # calculates a conservative CPO loss.
+        if self.loss_type == "sigmoid":
+            # This reduces to Equation 3 from the CPO paper when label_smoothing -> 0.
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        elif self.loss_type == "simpo":
+            logits = logits - (self.simpo_gamma / self.beta)
+            losses = (
+                -F.logsigmoid(self.beta * logits) * (1 - self.label_smoothing)
+                - F.logsigmoid(-self.beta * logits) * self.label_smoothing
+            )
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid']")
+
+        chosen_rewards = self.beta * policy_chosen_logps
+        rejected_rewards = self.beta * policy_rejected_logps
+
+        return losses, chosen_rewards, rejected_rewards
+
+
+class TorchLMHeadCPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        alpha: float = 1.0,
+        label_smoothing: float = 0.0,
+        loss_type: str = "sigmoid",
+        simpo_gamma: float = 0.5,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.cpo_loss = HFCPOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            loss_type=loss_type,
+            label_smoothing=label_smoothing,
+            simpo_gamma=simpo_gamma,
+        ).get_batch_loss_metrics
+        self.average_log_prob = loss_type == "simpo"
+
+    def forward(self, x, y):
+        return self.cpo_loss(self.lin.weight, x, y, self.lin.bias, average_log_prob=self.average_log_prob)
+
+
+class LigerLMHeadCPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        alpha: float = 1.0,
+        label_smoothing: float = 0.0,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.cpo_loss = LigerFusedLinearCPOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            alpha=alpha,
+            label_smoothing=label_smoothing,
+        )
+
+    def forward(self, x, y):
+        return self.cpo_loss(self.lin.weight, x, y, self.lin.bias)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-2),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ignore_index, beta, alpha", [(-100, 0.1, 1.0), (42, 0.2, 0.85)])
+@pytest.mark.parametrize("label_smoothing", [0.0, 0.1])
+def test_correctness(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    ignore_index,
+    beta,
+    alpha,
+    label_smoothing,
+):
+    B = 2 * B  # cpo loss requires B to be even
+
+    torch_lm_head_cpo = TorchLMHeadCPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+        label_smoothing=label_smoothing,
+    )
+    liger_lm_head_cpo = LigerLMHeadCPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+        label_smoothing=label_smoothing,
+    )
+
+    torch_lm_head_cpo.lin.weight.data = liger_lm_head_cpo.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+
+    if bias:
+        torch_lm_head_cpo.lin.bias.data = liger_lm_head_cpo.lin.bias.data = torch.randn(V, device=device, dtype=dtype)
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    loss1, aggregated_aux_outputs1 = torch_lm_head_cpo(input1, target)
+    loss2, aggregated_aux_outputs2 = liger_lm_head_cpo(input2, target)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    assert len(aggregated_aux_outputs1) == len(aggregated_aux_outputs2)
+
+    for i in range(len(aggregated_aux_outputs1)):
+        assert_verbose_allclose(
+            aggregated_aux_outputs1[i],
+            aggregated_aux_outputs2[i],
+            atol=atol,
+            rtol=rtol,
+        )
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_cpo.lin.weight.grad,
+        liger_lm_head_cpo.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_cpo.lin.bias.grad,
+            liger_lm_head_cpo.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias):
+    B = 2 * B
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype)
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    _bias = torch.randn(V, device=device, dtype=dtype) if bias else None
+    bias1 = _bias.detach().clone().requires_grad_(True) if bias else None
+    bias2 = _bias.detach().clone().requires_grad_(True) if bias else None
+
+    loss1, aggregated_aux_outputs1 = LigerFusedLinearCPOFunction.apply(input1, weight1, target, bias1)
+    loss2, aggregated_aux_outputs2 = liger_fused_linear_cpo(input2, weight2, target, bias2)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)
diff --git a/test/chunked_loss/test_dpo_loss.py b/test/chunked_loss/test_dpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..de5762f26682e0337ab1f1ec382695d0578ae3d1
--- /dev/null
+++ b/test/chunked_loss/test_dpo_loss.py
@@ -0,0 +1,938 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
+from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
+from liger_kernel.chunked_loss.functional import liger_fused_linear_dpo
+from liger_kernel.utils import infer_device
+from test.utils import HFAlignmentLoss
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+
+# set random seed globally
+set_seed()
+
+
+class HFDPOLoss(HFAlignmentLoss):
+    """
+    Implementation of the Direct Preference Optimization (DPO) loss,
+    adapted from Hugging Face's implementation.
+    Reference: https://github.com/huggingface/trl/blob/main/trl/trainer/dpo_trainer.py
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        use_ref_model: bool = True,
+        compute_nll_loss: bool = False,
+    ):
+        super().__init__(
+            beta=beta,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            compute_nll_loss=compute_nll_loss,
+        )
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+    ):
+        """Compute DPO loss for a batch of policy log probabilities.
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            The losses tensor contains the DPO loss for each example in the batch.
+        """
+        # Derived from https://huggingface.co/papers/2305.18290
+        chosen_logratios = policy_chosen_logps - ref_chosen_logps
+        rejected_logratios = policy_rejected_logps - ref_rejected_logps
+
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+
+        logits_diff = self.beta * (chosen_logratios - rejected_logratios)
+        losses = -F.logsigmoid(logits_diff)
+        return losses, chosen_rewards, rejected_rewards
+
+
+class HFAPOZeroLoss(HFAlignmentLoss):
+    """
+    Implementation of the APO-zero loss.
+    Reference: https://huggingface.co/papers/2408.06266
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        use_ref_model: bool = True,
+        compute_nll_loss: bool = False,
+    ):
+        super().__init__(
+            beta=beta,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            compute_nll_loss=compute_nll_loss,
+        )
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+    ):
+        """Compute APO-zero loss for a batch of policy log probabilities.
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            The losses tensor contains the APO-zero loss for each example in the batch.
+        """
+        # Eqn (7) of the APO paper (https://huggingface.co/papers/2408.06266)
+        chosen_logratios = policy_chosen_logps - ref_chosen_logps
+        rejected_logratios = policy_rejected_logps - ref_rejected_logps
+
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+
+        # Use this loss when you believe the chosen outputs are better than your model's default output
+        losses_chosen = 1 - F.sigmoid(self.beta * chosen_logratios)  # Increase chosen likelihood
+        losses_rejected = F.sigmoid(self.beta * rejected_logratios)  # Decrease rejected likelihood
+        losses = losses_chosen + losses_rejected
+
+        return losses, chosen_rewards, rejected_rewards
+
+
+class HFAPODownLoss(HFAlignmentLoss):
+    """
+    Implementation of the APO-down loss.
+    Reference: https://huggingface.co/papers/2408.06266
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        use_ref_model: bool = True,
+        compute_nll_loss: bool = False,
+    ):
+        super().__init__(
+            beta=beta,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            compute_nll_loss=compute_nll_loss,
+        )
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+    ):
+        """Compute APO-down loss for a batch of policy log probabilities.
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            The losses tensor contains the APO-down loss for each example in the batch.
+        """
+        # Eqn (8) of the APO paper (https://huggingface.co/papers/2408.06266)
+        chosen_logratios = policy_chosen_logps - ref_chosen_logps
+        rejected_logratios = policy_rejected_logps - ref_rejected_logps
+
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+
+        # Use this loss when you believe the chosen outputs are worse than your model's default output.
+        # Decrease chosen likelihood and decrease rejected likelihood more
+        losses_chosen = F.sigmoid(self.beta * chosen_logratios)
+        losses_rejected = 1 - F.sigmoid(self.beta * (chosen_logratios - rejected_logratios))
+        losses = losses_chosen + losses_rejected
+
+        return losses, chosen_rewards, rejected_rewards
+
+
+class HFSPPPOHARDLoss(HFAlignmentLoss):
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        use_ref_model: bool = True,
+        compute_nll_loss: bool = False,
+    ):
+        super().__init__(
+            beta=beta,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            compute_nll_loss=compute_nll_loss,
+        )
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+    ):
+        chosen_logratios = policy_chosen_logps - ref_chosen_logps
+        rejected_logratios = policy_rejected_logps - ref_rejected_logps
+
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+
+        a = policy_chosen_logps - ref_chosen_logps
+        b = policy_rejected_logps - ref_rejected_logps
+        losses = (a - 0.5 / self.beta) ** 2 + (b + 0.5 / self.beta) ** 2
+
+        return losses, chosen_rewards, rejected_rewards
+
+
+class HFNCAPAIRLoss(HFAlignmentLoss):
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        use_ref_model: bool = True,
+        compute_nll_loss: bool = False,
+    ):
+        super().__init__(
+            beta=beta,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            compute_nll_loss=compute_nll_loss,
+        )
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+    ):
+        chosen_logratios = policy_chosen_logps - ref_chosen_logps
+        rejected_logratios = policy_rejected_logps - ref_rejected_logps
+
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+
+        losses = (
+            -F.logsigmoid(chosen_rewards) - 0.5 * F.logsigmoid(-chosen_rewards) - 0.5 * F.logsigmoid(-rejected_rewards)
+        )
+
+        return losses, chosen_rewards, rejected_rewards
+
+
+class TorchLMHeadDPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        compute_nll_loss: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.dpo_loss = HFDPOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
+        ).get_batch_loss_metrics
+
+    def forward(self, x, ref_x, y):
+        return self.dpo_loss(
+            self.lin.weight,
+            x,
+            y,
+            self.lin.bias,
+            ref_x,
+            self.ref_lin.weight,
+            self.ref_lin.bias,
+            average_log_prob=True,
+        )
+
+
+class TorchLMHeadAPOZero(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        compute_nll_loss: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.apo_loss = HFAPOZeroLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
+        ).get_batch_loss_metrics
+
+    def forward(self, x, ref_x, y):
+        return self.apo_loss(
+            self.lin.weight,
+            x,
+            y,
+            self.lin.bias,
+            ref_x,
+            self.ref_lin.weight,
+            self.ref_lin.bias,
+            average_log_prob=True,
+        )
+
+
+class TorchLMHeadAPODown(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        compute_nll_loss: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.apo_loss = HFAPODownLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
+        ).get_batch_loss_metrics
+
+    def forward(self, x, ref_x, y):
+        return self.apo_loss(
+            self.lin.weight,
+            x,
+            y,
+            self.lin.bias,
+            ref_x,
+            self.ref_lin.weight,
+            self.ref_lin.bias,
+            average_log_prob=True,
+        )
+
+
+class TorchLMHeadSPPOHARD(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        compute_nll_loss: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.sppo_hard = HFSPPPOHARDLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
+        ).get_batch_loss_metrics
+
+    def forward(self, x, ref_x, y):
+        return self.sppo_hard(
+            self.lin.weight,
+            x,
+            y,
+            self.lin.bias,
+            ref_x,
+            self.ref_lin.weight,
+            self.ref_lin.bias,
+            average_log_prob=True,
+        )
+
+
+class TorchLMHeadNCAPAIR(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        compute_nll_loss: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.nca_pair = HFNCAPAIRLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
+        ).get_batch_loss_metrics
+
+    def forward(self, x, ref_x, y):
+        return self.nca_pair(
+            self.lin.weight,
+            x,
+            y,
+            self.lin.bias,
+            ref_x,
+            self.ref_lin.weight,
+            self.ref_lin.bias,
+            average_log_prob=True,
+        )
+
+
+class LigerLMHeadDPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        compute_nll_loss: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        loss_type: str = "sigmoid",
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.dpo_loss = LigerFusedLinearDPOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            compute_nll_loss=compute_nll_loss,
+            average_log_prob=True,
+            loss_type=loss_type,
+        )
+
+    def forward(self, x, ref_x, y):
+        return self.dpo_loss(
+            self.lin.weight,
+            x,
+            y,
+            self.lin.bias,
+            ref_x,
+            self.ref_lin.weight,
+            self.ref_lin.bias,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ref_bias", [True, False])
+@pytest.mark.parametrize("compute_nll_loss", [True, False])
+@pytest.mark.parametrize("ignore_index, beta", [(-100, 0.1), (42, 0.2)])
+def test_correctness(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    ref_bias,
+    compute_nll_loss,
+    ignore_index,
+    beta,
+):
+    B = 2 * B  # dpo loss requires B to be even
+
+    torch_lm_head_dpo = TorchLMHeadDPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=ref_bias,
+        compute_nll_loss=compute_nll_loss,
+        ignore_index=ignore_index,
+        beta=beta,
+    )
+    liger_lm_head_dpo = LigerLMHeadDPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=ref_bias,
+        compute_nll_loss=compute_nll_loss,
+        ignore_index=ignore_index,
+        beta=beta,
+    )
+
+    torch_lm_head_dpo.lin.weight.data = liger_lm_head_dpo.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_dpo.ref_lin.weight.data = liger_lm_head_dpo.ref_lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+
+    if bias:
+        torch_lm_head_dpo.lin.bias.data = liger_lm_head_dpo.lin.bias.data = torch.randn(V, device=device, dtype=dtype)
+    if ref_bias:
+        torch_lm_head_dpo.ref_lin.bias.data = liger_lm_head_dpo.ref_lin.bias.data = torch.randn(
+            V, device=device, dtype=dtype
+        )
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False) * scalar
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    loss1, aggregated_aux_outputs1 = torch_lm_head_dpo(input1, ref_input, target)
+    loss2, aggregated_aux_outputs2 = liger_lm_head_dpo(input2, ref_input, target)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    assert len(aggregated_aux_outputs1) == len(aggregated_aux_outputs2)
+
+    for i in range(len(aggregated_aux_outputs1)):
+        if i > 4 and dtype == torch.bfloat16:
+            # numerical instability in bf16 for chosen_rewards and rejected_rewards
+            # temporary fix. TODO: investigate how to reduce numercial instabiltiy issue
+            assert_verbose_allclose(
+                aggregated_aux_outputs1[i],
+                aggregated_aux_outputs2[i],
+                atol=5e-1,
+                rtol=rtol,
+            )
+            continue
+        assert_verbose_allclose(
+            aggregated_aux_outputs1[i],
+            aggregated_aux_outputs2[i],
+            atol=atol,
+            rtol=rtol,
+        )
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_dpo.lin.weight.grad,
+        liger_lm_head_dpo.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_dpo.lin.bias.grad,
+            liger_lm_head_dpo.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ref_bias", [True, False])
+@pytest.mark.parametrize("compute_nll_loss", [True, False])
+def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias, compute_nll_loss):
+    B = 2 * B
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False) * scalar
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype)
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    _ref_weight = torch.randn(V, H, device=device, dtype=dtype)
+    ref_weight1 = _ref_weight.detach().clone().requires_grad_(True)
+    ref_weight2 = _ref_weight.detach().clone().requires_grad_(True)
+
+    _bias = torch.randn(V, device=device, dtype=dtype) if bias else None
+    bias1 = _bias.detach().clone().requires_grad_(True) if bias else None
+    bias2 = _bias.detach().clone().requires_grad_(True) if bias else None
+
+    _ref_bias = torch.randn(V, device=device, dtype=dtype) if ref_bias else None
+    ref_bias1 = _ref_bias.detach().clone().requires_grad_(True) if ref_bias else None
+    ref_bias2 = _ref_bias.detach().clone().requires_grad_(True) if ref_bias else None
+
+    loss1, aggregated_aux_outputs1 = LigerFusedLinearDPOFunction.apply(
+        input1,
+        weight1,
+        target,
+        bias1,
+        ref_input,
+        ref_weight1,
+        ref_bias1,
+        -100,
+        0.1,
+        compute_nll_loss,
+    )
+    loss2, aggregated_aux_outputs2 = liger_fused_linear_dpo(
+        input2,
+        weight2,
+        target,
+        bias2,
+        ref_input,
+        ref_weight2,
+        ref_bias2,
+        -100,
+        0.1,
+        compute_nll_loss,
+    )
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ref_bias", [True, False])
+@pytest.mark.parametrize("compute_nll_loss", [True, False])
+@pytest.mark.parametrize("ignore_index, beta", [(-100, 0.1), (42, 0.2)])
+@pytest.mark.parametrize("loss_type", ["apo_zero", "apo_down", "sppo_hard", "nca_pair"])
+def test_correctness_apo_loss_types(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    ref_bias,
+    compute_nll_loss,
+    ignore_index,
+    beta,
+    loss_type,
+):
+    B = 2 * B  # dpo loss requires B to be even
+
+    # Select the appropriate HF reference implementation
+    if loss_type == "apo_zero":
+        torch_lm_head = TorchLMHeadAPOZero
+    elif loss_type == "apo_down":
+        torch_lm_head = TorchLMHeadAPODown
+    elif loss_type == "sppo_hard":
+        torch_lm_head = TorchLMHeadSPPOHARD
+    elif loss_type == "nca_pair":
+        torch_lm_head = TorchLMHeadNCAPAIR
+    else:
+        raise ValueError(f"Unsupported loss_type: {loss_type}")
+
+    torch_lm_head_apo = torch_lm_head(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=ref_bias,
+        compute_nll_loss=compute_nll_loss,
+        ignore_index=ignore_index,
+        beta=beta,
+    )
+    liger_lm_head_apo = LigerLMHeadDPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=ref_bias,
+        compute_nll_loss=compute_nll_loss,
+        ignore_index=ignore_index,
+        beta=beta,
+        loss_type=loss_type,
+    )
+
+    torch_lm_head_apo.lin.weight.data = liger_lm_head_apo.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_apo.ref_lin.weight.data = liger_lm_head_apo.ref_lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+
+    if bias:
+        torch_lm_head_apo.lin.bias.data = liger_lm_head_apo.lin.bias.data = torch.randn(V, device=device, dtype=dtype)
+    if ref_bias:
+        torch_lm_head_apo.ref_lin.bias.data = liger_lm_head_apo.ref_lin.bias.data = torch.randn(
+            V, device=device, dtype=dtype
+        )
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False) * scalar
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    loss1, aggregated_aux_outputs1 = torch_lm_head_apo(input1, ref_input, target)
+    loss2, aggregated_aux_outputs2 = liger_lm_head_apo(input2, ref_input, target)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    assert len(aggregated_aux_outputs1) == len(aggregated_aux_outputs2)
+
+    for i in range(len(aggregated_aux_outputs1)):
+        if i > 4 and dtype == torch.bfloat16:
+            # numerical instability in bf16 for chosen_rewards and rejected_rewards
+            # temporary fix. TODO: investigate how to reduce numerical instability issue
+            assert_verbose_allclose(
+                aggregated_aux_outputs1[i],
+                aggregated_aux_outputs2[i],
+                atol=5e-1,
+                rtol=rtol,
+            )
+            continue
+        assert_verbose_allclose(
+            aggregated_aux_outputs1[i],
+            aggregated_aux_outputs2[i],
+            atol=atol,
+            rtol=rtol,
+        )
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_apo.lin.weight.grad,
+        liger_lm_head_apo.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_apo.lin.bias.grad,
+            liger_lm_head_apo.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ref_bias", [True, False])
+@pytest.mark.parametrize("compute_nll_loss", [True, False])
+@pytest.mark.parametrize("loss_type", ["apo_zero", "apo_down", "sppo_hard", "nca_pair"])
+def test_correctness_functional_apo_loss_types(
+    B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias, compute_nll_loss, loss_type
+):
+    B = 2 * B
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False) * scalar
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype)
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    _ref_weight = torch.randn(V, H, device=device, dtype=dtype)
+    ref_weight1 = _ref_weight.detach().clone().requires_grad_(True)
+    ref_weight2 = _ref_weight.detach().clone().requires_grad_(True)
+
+    _bias = torch.randn(V, device=device, dtype=dtype) if bias else None
+    bias1 = _bias.detach().clone().requires_grad_(True) if bias else None
+    bias2 = _bias.detach().clone().requires_grad_(True) if bias else None
+
+    _ref_bias = torch.randn(V, device=device, dtype=dtype) if ref_bias else None
+    ref_bias1 = _ref_bias.detach().clone().requires_grad_(True) if ref_bias else None
+    ref_bias2 = _ref_bias.detach().clone().requires_grad_(True) if ref_bias else None
+
+    # Call with loss_type parameter for LigerFusedLinearDPOFunction
+    loss1, aggregated_aux_outputs1 = LigerFusedLinearDPOFunction.apply(
+        input1,
+        weight1,
+        target,
+        bias1,
+        ref_input,
+        ref_weight1,
+        ref_bias1,
+        -100,
+        0.1,
+        compute_nll_loss,
+        True,  # compiled
+        True,  # use_ref_model
+        False,  # average_log_prob
+        1,  # chunk_size
+        loss_type,  # loss_type
+    )
+
+    # For comparison, create a LigerFusedLinearDPOLoss with the loss_type
+    dpo_loss_fn = LigerFusedLinearDPOLoss(
+        ignore_index=-100,
+        beta=0.1,
+        compute_nll_loss=compute_nll_loss,
+        loss_type=loss_type,
+    )
+
+    loss2, aggregated_aux_outputs2 = dpo_loss_fn(
+        weight2,
+        input2,
+        target,
+        bias2,
+        ref_input,
+        ref_weight2,
+        ref_bias2,
+    )
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)
+
+
+def test_invalid_loss_type():
+    """Test that invalid loss types raise ValueError"""
+    with pytest.raises(ValueError, match="Unsupported loss_type"):
+        LigerFusedLinearDPOLoss(loss_type="invalid_loss_type")
+
+    # Test that valid loss types don't raise errors
+    valid_loss_types = ["sigmoid", "apo_zero", "apo_down", "sppo_hard", "nca_pair"]
+    for loss_type in valid_loss_types:
+        # Should not raise an exception
+        loss_fn = LigerFusedLinearDPOLoss(loss_type=loss_type)
+        assert loss_fn.loss_type == loss_type
diff --git a/test/chunked_loss/test_grpo_loss.py b/test/chunked_loss/test_grpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..59221a666a433741cdfddafacea43ab083a664dd
--- /dev/null
+++ b/test/chunked_loss/test_grpo_loss.py
@@ -0,0 +1,993 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+from liger_kernel.chunked_loss.functional import liger_fused_linear_grpo
+from liger_kernel.chunked_loss.fused_linear_ppo import LigerFusedLinearPPOBase
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
+from liger_kernel.transformers.grpo_loss import _reduce_grpo_loss
+from liger_kernel.transformers.grpo_loss import triton_grpo_loss
+from liger_kernel.utils import infer_device
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+
+# set random seed globally
+set_seed()
+
+
+def sapo_loss_fn(importance_ratio: torch.Tensor, temperature: float) -> torch.Tensor:
+    """SAPO (Soft Adaptive Policy Optimization) loss function for torch reference.
+
+    Reference: https://huggingface.co/papers/2511.20347
+    TRL implementation: https://github.com/huggingface/trl/blob/1bd2a52ec2d8344050af736d60cdc735181ae4b8/trl/trainer/grpo_trainer.py#L1913
+    """
+    if temperature <= 0:
+        raise ValueError("sapo_temperature must be > 0.")
+    sigmoid_input = temperature * (importance_ratio - 1)
+    sigmoid_smoothed_loss = torch.sigmoid(sigmoid_input)
+    return sigmoid_smoothed_loss * 4 / temperature
+
+
+class TorchLMHeadGRPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        beta: float = 0.1,
+        epsilon_low: float = 0.2,
+        epsilon_high: float = 0.2,
+        temperature: float = 1.0,
+        use_ref_model: bool = True,
+        loss_type: str = "bnpo",
+        max_completion_length: int | None = None,
+        importance_sampling_level: str = "token",
+        sapo_temperature_pos: float = 1.0,
+        sapo_temperature_neg: float = 1.05,
+        delta: float | None = None,
+        use_bias_correction_kl: bool = False,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.beta = beta
+        self.epsilon_low = epsilon_low
+        self.epsilon_high = epsilon_high
+        self.temperature = temperature
+        self.use_ref_model = use_ref_model
+        self.loss_type = loss_type
+        self.max_completion_length = max_completion_length
+        self.importance_sampling_level = importance_sampling_level
+        self.sapo_temperature_pos = sapo_temperature_pos
+        self.sapo_temperature_neg = sapo_temperature_neg
+        self.delta = delta
+        self.use_bias_correction_kl = use_bias_correction_kl
+        if self.loss_type == "dr_grpo":
+            assert self.max_completion_length is not None, "max_completion_length must be provided for dr_grpo"
+
+    @staticmethod
+    def compute_per_token_components(
+        per_token_logps,
+        attention_mask,
+        advantages,
+        old_per_token_logps,
+        ref_per_token_logps,
+        epsilon_low,
+        epsilon_high,
+        beta,
+        importance_sampling_level,
+        loss_type: str = "grpo",
+        sapo_temperature_pos: float = 1.0,
+        sapo_temperature_neg: float = 1.05,
+        vllm_is_ratio=None,
+        delta=None,
+        use_bias_correction_kl=False,
+    ):
+        attention_mask = attention_mask.to(per_token_logps.dtype)
+        old_per_token_logps = (
+            old_per_token_logps.float() if old_per_token_logps is not None else per_token_logps.detach()
+        )
+        log_ratio = per_token_logps - old_per_token_logps
+
+        if importance_sampling_level == "token":
+            log_importance_weights = log_ratio
+        elif importance_sampling_level == "sequence":
+            log_importance_weights = (log_ratio * attention_mask).sum(-1) / attention_mask.sum(-1).clamp(min=1.0)
+            log_importance_weights = log_importance_weights.unsqueeze(-1)
+        else:
+            raise ValueError(
+                f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' "
+                "and 'sequence'."
+            )
+
+        coef_1 = torch.exp(log_importance_weights)
+        expanded_advantages = advantages.unsqueeze(1)
+
+        if loss_type == "sapo":
+            # SAPO: Soft Adaptive Policy Optimization
+            # Uses sigmoid-based soft gating instead of hard clipping
+            # Reference: https://github.com/huggingface/trl/blob/1bd2a52ec2d8344050af736d60cdc735181ae4b8/trl/trainer/grpo_trainer.py#L2037-L2046
+            per_token_loss = torch.empty_like(coef_1)
+            advantages_expanded = expanded_advantages.expand_as(coef_1)
+            positive_advantages_mask = advantages_expanded > 0
+
+            per_token_loss[positive_advantages_mask] = sapo_loss_fn(
+                coef_1[positive_advantages_mask], sapo_temperature_pos
+            )
+            per_token_loss[~positive_advantages_mask] = sapo_loss_fn(
+                coef_1[~positive_advantages_mask], sapo_temperature_neg
+            )
+            per_token_loss = -per_token_loss * advantages_expanded
+            # SAPO doesn't use clipping metrics
+            is_lower_clipped = torch.zeros_like(coef_1, dtype=torch.bool)
+            is_upper_clipped = torch.zeros_like(coef_1, dtype=torch.bool)
+        elif loss_type == "cispo":
+            # CISPO: clip and detach the importance weights
+            upper_bound = epsilon_high
+            lower_bound = None
+            coef_2 = torch.clamp(coef_1, lower_bound, upper_bound).detach()
+            is_lower_clipped = torch.zeros_like(coef_1, dtype=torch.bool)
+            is_upper_clipped = coef_1 > upper_bound
+            # CISPO: clip and detach the importance weights, multiply by log probs
+            # Reference: https://github.com/huggingface/trl/blob/035c3ff151b953ca72cdfe0ee966bc1469a26fde/trl/trainer/grpo_trainer.py#L2030
+            per_token_loss = -coef_2 * expanded_advantages * per_token_logps
+        else:
+            upper_bound = 1 + epsilon_high
+            lower_bound = 1 - epsilon_low
+            coef_2 = torch.clamp(coef_1, lower_bound, upper_bound)
+            is_lower_clipped = coef_1 < lower_bound
+            is_upper_clipped = coef_1 > upper_bound
+            if delta is not None:
+                coef_1 = torch.clamp(coef_1, max=delta)
+            per_token_loss1 = coef_1 * expanded_advantages
+            per_token_loss2 = coef_2 * expanded_advantages
+            per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+
+        # Apply vLLM importance sampling correction BEFORE KL penalty
+        if vllm_is_ratio is not None:
+            per_token_loss = per_token_loss * vllm_is_ratio
+
+        kl_div = None
+        if beta != 0.0:
+            ref_per_token_logps = ref_per_token_logps.float()
+            kl_div = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1.0
+            if use_bias_correction_kl:
+                token_coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+                kl_div = kl_div * token_coef_1
+            per_token_loss = per_token_loss + beta * kl_div
+
+        # Adjust clipping metric calculation based on importance sampling level
+        if importance_sampling_level == "token":
+            is_clipped = (is_lower_clipped & (expanded_advantages < 0)) | (is_upper_clipped & (expanded_advantages > 0))
+        else:  # sequence level
+            # For sequence level, coef_1 is shape (B, 1), advantages is shape (B,)
+            is_clipped = (is_lower_clipped & (expanded_advantages < 0)) | (is_upper_clipped & (expanded_advantages > 0))
+            is_clipped = is_clipped.expand_as(attention_mask)
+        return per_token_loss, kl_div, is_clipped
+
+    def forward(
+        self,
+        x,  # Shape: [batch_size, seq_len, hidden_size]
+        selected_token_ids,  # Shape: [batch_size, seq_len]
+        attention_mask,  # Shape: [batch_size, seq_len]
+        advantages,  # Shape: [batch_size,]
+        ref_per_token_logps=None,  # Shape: [batch_size, seq_len]
+        old_per_token_logps=None,
+        ref_input=None,  # Shape: [batch_size, seq_len, hidden_size]
+        vllm_is_ratio=None,  # Shape: [batch_size, seq_len] or None
+    ):
+        logits = x @ self.lin.weight.t()
+        if self.lin.bias is not None:
+            logits = logits + self.lin.bias
+        if self.temperature != 1.0:
+            logits = logits / self.temperature
+        # Get log probabilities
+        log_probs = F.log_softmax(logits.float(), dim=-1)
+
+        # Get chosen token probabilities
+        per_token_logps = log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(-1)
+
+        # Get reference model probabilities,
+        if ref_per_token_logps is None:
+            if self.use_ref_model:
+                with torch.no_grad():
+                    ref_logits = ref_input @ self.ref_lin.weight.t()
+                    if self.ref_lin.bias is not None:
+                        ref_logits = ref_logits + self.ref_lin.bias.float()
+                    if self.temperature != 1.0:
+                        ref_logits = ref_logits / self.temperature
+                    ref_log_probs = F.log_softmax(ref_logits.float(), dim=-1)
+                    ref_per_token_logps = ref_log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(
+                        -1
+                    )
+            else:
+                ref_per_token_logps = per_token_logps.detach()
+
+        per_token_loss, kl_div, is_clipped = self.compute_per_token_components(
+            per_token_logps,
+            attention_mask,
+            advantages,
+            old_per_token_logps,
+            ref_per_token_logps,
+            self.epsilon_low,
+            self.epsilon_high,
+            self.beta,
+            self.importance_sampling_level,
+            self.loss_type,
+            self.sapo_temperature_pos,
+            self.sapo_temperature_neg,
+            vllm_is_ratio=vllm_is_ratio,
+            delta=self.delta,
+            use_bias_correction_kl=self.use_bias_correction_kl,
+        )
+
+        # Apply masking and calculate loss based on loss_type
+        if self.loss_type == "grpo" or self.loss_type == "sapo":
+            # SAPO uses same normalization as GRPO (per-sequence)
+            loss = ((per_token_loss * attention_mask).sum(-1) / torch.clamp(attention_mask.sum(-1), min=1.0)).mean()
+        elif self.loss_type == "bnpo":
+            loss = (per_token_loss * attention_mask).sum() / torch.clamp(attention_mask.sum(), min=1.0)
+        elif self.loss_type == "dr_grpo":
+            loss = (per_token_loss * attention_mask).sum() / (per_token_loss.size(0) * self.max_completion_length)
+        elif self.loss_type == "dapo":
+            normalizer = LigerFusedLinearPPOBase._compute_dapo_normalizer(attention_mask)
+            loss = (per_token_loss * attention_mask).sum() / normalizer
+        elif self.loss_type == "cispo":
+            normalizer = attention_mask.sum().clamp(min=1.0)
+            loss = (per_token_loss * attention_mask).sum() / normalizer
+        elif self.loss_type == "luspo":
+            loss = (per_token_loss * attention_mask.sum(-1, keepdim=True)).mean()
+        else:
+            raise ValueError(f"Unknown loss type: {self.loss_type}")
+
+        # Compute metrics
+        metrics = []
+        if self.beta != 0.0:
+            metrics.append(((kl_div * attention_mask).sum() / torch.clamp(attention_mask.sum(), min=1.0)))
+        metrics.append((is_clipped.float() * attention_mask).sum() / torch.clamp(attention_mask.sum(), min=1.0))
+        return loss, metrics
+
+
+class LigerLMHeadGRPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        beta: float = 0.1,
+        epsilon_low: float = 0.2,
+        epsilon_high: float = 0.2,
+        temperature: float = 1.0,
+        use_ref_model: bool = True,
+        loss_type: str = "bnpo",
+        max_completion_length: int | None = None,
+        importance_sampling_level: str = "token",
+        sapo_temperature_pos: float = 1.0,
+        sapo_temperature_neg: float = 1.05,
+        delta: float | None = None,
+        use_bias_correction_kl: bool = False,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.grpo_loss = LigerFusedLinearGRPOLoss(
+            beta=beta,
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
+            temperature=temperature,
+            use_ref_model=use_ref_model,
+            compiled=True,
+            loss_type=loss_type,
+            max_completion_length=max_completion_length,
+            importance_sampling_level=importance_sampling_level,
+            sapo_temperature_pos=sapo_temperature_pos,
+            sapo_temperature_neg=sapo_temperature_neg,
+            delta=delta,
+            use_bias_correction_kl=use_bias_correction_kl,
+        )
+
+    def forward(
+        self,
+        x,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        ref_per_token_logps=None,
+        old_per_token_logps=None,
+        ref_input=None,
+        vllm_is_ratio=None,
+    ):
+        return self.grpo_loss(
+            x,  # _input
+            self.lin.weight,  # weight
+            selected_token_ids,  # selected_token_ids
+            attention_mask,  # attention_mask
+            advantages,  # advantages
+            self.lin.bias,  # bias
+            ref_per_token_logps,  # ref_per_token_logps
+            old_per_token_logps,  # old_per_token_logps
+            ref_input,  # ref_input
+            self.ref_lin.weight,  # ref_weight
+            self.ref_lin.bias,  # ref_bias
+            vllm_is_ratio=vllm_is_ratio,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "beta, epsilon_low, epsilon_high, temperature",
+    [
+        # Standard settings
+        (0.1, 0.2, 0.2, 1.0),
+        (0.0, 0.1, 0.1, 2.0),
+    ],
+)
+@pytest.mark.parametrize(
+    "use_ref_model, use_ref_per_token_logps, old_per_token_logps",
+    [
+        (True, True, True),
+        (True, False, False),
+        (False, False, True),
+    ],
+)
+@pytest.mark.parametrize("loss_type", ["bnpo", "grpo", "dr_grpo", "dapo", "cispo", "sapo", "luspo"])
+@pytest.mark.parametrize("importance_sampling_level", ["token", "sequence"])
+@pytest.mark.parametrize("delta", [None, 2.0])
+def test_correctness(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    beta,
+    epsilon_low,
+    epsilon_high,
+    temperature,
+    use_ref_per_token_logps,
+    use_ref_model,
+    old_per_token_logps,
+    loss_type,
+    importance_sampling_level,
+    delta,
+):
+    if importance_sampling_level == "sequence" and loss_type in ("cispo", "sapo"):
+        pytest.skip(f"Sequence-level importance sampling is not supported for loss_type='{loss_type}'")
+    if delta is not None and loss_type in ("cispo", "sapo"):
+        pytest.skip(f"delta is not supported for loss_type='{loss_type}'")
+
+    # LUSPO's formula multiplies per_token_loss by seq_lens, amplifying torch.compile
+    # numerical differences by O(T). Relax tolerances to account for this amplification.
+    if loss_type == "luspo":
+        if dtype == torch.bfloat16:
+            atol = max(atol, 1.0)
+            rtol = max(rtol, 5.0)
+        else:
+            atol = max(atol, 1e-4)
+            rtol = max(rtol, 5e-3)
+
+    # Reset torch compiler cache for each parameter of the test case
+    torch.compiler.reset()
+    max_completion_length = T if loss_type == "dr_grpo" else None
+
+    torch_lm_head_grpo = TorchLMHeadGRPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        beta=beta,
+        epsilon_low=epsilon_low,
+        epsilon_high=epsilon_high,
+        temperature=temperature,
+        use_ref_model=use_ref_model,
+        loss_type=loss_type,
+        max_completion_length=max_completion_length,
+        importance_sampling_level=importance_sampling_level,
+        delta=delta,
+    )
+    liger_lm_head_grpo = LigerLMHeadGRPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        beta=beta,
+        epsilon_low=epsilon_low,
+        epsilon_high=epsilon_high,
+        temperature=temperature,
+        use_ref_model=use_ref_model,
+        loss_type=loss_type,
+        max_completion_length=max_completion_length,
+        importance_sampling_level=importance_sampling_level,
+        delta=delta,
+    )
+
+    # Initialize weights
+    torch_lm_head_grpo.lin.weight.data = liger_lm_head_grpo.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+    if bias:
+        torch_lm_head_grpo.lin.bias.data = liger_lm_head_grpo.lin.bias.data = torch.randn(V, device=device, dtype=dtype)
+
+    # set ref weights to be close to the original weights
+    torch_lm_head_grpo.ref_lin.weight.data = liger_lm_head_grpo.ref_lin.weight.data = (
+        torch_lm_head_grpo.lin.weight.data + torch.randn(V, H, device=device, dtype=dtype) * 0.01
+    )
+    if bias:
+        torch_lm_head_grpo.ref_lin.bias.data = liger_lm_head_grpo.ref_lin.bias.data = (
+            torch_lm_head_grpo.lin.bias.data + torch.randn(V, device=device, dtype=dtype) * 0.01
+        )
+
+    # Create inputs with shape [B, T, H]
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    # Create selected token ids with shape [B, T]
+    selected_token_ids = torch.randint(0, V, (B, T), device=device)
+
+    # Compute per-token logps
+    with torch.no_grad():
+        logits = _input @ torch_lm_head_grpo.lin.weight.t()
+        if torch_lm_head_grpo.lin.bias is not None:
+            logits = logits + torch_lm_head_grpo.lin.bias
+        logits = logits / temperature
+        logps = F.log_softmax(logits, dim=-1)
+        per_token_logps = logps.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(-1)
+
+    # Create attention mask with random padding [B, T]
+    attention_mask = torch.ones(B, T, device=device)
+    num_elements_to_mask = torch.randint(1, B * T // 2, (1,)).item()
+    mask_indices = torch.randperm(B * T)[:num_elements_to_mask]
+    attention_mask.view(-1)[mask_indices] = 0
+
+    # Create advantages with shape [B] and ensure mixed signs for SAPO
+    advantages = torch.randn(B, device=device, dtype=dtype)
+    advantages[0] = -advantages[0].abs()
+    if B > 1:
+        advantages[1] = advantages[1].abs()
+
+    ref_per_token_logps = None
+    ref_input = None
+    if use_ref_model and use_ref_per_token_logps:
+        # Create reference log probs with shape [B, T]
+        ref_per_token_logps = per_token_logps.detach() + torch.randn(B, T, device=device) * 0.01
+    elif use_ref_model:
+        # Create reference inputs (optional) with shape [B, T, H] if ref_log_probs is None
+        ref_input = _input.detach() + torch.randn(B, T, H, device=device, dtype=dtype) * 0.01
+
+    if old_per_token_logps:
+        old_per_token_logps = per_token_logps.detach() + torch.randn(B, T, device=device) * 0.01
+    else:
+        old_per_token_logps = None
+
+    # Forward pass with reference model
+    loss1, aux1 = torch_lm_head_grpo(
+        input1,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        ref_per_token_logps=ref_per_token_logps,
+        old_per_token_logps=old_per_token_logps,
+        ref_input=ref_input,
+    )
+    loss2, aux2 = liger_lm_head_grpo(
+        input2,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        ref_per_token_logps=ref_per_token_logps,
+        old_per_token_logps=old_per_token_logps,
+        ref_input=ref_input,
+    )
+    # Check losses match
+    assert not torch.isnan(loss1)
+    assert not torch.isnan(loss2)
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    # Check metrics match
+    assert len(aux1) == len(aux2)
+    # aggregated metrics are unstable for bfloat16
+    for metric1, metric2 in zip(aux1, aux2):
+        assert_verbose_allclose(metric1, metric2, atol=atol, rtol=rtol)
+
+    # Backward pass
+    loss1.backward()
+    loss2.backward()
+
+    # Check gradients match for loss_type
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_grpo.lin.weight.grad,
+        liger_lm_head_grpo.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_grpo.lin.bias.grad,
+            liger_lm_head_grpo.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize("loss_type", ["grpo", "dapo"])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 5e-4),
+    ],
+)
+def test_correctness_with_bias_correction_kl(loss_type, dtype, atol, rtol):
+    """Test use_bias_correction_kl (importance-sampling-corrected KL from DeepSeek-V3.2)."""
+    B, T, H, V = 3, 47, 31, 123
+    beta = 0.1  # Must be non-zero for KL to matter
+    torch.compiler.reset()
+
+    torch_lm_head_grpo = TorchLMHeadGRPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        beta=beta,
+        loss_type=loss_type,
+        use_bias_correction_kl=True,
+    )
+    liger_lm_head_grpo = LigerLMHeadGRPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        beta=beta,
+        loss_type=loss_type,
+        use_bias_correction_kl=True,
+    )
+
+    torch_lm_head_grpo.lin.weight.data = liger_lm_head_grpo.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_grpo.ref_lin.weight.data = liger_lm_head_grpo.ref_lin.weight.data = (
+        torch_lm_head_grpo.lin.weight.data + torch.randn(V, H, device=device, dtype=dtype) * 0.01
+    )
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    selected_token_ids = torch.randint(0, V, (B, T), device=device)
+    attention_mask = torch.ones(B, T, device=device, dtype=dtype)
+    attention_mask[:, -10:] = 0
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+    old_per_token_logps = torch.randn(B, T, device=device, dtype=torch.float32)
+
+    loss1, metrics1 = torch_lm_head_grpo(
+        input1,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        old_per_token_logps=old_per_token_logps,
+        ref_input=input1.detach(),
+    )
+    loss2, metrics2 = liger_lm_head_grpo(
+        input2,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        old_per_token_logps=old_per_token_logps,
+        ref_input=input2.detach(),
+    )
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+    loss1.backward()
+    loss2.backward()
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_grpo.lin.weight.grad,
+        liger_lm_head_grpo.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+@pytest.mark.parametrize("loss_type", ["bnpo", "grpo", "dapo", "cispo", "sapo", "luspo"])
+@pytest.mark.parametrize("beta", [0.0, 0.1])
+def test_correctness_with_vllm_is_ratio(loss_type, beta):
+    """Test vllm_is_ratio correctness against torch reference, and 1D/2D shape equivalence."""
+    torch.compiler.reset()
+    B, T, H, V = 4, 32, 64, 128
+    dtype = torch.float32
+    atol, rtol = 1e-5, 5e-4
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype)
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    selected_token_ids = torch.randint(0, V, (B, T), device=device)
+    attention_mask = torch.ones(B, T, device=device)
+    attention_mask[:, -5:] = 0
+    advantages = torch.randn(B, device=device, dtype=dtype)
+    advantages[0] = -advantages[0].abs()  # ensure mixed signs for SAPO
+
+    vllm_is_ratio = torch.rand(B, T, device=device, dtype=torch.float32) * 0.999 + 0.001
+
+    torch_lm = TorchLMHeadGRPO(H=H, V=V, dtype=dtype, beta=beta, loss_type=loss_type, use_ref_model=False)
+    liger_lm = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, beta=beta, loss_type=loss_type, use_ref_model=False)
+    torch_lm.lin.weight.data = liger_lm.lin.weight.data = _weight.clone()
+
+    loss1, aux1 = torch_lm(input1, selected_token_ids, attention_mask, advantages, vllm_is_ratio=vllm_is_ratio)
+    loss2, aux2 = liger_lm(input2, selected_token_ids, attention_mask, advantages, vllm_is_ratio=vllm_is_ratio)
+
+    assert not torch.isnan(loss1)
+    assert not torch.isnan(loss2)
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+    for m1, m2 in zip(aux1, aux2):
+        assert_verbose_allclose(m1, m2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(torch_lm.lin.weight.grad, liger_lm.lin.weight.grad, atol=atol, rtol=rtol)
+
+    # Verify 1D (B,) gives same result as (B, 1)
+    uniform_val = 0.42
+    input3 = _input.detach().clone().requires_grad_(True)
+    input4 = _input.detach().clone().requires_grad_(True)
+    liger3 = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, beta=beta, loss_type=loss_type, use_ref_model=False)
+    liger4 = LigerLMHeadGRPO(H=H, V=V, dtype=dtype, beta=beta, loss_type=loss_type, use_ref_model=False)
+    liger3.lin.weight.data = liger4.lin.weight.data = _weight.clone()
+
+    loss3, _ = liger3(
+        input3,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        vllm_is_ratio=torch.full((B,), uniform_val, device=device),
+    )
+    loss4, _ = liger4(
+        input4,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        vllm_is_ratio=torch.full((B, 1), uniform_val, device=device),
+    )
+    assert_verbose_allclose(loss3, loss4, atol=1e-5, rtol=1e-5)
+    loss3.backward()
+    loss4.backward()
+    assert_verbose_allclose(input3.grad, input4.grad, atol=1e-5, rtol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+def test_functional_correctness(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+):
+    # Reset torch compiler cache for each parameter of the test case
+    torch.compiler.reset()
+    max_completion_length = T
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype) * scalar
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    selected_token_ids = torch.randint(0, V, (B, T), device=device)
+
+    attention_mask = torch.ones(B, T, device=device)
+
+    advantages = torch.rand(B, device=device, dtype=dtype)
+
+    if bias:
+        _bias = torch.randn(V, device=device, dtype=dtype) * scalar
+        bias1 = _bias.detach().clone().requires_grad_(True)
+        bias2 = _bias.detach().clone().requires_grad_(True)
+    else:
+        bias1 = None
+        bias2 = None
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+
+    _ref_weight = _weight.detach() + torch.randn(V, H, device=device, dtype=dtype) * 0.01
+    ref_weight1 = _ref_weight.detach().clone().requires_grad_(True)
+    ref_weight2 = _ref_weight.detach().clone().requires_grad_(True)
+
+    if bias:
+        _ref_bias = _bias.detach() + torch.randn(V, device=device, dtype=dtype) * 0.01
+        ref_bias1 = _ref_bias.detach().clone().requires_grad_(True)
+        ref_bias2 = _ref_bias.detach().clone().requires_grad_(True)
+    else:
+        ref_bias1 = None
+        ref_bias2 = None
+
+    old_per_token_logps = None
+    ref_per_token_logps = None
+
+    loss1, aux1 = liger_fused_linear_grpo(
+        input1,
+        weight1,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        bias1,
+        ref_per_token_logps,
+        old_per_token_logps,
+        ref_input,
+        ref_weight1,
+        ref_bias1,
+        0.04,
+        0.2,
+        0.2,
+        "bnpo",
+        max_completion_length,
+        "token",
+        1.0,
+        False,
+        True,
+        1,
+    )
+
+    loss2, aux2 = LigerFusedLinearGRPOFunction.apply(
+        input2,
+        weight2,
+        selected_token_ids,
+        attention_mask,
+        advantages,
+        bias2,
+        ref_per_token_logps,
+        old_per_token_logps,
+        ref_input,
+        ref_weight2,
+        ref_bias2,
+        0.04,
+        0.2,
+        0.2,
+        "bnpo",
+        max_completion_length,
+        "token",
+        1.0,
+        False,
+        True,
+        1,
+    )
+
+    assert not torch.isnan(loss1)
+    assert not torch.isnan(loss2)
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    # Check metrics match
+    assert len(aux1) == len(aux2)
+    # aggregated metrics are unstable for bfloat16
+    for metric1, metric2 in zip(aux1, aux2):
+        assert_verbose_allclose(metric1, metric2, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("loss_type", ["grpo", "bnpo", "dr_grpo", "dapo", "luspo"])
+def test_reduce_grpo_loss_matches_reference(loss_type):
+    torch.manual_seed(0)
+    per_token_loss = torch.randn(3, 5)
+    mask = torch.randint(0, 2, (3, 5), device=per_token_loss.device, dtype=torch.long)
+    mask[:, 0] = 1  # ensure at least one valid token per sequence
+    max_completion_length = 5 if loss_type == "dr_grpo" else None
+
+    reduced = _reduce_grpo_loss(per_token_loss, mask, loss_type, max_completion_length)
+
+    mask_f = mask.to(per_token_loss.dtype)
+    if loss_type == "grpo":
+        expected = ((per_token_loss * mask_f).sum(-1) / mask_f.sum(-1).clamp(min=1.0)).mean()
+    elif loss_type == "bnpo":
+        expected = (per_token_loss * mask_f).sum() / mask_f.sum().clamp(min=1.0)
+    elif loss_type == "dr_grpo":
+        expected = (per_token_loss * mask_f).sum() / (per_token_loss.size(0) * max_completion_length)
+    elif loss_type == "luspo":
+        expected = (per_token_loss * mask_f.sum(-1, keepdim=True)).mean()
+    else:  # dapo/cispo
+        expected = (per_token_loss * mask_f).sum() / mask_f.sum().clamp(min=1.0)
+
+    assert_verbose_allclose(reduced, expected)
+
+
+def test_reduce_grpo_loss_requires_max_completion_length():
+    per_token_loss = torch.randn(2, 3)
+    mask = torch.ones_like(per_token_loss, dtype=torch.long)
+    reduced = _reduce_grpo_loss(per_token_loss, mask, "dr_grpo", max_completion_length=None)
+    expected = (per_token_loss * mask).sum() / (per_token_loss.size(0) * per_token_loss.size(1))
+    assert_verbose_allclose(reduced, expected)
+
+
+@pytest.mark.parametrize("loss_type", ["cispo", "sapo"])
+def test_sequence_level_rejects_unsupported_loss_types(loss_type):
+    """Sequence-level importance sampling should raise ValueError for cispo and sapo."""
+    B, T, H, V = 2, 8, 16, 32
+    dtype = torch.float32
+
+    liger_lm = LigerLMHeadGRPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        beta=0.0,
+        loss_type=loss_type,
+        use_ref_model=False,
+        importance_sampling_level="sequence",
+    )
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype).requires_grad_(True)
+    selected_token_ids = torch.randint(0, V, (B, T), device=device)
+    attention_mask = torch.ones(B, T, device=device)
+    advantages = torch.randn(B, device=device)
+
+    with pytest.raises(ValueError, match="Sequence-level importance sampling is not supported"):
+        liger_lm(_input, selected_token_ids, attention_mask, advantages)
+
+
+@pytest.mark.parametrize("loss_type,beta", [("bnpo", 0.0), ("dapo", 0.04)])
+def test_triton_grpo_loss_matches_reference(loss_type, beta):
+    pytest.importorskip("triton")
+    device = infer_device()
+
+    B, T, V = 2, 4, 16
+    logits = torch.randn(B, T + 1, V, device=device, dtype=torch.float32).contiguous()
+    completion_ids = torch.randint(0, V, (B, T), device=device)
+    completion_mask = torch.randint(0, 2, (B, T), device=device, dtype=torch.long)
+    completion_mask[:, 0] = 1  # ensure each sequence has at least one valid token
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32)
+    ref_logp = torch.randn(B, T, device=device, dtype=torch.float32) if beta != 0.0 else None
+
+    per_token_loss, per_token_kl, is_clipped = triton_grpo_loss(
+        logits=logits,
+        old_logp=old_logp,
+        ref_logp=ref_logp,
+        completion_ids=completion_ids,
+        advantages=advantages,
+        completion_mask=completion_mask,
+        temperature=1.0,
+        beta=beta,
+        eps_low=0.2,
+        eps_high=0.2,
+        inplace=False,
+        loss_type=loss_type,
+        max_completion_length=T,
+        reduce=False,
+    )
+
+    logits_main = logits[:, :-1, :]
+    log_probs = torch.log_softmax(logits_main, dim=-1)
+    per_token_logps = log_probs.gather(dim=-1, index=completion_ids.unsqueeze(-1)).squeeze(-1)
+    ref_tokens = ref_logp if ref_logp is not None else per_token_logps.detach()
+    reference_loss, reference_kl, reference_is_clipped = TorchLMHeadGRPO.compute_per_token_components(
+        per_token_logps,
+        completion_mask.float(),
+        advantages,
+        old_logp,
+        ref_tokens,
+        0.2,
+        0.2,
+        beta,
+        "token",
+    )
+
+    mask = completion_mask.float()
+    mask_bool = mask.bool()
+    assert_verbose_allclose(per_token_loss, reference_loss * mask)
+    assert torch.equal(is_clipped.bool()[mask_bool], reference_is_clipped[mask_bool])
+    if beta != 0.0:
+        assert_verbose_allclose(per_token_kl, reference_kl * mask)
+    else:
+        assert per_token_kl is None
+
+    reduced_loss, metrics = triton_grpo_loss(
+        logits=logits,
+        old_logp=old_logp,
+        ref_logp=ref_logp,
+        completion_ids=completion_ids,
+        advantages=advantages,
+        completion_mask=completion_mask,
+        temperature=1.0,
+        beta=beta,
+        eps_low=0.2,
+        eps_high=0.2,
+        inplace=False,
+        loss_type=loss_type,
+        max_completion_length=T,
+        reduce=True,
+    )
+    expected_loss = _reduce_grpo_loss(reference_loss, completion_mask, loss_type, T)
+    assert_verbose_allclose(reduced_loss, expected_loss)
+    if beta != 0.0:
+        assert_verbose_allclose(metrics[0], _masked_mean(reference_kl, completion_mask))
+        clip_metric = metrics[1]
+    else:
+        clip_metric = metrics[0]
+    assert_verbose_allclose(clip_metric, _masked_mean(reference_is_clipped.float(), completion_mask))
+
+
+def _reference_per_token_loss(
+    logits,
+    completion_ids,
+    completion_mask,
+    advantages,
+    old_logp,
+    ref_logp,
+    beta,
+    eps_low,
+    eps_high,
+    temperature=1.0,
+    delta=None,
+    use_bias_correction_kl=False,
+):
+    logits = logits[:, :-1, :] / temperature
+    log_probs = torch.log_softmax(logits, dim=-1)
+    per_token_logps = log_probs.gather(-1, completion_ids.unsqueeze(-1)).squeeze(-1)
+    old = old_logp if old_logp is not None else per_token_logps.detach()
+    coef_1 = torch.exp(per_token_logps - old)
+    coef_2 = torch.clamp(coef_1, 1 - eps_low, 1 + eps_high)
+    if delta is not None:
+        coef_1 = torch.clamp(coef_1, max=delta)
+    per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+    per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+    per_token_loss = -torch.minimum(per_token_loss1, per_token_loss2)
+    is_clipped = per_token_loss1 < per_token_loss2
+    mask = completion_mask.to(torch.bool)
+    per_token_loss = per_token_loss.masked_fill(~mask, 0.0)
+    is_clipped = is_clipped & mask
+    if beta != 0.0:
+        kl = torch.exp(ref_logp - per_token_logps) - (ref_logp - per_token_logps) - 1.0
+        if use_bias_correction_kl:
+            kl = kl * torch.exp(per_token_logps - old)
+        kl = kl.masked_fill(~mask, 0.0)
+        per_token_loss = per_token_loss + beta * kl
+    else:
+        kl = None
+    return {
+        "per_token_loss": per_token_loss,
+        "kl": kl,
+        "is_clipped": is_clipped,
+    }
+
+
+def _masked_mean(values, mask):
+    mask = mask.to(values.dtype)
+    return (values * mask).sum() / mask.sum().clamp(min=1.0)
diff --git a/test/chunked_loss/test_jsd_loss.py b/test/chunked_loss/test_jsd_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..415a015c921d8027fe54e214a06e24725968a970
--- /dev/null
+++ b/test/chunked_loss/test_jsd_loss.py
@@ -0,0 +1,441 @@
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss import LigerFusedLinearJSDLoss
+from liger_kernel.chunked_loss.functional import liger_fused_linear_jsd
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDFunction
+from liger_kernel.utils import infer_device
+from test.utils import HFDistillationLoss
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+
+# set random seed globally
+set_seed()
+
+
+class HFJSDLoss(HFDistillationLoss):
+    """
+    Naive implementation of a distillation loss using Jensen-Shannon Divergence (JSD).
+    """
+
+    def __init__(
+        self,
+        temperature: float = 1.0,
+        ignore_index: int = -100,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+    ):
+        super().__init__(
+            ignore_index=ignore_index,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            temperature=temperature,
+        )
+
+    def distillation_loss(self, student_logits, teacher_logits, target=None, ignore_index=-100, beta=0.5):
+        """
+        Compute JSD loss (Jensen-Shannon Divergence Loss).
+        Args:
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
+            target (torch.Tensor): Target labels for masking. Shape: (batch_size * seq_len,).
+            ignore_index (int): Index to ignore in loss computation.
+            beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+        Returns:
+            torch.Tensor: Jensen-Shannon Divergence loss
+        """
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+
+        if beta == 0:
+            jsd_loss = F.kl_div(student_log_probs, teacher_log_probs, reduction="none", log_target=True)
+        elif beta == 1:
+            jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="none", log_target=True)
+        else:
+            log_mean_probs = torch.logsumexp(
+                torch.stack([student_log_probs + math.log(1 - beta), teacher_log_probs + math.log(beta)], dim=0), dim=0
+            )
+            student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="none", log_target=True)
+            teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="none", log_target=True)
+            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
+
+        # Sum over vocab dimension
+        jsd_loss = jsd_loss.sum(dim=-1)
+
+        # Apply ignore_index mask
+        if target is not None:
+            mask = target != ignore_index
+            jsd_loss = jsd_loss * mask.float()
+            num_valid_tokens = mask.sum().clamp_min(1)
+            return jsd_loss.sum() / num_valid_tokens
+
+        return jsd_loss.sum()
+
+
+class TorchLMHeadJSD(torch.nn.Module):
+    """Ground truth implementation of the linear fused with torch based jsd loss.
+    :param H: hidden size
+    :param V: vocab size
+    :param temperature: softmax temperature
+    :param weight_hard_loss: weight_hard_loss
+    :param weight_soft_loss: weight_soft_loss
+    """
+
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool,
+        device: torch.device,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__()
+        # smaller student model weights
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=bias, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype, device=device)
+        self.beta = beta
+        self.jsd = HFJSDLoss(
+            ignore_index=ignore_index,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            temperature=temperature,
+        ).get_batch_loss_metrics
+
+    def forward(self, student_input, teacher_input, target):
+        jsd_loss = self.jsd(
+            student_input,
+            self.student_lin.weight,
+            teacher_input,
+            self.teacher_lin.weight,
+            target,
+            self.student_lin.bias,
+            self.teacher_lin.bias,
+            beta=self.beta,
+        )
+        return jsd_loss
+
+    def backward_with_grad_and_value(self, student_input, teacher_input, target):
+        """
+        Compute gradients using grad_and_value on NPU to match Liger implementation.
+        This method is used in tests on NPU devices to ensure consistency.
+        """
+        # Use grad_and_value to compute gradients and loss
+        if self.student_lin.bias is not None:
+
+            def loss_fn(student_input, student_weight, student_bias):
+                return self.jsd(
+                    student_input,
+                    student_weight,
+                    teacher_input,
+                    self.teacher_lin.weight,
+                    target,
+                    student_bias,
+                    self.teacher_lin.bias,
+                    beta=self.beta,
+                )
+
+            (grad_input, grad_weight, grad_bias), loss = torch.func.grad_and_value(loss_fn, argnums=(0, 1, 2))(
+                student_input, self.student_lin.weight, self.student_lin.bias
+            )
+
+            # Set gradients
+            student_input.grad = grad_input
+            self.student_lin.weight.grad = grad_weight
+            self.student_lin.bias.grad = grad_bias
+        else:
+
+            def loss_fn(student_input, student_weight):
+                return self.jsd(
+                    student_input,
+                    student_weight,
+                    teacher_input,
+                    self.teacher_lin.weight,
+                    target,
+                    None,  # student_bias is None when bias=False
+                    self.teacher_lin.bias,
+                    beta=self.beta,
+                )
+
+            (grad_input, grad_weight), loss = torch.func.grad_and_value(loss_fn, argnums=(0, 1))(
+                student_input, self.student_lin.weight
+            )
+
+            # Set gradients
+            student_input.grad = grad_input
+            self.student_lin.weight.grad = grad_weight
+
+        return loss
+
+
+class LigerLMHeadJSD(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool,
+        device: torch.device,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__()
+        # smaller student model weights
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=bias, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype, device=device)
+        self.chunked_jsd = LigerFusedLinearJSDLoss(
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            beta=beta,
+        )
+
+    def forward(self, student_input, teacher_input, target):
+        return self.chunked_jsd(
+            student_input,
+            self.student_lin.weight,
+            teacher_input,
+            self.teacher_lin.weight,
+            target,
+            self.student_lin.bias,
+            self.teacher_lin.bias,
+        )
+
+
+#############################################################################
+# Test the correctness of the fused linear JSD
+#############################################################################
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "temperature, weight_hard_loss, weight_soft_loss, beta",
+    [
+        (1.0, 0.5, 0.5, 0.5),
+        (2.0, 0.0, 1.0, 0.8),
+        (0.5, 1.0, 0.0, 0.2),
+    ],
+)
+@pytest.mark.parametrize("ignore_index", [-100, 42])
+def test_correctness(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    temperature,
+    weight_hard_loss,
+    weight_soft_loss,
+    beta,
+    ignore_index,
+):
+    torch_lm_head_jsd = TorchLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        device=device,
+        temperature=temperature,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+        beta=beta,
+        ignore_index=ignore_index,
+    )
+    liger_lm_head_jsd = LigerLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        device=device,
+        temperature=temperature,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+        beta=beta,
+        ignore_index=ignore_index,
+    )
+
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H // 2, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+
+    if bias:
+        torch_lm_head_jsd.student_lin.bias.data = liger_lm_head_jsd.student_lin.bias.data = torch.rand(
+            V, device=device, dtype=dtype
+        )
+        torch_lm_head_jsd.teacher_lin.bias.data = liger_lm_head_jsd.teacher_lin.bias.data = torch.rand(
+            V, device=device, dtype=dtype
+        )
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target[indices_to_assign] = ignore_index
+
+    # Assign some random number of elements as ignore_index
+    # On NPU, use grad_and_value for reference implementation to match Liger implementation
+    if device == "npu":
+        loss1 = torch_lm_head_jsd.backward_with_grad_and_value(student_input1, teacher_input, target)
+        loss2 = liger_lm_head_jsd(student_input2, teacher_input, target)
+        assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+        loss2.backward()
+    else:
+        loss1 = torch_lm_head_jsd(student_input1, teacher_input, target)
+        loss2 = liger_lm_head_jsd(student_input2, teacher_input, target)
+        assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+        loss1.backward()
+        loss2.backward()
+
+    assert_verbose_allclose(student_input1.grad, student_input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(
+        torch_lm_head_jsd.student_lin.weight.grad,
+        liger_lm_head_jsd.student_lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_jsd.student_lin.bias.grad,
+            liger_lm_head_jsd.student_lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (9, 7, 41, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-2),
+        (1.0, torch.float32, 1e-4, 5e-3),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "temperature, weight_hard_loss, weight_soft_loss, beta, ignore_index",
+    [(1.0, 0.5, 0.5, 0.5, -100), (2.0, 0.1, 0.9, 0.5, 42)],
+)
+def test_correctness_functional(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    bias,
+    weight_hard_loss,
+    weight_soft_loss,
+    beta,
+    ignore_index,
+    temperature,
+    atol,
+    rtol,
+):
+    _weight = torch.rand(V, H // 2, device=device, dtype=dtype)
+    student_weight1 = _weight.detach().clone().requires_grad_(True)
+    student_weight2 = _weight.detach().clone().requires_grad_(True)
+    teacher_weight = torch.rand(V, H, device=device, dtype=dtype)
+
+    if bias:
+        _bias = torch.rand(V, device=device, dtype=dtype)
+        student_bias1 = _bias.detach().clone().requires_grad_(True)
+        student_bias2 = _bias.detach().clone().requires_grad_(True)
+        teacher_bias = torch.rand(V, device=device, dtype=dtype)
+    else:
+        student_bias1 = student_bias2 = teacher_bias = None
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+
+    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    output1 = liger_fused_linear_jsd(
+        student_input1,
+        student_weight1,
+        teacher_input,
+        teacher_weight,
+        label,
+        student_bias1,
+        teacher_bias,
+        weight_hard_loss,
+        weight_soft_loss,
+        beta,
+        ignore_index,
+        temperature,
+    )
+    output2 = LigerFusedLinearJSDFunction.apply(
+        student_input2,
+        student_weight2,
+        teacher_input,
+        teacher_weight,
+        label,
+        student_bias2,
+        teacher_bias,
+        weight_hard_loss,
+        weight_soft_loss,
+        beta,
+        ignore_index,
+        temperature,
+    )
+
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+
+    output1.backward()
+    output2.backward()
+
+    assert_verbose_allclose(student_input1.grad, student_input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(student_weight1.grad, student_weight2.grad, atol=atol, rtol=rtol)
+
+    if bias:
+        assert_verbose_allclose(student_bias1.grad, student_bias2.grad, atol=atol, rtol=rtol)
diff --git a/test/chunked_loss/test_kto_loss.py b/test/chunked_loss/test_kto_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..593159aa60d7b93f1093c9f94beca49496886fc8
--- /dev/null
+++ b/test/chunked_loss/test_kto_loss.py
@@ -0,0 +1,434 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss import LigerFusedLinearKTOLoss
+from liger_kernel.chunked_loss.functional import liger_fused_linear_kto
+from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOFunction
+from liger_kernel.utils import infer_device
+from test.utils import HFAlignmentLoss
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+
+# set random seed globally
+set_seed(0)
+
+
+class HFKTOLoss(HFAlignmentLoss):
+    """
+    Implementation of the Kahneman-Tversky Optimization (KTO) loss,
+    adapted from Hugging Face's implementation.
+    Reference: https://github.com/huggingface/trl/blob/main/trl/trainer/kto_trainer.py
+    """
+
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        use_ref_model: bool = True,
+    ):
+        super().__init__(
+            beta=beta,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            unpaired=True,
+            compute_nll_loss=False,
+        )
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+        ref_chosen_logps: torch.FloatTensor,
+        ref_rejected_logps: torch.FloatTensor,
+        kl: torch.FloatTensor = None,
+    ):
+        """Compute KTO loss for a batch of policy log probabilities.
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+            ref_chosen_logps: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
+            ref_rejected_logps: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
+        Returns:
+            The losses tensor contains the KTO loss for each example in the batch.
+        """
+        if kl is None:
+            kl = torch.zeros(1).to(policy_chosen_logps.device)
+
+        # Chosen losses
+        chosen_logratios = policy_chosen_logps - ref_chosen_logps
+        if policy_chosen_logps.shape[0] != 0 or ref_chosen_logps.shape[0] != 0:
+            # Eqn (7) of the KTO paper (https://huggingface.co/papers/2402.01306)
+            chosen_losses = 1 - F.sigmoid(self.beta * (chosen_logratios - kl))
+
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            chosen_losses = torch.Tensor([]).to(policy_chosen_logps.device)
+
+        # Rejected losses
+        rejected_logratios = policy_rejected_logps - ref_rejected_logps
+        if policy_rejected_logps.shape[0] != 0 or ref_rejected_logps.shape[0] != 0:
+            rejected_losses = 1 - F.sigmoid(self.beta * (kl - rejected_logratios))
+        else:
+            # lists can't be empty -- if they are, then accelerate.gather will hang
+            rejected_losses = torch.Tensor([]).to(policy_rejected_logps.device)
+
+        losses = torch.cat(
+            (chosen_losses, rejected_losses),
+            0,
+        )
+
+        chosen_rewards = self.beta * chosen_logratios
+        rejected_rewards = self.beta * rejected_logratios
+
+        return losses, chosen_rewards.sum(), rejected_rewards.sum()
+
+
+class TorchLMHeadKTO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.KTO_loss = HFKTOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+        ).get_batch_loss_metrics
+
+    def forward(self, x, ref_x, y, preference_labels, kl=None):
+        return self.KTO_loss(
+            weight=self.lin.weight,
+            _input=x,
+            target=y,
+            bias=self.lin.bias,
+            ref_input=ref_x,
+            ref_weight=self.ref_lin.weight,
+            ref_bias=self.ref_lin.bias,
+            preference_labels=preference_labels,
+            kl=kl,
+            average_log_prob=True,
+        )
+
+
+class LigerLMHeadKTO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ref_bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=ref_bias, dtype=dtype)
+        self.KTO_loss = LigerFusedLinearKTOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+            average_log_prob=True,
+        )
+
+    def forward(self, x, ref_x, y, preference_labels, kl=None):
+        return self.KTO_loss(
+            _input=x,
+            lin_weight=self.lin.weight,
+            target=y,
+            preference_labels=preference_labels,
+            bias=self.lin.bias,
+            ref_input=ref_x,
+            ref_weight=self.ref_lin.weight,
+            ref_bias=self.ref_lin.bias,
+            kl=kl,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ref_bias", [True, False])
+@pytest.mark.parametrize("ignore_index, beta", [(-100, 0.1), (42, 0.2)])
+def test_correctness(B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias, ignore_index, beta):
+    # Preference labels shape: [B]
+    # Create binary preference labels (0 or 1) for each sequence in the batch
+    # Used to indicate preferred sequences (1) vs non-preferred sequences (0)
+    preference_labels = torch.randint(2, (B,), dtype=torch.bool, device=device, requires_grad=False)
+    num_chosen_samples = preference_labels.sum()
+    num_rejected_samples = len(preference_labels) - num_chosen_samples
+
+    # Precomputed KL divergence between policy and reference distributions
+    kl = torch.randn(1, device=device, dtype=dtype)
+
+    torch_lm_head_KTO = TorchLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=ref_bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    )
+    liger_lm_head_KTO = LigerLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=ref_bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    )
+
+    torch_lm_head_KTO.lin.weight.data = liger_lm_head_KTO.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_KTO.ref_lin.weight.data = liger_lm_head_KTO.ref_lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+
+    if bias:
+        torch_lm_head_KTO.lin.bias.data = liger_lm_head_KTO.lin.bias.data = torch.randn(V, device=device, dtype=dtype)
+    if ref_bias:
+        torch_lm_head_KTO.ref_lin.bias.data = liger_lm_head_KTO.ref_lin.bias.data = torch.randn(
+            V, device=device, dtype=dtype
+        )
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False) * scalar
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    loss1, aggregated_aux_outputs1 = torch_lm_head_KTO(
+        x=input1, ref_x=ref_input, y=target, preference_labels=preference_labels, kl=kl
+    )
+    loss2, aggregated_aux_outputs2 = liger_lm_head_KTO(
+        x=input2, ref_x=ref_input, y=target, preference_labels=preference_labels, kl=kl
+    )
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    assert len(aggregated_aux_outputs1) == len(aggregated_aux_outputs2)
+
+    # Metrics tests are flaky for bf16 due to precision issues
+    if dtype == torch.float32:
+        # chosen_logps
+        chosen_logps_mean1 = aggregated_aux_outputs1[0] / ((num_chosen_samples) + 1e-20)
+        chosen_logps_mean2 = aggregated_aux_outputs2[0] / ((num_chosen_samples) + 1e-20)
+        assert_verbose_allclose(chosen_logps_mean1, chosen_logps_mean2, atol=atol, rtol=rtol)
+
+        # chosen_logits
+        chosen_logits_mean1 = aggregated_aux_outputs1[2] / ((num_chosen_samples * T * V) + 1e-20)
+        chosen_logits_mean2 = aggregated_aux_outputs2[2] / ((num_chosen_samples * T * V) + 1e-20)
+        assert_verbose_allclose(chosen_logits_mean1, chosen_logits_mean2, atol=atol, rtol=rtol)
+
+        # chosen_rewards
+        chosen_rewards_mean1 = aggregated_aux_outputs1[4] / ((num_chosen_samples) + 1e-20)
+        chosen_rewards_mean2 = aggregated_aux_outputs2[4] / ((num_chosen_samples) + 1e-20)
+        assert_verbose_allclose(chosen_rewards_mean1, chosen_rewards_mean2, atol=atol, rtol=rtol)
+
+        # rejected_logps
+        rejected_logps_mean1 = aggregated_aux_outputs1[1] / ((num_rejected_samples) + 1e-20)
+        rejected_logps_mean2 = aggregated_aux_outputs2[1] / ((num_rejected_samples) + 1e-20)
+        assert_verbose_allclose(rejected_logps_mean1, rejected_logps_mean2, atol=atol, rtol=rtol)
+
+        # rejected_logits
+        rejected_logits_mean1 = aggregated_aux_outputs1[3] / ((num_rejected_samples * T * V) + 1e-20)
+        rejected_logits_mean2 = aggregated_aux_outputs2[3] / ((num_rejected_samples * T * V) + 1e-20)
+        assert_verbose_allclose(rejected_logits_mean1, rejected_logits_mean2, atol=atol, rtol=rtol)
+
+        # rejected_rewards
+        rejected_rewards_mean1 = aggregated_aux_outputs1[5] / ((num_rejected_samples) + 1e-20)
+        rejected_rewards_mean2 = aggregated_aux_outputs2[5] / ((num_rejected_samples) + 1e-20)
+        assert_verbose_allclose(rejected_rewards_mean1, rejected_rewards_mean2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1, input2, atol=atol, rtol=rtol)
+    assert_verbose_allclose(torch_lm_head_KTO.lin.weight, liger_lm_head_KTO.lin.weight, atol=atol, rtol=rtol)
+
+    if bias:
+        assert_verbose_allclose(torch_lm_head_KTO.lin.bias, liger_lm_head_KTO.lin.bias, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_KTO.lin.weight.grad,
+        liger_lm_head_KTO.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_KTO.lin.bias.grad,
+            liger_lm_head_KTO.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ref_bias", [True, False])
+def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias, ref_bias):
+    # Preference labels shape: [B]
+    # Create binary preference labels (0 or 1) for each sequence in the batch
+    # Used to indicate preferred sequences (1) vs non-preferred sequences (0)
+    preference_labels = torch.randint(2, (B,), dtype=torch.bool, device=device)
+    num_chosen_samples = preference_labels.sum()
+    num_rejected_samples = len(preference_labels) - num_chosen_samples
+
+    # Precomputed KL divergence between policy and reference distributions
+    kl = torch.randn(1, device=device, dtype=dtype)
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype, requires_grad=False) * scalar
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype)
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    _ref_weight = torch.randn(V, H, device=device, dtype=dtype)
+    ref_weight1 = _ref_weight.detach().clone().requires_grad_(True)
+    ref_weight2 = _ref_weight.detach().clone().requires_grad_(True)
+
+    _bias = torch.randn(V, device=device, dtype=dtype) if bias else None
+    bias1 = _bias.detach().clone().requires_grad_(True) if bias else None
+    bias2 = _bias.detach().clone().requires_grad_(True) if bias else None
+
+    _ref_bias = torch.randn(V, device=device, dtype=dtype) if ref_bias else None
+    ref_bias1 = _ref_bias.detach().clone().requires_grad_(True) if ref_bias else None
+    ref_bias2 = _ref_bias.detach().clone().requires_grad_(True) if ref_bias else None
+
+    loss1, aggregated_aux_outputs1 = LigerFusedLinearKTOFunction.apply(
+        input1,
+        weight1,
+        target,
+        preference_labels,
+        bias1,
+        ref_input,
+        ref_weight1,
+        ref_bias1,
+        kl,
+    )
+    loss2, aggregated_aux_outputs2 = liger_fused_linear_kto(
+        input2,
+        weight2,
+        target,
+        preference_labels,
+        bias2,
+        ref_input,
+        ref_weight2,
+        ref_bias2,
+        kl,
+    )
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    assert len(aggregated_aux_outputs1) == len(aggregated_aux_outputs2)
+
+    # Metrics tests are flaky for bf16 due to precision issues
+    if dtype == torch.float32:
+        # chosen_logps
+        chosen_logps_mean1 = aggregated_aux_outputs1[0] / ((num_chosen_samples) + 1e-20)
+        chosen_logps_mean2 = aggregated_aux_outputs2[0] / ((num_chosen_samples) + 1e-20)
+        assert_verbose_allclose(chosen_logps_mean1, chosen_logps_mean2, atol=atol, rtol=rtol)
+
+        # chosen_logits
+        chosen_logits_mean1 = aggregated_aux_outputs1[2] / ((num_chosen_samples * T * V) + 1e-20)
+        chosen_logits_mean2 = aggregated_aux_outputs2[2] / ((num_chosen_samples * T * V) + 1e-20)
+        assert_verbose_allclose(chosen_logits_mean1, chosen_logits_mean2, atol=atol, rtol=rtol)
+
+        # chosen_rewards
+        chosen_rewards_mean1 = aggregated_aux_outputs1[4] / ((num_chosen_samples) + 1e-20)
+        chosen_rewards_mean2 = aggregated_aux_outputs2[4] / ((num_chosen_samples) + 1e-20)
+        assert_verbose_allclose(chosen_rewards_mean1, chosen_rewards_mean2, atol=atol, rtol=rtol)
+
+        # rejected_logps
+        rejected_logps_mean1 = aggregated_aux_outputs1[1] / ((num_rejected_samples) + 1e-20)
+        rejected_logps_mean2 = aggregated_aux_outputs2[1] / ((num_rejected_samples) + 1e-20)
+        assert_verbose_allclose(rejected_logps_mean1, rejected_logps_mean2, atol=atol, rtol=rtol)
+
+        # rejected_logits
+        rejected_logits_mean1 = aggregated_aux_outputs1[3] / ((num_rejected_samples * T * V) + 1e-20)
+        rejected_logits_mean2 = aggregated_aux_outputs2[3] / ((num_rejected_samples * T * V) + 1e-20)
+        assert_verbose_allclose(rejected_logits_mean1, rejected_logits_mean2, atol=atol, rtol=rtol)
+
+        # rejected_rewards
+        rejected_rewards_mean1 = aggregated_aux_outputs1[5] / ((num_rejected_samples) + 1e-20)
+        rejected_rewards_mean2 = aggregated_aux_outputs2[5] / ((num_rejected_samples) + 1e-20)
+        assert_verbose_allclose(rejected_rewards_mean1, rejected_rewards_mean2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)
diff --git a/test/chunked_loss/test_orpo_loss.py b/test/chunked_loss/test_orpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..a5c43b1b0b7e0dd33ae77a83fd0b3586e9c81d7c
--- /dev/null
+++ b/test/chunked_loss/test_orpo_loss.py
@@ -0,0 +1,266 @@
+from typing import Tuple
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
+from liger_kernel.chunked_loss.functional import liger_fused_linear_orpo
+from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
+from liger_kernel.utils import infer_device
+from test.utils import HFAlignmentLoss
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+
+# set random seed globally
+set_seed()
+
+
+class HFORPOLoss(HFAlignmentLoss):
+    """
+    Implementation of the Odds Ratio Preference Optimization (ORPO) loss,
+    adapted from Hugging Face's implementation.
+    Reference: https://github.com/huggingface/trl/blob/main/trl/trainer/orpo_trainer.py
+    """
+
+    def __init__(self, ignore_index: int = -100, beta: float = 0.1):
+        super().__init__(beta=beta, ignore_index=ignore_index)
+
+    def alignment_loss(
+        self,
+        policy_chosen_logps: torch.FloatTensor,
+        policy_rejected_logps: torch.FloatTensor,
+    ) -> Tuple[
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+    ]:
+        """Compute ORPO's odds ratio (OR) loss for a batch of policy and reference model log probabilities.
+
+        Args:
+            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+
+        Returns:
+            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
+            The losses tensor contains the ORPO loss for each example in the batch.
+            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
+            The log odds ratio of the chosen responses over the rejected responses ratio for logging purposes.
+            The `log(sigmoid(log_odds_chosen))` for logging purposes.
+        """
+
+        # Derived from Eqs. (4) and (7) from https://huggingface.co/papers/2403.07691 by using log identities and exp(log(P(y|x)) = P(y|x)
+        log_odds = (policy_chosen_logps - policy_rejected_logps) - (
+            torch.log1p(-torch.exp(policy_chosen_logps)) - torch.log1p(-torch.exp(policy_rejected_logps))
+        )
+        ratio = F.logsigmoid(log_odds)
+        losses = -self.beta * ratio
+
+        chosen_rewards = self.beta * policy_chosen_logps
+        rejected_rewards = self.beta * policy_rejected_logps
+
+        return (
+            losses,
+            chosen_rewards,
+            rejected_rewards,
+            torch.mean(ratio),
+            torch.mean(log_odds),
+        )
+
+
+class TorchLMHeadORPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.orpo_loss = HFORPOLoss(ignore_index=ignore_index, beta=beta).get_batch_loss_metrics
+
+    def forward(self, x, y, nll_target=None):
+        return self.orpo_loss(self.lin.weight, x, y, self.lin.bias, nll_target=nll_target)
+
+
+class LigerLMHeadORPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.orpo_loss = LigerFusedLinearORPOLoss(ignore_index=ignore_index, beta=beta)
+
+    def forward(self, x, y, nll_target=None):
+        return self.orpo_loss(self.lin.weight, x, y, self.lin.bias, nll_target=nll_target)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ignore_index, beta", [(-100, 0.1), (42, 0.2)])
+def test_correctness(B, T, H, V, scalar, dtype, atol, rtol, bias, ignore_index, beta):
+    # reset torch compiler cache
+    torch.compiler.reset()
+
+    B = 2 * B  # orpo loss requires B to be even
+    torch_lm_head_orpo = TorchLMHeadORPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    )
+    liger_lm_head_orpo = LigerLMHeadORPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    )
+
+    torch_lm_head_orpo.lin.weight.data = liger_lm_head_orpo.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+
+    if bias:
+        torch_lm_head_orpo.lin.bias.data = liger_lm_head_orpo.lin.bias.data = torch.randn(V, device=device, dtype=dtype)
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+    nll_target = torch.randint(0, V, (B, T), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    loss1, aggregated_aux_outputs1 = torch_lm_head_orpo(input1, target, nll_target)
+    loss2, aggregated_aux_outputs2 = liger_lm_head_orpo(input2, target, nll_target)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    assert len(aggregated_aux_outputs1) == len(aggregated_aux_outputs2)
+
+    for i in range(len(aggregated_aux_outputs1)):
+        assert_verbose_allclose(
+            aggregated_aux_outputs1[i],
+            aggregated_aux_outputs2[i],
+            atol=atol,
+            rtol=rtol,
+        )
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_orpo.lin.weight.grad,
+        liger_lm_head_orpo.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_orpo.lin.bias.grad,
+            liger_lm_head_orpo.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias):
+    # reset torch compiler cache
+    torch.compiler.reset()
+
+    B = 2 * B
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype)
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    _bias = torch.randn(V, device=device, dtype=dtype) if bias else None
+    bias1 = _bias.detach().clone().requires_grad_(True) if bias else None
+    bias2 = _bias.detach().clone().requires_grad_(True) if bias else None
+
+    loss1, _ = LigerFusedLinearORPOFunction.apply(input1, weight1, target, bias1)
+    loss2, _ = liger_fused_linear_orpo(input2, weight2, target, bias2)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)
diff --git a/test/chunked_loss/test_simpo_loss.py b/test/chunked_loss/test_simpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..4a6f019598a9dcfd0d78d060d2e3a8196aaa0a9e
--- /dev/null
+++ b/test/chunked_loss/test_simpo_loss.py
@@ -0,0 +1,215 @@
+import pytest
+import torch
+
+from liger_kernel.chunked_loss import LigerFusedLinearSimPOLoss
+from liger_kernel.chunked_loss.functional import liger_fused_linear_simpo
+from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
+from liger_kernel.utils import infer_device
+from test.chunked_loss.test_cpo_loss import TorchLMHeadCPO
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+device = infer_device()
+
+# set random seed globally
+set_seed()
+
+
+class LigerLMHeadSimPO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        alpha: float = 1.0,
+        label_smoothing: float = 0.0,
+        gamma: float = 0.5,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.simpo_loss = LigerFusedLinearSimPOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            alpha=alpha,
+            gamma=gamma,
+            label_smoothing=label_smoothing,
+        )
+
+    def forward(self, x, y):
+        return self.simpo_loss(self.lin.weight, x, y, self.lin.bias)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-3, 5e-3),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ignore_index, beta, gamma", [(-100, 0.1, 0.5), (42, 0.2, 0.85)])
+@pytest.mark.parametrize("label_smoothing", [0.0, 0.1])
+def test_correctness(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    bias,
+    ignore_index,
+    beta,
+    gamma,
+    label_smoothing,
+):
+    B = 2 * B  # SimPO loss requires B to be even
+
+    torch_lm_head_simpo = TorchLMHeadCPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+        loss_type="simpo",
+        label_smoothing=label_smoothing,
+        simpo_gamma=gamma,
+    )
+    liger_lm_head_simpo = LigerLMHeadSimPO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+        label_smoothing=label_smoothing,
+        gamma=gamma,
+    )
+
+    torch_lm_head_simpo.lin.weight.data = liger_lm_head_simpo.lin.weight.data = torch.randn(
+        V, H, device=device, dtype=dtype
+    )
+
+    if bias:
+        torch_lm_head_simpo.lin.bias.data = liger_lm_head_simpo.lin.bias.data = torch.randn(
+            V, device=device, dtype=dtype
+        )
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+
+    loss1, aggregated_aux_outputs1 = torch_lm_head_simpo(input1, target)
+    loss2, aggregated_aux_outputs2 = liger_lm_head_simpo(input2, target)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    assert len(aggregated_aux_outputs1) == len(aggregated_aux_outputs2)
+
+    for i in range(len(aggregated_aux_outputs1)):
+        assert_verbose_allclose(
+            aggregated_aux_outputs1[i],
+            aggregated_aux_outputs2[i],
+            atol=atol,
+            rtol=rtol,
+        )
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_simpo.lin.weight.grad,
+        liger_lm_head_simpo.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_simpo.lin.bias.grad,
+            liger_lm_head_simpo.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        (3, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-2, 5e-1),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+def test_correctness_functional(B, T, H, V, scalar, dtype, atol, rtol, bias):
+    B = 2 * B
+
+    _input = torch.randn(B, T, H, device=device, dtype=dtype) * scalar
+    input1 = _input.detach().clone().requires_grad_(True)
+    input2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(
+        0,
+        V,
+        (
+            B,
+            T,
+        ),
+        device=device,
+        dtype=torch.long,
+    )
+
+    _weight = torch.randn(V, H, device=device, dtype=dtype)
+    weight1 = _weight.detach().clone().requires_grad_(True)
+    weight2 = _weight.detach().clone().requires_grad_(True)
+
+    _bias = torch.randn(V, device=device, dtype=dtype) if bias else None
+    bias1 = _bias.detach().clone().requires_grad_(True) if bias else None
+    bias2 = _bias.detach().clone().requires_grad_(True) if bias else None
+
+    loss1, aggregated_aux_outputs1 = LigerFusedLinearSimPOFunction.apply(input1, weight1, target, bias1)
+    loss2, aggregated_aux_outputs2 = liger_fused_linear_simpo(input2, weight2, target, bias2)
+
+    assert_verbose_allclose(loss1, loss2, atol=atol, rtol=rtol)
+
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(input1.grad, input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100755
index 0000000000000000000000000000000000000000..3d36a0d2256f2ac058d191394a62fcc1668c9f28
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,15 @@
+import pytest
+import torch
+
+from liger_kernel.utils import is_npu_available
+
+
+@pytest.fixture(autouse=True)
+def clear_gpu_cache():
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif is_npu_available():
+        torch.npu.empty_cache()
+    elif torch.xpu.is_available():
+        torch.xpu.empty_cache()
diff --git a/test/convergence/__init__.py b/test/convergence/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test/convergence/bf16/__init__.py b/test/convergence/bf16/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test/convergence/bf16/test_mini_models.py b/test/convergence/bf16/test_mini_models.py
new file mode 100755
index 0000000000000000000000000000000000000000..98400b87041a7fb988ea79fef35d354d50247cff
--- /dev/null
+++ b/test/convergence/bf16/test_mini_models.py
@@ -0,0 +1,2324 @@
+import os
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Ensure deterministic behavior with CuBLAS
+
+import pytest
+import torch
+import transformers
+
+from datasets import load_from_disk
+from packaging import version
+from torch.utils.data import DataLoader
+from transformers.models.gemma import GemmaConfig
+from transformers.models.gemma import GemmaForCausalLM
+from transformers.models.gemma2 import Gemma2Config
+from transformers.models.gemma2 import Gemma2ForCausalLM
+from transformers.models.llama import LlamaConfig
+from transformers.models.llama import LlamaForCausalLM
+from transformers.models.mistral import MistralConfig
+from transformers.models.mistral import MistralForCausalLM
+from transformers.models.mixtral import MixtralConfig
+from transformers.models.mixtral import MixtralForCausalLM
+from transformers.models.phi3 import Phi3Config
+from transformers.models.phi3 import Phi3ForCausalLM
+from transformers.models.qwen2 import Qwen2Config
+from transformers.models.qwen2 import Qwen2ForCausalLM
+
+from liger_kernel.transformers import apply_liger_kernel_to_exaone4
+from liger_kernel.transformers import apply_liger_kernel_to_falcon_h1
+from liger_kernel.transformers import apply_liger_kernel_to_gemma
+from liger_kernel.transformers import apply_liger_kernel_to_gemma2
+from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text
+from liger_kernel.transformers import apply_liger_kernel_to_glm4
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe
+from liger_kernel.transformers import apply_liger_kernel_to_gpt_oss
+from liger_kernel.transformers import apply_liger_kernel_to_granite
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe
+from liger_kernel.transformers import apply_liger_kernel_to_internvl
+from liger_kernel.transformers import apply_liger_kernel_to_llama
+from liger_kernel.transformers import apply_liger_kernel_to_llama4
+from liger_kernel.transformers import apply_liger_kernel_to_llava
+from liger_kernel.transformers import apply_liger_kernel_to_mistral
+from liger_kernel.transformers import apply_liger_kernel_to_mixtral
+from liger_kernel.transformers import apply_liger_kernel_to_mllama
+from liger_kernel.transformers import apply_liger_kernel_to_olmo2
+from liger_kernel.transformers import apply_liger_kernel_to_olmo3
+from liger_kernel.transformers import apply_liger_kernel_to_phi3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5_moe
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_next
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl_moe
+from liger_kernel.transformers import apply_liger_kernel_to_smollm3
+from liger_kernel.utils import infer_device
+from test.utils import DEFAULT_DATASET_PATH
+from test.utils import MiniModelConfig
+from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
+from test.utils import require_deterministic
+from test.utils import revert_liger_kernel_to_exaone4
+from test.utils import revert_liger_kernel_to_falcon_h1
+from test.utils import revert_liger_kernel_to_gemma
+from test.utils import revert_liger_kernel_to_gemma2
+from test.utils import revert_liger_kernel_to_gemma3_text
+from test.utils import revert_liger_kernel_to_glm4
+from test.utils import revert_liger_kernel_to_glm4v
+from test.utils import revert_liger_kernel_to_glm4v_moe
+from test.utils import revert_liger_kernel_to_gpt_oss
+from test.utils import revert_liger_kernel_to_granite
+from test.utils import revert_liger_kernel_to_hunyuan_v1
+from test.utils import revert_liger_kernel_to_hunyuan_v1_moe
+from test.utils import revert_liger_kernel_to_internvl
+from test.utils import revert_liger_kernel_to_llama
+from test.utils import revert_liger_kernel_to_llama4
+from test.utils import revert_liger_kernel_to_llava
+from test.utils import revert_liger_kernel_to_mistral
+from test.utils import revert_liger_kernel_to_mixtral
+from test.utils import revert_liger_kernel_to_mllama
+from test.utils import revert_liger_kernel_to_olmo2
+from test.utils import revert_liger_kernel_to_olmo3
+from test.utils import revert_liger_kernel_to_phi3
+from test.utils import revert_liger_kernel_to_qwen2
+from test.utils import revert_liger_kernel_to_qwen2_5_vl
+from test.utils import revert_liger_kernel_to_qwen2_vl
+from test.utils import revert_liger_kernel_to_qwen3
+from test.utils import revert_liger_kernel_to_qwen3_5
+from test.utils import revert_liger_kernel_to_qwen3_5_moe
+from test.utils import revert_liger_kernel_to_qwen3_moe
+from test.utils import revert_liger_kernel_to_qwen3_next
+from test.utils import revert_liger_kernel_to_qwen3_vl
+from test.utils import revert_liger_kernel_to_qwen3_vl_moe
+from test.utils import revert_liger_kernel_to_smollm3
+from test.utils import set_seed
+from test.utils import simple_collate_fn
+from test.utils import supports_bfloat16
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+try:
+    from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+    from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
+
+    LLAMA4_AVAILABLE = True
+except ImportError:
+    LLAMA4_AVAILABLE = False
+
+try:
+    # Mllama is only available in transformers>=4.45.0
+    from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+    from transformers.models.mllama.modeling_mllama import MllamaForCausalLM
+
+    MLLAMA_AVAILABLE = True
+except ImportError:
+    MLLAMA_AVAILABLE = False
+
+try:
+    # Qwen2-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+
+    QWEN2_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_VL_AVAILABLE = False
+
+try:
+    # Qwen2.5-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+
+    QWEN2_5_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_5_VL_AVAILABLE = False
+
+
+try:
+    # Qwen2.5-VL is only available in transformers>=4.57.0
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+
+    QWEN3_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.57.0")
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+
+try:
+    # Qwen3-VL-MoE is only available in transformers>=4.57.0
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeTextConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeVisionConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.57.0")
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+
+try:
+    from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
+    from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeForCausalLM
+
+    QWEN3_AVAILABLE = True
+except ImportError:
+    QWEN3_AVAILABLE = False
+
+try:
+    from transformers.models.granite import GraniteConfig
+    from transformers.models.granite import GraniteForCausalLM
+
+    GRANITE_AVAILABLE = True
+except ImportError:
+    GRANITE_AVAILABLE = False
+
+try:
+    from transformers import CLIPVisionConfig
+    from transformers.models.llava.configuration_llava import LlavaConfig
+    from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration
+
+    LLAVA_AVAILABLE = True
+except ImportError:
+    LLAVA_AVAILABLE = False
+
+try:
+    # OLMO2 is only available in transformers>=4.47.0
+    from transformers.models.olmo2.configuration_olmo2 import Olmo2Config
+    from transformers.models.olmo2.modeling_olmo2 import Olmo2ForCausalLM
+
+    OLMO2_AVAILABLE = True
+except ImportError:
+    OLMO2_AVAILABLE = False
+
+try:
+    # OLMO3 is only available in transformers>=4.57.0
+    from transformers.models.olmo3.configuration_olmo3 import Olmo3Config
+    from transformers.models.olmo3.modeling_olmo3 import Olmo3ForCausalLM
+
+    OLMO3_AVAILABLE = True
+except ImportError:
+    OLMO3_AVAILABLE = False
+
+try:
+    # Glm4 is only available in transformers>=4.51.3
+    from transformers.models.glm4.configuration_glm4 import Glm4Config
+    from transformers.models.glm4.modeling_glm4 import Glm4ForCausalLM
+
+    GLM4_AVAILABLE = True
+except ImportError:
+    GLM4_AVAILABLE = False
+
+try:
+    # Glm4v is only available in transformers>=4.51.3
+    from transformers.models.glm4v.configuration_glm4v import Glm4vConfig
+    from transformers.models.glm4v.modeling_glm4v import Glm4vForConditionalGeneration
+
+    GLM4V_AVAILABLE = True
+except ImportError:
+    GLM4V_AVAILABLE = False
+
+try:
+    # Glm4v_moe is only available in transformers>=4.51.3
+    from transformers.models.glm4v_moe.configuration_glm4v_moe import Glm4vMoeConfig
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeForConditionalGeneration
+
+    GLM4V_MOE_AVAILABLE = True
+except ImportError:
+    GLM4V_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
+
+    GEMMA3_AVAILABLE = True
+except ImportError:
+    GEMMA3_AVAILABLE = False
+
+try:
+    # Smollm3 is only available in transformers>=4.53.0
+    from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config
+    from transformers.models.smollm3.modeling_smollm3 import SmolLM3ForCausalLM
+
+    SMOLLM3_AVAILABLE = True
+except ImportError:
+    SMOLLM3_AVAILABLE = False
+
+try:
+    # InternVL is only available in transformers>=4.52.1
+    from transformers.models.internvl.configuration_internvl import InternVLConfig
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+
+    INTERNVL_AVAILABLE = True
+except ImportError:
+    INTERNVL_AVAILABLE = False
+
+try:
+    # FalconH1 is only available in transformers>=4.53.0
+    from transformers.models.falcon_h1.configuration_falcon_h1 import FalconH1Config
+    from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1ForCausalLM
+
+    FALCONH1_AVAILABLE = True
+except ImportError:
+    FALCONH1_AVAILABLE = False
+
+try:
+    # GPT-OSS is only available in transformers>=4.55.0
+    from transformers.models.gpt_oss.configuration_gpt_oss import GptOssConfig
+    from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM
+
+    GPT_OSS_AVAILABLE = True
+except ImportError:
+    GPT_OSS_AVAILABLE = False
+
+try:
+    # Qwen3Next is only available in transformers>=4.57.0
+    from transformers.models.qwen3_next.configuration_qwen3_next import Qwen3NextConfig
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextForCausalLM
+
+    QWEN3NEXT_AVAILABLE = True
+except ImportError:
+    QWEN3NEXT_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeForCausalLM
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeTextConfig
+
+    QWEN3_5_MOE_AVAILABLE = True
+except ImportError:
+    QWEN3_5_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForCausalLM
+
+    QWEN3_5_AVAILABLE = True
+except ImportError:
+    QWEN3_5_AVAILABLE = False
+
+try:
+    from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config
+    from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM
+    from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+    from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM
+
+    HUNYUAN_V1_AVAILABLE = True
+except ImportError:
+    HUNYUAN_V1_AVAILABLE = False
+
+try:
+    from transformers.models.exaone4.configuration_exaone4 import Exaone4Config
+    from transformers.models.exaone4.modeling_exaone4 import Exaone4ForCausalLM
+
+    EXAONE4_AVAILABLE = True
+except ImportError:
+    EXAONE4_AVAILABLE = False
+
+
+device = infer_device()
+
+MINI_MODEL_SETUPS = {
+    "mini_llama3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama,
+        model_class=LlamaForCausalLM,
+        mini_model_config=LlamaConfig(
+            attention_bias=False,
+            attention_dropout=0.0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_qwen2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2,
+        model_class=Qwen2ForCausalLM,
+        mini_model_config=Qwen2Config(
+            attention_dropout=0.0,
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151643
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,  # 131072
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,  # 151936
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_phi3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_phi3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_phi3,
+        model_class=Phi3ForCausalLM,
+        mini_model_config=Phi3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,  # 32000
+            hidden_act="silu",
+            hidden_size=896,  # 3072
+            initializer_range=0.02,
+            intermediate_size=4864,  # 8192
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=None,  # defaults to num_attention_heads
+            rms_norm_eps=1e-5,
+            sliding_window=None,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32064,
+            attn_implementation="eager",
+        ),
+    ),
+    "mini_mistral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mistral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mistral,
+        model_class=MistralForCausalLM,
+        mini_model_config=MistralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=1024,
+            initializer_range=0.02,
+            intermediate_size=2048,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_mixtral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mixtral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mixtral,
+        model_class=MixtralForCausalLM,
+        mini_model_config=MixtralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=512,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=32768,  # 32768
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_gemma1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            # gemma1 model config uses `hidden_act` and point it to gelu,
+            # https://huggingface.co/google/gemma-7b/blob/main/config.json#L10
+            # but in reality it's ignored and HuggingFace will use tanh approximation:
+            # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/gemma/modeling_gemma.py#L175
+            hidden_act="gelu",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma1.1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma2,
+        model_class=Gemma2ForCausalLM,
+        mini_model_config=Gemma2Config(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    ),
+}
+
+if LLAMA4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_llama4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
+        model_class=Llama4ForCausalLM,
+        mini_model_config=Llama4TextConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=1.0,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+
+if QWEN3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3,
+        model_class=Qwen3ForCausalLM,
+        mini_model_config=Qwen3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_qwen3_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_moe,
+        model_class=Qwen3MoeForCausalLM,
+        mini_model_config=Qwen3MoeConfig(
+            vocab_size=32000,  # 151936
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            mlp_only_layers=None,
+        ),
+    )
+
+if GPT_OSS_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gpt_oss"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gpt_oss,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gpt_oss,
+        model_class=GptOssForCausalLM,
+        mini_model_config=GptOssConfig(
+            vocab_size=32000,  # 201088
+            hidden_size=896,
+            intermediate_size=896,  # Same as hidden_size for GPT-OSS
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            head_dim=64,
+            hidden_act="silu",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_dropout=0.0,
+            num_local_experts=8,  # Reduced from 32 for mini model
+            num_experts_per_tok=2,  # Reduced from 4 for mini model
+            router_aux_loss_coef=0.9,
+            output_router_logits=False,
+            sliding_window=128,
+            layer_types=["sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(4)],
+        ),
+    )
+
+if GEMMA3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gemma3_text"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma3_text,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma3_text,
+        model_class=Gemma3ForCausalLM,
+        mini_model_config=Gemma3TextConfig(
+            vocab_size=32000,  # 262144
+            hidden_size=1024,  # 1152
+            intermediate_size=2048,  # 6912
+            num_hidden_layers=4,  # 26
+            num_attention_heads=4,
+            num_key_value_heads=1,
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,  # 32768
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=2,
+            eos_token_id=1,
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    )
+
+
+if MLLAMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_mllama"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mllama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mllama,
+        model_class=MllamaForCausalLM,
+        mini_model_config=MllamaTextConfig(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=131_072,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            rope_scaling=dict(
+                factor=8.0,
+                high_freq_factor=4.0,
+                low_freq_factor=1.0,
+                original_max_position_embeddings=8192,
+                rope_type="llama3",
+                rope_theta=500_000,
+            )
+            if not IS_TRANSFORMERS_V5_OR_LATER
+            else None,
+        ),
+    )
+
+if QWEN2_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
+        model_class=Qwen2VLForConditionalGeneration,
+        mini_model_config=Qwen2VLConfig(
+            # In transformers v5, text-related parameters must be in text_config
+            text_config={
+                "attention_dropout": 0.0,
+                # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                "bos_token_id": 1,  # 151643
+                "eos_token_id": 2,  # 151645
+                "hidden_act": "silu",
+                "hidden_size": 1536,  # 8192
+                "initializer_range": 0.02,
+                "intermediate_size": 4864,  # 29568
+                "max_position_embeddings": 32768,
+                "max_window_layers": 4,  # 80
+                "num_attention_heads": 12,  # 64
+                "num_hidden_layers": 4,  # 80
+                "num_key_value_heads": 2,  # 8
+                "rms_norm_eps": 1e-6,  # 1e-5
+                **(
+                    {"rope_parameters": {"mrope_section": [16, 24, 24]}}  # (temporal, height, width)
+                    if IS_TRANSFORMERS_V5_OR_LATER
+                    else {"rope_scaling": {"type": "mrope", "mrope_section": [16, 24, 24]}}
+                ),
+                "sliding_window": 4096,
+                "tie_word_embeddings": False,
+                "use_cache": True,
+                "vocab_size": 32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+                "use_sliding_window": False,
+            },
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            vision_config={
+                "depth": 4,  # 32
+                "embed_dim": 1280,
+                "mlp_ratio": 4,
+                "num_heads": 16,
+                "in_chans": 3,
+                "hidden_size": 128,  # 1536
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if QWEN2_5_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_5_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_5_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
+        model_class=Qwen2_5_VLForConditionalGeneration,
+        mini_model_config=Qwen2_5_VLConfig(
+            # In transformers v5, text-related parameters must be in text_config
+            text_config={
+                "attention_dropout": 0.0,
+                # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                "bos_token_id": 1,  # 151643
+                "eos_token_id": 2,  # 151645
+                "hidden_act": "silu",
+                "hidden_size": 1536,  # 8192
+                "initializer_range": 0.02,
+                "intermediate_size": 4864,  # 29568
+                "max_position_embeddings": 32768,
+                "max_window_layers": 4,  # 80
+                "num_attention_heads": 12,  # 64
+                "num_hidden_layers": 4,  # 80
+                "num_key_value_heads": 2,  # 8
+                "rms_norm_eps": 1e-6,  # 1e-5
+                **(
+                    {"rope_parameters": {"mrope_section": [16, 24, 24]}}  # (temporal, height, width)
+                    if IS_TRANSFORMERS_V5_OR_LATER
+                    else {"rope_scaling": {"type": "mrope", "mrope_section": [16, 24, 24]}}
+                ),
+                "sliding_window": 4096,
+                "tie_word_embeddings": False,
+                "use_cache": True,
+                "vocab_size": 32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+                "use_sliding_window": False,
+            },
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "window_size": 112,
+                "fullatt_block_indexes": [7, 15, 23, 31],
+                "tokens_per_second": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+
+if QWEN3_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl,
+        model_class=Qwen3VLForConditionalGeneration,
+        mini_model_config=Qwen3VLConfig(
+            bos_token_id=1,
+            eos_token_id=2,
+            vision_start_token_id=32765,
+            vision_end_token_id=32766,
+            image_token_id=32768,
+            video_token_id=32769,
+            tie_word_embeddings=False,
+            attn_implementation="sdpa",
+            text_config=dict(
+                attention_dropout=0.0,
+                hidden_act="silu",
+                hidden_size=1536,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=12,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                vocab_size=32768,
+                pad_token_id=None,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+            ),
+            vision_config=dict(
+                depth=4,
+                hidden_size=128,
+                hidden_act="silu",
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=128,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[],
+                initializer_range=0.02,
+            ),
+        ),
+    )
+
+if QWEN3_VL_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl_moe,
+        model_class=Qwen3VLMoeForConditionalGeneration,
+        mini_model_config=Qwen3VLMoeConfig(
+            bos_token_id=1,
+            eos_token_id=2,
+            vision_start_token_id=32765,
+            vision_end_token_id=32766,
+            image_token_id=32768,
+            video_token_id=32769,
+            tie_word_embeddings=False,
+            attn_implementation="sdpa",
+            text_config=Qwen3VLMoeTextConfig(
+                attention_dropout=0.0,
+                attention_bias=False,
+                hidden_act="silu",
+                hidden_size=1536,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=12,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                head_dim=128,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                vocab_size=32768,
+                decoder_sparse_step=1,
+                moe_intermediate_size=3072,
+                num_experts_per_tok=2,
+                num_experts=4,
+                tie_word_embeddings=False,
+                mlp_only_layers=[],
+                pad_token_id=None,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+            ).to_dict(),
+            vision_config=Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=128,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=128,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+        ),
+    )
+
+if GRANITE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_granite3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_granite,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
+        model_class=GraniteForCausalLM,
+        mini_model_config=GraniteConfig(
+            attention_bias=False,
+            attention_dropout=0.1,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if LLAVA_AVAILABLE:
+    # https://huggingface.co/llava-hf/llava-1.5-7b-hf
+    MINI_MODEL_SETUPS["mini_llava"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llava,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llava,
+        model_class=LlavaForConditionalGeneration,
+        mini_model_config=LlavaConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                hidden_act="silu",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=2048,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pretraining_tp=1,
+                tie_word_embeddings=False,
+                use_cache=True,
+                max_position_embeddings=4096,  # llava-1.5-7b-hf
+                rms_norm_eps=1e-05,  # llava-1.5-7b-hf
+                vocab_size=32064,  # llava-1.5-7b-hf
+                # At rope backward
+                # Eager produces incontiguous dq and dk
+                # SDPA produces contiguous dq and incontiguous dk
+                # Flash_attn produces contiguous dq and dk
+                attn_implementation="sdpa",  # default value, pytorch native attention
+            ),
+            vision_config=CLIPVisionConfig(
+                hidden_size=1024,
+                image_size=336,
+                intermediate_size=2048,  # 4096
+                model_type="clip_vision_model",
+                num_attention_heads=4,  # 16
+                num_hidden_layers=4,  # 24
+                patch_size=14,
+                projection_dim=768,
+                vocab_size=32000,
+            ),
+            vocab_size=32064,
+            ignore_index=-100,
+            pad_token_id=4,
+            image_token_index=3,
+            projector_hidden_act="gelu",
+            vision_feature_layer=-2,
+            vision_feature_select_strategy="default",
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO2_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo2"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo2,
+        model_class=Olmo2ForCausalLM,
+        mini_model_config=Olmo2Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo3,
+        model_class=Olmo3ForCausalLM,
+        mini_model_config=Olmo3Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if GLM4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4,
+        model_class=Glm4ForCausalLM,
+        mini_model_config=Glm4Config(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if GLM4V_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v,
+        model_class=Glm4vForConditionalGeneration,
+        mini_model_config=Glm4vConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+                "pad_token_id": None,
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if GLM4V_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v_moe,
+        model_class=Glm4vMoeForConditionalGeneration,
+        mini_model_config=Glm4vMoeConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                "attention_dropout": 0.0,
+                "moe_intermediate_size": 1408,
+                "num_experts_per_tok": 2,
+                "n_shared_experts": 1,
+                "n_routed_experts": 8,
+                "routed_scaling_factor": 1.0,
+                "n_group": 1,
+                "topk_group": 1,
+                "first_k_dense_replace": 1,
+                "norm_topk_prob": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if SMOLLM3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_smollm3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_smollm3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_smollm3,
+        model_class=SmolLM3ForCausalLM,
+        mini_model_config=SmolLM3Config(
+            attention_bias=False,
+            attention_dropout=0.0,
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,  # 128000
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if INTERNVL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_internvl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_internvl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_internvl,
+        model_class=InternVLForConditionalGeneration,
+        mini_model_config=InternVLConfig(
+            text_config=Qwen2Config(
+                rms_norm_eps=1e-5,
+                hidden_size=256,  # 1024
+                intermediate_size=1024,  # 4096
+                hidden_act="silu",
+                num_hidden_layers=4,  # 24
+                num_attention_heads=4,  # 16
+                num_key_value_heads=2,  # 16
+                max_position_embeddings=4096,  # 8192
+                vocab_size=32000,  # 151936
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                tie_word_embeddings=False,
+            ),
+            vision_config={
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+            },
+            image_token_id=10,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if FALCONH1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_falcon_h1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_falcon_h1,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_falcon_h1,
+        model_class=FalconH1ForCausalLM,
+        mini_model_config=FalconH1Config(
+            model_type="falcon_h1",
+            vocab_size=32000,
+            hidden_size=256,  # 4096
+            num_hidden_layers=4,  # 24
+            num_attention_heads=4,  # 32
+            num_key_value_heads=2,  # 8
+            intermediate_size=1024,  # 11008
+            hidden_act="silu",
+            max_position_embeddings=4096,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            tie_word_embeddings=False,
+            mamba_d_ssm=128,  # 1024
+            mamba_n_heads=16,  # 128
+            mamba_d_state=32,  # 245
+            mamba_d_conv=2,  # 4
+            attn_implementation="eager",
+        ),
+    )
+
+if QWEN3NEXT_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_next"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_next,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_next,
+        model_class=Qwen3NextForCausalLM,
+        mini_model_config=Qwen3NextConfig(  # Copypaste Qwen3MoeConfig
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            # config.dtype must be set if fla installed since there's a bug in the original code (No torch.get_current_dtype())
+            dtype=torch.bfloat16,
+        ),
+    )
+
+if QWEN3_5_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5_moe,
+        model_class=Qwen3_5MoeForCausalLM,
+        mini_model_config=Qwen3_5MoeTextConfig(
+            vocab_size=32000,
+            hidden_size=896,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            head_dim=128,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=64,
+            linear_value_head_dim=64,
+            linear_num_key_heads=8,
+            linear_num_value_heads=8,
+            moe_intermediate_size=768,
+            shared_expert_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            # config.dtype must be set if fla installed since there's a bug in the original code (No torch.get_current_dtype())
+            dtype=torch.bfloat16,
+        ),
+    )
+
+if QWEN3_5_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5,
+        model_class=Qwen3_5ForCausalLM,
+        mini_model_config=Qwen3_5TextConfig(
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            head_dim=128,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=64,
+            linear_value_head_dim=64,
+            linear_num_key_heads=8,
+            linear_num_value_heads=8,
+            layer_types=["linear_attention", "linear_attention", "linear_attention", "full_attention"],
+            dtype=torch.bfloat16,
+        ),
+    )
+
+if HUNYUAN_V1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1,
+        model_class=HunYuanDenseV1ForCausalLM,
+        mini_model_config=HunYuanDenseV1Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            num_hidden_layers=4,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_attention_heads=8,
+            head_dim=112,
+            rms_norm_eps=1e-6,
+            tie_word_embeddings=True,
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            norm_eps=1e-6,
+            num_key_value_heads=2,
+            partial_rotary_factor=1.0,
+            vocab_size=32000,
+            use_cache=True,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe,
+        model_class=HunYuanMoEV1ForCausalLM,
+        mini_model_config=HunYuanMoEV1Config(
+            vocab_size=32000,
+            hidden_size=128,
+            intermediate_size=512,
+            head_dim=16,
+            num_hidden_layers=2,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            eod_token_id=3,
+            sep_token_id=4,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            num_experts=2,
+            moe_topk=1,
+            attn_implementation="sdpa",
+        ),
+    )
+
+if EXAONE4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_exaone4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_exaone4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_exaone4,
+        model_class=Exaone4ForCausalLM,
+        mini_model_config=Exaone4Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+            pad_token_id=None,
+        ),
+    )
+
+
+def create_model(model_name="mini_llama4"):
+    """
+    Create a mini version model
+    The commented values are the original values
+    """
+    model_config = MINI_MODEL_SETUPS[model_name].mini_model_config
+    model_class = MINI_MODEL_SETUPS[model_name].model_class
+    return model_class(model_config)
+
+
+@require_deterministic
+def run_mini_model(
+    model_name="mini_llama4",
+    num_steps=100,
+    dtype=torch.bfloat16,
+    lr=1e-5,
+    with_liger=False,
+):
+    # If we move it to the beginning of test_mini_model, the two runs are initialized with different weights.
+    # This is due to RNG (Random Number Generator). The formula of RNG progression is x_(n+1) = (a * x_n + c) % m
+    # Everytime RNG is used, like randomly initialzing weight, the RNG progresses to the next state.
+    # Therefore, we have to reset RNG before we create the model to ensure the weight initialization started from the same RNG state.
+
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+    if "mllama" in model_name:
+        revert_kwargs["model_type"] = "causal_lm"
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+        }
+
+        if "glm4" in model_name or "qwen3_next" in model_name or "qwen3_5" in model_name:
+            kwargs["rope"] = False
+
+        model_supports_layer_norm = "qwen2_vl" in model_name
+        if model_supports_layer_norm:
+            kwargs["layer_norm"] = True
+
+        if "gemma" in model_name:
+            kwargs["geglu"] = True
+        else:
+            kwargs["swiglu"] = True
+
+        if "llava" in model_name:
+            apply_liger_kernel_to_llama(**kwargs)
+
+        # fused_linear_cross_entropy is not supported in mini_granite3
+        kwargs["fused_linear_cross_entropy"] = True if model_name != "mini_granite3" else False
+        kwargs["cross_entropy"] = False
+
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+
+    train_dataset = load_from_disk(DEFAULT_DATASET_PATH)
+    loader = DataLoader(train_dataset, batch_size=16, shuffle=False, collate_fn=simple_collate_fn)
+    loader_iter = iter(loader)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+
+    for i in range(num_steps):
+        batch = next(loader_iter).to(model.device)
+        optimizer.zero_grad()
+        output = model(**batch, accum_dtype=torch.float32)
+        output.loss.backward()
+        optimizer.step()
+        print(f"Step {i}, Loss: {output.loss.item()}")
+        loss_list.append(output.loss.item())
+
+    model.eval()
+    eval_batch = next(loader_iter).to(model.device)
+    if with_liger:
+        eval_batch["skip_logits"] = False
+    with torch.no_grad():
+        eval_output = model(**eval_batch)
+    print(f"Eval Loss: {eval_output.loss.item()}")
+    loss_list.append(eval_output.loss.item())
+    topk_logprobs = get_topk(get_logprobs(eval_output.logits))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        pytest.param(
+            "mini_llama4",  # llama4 requires slightly larger tolerances to pass this test after bug fix to llama4 in transformers v5.0.0
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            4e-1,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not LLAMA4_AVAILABLE,
+                    reason="Llama not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not IS_TRANSFORMERS_V5_OR_LATER,
+                    reason="The `attention_bias` configuration of Llama4 is not set in Transformers v4",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_llama3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_llava",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not LLAVA_AVAILABLE,
+                    reason="LLaVa not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    version.parse(transformers.__version__) < version.parse("4.52.0"),
+                    reason="LLaVa doesn't materialize logits in transformers<=4.52.0 so we can't test it",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_granite3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,  # 1e-1
+            1e-2,  # 1e-2
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GRANITE_AVAILABLE,
+                    reason="Granite not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_mllama",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not MLLAMA_AVAILABLE,
+                    reason="Mllama not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_qwen3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_AVAILABLE,
+                    reason="Qwen3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        # TODO(tcc): Investigate qwen3_moe on different machines.
+        # The loss diverges on ci test (A10G), but it never diverges on my local machine (3080).
+        # Qwen3_moe can pass float32 tests. (mecoli1219): diverges on h100
+        pytest.param(
+            "mini_qwen3_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            2e-1,
+            1e-1,  # 1e-1
+            1e-1,  # 1e-2
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_AVAILABLE,
+                    reason="Qwen3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_gpt_oss",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GPT_OSS_AVAILABLE,
+                    reason="GPT-OSS not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,  # 1e-1
+            1e-2,  # 1e-2
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN2_VL_AVAILABLE,
+                    reason="Qwen2-VL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2_5_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,  # 1e-1
+            1e-2,  # 1e-2
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN2_5_VL_AVAILABLE,
+                    reason="Qwen2.5-VL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,  # 1e-1
+            1e-2,  # 1e-2
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_VL_AVAILABLE,
+                    reason="Qwen3-VL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            1e-1,
+            1e-1,
+            5e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_VL_MOE_AVAILABLE,
+                    reason="Qwen3-VL-MoE not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_phi3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_mistral",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+            ],
+        ),
+        pytest.param(
+            "mini_olmo2",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not OLMO2_AVAILABLE,
+                    reason="OLMO2 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_olmo3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not OLMO3_AVAILABLE,
+                    reason="OLMO3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_glm4",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GLM4_AVAILABLE,
+                    reason="Glm4 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_glm4v",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            2e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GLM4V_AVAILABLE,
+                    reason="Glm4v not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_glm4v_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            4e-1,  # rms_norm patch needs higher tolerance in bf16
+            1e-1,
+            5e-1,  # rms_norm patch needs higher tolerance in bf16
+            2e-1,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GLM4V_MOE_AVAILABLE,
+                    reason="Glm4v_moe not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_smollm3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not SMOLLM3_AVAILABLE,
+                    reason="Smollm3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_internvl",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not INTERNVL_AVAILABLE,
+                    reason="InternVL not available in this version of transformers",
+                ),
+            ],
+        ),
+        # TODO: mixtral is flaky so disable the test for now
+        # pytest.param(
+        #     "mini_mixtral",
+        #     32,
+        #     1e-4,
+        #     torch.bfloat16,
+        #     1e-3,
+        #     1e-2,
+        #     1e-1,
+        #     1e-2,
+        #     1e-1,
+        #     1e-2,
+        #     marks=pytest.mark.skipif(
+        #         not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+        #     ),
+        # ),
+        # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match (casts are not done the same way)
+        pytest.param(
+            "mini_gemma1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_gemma1.1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        # TODO: Gemma2 test for bf16 is not passing within the tolerance range, might be casting issue, need to investigate
+        # pytest.param(
+        #     "mini_gemma2",
+        #     32,
+        #     1e-4,
+        #     torch.bfloat16,
+        #     1e-3,
+        #     1e-2,
+        #     1e-1,
+        #     1e-2,
+        #     1e-2,
+        #     1e-2,
+        #     marks=pytest.mark.skipif(
+        #         not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+        #     ),
+        # ),
+        pytest.param(
+            "mini_gemma3_text",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GEMMA3_AVAILABLE,
+                    reason="Gemma3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_falcon_h1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not FALCONH1_AVAILABLE,
+                    reason="FalconH1 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_next",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3NEXT_AVAILABLE,
+                    reason="Qwen3Next not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            2e-1,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_5_MOE_AVAILABLE,
+                    reason="Qwen3_5Moe not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            2e-1,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_5_AVAILABLE,
+                    reason="Qwen3_5 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_hunyuan_v1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not HUNYUAN_V1_AVAILABLE,
+                    reason="Hunyuan_v1 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_hunyuan_v1_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,  # 1e-1
+            1e-1,  # 1e-2
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not HUNYUAN_V1_AVAILABLE,
+                    reason="Hunyuan_v1_moe not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_exaone4",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not EXAONE4_AVAILABLE,
+                    reason="EXAONE4 not available in this version of transformers",
+                ),
+            ],
+        ),
+    ],
+)
+def test_mini_model(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+
+    expected_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True)
+
+    # Compare every step of the loss
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # Compare the topk logprobs from evaluation step
+    if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None:
+        assert_verbose_allclose(
+            expected_output["topk_logprobs"],
+            actual_output["topk_logprobs"],
+            atol=logprobs_atol,
+            rtol=logprobs_rtol,
+            extra_info="[Top k logprobs]",
+        )
+
+    # Compare the params from the last step
+    # Iterate over the model's parameters and compare them
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1], actual_param[1], atol=param_atol, rtol=param_rtol, extra_info="[Model parameters]"
+        )
diff --git a/test/convergence/bf16/test_mini_models_multimodal.py b/test/convergence/bf16/test_mini_models_multimodal.py
new file mode 100755
index 0000000000000000000000000000000000000000..5495c8bbbbc51b4fdaa06d179ed1e9e3b8c46ed0
--- /dev/null
+++ b/test/convergence/bf16/test_mini_models_multimodal.py
@@ -0,0 +1,1830 @@
+import functools
+import os
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Ensure deterministic behavior with CuBLAS
+import pytest
+import torch
+import transformers
+
+from datasets import load_dataset
+from packaging import version
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerFast
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+
+from liger_kernel.transformers import apply_liger_kernel_to_gemma3
+from liger_kernel.transformers import apply_liger_kernel_to_internvl
+from liger_kernel.transformers import apply_liger_kernel_to_llama4
+from liger_kernel.transformers import apply_liger_kernel_to_llava
+from liger_kernel.transformers import apply_liger_kernel_to_mllama
+from liger_kernel.transformers import apply_liger_kernel_to_paligemma
+from liger_kernel.transformers import apply_liger_kernel_to_pixtral
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl_moe
+from liger_kernel.transformers import apply_liger_kernel_to_smolvlm
+from liger_kernel.utils import infer_device
+from test.utils import FAKE_CONFIGS_PATH
+from test.utils import UNTOKENIZED_DATASET_PATH
+from test.utils import MiniModelConfig
+from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
+from test.utils import is_torchvision_available
+from test.utils import load_image_processing_config
+from test.utils import load_processor_config
+from test.utils import load_tokenizer_config
+from test.utils import multimodal_collate_fn
+from test.utils import require_deterministic
+from test.utils import revert_liger_kernel_to_gemma3
+from test.utils import revert_liger_kernel_to_internvl
+from test.utils import revert_liger_kernel_to_llama4
+from test.utils import revert_liger_kernel_to_llava
+from test.utils import revert_liger_kernel_to_mllama
+from test.utils import revert_liger_kernel_to_Paligemma
+from test.utils import revert_liger_kernel_to_pixtral
+from test.utils import revert_liger_kernel_to_qwen2_5_vl
+from test.utils import revert_liger_kernel_to_qwen2_vl
+from test.utils import revert_liger_kernel_to_qwen3_5
+from test.utils import revert_liger_kernel_to_qwen3_vl
+from test.utils import revert_liger_kernel_to_qwen3_vl_moe
+from test.utils import revert_liger_kernel_to_smolvlm2
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+from test.utils import train_bpe_tokenizer
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+if IS_TRANSFORMERS_V5_OR_LATER:
+    from transformers.models.gemma.tokenization_gemma import GemmaTokenizer
+else:
+    from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast as GemmaTokenizer
+
+try:
+    # Qwen2-VL is only available in transformers>=4.52.4
+    import transformers
+
+    from packaging import version
+
+    if IS_TRANSFORMERS_V5_OR_LATER:
+        from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    else:
+        from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as Qwen2Tokenizer
+    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+    from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
+    from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
+
+    QWEN2_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_VL_AVAILABLE = False
+
+try:
+    # Qwen2.5-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+
+    if IS_TRANSFORMERS_V5_OR_LATER:
+        from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    else:
+        from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as Qwen2Tokenizer
+    from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+    from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessor
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
+
+    QWEN2_5_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_5_VL_AVAILABLE = False
+
+try:
+    if IS_TRANSFORMERS_V5_OR_LATER:
+        from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    else:
+        from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as Qwen2Tokenizer
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLVisionConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+    from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
+    from transformers.models.qwen3_vl.video_processing_qwen3_vl import Qwen3VLVideoProcessor
+
+    QWEN3_VL_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeTextConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeVisionConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5Config
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5VisionConfig
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
+    from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
+    from transformers.models.qwen3_vl.video_processing_qwen3_vl import Qwen3VLVideoProcessor
+
+    QWEN3_5_AVAILABLE = True
+except ImportError:
+    QWEN3_5_AVAILABLE = False
+
+try:
+    # Mllama is only available in transformers>=4.45.0
+    from transformers.models.mllama.configuration_mllama import MllamaConfig
+    from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+    from transformers.models.mllama.configuration_mllama import MllamaVisionConfig
+    from transformers.models.mllama.image_processing_mllama import MllamaImageProcessor
+    from transformers.models.mllama.modeling_mllama import MllamaForConditionalGeneration
+    from transformers.models.mllama.processing_mllama import MllamaProcessor
+
+    MLLAMA_AVAILABLE = True
+except ImportError:
+    MLLAMA_AVAILABLE = False
+
+try:
+    from transformers import CLIPImageProcessor
+    from transformers import CLIPVisionConfig
+    from transformers import LlamaConfig
+    from transformers.models.llava.configuration_llava import LlavaConfig
+    from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration
+    from transformers.models.llava.processing_llava import LlavaProcessor
+
+    from liger_kernel.transformers import apply_liger_kernel_to_llama
+
+    LLAVA_AVAILABLE = True
+except ImportError:
+    LLAVA_AVAILABLE = False
+
+try:
+    import transformers
+
+    from packaging import version
+    from transformers.models.gemma.configuration_gemma import GemmaConfig
+    from transformers.models.gemma2.configuration_gemma2 import Gemma2Config
+    from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
+    from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+    from transformers.models.paligemma.processing_paligemma import PaliGemmaProcessor
+    from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+    from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
+
+    PALIGEMMA_AVAILABLE = True
+except ImportError:
+    PALIGEMMA_AVAILABLE = False
+
+
+try:
+    # Gemma3 is only available in transformers>=4.50.0
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3Config
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
+    from transformers.models.gemma3.image_processing_gemma3 import Gemma3ImageProcessor
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForConditionalGeneration
+    from transformers.models.gemma3.processing_gemma3 import Gemma3Processor
+
+    GEMMA3_AVAILABLE = True
+except ImportError:
+    GEMMA3_AVAILABLE = False
+
+try:
+    from transformers.models.llama4.configuration_llama4 import Llama4Config
+    from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+    from transformers.models.llama4.configuration_llama4 import Llama4VisionConfig
+    from transformers.models.llama4.image_processing_llama4_fast import Llama4ImageProcessorFast
+    from transformers.models.llama4.modeling_llama4 import Llama4ForConditionalGeneration
+    from transformers.models.llama4.processing_llama4 import Llama4Processor
+
+    LLAMA4_AVAILABLE = True
+
+except ImportError:
+    LLAMA4_AVAILABLE = False
+
+try:
+    from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
+    from transformers.models.internvl.configuration_internvl import InternVLConfig
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+    from transformers.models.internvl.processing_internvl import InternVLProcessor
+    from transformers.models.internvl.video_processing_internvl import InternVLVideoProcessor
+    from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+
+    # Input fp32 with bf16 CNN-based models in InternVL is only working in transformers>=4.56.0
+    INTERNVL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.56.0")
+except ImportError:
+    INTERNVL_AVAILABLE = False
+
+try:
+    # SmolVLM2 is only available in transformers>=4.50.0
+    from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+    from transformers.models.smolvlm.configuration_smolvlm import SmolVLMConfig
+    from transformers.models.smolvlm.image_processing_smolvlm import SmolVLMImageProcessor
+    from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration
+    from transformers.models.smolvlm.processing_smolvlm import SmolVLMProcessor
+    from transformers.models.smolvlm.video_processing_smolvlm import SmolVLMVideoProcessor
+
+    SMOLVLM2_AVAILABLE = True
+except ImportError:
+    SMOLVLM2_AVAILABLE = False
+
+try:
+    from transformers.models.pixtral.configuration_pixtral import PixtralVisionConfig
+    from transformers.models.pixtral.modeling_pixtral import PixtralVisionModel
+
+    PIXTRAL_AVAILABLE = True
+except ImportError:
+    PIXTRAL_AVAILABLE = False
+
+try:
+    from num2words import num2words  # noqa: F401
+
+    NUM2WORDS_AVAILABLE = True
+except ImportError:
+    NUM2WORDS_AVAILABLE = False
+
+
+device = infer_device()
+
+torch.use_deterministic_algorithms(True)
+
+#  Only setting torch.use_deterministic_algorithms(True) throws the following error:
+#  RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`,
+#  but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an
+#  environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information,
+#  go to https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+TEST_IMAGE_DIM = 64
+
+MINI_MODEL_SETUPS = {}
+
+if LLAMA4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_llama4"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_llama4, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
+        model_class=Llama4ForConditionalGeneration,
+        mini_model_config=Llama4Config(
+            image_token_index=8,
+            vision_config=Llama4VisionConfig(
+                attn_implementation_autoset=True,
+                attention_dropout=0.0,
+                hidden_act="gelu",
+                hidden_size=512,  # 1280
+                image_size=560,  # 560
+                initializer_range=0.02,
+                intermediate_layers_indices=[2],  # [3, 7, 15, etc...]
+                intermediate_size=2048,  # 5120
+                max_num_tiles=1,  # 4
+                norm_eps=1e-5,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_global_layers=2,  # 8
+                num_hidden_layers=8,  # 32
+                patch_size=280,  # 14
+                supported_aspect_ratios=[[1, 1]],  # [[1, 1], [1, 2], etc... ]
+                vision_output_dim=4096,  # 7680
+            ),
+            text_config=Llama4TextConfig(
+                bos_token_id=0,
+                eos_token_id=0,
+                pad_token_id=0,
+                cross_attention_layers=[2],  # [3, 8, 13, 18, etc...]
+                dropout=0,
+                hidden_act="silu",
+                hidden_size=1024,  # 4096
+                initializer_range=0.02,
+                intermediate_size=2048,  # 14336
+                max_position_embeddings=131_072,
+                num_attention_heads=8,  # 32
+                num_hidden_layers=4,  # 40
+                num_key_value_heads=2,  # 8
+                rms_norm_eps=1e-5,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,  # 128256,
+            ),
+            attn_implementation="sdpa",
+            pad_token_id=None,
+        ),
+    )
+
+if MLLAMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_mllama"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_mllama, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mllama,
+        model_class=MllamaForConditionalGeneration,
+        mini_model_config=MllamaConfig(
+            vision_config=MllamaVisionConfig(
+                hidden_act="gelu",
+                hidden_size=512,  # 1280
+                image_size=560,  # 560
+                initializer_range=0.02,
+                intermediate_layers_indices=[2],  # [3, 7, 15, etc...]
+                intermediate_size=2048,  # 5120
+                max_num_tiles=1,  # 4
+                norm_eps=1e-5,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_global_layers=2,  # 8
+                num_hidden_layers=8,  # 32
+                patch_size=140,  # 14
+                supported_aspect_ratios=[[1, 1]],  # [[1, 1], [1, 2], etc... ]
+                vision_output_dim=1024,  # 7680
+            ),
+            text_config=MllamaTextConfig(
+                bos_token_id=0,
+                eos_token_id=0,
+                pad_token_id=0,
+                cross_attention_layers=[2],  # [3, 8, 13, 18, etc...]
+                dropout=0,
+                hidden_act="silu",
+                hidden_size=1024,  # 4096
+                initializer_range=0.02,
+                intermediate_size=2048,  # 14336
+                max_position_embeddings=131_072,
+                num_attention_heads=8,  # 32
+                num_hidden_layers=4,  # 40
+                num_key_value_heads=2,  # 8
+                rms_norm_eps=1e-5,
+                rope_scaling=dict(
+                    factor=8.0,
+                    high_freq_factor=4.0,
+                    low_freq_factor=1.0,
+                    original_max_position_embeddings=8192,
+                    rope_type="llama3",
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,  # 128256,
+            ),
+            image_token_index=1,  # NOTE: outside the vocab size
+            attn_implementation="sdpa",
+        ),
+    )
+
+if PALIGEMMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_paligemma"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
+        model_class=PaliGemmaForConditionalGeneration,
+        mini_model_config=PaliGemmaConfig(
+            vision_config=SiglipVisionConfig(
+                attention_dropout=0.0,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_size=1152,
+                image_size=224,
+                intermediate_size=2048,  # 4304
+                layer_norm_eps=1e-06,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_hidden_layers=4,  # 27
+                num_image_tokens=256,
+                num_positions=256,
+                patch_size=14,
+                projection_dim=1024,  # 2304
+            ),
+            text_config=GemmaConfig(
+                vocab_size=32000,  # 256000
+                hidden_size=1024,  # 3072
+                intermediate_size=2048,  # 24576
+                num_hidden_layers=4,  # 28
+                num_attention_heads=4,  # 16
+                num_key_value_heads=4,  # 16
+                head_dim=256,
+                hidden_activation="gelu_pytorch_tanh",
+                max_position_embeddings=8192,
+                initializer_range=0.02,
+                rms_norm_eps=1e-06,
+                use_cache=True,
+                pad_token_id=0,
+                # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                bos_token_id=1,  # 128000
+                eos_token_id=2,  # 128001
+                tie_word_embeddings=True,
+                attention_bias=False,
+                attention_dropout=0.0,
+            ),
+            image_token_index=4,  # NOTE: outside the vocab size
+            attn_implementation="eager",
+            vocab_size=32000,
+            projection_dim=1024,
+        ),
+    )
+    MINI_MODEL_SETUPS["mini_paligemma2"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
+        model_class=PaliGemmaForConditionalGeneration,
+        mini_model_config=PaliGemmaConfig(
+            vision_config=SiglipVisionConfig(
+                attention_dropout=0.0,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_size=1152,
+                image_size=224,
+                intermediate_size=2048,  # 4304
+                layer_norm_eps=1e-06,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_hidden_layers=4,  # 27
+                num_image_tokens=256,
+                num_positions=256,
+                patch_size=14,
+                projection_dim=1024,  # 2304
+            ),
+            text_config=Gemma2Config(
+                vocab_size=32000,  # 256000
+                hidden_size=1024,  # 3072
+                intermediate_size=2048,  # 24576
+                num_hidden_layers=4,  # 28
+                num_attention_heads=4,  # 16
+                num_key_value_heads=4,  # 16
+                head_dim=256,
+                hidden_activation="gelu_pytorch_tanh",
+                max_position_embeddings=8192,
+                initializer_range=0.02,
+                rms_norm_eps=1e-06,
+                use_cache=True,
+                pad_token_id=0,
+                # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                bos_token_id=1,  # 128000
+                eos_token_id=2,  # 128001
+                tie_word_embeddings=True,
+                attention_bias=False,
+                attention_dropout=0.0,
+            ),
+            image_token_index=4,  # NOTE: outside the vocab size
+            attn_implementation="eager",
+            vocab_size=32000,
+            projection_dim=1024,
+        ),
+    )
+
+if GEMMA3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gemma3"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_gemma3, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma3,
+        model_class=Gemma3ForConditionalGeneration,
+        mini_model_config=Gemma3Config(
+            vision_config=SiglipVisionConfig(
+                attention_dropout=0.0,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_size=1152,
+                image_size=224,
+                intermediate_size=2048,  # 4304
+                layer_norm_eps=1e-06,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_hidden_layers=4,  # 27
+                num_image_tokens=256,
+                num_positions=256,
+                patch_size=14,
+            ).to_dict(),
+            text_config=Gemma3TextConfig(
+                vocab_size=32000,  # 256000
+                hidden_size=1024,  # 3072
+                intermediate_size=2048,  # 24576
+                num_hidden_layers=4,  # 28
+                num_attention_heads=4,  # 16
+                num_key_value_heads=4,  # 16
+                head_dim=256,
+                hidden_activation="gelu_pytorch_tanh",
+                max_position_embeddings=8192,
+                initializer_range=0.02,
+                rms_norm_eps=1e-06,
+                use_cache=True,
+                tie_word_embeddings=True,
+                attention_bias=False,
+                attention_dropout=0.0,
+            ).to_dict(),
+            image_token_index=5,  # NOTE: outside the vocab size
+            boi_token_index=4,
+            eoi_token_index=6,
+            attn_implementation="eager",
+        ),
+    )
+
+if QWEN2_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_qwen2_vl, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
+        model_class=Qwen2VLForConditionalGeneration,
+        mini_model_config=Qwen2VLConfig(
+            attention_dropout=0.0,
+            # Token Ids and vocab size must match those in the tokenizer/processor
+            # test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json
+            bos_token_id=0,
+            eos_token_id=0,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            vision_token_id=3,
+            image_token_id=4,
+            video_token_id=5,
+            hidden_act="silu",
+            hidden_size=1024,  # 8192
+            initializer_range=0.02,
+            intermediate_size=1024,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=8,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=True,
+            use_cache=False,  # True
+            vocab_size=32000,  # 152064,
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "embed_dim": 128,  # 1280
+                "mlp_ratio": 1,
+                "num_heads": 8,  # 16
+                "in_chans": 3,
+                "hidden_size": 1024,  # 1536
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if LLAVA_AVAILABLE:
+    # https://huggingface.co/llava-hf/llava-1.5-7b-hf
+    MINI_MODEL_SETUPS["mini_llava"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_llava, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llava,
+        model_class=LlavaForConditionalGeneration,
+        mini_model_config=LlavaConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                hidden_act="silu",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=2048,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pretraining_tp=1,
+                tie_word_embeddings=False,
+                use_cache=True,
+                max_position_embeddings=4096,  # llava-1.5-7b-hf
+                rms_norm_eps=1e-05,  # llava-1.5-7b-hf
+                vocab_size=32064,  # llava-1.5-7b-hf
+                # At rope backward
+                # Eager produces incontiguous dq and dk
+                # SDPA produces contiguous dq and incontiguous dk
+                # Flash_attn produces contiguous dq and dk
+                attn_implementation="sdpa",  # default value, pytorch native attention
+            ),
+            vision_config=CLIPVisionConfig(
+                hidden_size=1024,
+                image_size=336,
+                intermediate_size=2048,  # 4096
+                model_type="clip_vision_model",
+                num_attention_heads=4,  # 16
+                num_hidden_layers=4,  # 24
+                patch_size=14,
+                projection_dim=768,
+                vocab_size=32000,
+            ),
+            vocab_size=32064,
+            ignore_index=-100,
+            pad_token_id=4,
+            image_token_index=3,
+            projector_hidden_act="gelu",
+            vision_feature_layer=-2,
+            vision_feature_select_strategy="default",
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if INTERNVL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_internvl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_internvl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_internvl,
+        model_class=InternVLForConditionalGeneration,
+        mini_model_config=InternVLConfig(
+            text_config=Qwen2Config(
+                rms_norm_eps=1e-5,
+                hidden_size=256,  # 1024
+                intermediate_size=1024,  # 4096
+                hidden_act="silu",
+                num_hidden_layers=4,  # 24
+                num_attention_heads=4,  # 16
+                num_key_value_heads=2,  # 16
+                max_position_embeddings=4096,  # 8192
+                vocab_size=32000,  # 151936
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                tie_word_embeddings=False,
+            ),
+            vision_config={
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+            },
+            image_token_id=24,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if SMOLVLM2_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_smolvlm2"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_smolvlm,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_smolvlm2,
+        model_class=SmolVLMForConditionalGeneration,
+        mini_model_config=SmolVLMConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                hidden_act="silu",
+                hidden_size=576,  # 576 for 256M model
+                initializer_range=0.041666666666666664,
+                intermediate_size=1536,  # 1536 for 256M model
+                max_position_embeddings=8192,
+                num_attention_heads=9,  # 9 for 256M model
+                num_hidden_layers=4,  # 30 -> reduced to 4 for testing
+                num_key_value_heads=3,  # 3 for 256M model
+                rms_norm_eps=1e-5,
+                tie_word_embeddings=False,
+                vocab_size=49280,
+            ),
+            vision_config={
+                "hidden_size": 768,
+                "intermediate_size": 3072,
+                "num_hidden_layers": 4,  # 12 -> reduced to 4 for testing
+                "num_attention_heads": 12,
+                "image_size": 512,
+                "patch_size": 16,
+            },
+            image_token_id=49190,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if QWEN2_5_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_5_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_qwen2_5_vl, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
+        model_class=Qwen2_5_VLForConditionalGeneration,
+        mini_model_config=Qwen2_5_VLConfig(
+            attention_dropout=0.0,
+            # Token Ids and vocab size must match those in the tokenizer/processor
+            # test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json
+            bos_token_id=0,
+            eos_token_id=0,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            vision_token_id=3,
+            image_token_id=4,
+            video_token_id=5,
+            hidden_act="silu",
+            hidden_size=1024,  # 8192
+            initializer_range=0.02,
+            intermediate_size=1024,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=8,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=True,
+            use_cache=False,  # True
+            vocab_size=32000,  # 152064,
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_size": 128,  # 1280
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 1024,
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if QWEN3_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl,
+        model_class=Qwen3VLForConditionalGeneration,
+        mini_model_config=Qwen3VLConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3VLVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3VLTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+                attention_dropout=0.0,
+                attention_bias=False,
+            ).to_dict(),
+        ),
+    )
+
+if QWEN3_VL_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl_moe,
+        model_class=Qwen3VLMoeForConditionalGeneration,
+        mini_model_config=Qwen3VLMoeConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3VLMoeTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+                attention_dropout=0.0,
+                attention_bias=False,
+                decoder_sparse_step=1,
+                moe_intermediate_size=1024,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+                pad_token_id=None,
+            ).to_dict(),
+        ),
+    )
+
+if QWEN3_5_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5,
+        model_class=Qwen3_5ForConditionalGeneration,
+        mini_model_config=Qwen3_5Config(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3_5VisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3_5TextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+                attention_dropout=0.0,
+                attention_bias=False,
+                decoder_sparse_step=1,
+                moe_intermediate_size=1024,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+                pad_token_id=None,
+            ).to_dict(),
+        ),
+    )
+
+
+if PIXTRAL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_pixtral"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_pixtral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_pixtral,
+        model_class=PixtralVisionModel,
+        mini_model_config=PixtralVisionConfig(
+            hidden_size=1024,
+            intermediate_size=2048,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_channels=3,
+            image_size=256,
+            patch_size=16,
+            hidden_act="silu",
+            attention_dropout=0.0,
+            rope_theta=10000.0,
+            initializer_range=0.02,
+        ),
+    )
+
+
+def create_processor(model_name: str):
+    if model_name == "mini_qwen2_vl":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Qwen2VLImageProcessor()
+        video_processor = Qwen2VLVideoProcessor()
+        return Qwen2VLProcessor(
+            image_processor=image_processor,
+            video_processor=video_processor,
+            tokenizer=qwen_tokenizer,
+        )
+
+    elif model_name == "mini_qwen2_5_vl":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Qwen2VLImageProcessor()
+        video_processor = Qwen2VLVideoProcessor()
+        return Qwen2_5_VLProcessor(
+            image_processor=image_processor,
+            video_processor=video_processor,
+            tokenizer=qwen_tokenizer,
+        )
+
+    elif model_name in ("mini_qwen3_vl", "mini_qwen3_vl_moe", "mini_qwen3_5"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Qwen2VLImageProcessor(patch_size=16, temporal_patch_size=2, merge_size=2)
+        video_processor = Qwen3VLVideoProcessor()
+        return Qwen3VLProcessor(
+            image_processor=image_processor,
+            video_processor=video_processor,
+            tokenizer=qwen_tokenizer,
+        )
+
+    elif model_name == "mini_llava":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Llava/llava-1.5-7b-hf/tokenizer_config.json",
+            )
+        )
+        image_processor_config = load_image_processing_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Llava/llava-1.5-7b-hf/preprocessor_config.json",
+            )
+        )
+        processor_config = load_processor_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Llava/llava-1.5-7b-hf/processor_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config)
+        fast_tokenizer.model_input_names = ["input_ids", "attention_mask"]
+        image_processor = CLIPImageProcessor(**image_processor_config)
+
+        return LlavaProcessor(**processor_config, image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    elif model_name == "mini_internvl":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "OpenGVLab/InternVL3-1B-hf/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = GotOcr2ImageProcessorFast(
+            crop_to_patches=False, min_patches=1, max_patches=12, size={"height": 448, "width": 448}
+        )
+        video_processor = InternVLVideoProcessor()
+
+        # Return proper InternVL processor
+        return InternVLProcessor(
+            image_processor=image_processor, tokenizer=qwen_tokenizer, video_processor=video_processor
+        )
+
+    elif model_name == "mini_smolvlm2":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        gpt2_tokenizer = GPT2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = SmolVLMImageProcessor(size={"longest_edge": 512})
+        video_processor = SmolVLMVideoProcessor()
+
+        # Return proper SmolVLM processor
+        return SmolVLMProcessor(
+            image_processor=image_processor, tokenizer=gpt2_tokenizer, video_processor=video_processor
+        )
+
+    elif model_name.startswith("mini_llama4"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Llama4ImageProcessorFast(size={"height": 560, "width": 560})
+        return Llama4Processor(
+            image_processor=image_processor,
+            tokenizer=fast_tokenizer,
+            fake_image_token="<|image|>",
+            image_token="<|image|>",
+        )
+    elif model_name == "mini_mllama":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = MllamaImageProcessor(size={"height": 560, "width": 560})
+        return MllamaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    elif model_name.startswith("mini_paligemma"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+
+        fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = SiglipImageProcessor(size={"height": 224, "width": 224}, image_seq_length=256)
+        return PaliGemmaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    elif model_name.startswith("mini_gemma3"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Google/Gemma3/gemma-3-4b-it/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Gemma3ImageProcessor()
+        return Gemma3Processor(image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    else:
+        raise ValueError(f"Processor not available for model {model_name}")
+
+
+def create_multimodal_dataset(model_name: str):
+    processor = create_processor(model_name)
+
+    def generate_procedural_image(example, index):
+        """Generate an image with a single row of white pixels at the index specified"""
+        image = torch.zeros(3, TEST_IMAGE_DIM, TEST_IMAGE_DIM)
+        image[:, index % TEST_IMAGE_DIM, :] = 255
+        example["image"] = image
+        return example
+
+    def apply_chat_template(example):
+        """
+        Under the hood, this inserts the correct image placeholder token into the text.
+        More or less this conversation format is used by HF's mllms. The fact that it is
+        formatting as for IFT is not in-and-of-itself important here.
+        """
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": example["text"]}],
+            },
+        ]
+        example["text"] = processor.tokenizer.apply_chat_template(conversation, tokenize=False)
+        return example
+
+    def preprocess_function(examples):
+        """Tokenize text, preprocess images, and generate other relevant inputs for the model."""
+        if model_name == "mini_llama4":
+            # Process images and text separately to avoid complex token replacement, this helped setting lower tolerance than processing them together.
+            image_inputs = processor.image_processor(images=examples["image"], return_tensors="pt")
+            text_inputs = processor.tokenizer(
+                examples["text"],
+                padding="max_length",
+                truncation=True,
+                max_length=1024,
+                return_tensors="pt",
+            )
+            return {**text_inputs, **image_inputs}
+        else:
+            # For other models, use the normal processor
+            results = processor(
+                text=examples["text"],
+                images=examples["image"],
+                padding="max_length",
+                truncation=True,
+                max_length=1024,  # longer than for text-only b/c images require quite a few tokens
+                return_tensors="pt",
+            )
+            return results
+
+    train_dataset = (
+        load_dataset("text", data_files={"train": UNTOKENIZED_DATASET_PATH}, split="train")
+        .to_iterable_dataset()  # only map examples as-needed and on-demand
+        .map(generate_procedural_image, with_indices=True)
+        .map(apply_chat_template)
+        .map(preprocess_function, remove_columns=["text", "image"])
+    )
+    return train_dataset
+
+
+def create_model(model_name):
+    """
+    Create a mini version model
+    The commented values are the original values
+    """
+    model_config = MINI_MODEL_SETUPS[model_name].mini_model_config
+    model_class = MINI_MODEL_SETUPS[model_name].model_class
+    return model_class(model_config)
+
+
+@require_deterministic
+def run_mini_model_multimodal(
+    model_name="mini_qwen2_vl",
+    num_steps=100,
+    dtype=torch.bfloat16,
+    lr=1e-5,
+    with_liger=False,
+):
+    # If we move it to the beginning of test_mini_model, the two runs are initialized with different weights.
+    # This is due to RNG (Random Number Generator). The formula of RNG progression is x_(n+1) = (a * x_n + c) % m
+    # Everytime RNG is used, like randomly initialzing weight, the RNG progresses to the next state.
+    # Therefore, we have to reset RNG before we create the model to ensure the weight initialization started from the same RNG state.
+
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+    if "mllama" in model_name or "llama4" in model_name or "qwen3_5" in model_name:
+        revert_kwargs["model_type"] = "conditional_generation"
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+            "cross_entropy": False,
+        }
+
+        if (
+            "qwen2_5_vl" not in model_name
+            and "llava" not in model_name
+            and "qwen3_vl" not in model_name
+            and "qwen3_5" not in model_name
+        ):
+            kwargs["layer_norm"] = True
+
+        if "qwen3_5" in model_name:
+            kwargs["rope"] = False
+
+        if "gemma" in model_name:
+            kwargs["geglu"] = True
+        else:
+            kwargs["swiglu"] = True
+
+        if "llava" in model_name:
+            apply_liger_kernel_to_llama(**kwargs)
+
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+    model.gradient_checkpointing_enable()
+
+    train_dataset = create_multimodal_dataset(model_name)
+    loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=multimodal_collate_fn)
+    loader_iter = iter(loader)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+
+    for i in range(num_steps):
+        batch = next(loader_iter).to(model.device)
+        optimizer.zero_grad()
+        supports_accum = getattr(model, "_supports_accum_dtype", None)
+        if supports_accum is None:
+            import inspect
+
+            params = inspect.signature(model.forward).parameters
+            supports_accum = ("accum_dtype" in params) or any(
+                p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()
+            )
+            setattr(model, "_supports_accum_dtype", supports_accum)
+
+        output = model(**batch, accum_dtype=torch.float32) if supports_accum else model(**batch)
+        output.loss.backward()
+        optimizer.step()
+
+        print(f"Step {i}, Loss: {output.loss.item()}")
+        loss_list.append(output.loss.item())
+
+    model.eval()
+    eval_batch = next(loader_iter).to(model.device)
+    if with_liger:
+        eval_batch["skip_logits"] = False
+    with torch.no_grad():
+        eval_output = model(**eval_batch)
+    print(f"Eval Loss: {eval_output.loss.item()}")
+    loss_list.append(eval_output.loss.item())
+    topk_logprobs = get_topk(get_logprobs(eval_output.logits))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        pytest.param(
+            "mini_qwen2_vl",
+            32,
+            1e-4,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN2_VL_AVAILABLE,
+                    reason="Qwen2-VL not available in this version of transformers",
+                ),
+                pytest.mark.skipif(not is_torchvision_available(), reason="Qwen2VLVideoProcessor requires torchvision"),
+            ],
+        ),
+        pytest.param(
+            "mini_llava",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not LLAVA_AVAILABLE,
+                    reason="LLaVa not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_internvl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not INTERNVL_AVAILABLE,
+                    reason="InternVL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_smolvlm2",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not SMOLVLM2_AVAILABLE,
+                    reason="SmolVLM2 not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not NUM2WORDS_AVAILABLE,
+                    reason="num2words must be present to run SmolVLMProcessor",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2_5_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN2_5_VL_AVAILABLE,
+                    reason="Qwen2.5-VL not available in this version of transformers",
+                ),
+                pytest.mark.skipif(not is_torchvision_available(), reason="Qwen2VLVideoProcessor requires torchvision"),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_VL_AVAILABLE,
+                    reason="Qwen3-VL not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not is_torchvision_available(),
+                    reason="Qwen3VLVideoProcessor requires torchvision",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_VL_MOE_AVAILABLE,
+                    reason="Qwen3-VL-MoE not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not is_torchvision_available(),
+                    reason="Qwen3VLVideoProcessor requires torchvision",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_mllama",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not MLLAMA_AVAILABLE,
+                    reason="Mllama not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    version.parse("4.51.0") > version.parse(transformers.__version__),
+                    reason="MllamaForConditionalGeneration doesn't accecpt `skip_logits` kwargs",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_llama4",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not LLAMA4_AVAILABLE,
+                    reason="Llama4 not available in this version of transformers",
+                ),
+                # TODO: Remove this skipif when the bug fix is released in Transformers
+                pytest.mark.skipif(
+                    version.parse(transformers.__version__) <= version.parse("5.1.0"),
+                    reason="Wait for this bug fix to be released in Transformers: https://github.com/huggingface/transformers/pull/43882",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_paligemma",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not PALIGEMMA_AVAILABLE,
+                    reason="Paligemma not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_paligemma2",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not PALIGEMMA_AVAILABLE,
+                    reason="Paligemma2 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_gemma3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GEMMA3_AVAILABLE,
+                    reason="Gemma3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_5_AVAILABLE,
+                    reason="Qwen3.5 not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not is_torchvision_available(),
+                    reason="Qwen3VLVideoProcessor requires torchvision",
+                ),
+            ],
+        ),
+    ],
+)
+def test_mini_model_multimodal(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+    expected_output = run_mini_model_multimodal(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model_multimodal(
+        model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True
+    )
+
+    # Compare the loss of every step
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # Compare the topk logprobs from evaluation step
+    assert_verbose_allclose(
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
+        extra_info="[Top k logprobs]",
+    )
+
+    # Compare the params from the last step
+    # Iterate over the model's parameters and compare them
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1],
+            actual_param[1],
+            atol=param_atol,
+            rtol=param_rtol,
+            extra_info="[Model parameters]",
+        )
+
+
+#
+# Vision-only model tests (e.g. Pixtral vision encoder)
+#
+
+
+def generate_procedural_pixel_values(batch_size, num_channels, image_size, index, dtype, device):
+    """Generate deterministic pixel values for vision-only model testing.
+
+    Each image has a single row of white pixels at a deterministic position,
+    providing a reproducible signal for convergence testing.
+    """
+    pixel_values = torch.zeros(batch_size, num_channels, image_size, image_size, dtype=dtype, device=device)
+    for b in range(batch_size):
+        row = (index + b) % image_size
+        pixel_values[b, :, row, :] = 1.0
+    return pixel_values
+
+
+@require_deterministic
+def run_mini_model_vision(
+    model_name="mini_pixtral",
+    num_steps=100,
+    dtype=torch.bfloat16,
+    lr=1e-5,
+    with_liger=False,
+):
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+            "swiglu": True,
+        }
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+
+    for i in range(num_steps):
+        optimizer.zero_grad()
+        pixel_values = generate_procedural_pixel_values(
+            batch_size=2,
+            num_channels=model.config.num_channels,
+            image_size=model.config.image_size,
+            index=i,
+            dtype=dtype,
+            device=device,
+        )
+        output = model(pixel_values=pixel_values)
+        loss = output.last_hidden_state.sum()
+        loss.backward()
+        optimizer.step()
+        print(f"Step {i}, Loss: {loss.item()}")
+        loss_list.append(loss.item())
+
+    # Eval step with deterministic input
+    model.eval()
+    with torch.no_grad():
+        eval_pixel_values = generate_procedural_pixel_values(
+            batch_size=2,
+            num_channels=model.config.num_channels,
+            image_size=model.config.image_size,
+            index=num_steps,
+            dtype=dtype,
+            device=device,
+        )
+        eval_output = model(pixel_values=eval_pixel_values)
+
+    topk_logprobs = get_topk(get_logprobs(eval_output.last_hidden_state))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        pytest.param(
+            "mini_pixtral",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-0,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not PIXTRAL_AVAILABLE, reason="Pixtral not available in this version of transformers"
+                ),
+            ],
+        ),
+    ],
+)
+def test_mini_model_vision(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+    expected_output = run_mini_model_vision(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model_vision(
+        model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True
+    )
+
+    # Compare the loss of every step
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # Compare the topk logprobs from evaluation step
+    assert_verbose_allclose(
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
+        extra_info="[Top k logprobs]",
+    )
+
+    # Compare the params from the last step
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1],
+            actual_param[1],
+            atol=param_atol,
+            rtol=param_rtol,
+            extra_info="[Model parameters]",
+        )
diff --git a/test/convergence/bf16/test_mini_models_with_logits.py b/test/convergence/bf16/test_mini_models_with_logits.py
new file mode 100755
index 0000000000000000000000000000000000000000..ff0d12304f8af5adca7707686cf309101ab7ecd8
--- /dev/null
+++ b/test/convergence/bf16/test_mini_models_with_logits.py
@@ -0,0 +1,2167 @@
+import os
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Ensure deterministic behavior with CuBLAS
+
+import pytest
+import torch
+import transformers
+
+from datasets import load_from_disk
+from packaging import version
+from torch.utils.data import DataLoader
+from transformers.models.gemma import GemmaConfig
+from transformers.models.gemma import GemmaForCausalLM
+from transformers.models.gemma2 import Gemma2Config
+from transformers.models.gemma2 import Gemma2ForCausalLM
+from transformers.models.llama import LlamaConfig
+from transformers.models.llama import LlamaForCausalLM
+from transformers.models.mistral import MistralConfig
+from transformers.models.mistral import MistralForCausalLM
+from transformers.models.mixtral import MixtralConfig
+from transformers.models.mixtral import MixtralForCausalLM
+from transformers.models.phi3 import Phi3Config
+from transformers.models.phi3 import Phi3ForCausalLM
+from transformers.models.qwen2 import Qwen2Config
+from transformers.models.qwen2 import Qwen2ForCausalLM
+
+from liger_kernel.transformers import apply_liger_kernel_to_exaone4
+from liger_kernel.transformers import apply_liger_kernel_to_falcon_h1
+from liger_kernel.transformers import apply_liger_kernel_to_gemma
+from liger_kernel.transformers import apply_liger_kernel_to_gemma2
+from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text
+from liger_kernel.transformers import apply_liger_kernel_to_glm4
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe
+from liger_kernel.transformers import apply_liger_kernel_to_granite
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe
+from liger_kernel.transformers import apply_liger_kernel_to_internvl
+from liger_kernel.transformers import apply_liger_kernel_to_llama
+from liger_kernel.transformers import apply_liger_kernel_to_llama4
+from liger_kernel.transformers import apply_liger_kernel_to_llava
+from liger_kernel.transformers import apply_liger_kernel_to_mistral
+from liger_kernel.transformers import apply_liger_kernel_to_mixtral
+from liger_kernel.transformers import apply_liger_kernel_to_mllama
+from liger_kernel.transformers import apply_liger_kernel_to_olmo2
+from liger_kernel.transformers import apply_liger_kernel_to_olmo3
+from liger_kernel.transformers import apply_liger_kernel_to_phi3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_next
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl_moe
+from liger_kernel.transformers import apply_liger_kernel_to_smollm3
+from liger_kernel.utils import infer_device
+from test.utils import DEFAULT_DATASET_PATH
+from test.utils import MiniModelConfig
+from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
+from test.utils import require_deterministic
+from test.utils import revert_liger_kernel_to_exaone4
+from test.utils import revert_liger_kernel_to_falcon_h1
+from test.utils import revert_liger_kernel_to_gemma
+from test.utils import revert_liger_kernel_to_gemma2
+from test.utils import revert_liger_kernel_to_gemma3_text
+from test.utils import revert_liger_kernel_to_glm4
+from test.utils import revert_liger_kernel_to_glm4v
+from test.utils import revert_liger_kernel_to_glm4v_moe
+from test.utils import revert_liger_kernel_to_granite
+from test.utils import revert_liger_kernel_to_hunyuan_v1
+from test.utils import revert_liger_kernel_to_hunyuan_v1_moe
+from test.utils import revert_liger_kernel_to_internvl
+from test.utils import revert_liger_kernel_to_llama
+from test.utils import revert_liger_kernel_to_llama4
+from test.utils import revert_liger_kernel_to_llava
+from test.utils import revert_liger_kernel_to_mistral
+from test.utils import revert_liger_kernel_to_mixtral
+from test.utils import revert_liger_kernel_to_mllama
+from test.utils import revert_liger_kernel_to_olmo2
+from test.utils import revert_liger_kernel_to_olmo3
+from test.utils import revert_liger_kernel_to_phi3
+from test.utils import revert_liger_kernel_to_qwen2
+from test.utils import revert_liger_kernel_to_qwen2_5_vl
+from test.utils import revert_liger_kernel_to_qwen2_vl
+from test.utils import revert_liger_kernel_to_qwen3
+from test.utils import revert_liger_kernel_to_qwen3_5
+from test.utils import revert_liger_kernel_to_qwen3_moe
+from test.utils import revert_liger_kernel_to_qwen3_next
+from test.utils import revert_liger_kernel_to_qwen3_vl
+from test.utils import revert_liger_kernel_to_qwen3_vl_moe
+from test.utils import revert_liger_kernel_to_smollm3
+from test.utils import set_seed
+from test.utils import simple_collate_fn
+from test.utils import supports_bfloat16
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+try:
+    from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+    from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
+
+    LLAMA4_AVAILABLE = True
+except ImportError:
+    LLAMA4_AVAILABLE = False
+
+try:
+    # Mllama is only available in transformers>=4.45.0
+    from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+    from transformers.models.mllama.modeling_mllama import MllamaForCausalLM
+
+    MLLAMA_AVAILABLE = True
+except ImportError:
+    MLLAMA_AVAILABLE = False
+
+try:
+    # Qwen2-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+
+    QWEN2_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_VL_AVAILABLE = False
+
+try:
+    # Qwen2.5-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+
+    QWEN2_5_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_5_VL_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
+    from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeForCausalLM
+
+    QWEN3_AVAILABLE = True
+except ImportError:
+    QWEN3_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+
+    QWEN3_VL_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+try:
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeTextConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeVisionConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.57.0")
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.granite import GraniteConfig
+    from transformers.models.granite import GraniteForCausalLM
+
+    GRANITE_AVAILABLE = True
+except ImportError:
+    GRANITE_AVAILABLE = False
+
+try:
+    from transformers import CLIPVisionConfig
+    from transformers.models.llava.configuration_llava import LlavaConfig
+    from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration
+
+    LLAVA_AVAILABLE = True
+except ImportError:
+    LLAVA_AVAILABLE = False
+
+try:
+    # OLMO2 is only available in transformers>=4.47.0
+    from transformers.models.olmo2.configuration_olmo2 import Olmo2Config
+    from transformers.models.olmo2.modeling_olmo2 import Olmo2ForCausalLM
+
+    OLMO2_AVAILABLE = True
+except ImportError:
+    OLMO2_AVAILABLE = False
+
+try:
+    # OLMO3 is only available in transformers>=4.57.0
+    from transformers.models.olmo3.configuration_olmo3 import Olmo3Config
+    from transformers.models.olmo3.modeling_olmo3 import Olmo3ForCausalLM
+
+    OLMO3_AVAILABLE = True
+except ImportError:
+    OLMO3_AVAILABLE = False
+
+try:
+    # Glm4 is only available in transformers>=4.51.3
+    from transformers.models.glm4.configuration_glm4 import Glm4Config
+    from transformers.models.glm4.modeling_glm4 import Glm4ForCausalLM
+
+    GLM4_AVAILABLE = True
+except ImportError:
+    GLM4_AVAILABLE = False
+
+try:
+    # Glm4v is only available in transformers>=4.51.3
+    from transformers.models.glm4v.configuration_glm4v import Glm4vConfig
+    from transformers.models.glm4v.modeling_glm4v import Glm4vForConditionalGeneration
+
+    GLM4V_AVAILABLE = True
+except ImportError:
+    GLM4V_AVAILABLE = False
+
+try:
+    # Glm4v_moe is only available in transformers>=4.51.3
+    from transformers.models.glm4v_moe.configuration_glm4v_moe import Glm4vMoeConfig
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeForConditionalGeneration
+
+    GLM4V_MOE_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.53.1")
+except ImportError:
+    GLM4V_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
+
+    GEMMA3_AVAILABLE = True
+except ImportError:
+    GEMMA3_AVAILABLE = False
+
+try:
+    # Smollm3 is only available in transformers>=4.53.0
+    from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config
+    from transformers.models.smollm3.modeling_smollm3 import SmolLM3ForCausalLM
+
+    SMOLLM3_AVAILABLE = True
+except ImportError:
+    SMOLLM3_AVAILABLE = False
+
+try:
+    # InternVL is only available in transformers>=4.52.1
+    from transformers.models.internvl.configuration_internvl import InternVLConfig
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+
+    INTERNVL_AVAILABLE = True
+except ImportError:
+    INTERNVL_AVAILABLE = False
+
+try:
+    # FalconH1 is only available in transformers>=4.53.0
+    from transformers.models.falcon_h1.configuration_falcon_h1 import FalconH1Config
+    from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1ForCausalLM
+
+    FALCONH1_AVAILABLE = True
+except ImportError:
+    FALCONH1_AVAILABLE = False
+
+try:
+    # Qwen3Next is only available in transformers>=4.57.0
+    from transformers.models.qwen3_next.configuration_qwen3_next import Qwen3NextConfig
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextForCausalLM
+
+    QWEN3NEXT_AVAILABLE = True
+except ImportError:
+    QWEN3NEXT_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForCausalLM
+
+    QWEN3_5_AVAILABLE = True
+except ImportError:
+    QWEN3_5_AVAILABLE = False
+
+try:
+    from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config
+    from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM
+    from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+    from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM
+
+    HUNYUAN_V1_AVAILABLE = True
+except ImportError:
+    HUNYUAN_V1_AVAILABLE = False
+
+try:
+    from transformers.models.exaone4.configuration_exaone4 import Exaone4Config
+    from transformers.models.exaone4.modeling_exaone4 import Exaone4ForCausalLM
+
+    EXAONE4_AVAILABLE = True
+except ImportError:
+    EXAONE4_AVAILABLE = False
+
+
+device = infer_device()
+
+MINI_MODEL_SETUPS = {
+    "mini_llama3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama,
+        model_class=LlamaForCausalLM,
+        mini_model_config=LlamaConfig(
+            attention_bias=False,
+            attention_dropout=0.0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_qwen2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2,
+        model_class=Qwen2ForCausalLM,
+        mini_model_config=Qwen2Config(
+            attention_dropout=0.0,
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151643
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,  # 131072
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,  # 151936
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_phi3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_phi3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_phi3,
+        model_class=Phi3ForCausalLM,
+        mini_model_config=Phi3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,  # 32000
+            hidden_act="silu",
+            hidden_size=896,  # 3072
+            initializer_range=0.02,
+            intermediate_size=4864,  # 8192
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=None,  # defaults to num_attention_heads
+            rms_norm_eps=1e-5,
+            sliding_window=None,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32064,
+            attn_implementation="eager",
+        ),
+    ),
+    "mini_mistral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mistral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mistral,
+        model_class=MistralForCausalLM,
+        mini_model_config=MistralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=1024,
+            initializer_range=0.02,
+            intermediate_size=2048,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_mixtral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mixtral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mixtral,
+        model_class=MixtralForCausalLM,
+        mini_model_config=MixtralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=512,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=32768,  # 32768
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_gemma1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            # gemma1 model config uses `hidden_act` and point it to gelu,
+            # https://huggingface.co/google/gemma-7b/blob/main/config.json#L10
+            # but in reality it's ignored and HuggingFace will use tanh approximation:
+            # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/gemma/modeling_gemma.py#L175
+            hidden_act="gelu",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma1.1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma2,
+        model_class=Gemma2ForCausalLM,
+        mini_model_config=Gemma2Config(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    ),
+}
+
+if LLAMA4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_llama4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
+        model_class=Llama4ForCausalLM,
+        mini_model_config=Llama4TextConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=1.0,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if QWEN3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3,
+        model_class=Qwen3ForCausalLM,
+        mini_model_config=Qwen3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_qwen3_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_moe,
+        model_class=Qwen3MoeForCausalLM,
+        mini_model_config=Qwen3MoeConfig(
+            vocab_size=32000,  # 151936
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            mlp_only_layers=None,
+        ),
+    )
+
+if QWEN3_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl,
+        model_class=Qwen3VLForConditionalGeneration,
+        mini_model_config=Qwen3VLConfig(
+            tie_word_embeddings=False,
+            image_token_id=31997,
+            video_token_id=31998,
+            vision_start_token_id=31995,
+            vision_end_token_id=31996,
+            text_config=dict(
+                attention_dropout=0.0,
+                attn_implementation="sdpa",
+                bos_token_id=1,
+                eos_token_id=2,
+                head_dim=112,
+                hidden_act="silu",
+                hidden_size=896,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pad_token_id=2,
+                rms_norm_eps=1e-6,
+                sliding_window=131072,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+            ),
+            vision_config=dict(
+                depth=4,
+                hidden_size=128,
+                initializer_range=0.02,
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=896,
+                num_position_embeddings=576,
+                deepstack_visual_indexes=[1, 2, 3],
+            ),
+        ),
+    )
+
+if QWEN3_VL_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl_moe,
+        model_class=Qwen3VLMoeForConditionalGeneration,
+        mini_model_config=Qwen3VLMoeConfig(
+            tie_word_embeddings=False,
+            image_token_id=31997,
+            video_token_id=31998,
+            vision_start_token_id=31995,
+            vision_end_token_id=31996,
+            text_config=Qwen3VLMoeTextConfig(
+                attention_dropout=0.0,
+                attention_bias=False,
+                attn_implementation="sdpa",
+                bos_token_id=1,
+                eos_token_id=2,
+                head_dim=112,
+                hidden_act="silu",
+                hidden_size=896,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pad_token_id=2,
+                rms_norm_eps=1e-6,
+                sliding_window=131072,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,
+                decoder_sparse_step=1,
+                moe_intermediate_size=3072,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+            ).to_dict(),
+            vision_config=Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=128,
+                initializer_range=0.02,
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=896,
+                num_position_embeddings=576,
+                deepstack_visual_indexes=[1, 2, 3],
+            ).to_dict(),
+        ),
+    )
+
+if GEMMA3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gemma3_text"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma3_text,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma3_text,
+        model_class=Gemma3ForCausalLM,
+        mini_model_config=Gemma3TextConfig(
+            vocab_size=32000,  # 262144
+            hidden_size=1024,  # 1152
+            intermediate_size=2048,  # 6912
+            num_hidden_layers=4,  # 26
+            num_attention_heads=4,
+            num_key_value_heads=1,
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,  # 32768
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=2,
+            eos_token_id=1,
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    )
+
+if MLLAMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_mllama"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mllama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mllama,
+        model_class=MllamaForCausalLM,
+        mini_model_config=MllamaTextConfig(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=131_072,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            rope_scaling=dict(
+                factor=8.0,
+                high_freq_factor=4.0,
+                low_freq_factor=1.0,
+                original_max_position_embeddings=8192,
+                rope_type="llama3",
+                rope_theta=500_000,
+            )
+            if not IS_TRANSFORMERS_V5_OR_LATER
+            else None,
+        ),
+    )
+
+if QWEN2_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
+        model_class=Qwen2VLForConditionalGeneration,
+        mini_model_config=Qwen2VLConfig(
+            attention_dropout=0.0,
+            # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151645
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            vision_token_id=32767,  # vocab_size - 3
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            hidden_act="silu",
+            hidden_size=1536,  # 8192
+            initializer_range=0.02,
+            intermediate_size=4864,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=12,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "embed_dim": 1280,
+                "mlp_ratio": 4,
+                "num_heads": 16,
+                "in_chans": 3,
+                "hidden_size": 128,  # 1536
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "temporal_patch_size": 2,
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if QWEN2_5_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_5_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_5_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
+        model_class=Qwen2_5_VLForConditionalGeneration,
+        mini_model_config=Qwen2_5_VLConfig(
+            attention_dropout=0.0,
+            # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151645
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            vision_token_id=32767,  # vocab_size - 3
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            hidden_act="silu",
+            hidden_size=1536,  # 8192
+            initializer_range=0.02,
+            intermediate_size=4864,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=12,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "window_size": 112,
+                "fullatt_block_indexes": [7, 15, 23, 31],
+                "tokens_per_second": 2,
+                "temporal_patch_size": 2,
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if GRANITE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_granite3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_granite,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
+        model_class=GraniteForCausalLM,
+        mini_model_config=GraniteConfig(
+            attention_bias=False,
+            attention_dropout=0.0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            logits_scaling=8.0,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if LLAVA_AVAILABLE:
+    # https://huggingface.co/llava-hf/llava-1.5-7b-hf
+    MINI_MODEL_SETUPS["mini_llava"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llava,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llava,
+        model_class=LlavaForConditionalGeneration,
+        mini_model_config=LlavaConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                hidden_act="silu",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=2048,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pretraining_tp=1,
+                tie_word_embeddings=False,
+                use_cache=True,
+                max_position_embeddings=4096,  # llava-1.5-7b-hf
+                rms_norm_eps=1e-05,  # llava-1.5-7b-hf
+                vocab_size=32064,  # llava-1.5-7b-hf
+                # At rope backward
+                # Eager produces incontiguous dq and dk
+                # SDPA produces contiguous dq and incontiguous dk
+                # Flash_attn produces contiguous dq and dk
+                attn_implementation="sdpa",  # default value, pytorch native attention
+            ),
+            vision_config=CLIPVisionConfig(
+                hidden_size=1024,
+                image_size=336,
+                intermediate_size=2048,  # 4096
+                model_type="clip_vision_model",
+                num_attention_heads=4,  # 16
+                num_hidden_layers=4,  # 24
+                patch_size=14,
+                projection_dim=768,
+                vocab_size=32000,
+            ),
+            vocab_size=32064,
+            ignore_index=-100,
+            pad_token_id=4,
+            image_token_index=3,
+            projector_hidden_act="gelu",
+            vision_feature_layer=-2,
+            vision_feature_select_strategy="default",
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO2_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo2"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo2,
+        model_class=Olmo2ForCausalLM,
+        mini_model_config=Olmo2Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo3,
+        model_class=Olmo3ForCausalLM,
+        mini_model_config=Olmo3Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if GLM4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4,
+        model_class=Glm4ForCausalLM,
+        mini_model_config=Glm4Config(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+if GLM4V_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v,
+        model_class=Glm4vForConditionalGeneration,
+        mini_model_config=Glm4vConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+                "pad_token_id": None,
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if GLM4V_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v_moe,
+        model_class=Glm4vMoeForConditionalGeneration,
+        mini_model_config=Glm4vMoeConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                "attention_dropout": 0.0,
+                "moe_intermediate_size": 1408,
+                "num_experts_per_tok": 2,
+                "n_shared_experts": 1,
+                "n_routed_experts": 8,
+                "routed_scaling_factor": 1.0,
+                "n_group": 1,
+                "topk_group": 1,
+                "first_k_dense_replace": 1,
+                "norm_topk_prob": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if SMOLLM3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_smollm3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_smollm3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_smollm3,
+        model_class=SmolLM3ForCausalLM,
+        mini_model_config=SmolLM3Config(
+            attention_bias=False,
+            attention_dropout=0.0,
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,  # 128000
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if INTERNVL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_internvl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_internvl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_internvl,
+        model_class=InternVLForConditionalGeneration,
+        mini_model_config=InternVLConfig(
+            text_config=Qwen2Config(
+                rms_norm_eps=1e-5,
+                hidden_size=256,  # 1024
+                intermediate_size=1024,  # 4096
+                hidden_act="silu",
+                num_hidden_layers=4,  # 24
+                num_attention_heads=4,  # 16
+                num_key_value_heads=2,  # 16
+                max_position_embeddings=4096,  # 8192
+                vocab_size=32000,  # 151936
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                tie_word_embeddings=False,
+            ),
+            vision_config={
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+            },
+            image_token_id=10,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if FALCONH1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_falcon_h1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_falcon_h1,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_falcon_h1,
+        model_class=FalconH1ForCausalLM,
+        mini_model_config=FalconH1Config(
+            model_type="falcon_h1",
+            vocab_size=32000,
+            hidden_size=256,  # 4096
+            num_hidden_layers=4,  # 24
+            num_attention_heads=4,  # 32
+            num_key_value_heads=2,  # 8
+            intermediate_size=1024,  # 11008
+            hidden_act="silu",
+            max_position_embeddings=4096,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            tie_word_embeddings=False,
+            mamba_d_ssm=128,  # 1024
+            mamba_n_heads=16,  # 128
+            mamba_d_state=32,  # 245
+            mamba_d_conv=2,  # 4
+        ),
+    )
+
+if QWEN3NEXT_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_next"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_next,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_next,
+        model_class=Qwen3NextForCausalLM,
+        mini_model_config=Qwen3NextConfig(  # Copypaste Qwen3MoeConfig
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            # config.dtype must be set if fla installed since there's a bug in the original code (No torch.get_current_dtype())
+            # https://github.com/huggingface/transformers/blob/v4.57.1/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L613
+            dtype=torch.bfloat16,
+        ),
+    )
+
+if QWEN3_5_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5,
+        model_class=Qwen3_5ForCausalLM,
+        mini_model_config=Qwen3_5TextConfig(
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            head_dim=128,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=64,
+            linear_value_head_dim=64,
+            linear_num_key_heads=8,
+            linear_num_value_heads=8,
+            layer_types=["linear_attention", "linear_attention", "linear_attention", "full_attention"],
+            dtype=torch.bfloat16,
+        ),
+    )
+
+
+if HUNYUAN_V1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1,
+        model_class=HunYuanDenseV1ForCausalLM,
+        mini_model_config=HunYuanDenseV1Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            num_hidden_layers=4,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_attention_heads=8,
+            head_dim=112,
+            rms_norm_eps=1e-6,
+            tie_word_embeddings=True,
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            norm_eps=1e-6,
+            num_key_value_heads=2,
+            partial_rotary_factor=1.0,
+            vocab_size=32000,
+            use_cache=True,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe,
+        model_class=HunYuanMoEV1ForCausalLM,
+        mini_model_config=HunYuanMoEV1Config(
+            vocab_size=32000,
+            hidden_size=128,
+            intermediate_size=512,
+            head_dim=16,
+            num_hidden_layers=2,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            eod_token_id=3,
+            sep_token_id=4,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            num_experts=2,
+            moe_topk=1,
+            attn_implementation="sdpa",
+        ),
+    )
+
+if EXAONE4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_exaone4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_exaone4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_exaone4,
+        model_class=Exaone4ForCausalLM,
+        mini_model_config=Exaone4Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+            pad_token_id=None,
+        ),
+    )
+
+
+def create_model(model_name="mini_llama3"):
+    """
+    Create a mini version model
+    The commented values are the original values
+    """
+    model_config = MINI_MODEL_SETUPS[model_name].mini_model_config
+    model_class = MINI_MODEL_SETUPS[model_name].model_class
+    return model_class(model_config)
+
+
+@require_deterministic
+def run_mini_model(
+    model_name="mini_llama3",
+    num_steps=100,
+    dtype=torch.bfloat16,
+    lr=1e-5,
+    with_liger=False,
+):
+    # If we move it to the beginning of test_mini_model, the two runs are initialized with different weights.
+    # This is due to RNG (Random Number Generator). The formula of RNG progression is x_(n+1) = (a * x_n + c) % m
+    # Everytime RNG is used, like randomly initialzing weight, the RNG progresses to the next state.
+    # Therefore, we have to reset RNG before we create the model to ensure the weight initialization started from the same RNG state.
+
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+    if "mllama" in model_name:
+        revert_kwargs["model_type"] = "causal_lm"
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+        }
+
+        if "glm4" in model_name or "llama4" in model_name or "qwen3_next" in model_name or "qwen3_5" in model_name:
+            kwargs["rope"] = False
+
+        model_supports_layer_norm = "qwen2_vl" in model_name
+        if model_supports_layer_norm:
+            kwargs["layer_norm"] = True
+
+        if "gemma" in model_name:
+            kwargs["geglu"] = True
+        else:
+            kwargs["swiglu"] = True
+
+        if "llava" in model_name:
+            apply_liger_kernel_to_llama(**kwargs)
+
+        kwargs["fused_linear_cross_entropy"] = False
+        kwargs["cross_entropy"] = False
+
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+
+    train_dataset = load_from_disk(DEFAULT_DATASET_PATH)
+    loader = DataLoader(train_dataset, batch_size=16, shuffle=False, collate_fn=simple_collate_fn)
+    loader_iter = iter(loader)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+
+    for i in range(num_steps):
+        batch = next(loader_iter).to(model.device)
+        optimizer.zero_grad()
+        output = model(**batch)
+        output.loss.backward()
+        optimizer.step()
+        print(f"Step {i}, Loss: {output.loss.item()}")
+        loss_list.append(output.loss.item())
+
+    topk_logprobs = get_topk(get_logprobs(output.logits))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        # Tolerance is set higher than usual to pass the tests.
+        pytest.param(
+            "mini_llama4",  # llama4 requires slightly larger tolerances to pass this test after bug fix to llama4 in transformers v5.0.0
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            4e-1,
+            3e-1,
+            2e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not LLAMA4_AVAILABLE,
+                    reason="Llama4 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_llama3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_llava",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not LLAVA_AVAILABLE,
+                    reason="LLaVa not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_granite3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,  # loss
+            1e-2,  # loss
+            1e-1,  # logit logprobs atol
+            1e-2,  # logprobs rtol
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GRANITE_AVAILABLE,
+                    reason="Granite not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_mllama",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not MLLAMA_AVAILABLE,
+                    reason="Mllama not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_qwen3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_AVAILABLE,
+                    reason="Qwen3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            2e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_AVAILABLE,
+                    reason="Qwen3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_VL_AVAILABLE,
+                    reason="Qwen3-VL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_VL_MOE_AVAILABLE,
+                    reason="Qwen3-VL-MoE not available in this version of transformers",
+                ),
+                pytest.mark.skipif(True, reason="Flaky test"),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN2_VL_AVAILABLE,
+                    reason="Qwen2-VL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2_5_vl",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN2_5_VL_AVAILABLE,
+                    reason="Qwen2.5-VL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_phi3",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_mistral",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        # TODO: mixtral is flaky so disable the test for now
+        # pytest.param(
+        #     "mini_mixtral",
+        #     32,
+        #     1e-4,
+        #     torch.bfloat16,
+        #     1e-3,
+        #     1e-2,
+        #     1e-1,
+        #     1e-2,
+        #     1e-1,
+        #     1e-2,
+        #     marks=pytest.mark.skipif(
+        #         not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+        #     ),
+        # ),
+        # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match
+        pytest.param(
+            "mini_gemma1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_gemma1.1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        pytest.param(
+            "mini_olmo2",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not OLMO2_AVAILABLE,
+                    reason="OLMO2 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_olmo3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not OLMO3_AVAILABLE,
+                    reason="OLMO3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_glm4",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GLM4_AVAILABLE,
+                    reason="Glm4 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_glm4v",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            2e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GLM4V_AVAILABLE,
+                    reason="Glm4v not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_glm4v_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            4e-1,  # rms_norm patch needs higher tolerance in bf16
+            1e-1,
+            5e-1,  # rms_norm patch needs higher tolerance in bf16
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GLM4V_MOE_AVAILABLE,
+                    reason="Glm4v_moe not available in this version of transformers",
+                ),
+            ],
+        ),
+        # TODO: Gemma2 test for bf16 is not passing within the tolerance range, might be casting issue, need to investigate
+        # pytest.param(
+        #     "mini_gemma2",
+        #     32,
+        #     1e-4,
+        #     torch.bfloat16,
+        #     1e-3,
+        #     1e-2,
+        #     1e-1,
+        #     1e-2,
+        #     1e-2,
+        #     1e-2,
+        #     marks=pytest.mark.skipif(
+        #         not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+        #     ),
+        # ),
+        pytest.param(
+            "mini_gemma3_text",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            3e-1,  # 1e-1 too flaky
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GEMMA3_AVAILABLE,
+                    reason="Gemma3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_smollm3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not SMOLLM3_AVAILABLE,
+                    reason="Smollm3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_internvl",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not INTERNVL_AVAILABLE,
+                    reason="InternVL not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_falcon_h1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not FALCONH1_AVAILABLE,
+                    reason="FalconH1 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_next",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            1e-2,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3NEXT_AVAILABLE,
+                    reason="Qwen3Next not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5",
+            32,
+            1e-5,
+            torch.bfloat16,
+            5e-2,
+            2e-1,
+            1e-1,
+            1e-1,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not QWEN3_5_AVAILABLE,
+                    reason="Qwen3_5 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_hunyuan_v1",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not HUNYUAN_V1_AVAILABLE,
+                    reason="Hunyuan_v1 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_hunyuan_v1_moe",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not HUNYUAN_V1_AVAILABLE,
+                    reason="Hunyuan_v1_moe not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_exaone4",
+            32,
+            1e-5,
+            torch.bfloat16,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not EXAONE4_AVAILABLE,
+                    reason="EXAONE4 not available in this version of transformers",
+                ),
+            ],
+        ),
+    ],
+)
+def test_mini_model(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+
+    expected_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True)
+
+    # Compare every step of the loss
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # Compare the topk logprobs from evaluation step
+    assert_verbose_allclose(
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
+        extra_info="[Top k logprobs]",
+    )
+
+    # Compare the params from the last step
+    # Iterate over the model's parameters and compare them
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1],
+            actual_param[1],
+            atol=param_atol,
+            rtol=param_rtol,
+            extra_info="[Model parameters]",
+        )
diff --git a/test/convergence/fp32/__init__.py b/test/convergence/fp32/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py
new file mode 100755
index 0000000000000000000000000000000000000000..1a09cba21352152470c64d8237e6007d4e8c64b9
--- /dev/null
+++ b/test/convergence/fp32/test_mini_models.py
@@ -0,0 +1,2170 @@
+import os
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Ensure deterministic behavior with CuBLAS
+
+import pytest
+import torch
+import transformers
+
+from datasets import load_from_disk
+from packaging import version
+from torch.utils.data import DataLoader
+from transformers.models.gemma import GemmaConfig
+from transformers.models.gemma import GemmaForCausalLM
+from transformers.models.gemma2 import Gemma2Config
+from transformers.models.gemma2 import Gemma2ForCausalLM
+from transformers.models.llama import LlamaConfig
+from transformers.models.llama import LlamaForCausalLM
+from transformers.models.mistral import MistralConfig
+from transformers.models.mistral import MistralForCausalLM
+from transformers.models.mixtral import MixtralConfig
+from transformers.models.mixtral import MixtralForCausalLM
+from transformers.models.phi3 import Phi3Config
+from transformers.models.phi3 import Phi3ForCausalLM
+from transformers.models.qwen2 import Qwen2Config
+from transformers.models.qwen2 import Qwen2ForCausalLM
+
+from liger_kernel.transformers import apply_liger_kernel_to_exaone4
+from liger_kernel.transformers import apply_liger_kernel_to_falcon_h1
+from liger_kernel.transformers import apply_liger_kernel_to_gemma
+from liger_kernel.transformers import apply_liger_kernel_to_gemma2
+from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text
+from liger_kernel.transformers import apply_liger_kernel_to_glm4
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe
+from liger_kernel.transformers import apply_liger_kernel_to_gpt_oss
+from liger_kernel.transformers import apply_liger_kernel_to_granite
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe
+from liger_kernel.transformers import apply_liger_kernel_to_internvl
+from liger_kernel.transformers import apply_liger_kernel_to_llama
+from liger_kernel.transformers import apply_liger_kernel_to_llama4
+from liger_kernel.transformers import apply_liger_kernel_to_llava
+from liger_kernel.transformers import apply_liger_kernel_to_mistral
+from liger_kernel.transformers import apply_liger_kernel_to_mixtral
+from liger_kernel.transformers import apply_liger_kernel_to_mllama
+from liger_kernel.transformers import apply_liger_kernel_to_olmo2
+from liger_kernel.transformers import apply_liger_kernel_to_olmo3
+from liger_kernel.transformers import apply_liger_kernel_to_phi3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5_moe
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_next
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl_moe
+from liger_kernel.transformers import apply_liger_kernel_to_smollm3
+from liger_kernel.utils import infer_device
+from test.utils import DEFAULT_DATASET_PATH
+from test.utils import MiniModelConfig
+from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
+from test.utils import require_deterministic
+from test.utils import revert_liger_kernel_to_exaone4
+from test.utils import revert_liger_kernel_to_falcon_h1
+from test.utils import revert_liger_kernel_to_gemma
+from test.utils import revert_liger_kernel_to_gemma2
+from test.utils import revert_liger_kernel_to_gemma3_text
+from test.utils import revert_liger_kernel_to_glm4
+from test.utils import revert_liger_kernel_to_glm4v
+from test.utils import revert_liger_kernel_to_glm4v_moe
+from test.utils import revert_liger_kernel_to_gpt_oss
+from test.utils import revert_liger_kernel_to_granite
+from test.utils import revert_liger_kernel_to_hunyuan_v1
+from test.utils import revert_liger_kernel_to_hunyuan_v1_moe
+from test.utils import revert_liger_kernel_to_internvl
+from test.utils import revert_liger_kernel_to_llama
+from test.utils import revert_liger_kernel_to_llama4
+from test.utils import revert_liger_kernel_to_llava
+from test.utils import revert_liger_kernel_to_mistral
+from test.utils import revert_liger_kernel_to_mixtral
+from test.utils import revert_liger_kernel_to_mllama
+from test.utils import revert_liger_kernel_to_olmo2
+from test.utils import revert_liger_kernel_to_olmo3
+from test.utils import revert_liger_kernel_to_phi3
+from test.utils import revert_liger_kernel_to_qwen2
+from test.utils import revert_liger_kernel_to_qwen2_5_vl
+from test.utils import revert_liger_kernel_to_qwen2_vl
+from test.utils import revert_liger_kernel_to_qwen3
+from test.utils import revert_liger_kernel_to_qwen3_5
+from test.utils import revert_liger_kernel_to_qwen3_5_moe
+from test.utils import revert_liger_kernel_to_qwen3_moe
+from test.utils import revert_liger_kernel_to_qwen3_next
+from test.utils import revert_liger_kernel_to_qwen3_vl
+from test.utils import revert_liger_kernel_to_qwen3_vl_moe
+from test.utils import revert_liger_kernel_to_smollm3
+from test.utils import set_seed
+from test.utils import simple_collate_fn
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+try:
+    from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+    from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
+
+    LLAMA4_AVAILABLE = True
+except ImportError:
+    LLAMA4_AVAILABLE = False
+
+try:
+    # Mllama is only available in transformers>=4.45.0
+    from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+    from transformers.models.mllama.modeling_mllama import MllamaForCausalLM
+
+    MLLAMA_AVAILABLE = True
+except ImportError:
+    MLLAMA_AVAILABLE = False
+
+try:
+    # Qwen2-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+
+    QWEN2_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_VL_AVAILABLE = False
+
+try:
+    # Qwen2.5-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+
+    QWEN2_5_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_5_VL_AVAILABLE = False
+
+
+try:
+    # Qwen3-VL is only available in transformers>=4.57.0
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+
+    QWEN3_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.57.0")
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+
+try:
+    # Qwen3-VL-MoE is only available in transformers>=4.57.0
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeTextConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeVisionConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.57.0")
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.granite import GraniteConfig
+    from transformers.models.granite import GraniteForCausalLM
+
+    GRANITE_AVAILABLE = True
+except ImportError:
+    GRANITE_AVAILABLE = False
+
+try:
+    from transformers import CLIPVisionConfig
+    from transformers.models.llava.configuration_llava import LlavaConfig
+    from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration
+
+    LLAVA_AVAILABLE = True
+except ImportError:
+    LLAVA_AVAILABLE = False
+
+try:
+    # OLMO2 is only available in transformers>=4.47.0
+    from transformers.models.olmo2.configuration_olmo2 import Olmo2Config
+    from transformers.models.olmo2.modeling_olmo2 import Olmo2ForCausalLM
+
+    OLMO2_AVAILABLE = True
+except ImportError:
+    OLMO2_AVAILABLE = False
+
+try:
+    # OLMO3 is only available in transformers>=4.57.0
+    from transformers.models.olmo3.configuration_olmo3 import Olmo3Config
+    from transformers.models.olmo3.modeling_olmo3 import Olmo3ForCausalLM
+
+    OLMO3_AVAILABLE = True
+except ImportError:
+    OLMO3_AVAILABLE = False
+
+try:
+    # Glm4 is only available in transformers>=4.51.3
+    from transformers.models.glm4.configuration_glm4 import Glm4Config
+    from transformers.models.glm4.modeling_glm4 import Glm4ForCausalLM
+
+    GLM4_AVAILABLE = True
+except ImportError:
+    GLM4_AVAILABLE = False
+
+try:
+    # Glm4v is only available in transformers>=4.51.3
+    from transformers.models.glm4v.configuration_glm4v import Glm4vConfig
+    from transformers.models.glm4v.modeling_glm4v import Glm4vForConditionalGeneration
+
+    GLM4V_AVAILABLE = True
+except ImportError:
+    GLM4V_AVAILABLE = False
+
+try:
+    # Glm4v_moe is only available in transformers>=4.51.3
+    from transformers.models.glm4v_moe.configuration_glm4v_moe import Glm4vMoeConfig
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeForConditionalGeneration
+
+    GLM4V_MOE_AVAILABLE = True
+except ImportError:
+    GLM4V_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
+
+    GEMMA3_AVAILABLE = True
+except ImportError:
+    GEMMA3_AVAILABLE = False
+
+try:
+    # Smollm3 is only available in transformers>=4.53.0
+    from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config
+    from transformers.models.smollm3.modeling_smollm3 import SmolLM3ForCausalLM
+
+    SMOLLM3_AVAILABLE = True
+except ImportError:
+    SMOLLM3_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
+    from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeForCausalLM
+
+    QWEN3_AVAILABLE = True
+except ImportError:
+    QWEN3_AVAILABLE = False
+
+try:
+    # GPT-OSS is only available in transformers>=4.55.0
+    from transformers.models.gpt_oss.configuration_gpt_oss import GptOssConfig
+    from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM
+
+    GPT_OSS_AVAILABLE = True
+except ImportError:
+    GPT_OSS_AVAILABLE = False
+
+try:
+    # InternVL is only available in transformers>=4.52.1
+    from transformers.models.internvl.configuration_internvl import InternVLConfig
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+
+    INTERNVL_AVAILABLE = True
+except ImportError:
+    INTERNVL_AVAILABLE = False
+
+try:
+    # FalconH1 is only available in transformers>=4.53.0
+    from transformers.models.falcon_h1.configuration_falcon_h1 import FalconH1Config
+    from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1ForCausalLM
+
+    FALCONH1_AVAILABLE = True
+except ImportError:
+    FALCONH1_AVAILABLE = False
+
+try:
+    # Qwen3Next is only available in transformers>=4.57.0
+    from transformers.models.qwen3_next.configuration_qwen3_next import Qwen3NextConfig
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextForCausalLM
+
+    QWEN3NEXT_AVAILABLE = True
+except ImportError:
+    QWEN3NEXT_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeForCausalLM
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeTextConfig
+
+    QWEN3_5_MOE_AVAILABLE = True
+except ImportError:
+    QWEN3_5_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForCausalLM
+
+    QWEN3_5_AVAILABLE = True
+except ImportError:
+    QWEN3_5_AVAILABLE = False
+
+try:
+    from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config
+    from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM
+    from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+    from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM
+
+    HUNYUAN_V1_AVAILABLE = True
+except ImportError:
+    HUNYUAN_V1_AVAILABLE = False
+
+try:
+    from transformers.models.exaone4.configuration_exaone4 import Exaone4Config
+    from transformers.models.exaone4.modeling_exaone4 import Exaone4ForCausalLM
+
+    EXAONE4_AVAILABLE = True
+except ImportError:
+    EXAONE4_AVAILABLE = False
+
+
+device = infer_device()
+
+MINI_MODEL_SETUPS = {
+    "mini_llama3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama,
+        model_class=LlamaForCausalLM,
+        mini_model_config=LlamaConfig(
+            attention_bias=False,
+            attention_dropout=0.0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_qwen2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2,
+        model_class=Qwen2ForCausalLM,
+        mini_model_config=Qwen2Config(
+            attention_dropout=0.0,
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151643
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,  # 131072
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,  # 151936
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_phi3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_phi3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_phi3,
+        model_class=Phi3ForCausalLM,
+        mini_model_config=Phi3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,  # 32000
+            hidden_act="silu",
+            hidden_size=896,  # 3072
+            initializer_range=0.02,
+            intermediate_size=4864,  # 8192
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=None,  # defaults to num_attention_heads
+            rms_norm_eps=1e-5,
+            sliding_window=None,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32064,
+            attn_implementation="eager",
+        ),
+    ),
+    "mini_mistral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mistral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mistral,
+        model_class=MistralForCausalLM,
+        mini_model_config=MistralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=1024,
+            initializer_range=0.02,
+            intermediate_size=2048,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_mixtral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mixtral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mixtral,
+        model_class=MixtralForCausalLM,
+        mini_model_config=MixtralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=512,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=32768,  # 32768
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_gemma1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            # gemma1 model config uses `hidden_act` and point it to gelu,
+            # https://huggingface.co/google/gemma-7b/blob/main/config.json#L10
+            # but in reality it's ignored and HuggingFace will use tanh approximation:
+            # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/gemma/modeling_gemma.py#L175
+            hidden_act="gelu",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma1.1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma2,
+        model_class=Gemma2ForCausalLM,
+        mini_model_config=Gemma2Config(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    ),
+}
+if LLAMA4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_llama4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
+        model_class=Llama4ForCausalLM,
+        mini_model_config=Llama4TextConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=1.0,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+
+if QWEN3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3,
+        model_class=Qwen3ForCausalLM,
+        mini_model_config=Qwen3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_qwen3_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_moe,
+        model_class=Qwen3MoeForCausalLM,
+        mini_model_config=Qwen3MoeConfig(
+            vocab_size=32000,  # 151936
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            mlp_only_layers=None,
+        ),
+    )
+
+if GPT_OSS_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gpt_oss"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gpt_oss,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gpt_oss,
+        model_class=GptOssForCausalLM,
+        mini_model_config=GptOssConfig(
+            vocab_size=32000,  # 201088
+            hidden_size=896,
+            intermediate_size=896,  # Same as hidden_size for GPT-OSS
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            head_dim=64,
+            hidden_act="silu",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-5,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_dropout=0.0,
+            num_local_experts=8,  # Reduced from 32 for mini model
+            num_experts_per_tok=2,  # Reduced from 4 for mini model
+            router_aux_loss_coef=0.9,
+            output_router_logits=False,
+            sliding_window=128,
+            layer_types=["sliding_attention" if bool((i + 1) % 2) else "full_attention" for i in range(4)],
+        ),
+    )
+
+if GEMMA3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gemma3_text"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma3_text,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma3_text,
+        model_class=Gemma3ForCausalLM,
+        mini_model_config=Gemma3TextConfig(
+            vocab_size=32000,  # 262144
+            hidden_size=1024,  # 1152
+            intermediate_size=2048,  # 6912
+            num_hidden_layers=4,  # 26
+            num_attention_heads=4,
+            num_key_value_heads=1,
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,  # 32768
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=2,
+            eos_token_id=1,
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    )
+
+if MLLAMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_mllama"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mllama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mllama,
+        model_class=MllamaForCausalLM,
+        mini_model_config=MllamaTextConfig(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=131_072,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            rope_scaling=dict(
+                factor=8.0,
+                high_freq_factor=4.0,
+                low_freq_factor=1.0,
+                original_max_position_embeddings=8192,
+                rope_type="llama3",
+                rope_theta=500_000,
+            )
+            if not IS_TRANSFORMERS_V5_OR_LATER
+            else None,
+        ),
+    )
+
+if QWEN2_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
+        model_class=Qwen2VLForConditionalGeneration,
+        mini_model_config=Qwen2VLConfig(
+            # In transformers v5, text-related parameters must be in text_config
+            text_config={
+                "attention_dropout": 0.0,
+                # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                "bos_token_id": 1,  # 151643
+                "eos_token_id": 2,  # 151645
+                "hidden_act": "silu",
+                "hidden_size": 1536,  # 8192
+                "initializer_range": 0.02,
+                "intermediate_size": 4864,  # 29568
+                "max_position_embeddings": 32768,
+                "max_window_layers": 4,  # 80
+                "num_attention_heads": 12,  # 64
+                "num_hidden_layers": 4,  # 80
+                "num_key_value_heads": 2,  # 8
+                "rms_norm_eps": 1e-6,  # 1e-5
+                **(
+                    {"rope_parameters": {"mrope_section": [16, 24, 24]}}  # (temporal, height, width)
+                    if IS_TRANSFORMERS_V5_OR_LATER
+                    else {"rope_scaling": {"type": "mrope", "mrope_section": [16, 24, 24]}}
+                ),
+                "sliding_window": 4096,
+                "tie_word_embeddings": False,
+                "use_cache": True,
+                "vocab_size": 32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+                "use_sliding_window": False,
+            },
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            vision_config={
+                "depth": 4,  # 32
+                "embed_dim": 1280,
+                "mlp_ratio": 4,
+                "num_heads": 16,
+                "in_chans": 3,
+                "hidden_size": 128,  # 1536
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if QWEN2_5_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_5_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_5_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
+        model_class=Qwen2_5_VLForConditionalGeneration,
+        mini_model_config=Qwen2_5_VLConfig(
+            # In transformers v5, text-related parameters must be in text_config
+            text_config={
+                "attention_dropout": 0.0,
+                # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                "bos_token_id": 1,  # 151643
+                "eos_token_id": 2,  # 151645
+                "hidden_act": "silu",
+                "hidden_size": 1536,  # 8192
+                "initializer_range": 0.02,
+                "intermediate_size": 4864,  # 29568
+                "max_position_embeddings": 32768,
+                "max_window_layers": 4,  # 80
+                "num_attention_heads": 12,  # 64
+                "num_hidden_layers": 4,  # 80
+                "num_key_value_heads": 2,  # 8
+                "rms_norm_eps": 1e-6,  # 1e-5
+                **(
+                    {"rope_parameters": {"mrope_section": [16, 24, 24]}}  # (temporal, height, width)
+                    if IS_TRANSFORMERS_V5_OR_LATER
+                    else {"rope_scaling": {"type": "mrope", "mrope_section": [16, 24, 24]}}
+                ),
+                "sliding_window": 4096,
+                "tie_word_embeddings": False,
+                "use_cache": True,
+                "vocab_size": 32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+                "use_sliding_window": False,
+            },
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "window_size": 112,
+                "fullatt_block_indexes": [7, 15, 23, 31],
+                "tokens_per_second": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if QWEN3_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl,
+        model_class=Qwen3VLForConditionalGeneration,
+        mini_model_config=Qwen3VLConfig(
+            bos_token_id=1,
+            eos_token_id=2,
+            vision_start_token_id=32765,
+            vision_end_token_id=32766,
+            image_token_id=32768,
+            video_token_id=32769,
+            tie_word_embeddings=False,
+            attn_implementation="sdpa",
+            text_config=dict(
+                attention_dropout=0.0,
+                hidden_act="silu",
+                hidden_size=1536,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=12,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                vocab_size=32768,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+            ),
+            vision_config=dict(
+                depth=4,
+                hidden_size=128,
+                hidden_act="silu",
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=128,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[],
+                initializer_range=0.02,
+            ),
+        ),
+    )
+
+if QWEN3_VL_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl_moe,
+        model_class=Qwen3VLMoeForConditionalGeneration,
+        mini_model_config=Qwen3VLMoeConfig(
+            bos_token_id=1,
+            eos_token_id=2,
+            vision_start_token_id=32765,
+            vision_end_token_id=32766,
+            image_token_id=32768,
+            video_token_id=32769,
+            tie_word_embeddings=False,
+            attn_implementation="sdpa",
+            text_config=Qwen3VLMoeTextConfig(
+                attention_dropout=0.0,
+                attention_bias=False,
+                hidden_act="silu",
+                hidden_size=1536,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=12,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                head_dim=128,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                vocab_size=32768,
+                decoder_sparse_step=1,
+                moe_intermediate_size=3072,
+                num_experts_per_tok=2,
+                num_experts=4,
+                tie_word_embeddings=False,
+                mlp_only_layers=[],
+                pad_token_id=None,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+            ).to_dict(),
+            vision_config=Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=128,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=128,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+        ),
+    )
+
+if GRANITE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_granite3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_granite,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
+        model_class=GraniteForCausalLM,
+        mini_model_config=GraniteConfig(
+            attention_bias=False,
+            attention_dropout=0.1,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO2_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo2"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo2,
+        model_class=Olmo2ForCausalLM,
+        mini_model_config=Olmo2Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo3,
+        model_class=Olmo3ForCausalLM,
+        mini_model_config=Olmo3Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if GLM4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4,
+        model_class=Glm4ForCausalLM,
+        mini_model_config=Glm4Config(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=32768,
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if GLM4V_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v,
+        model_class=Glm4vForConditionalGeneration,
+        mini_model_config=Glm4vConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+                "pad_token_id": None,
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if GLM4V_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v_moe,
+        model_class=Glm4vMoeForConditionalGeneration,
+        mini_model_config=Glm4vMoeConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                "attention_dropout": 0.0,
+                "moe_intermediate_size": 1408,
+                "num_experts_per_tok": 2,
+                "n_shared_experts": 1,
+                "n_routed_experts": 8,
+                "routed_scaling_factor": 1.0,
+                "n_group": 1,
+                "topk_group": 1,
+                "first_k_dense_replace": 1,
+                "norm_topk_prob": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+if LLAVA_AVAILABLE:
+    # https://huggingface.co/llava-hf/llava-1.5-7b-hf
+    MINI_MODEL_SETUPS["mini_llava"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llava,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llava,
+        model_class=LlavaForConditionalGeneration,
+        mini_model_config=LlavaConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                hidden_act="silu",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=2048,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pretraining_tp=1,
+                tie_word_embeddings=False,
+                use_cache=True,
+                max_position_embeddings=4096,  # llava-1.5-7b-hf
+                rms_norm_eps=1e-05,  # llava-1.5-7b-hf
+                vocab_size=32064,  # llava-1.5-7b-hf
+                # At rope backward
+                # Eager produces incontiguous dq and dk
+                # SDPA produces contiguous dq and incontiguous dk
+                # Flash_attn produces contiguous dq and dk
+                attn_implementation="sdpa",  # default value, pytorch native attention
+            ),
+            vision_config=CLIPVisionConfig(
+                hidden_size=1024,
+                image_size=336,
+                intermediate_size=2048,  # 4096
+                model_type="clip_vision_model",
+                num_attention_heads=4,  # 16
+                num_hidden_layers=4,  # 24
+                patch_size=14,
+                projection_dim=768,
+                vocab_size=32000,
+            ),
+            vocab_size=32064,
+            ignore_index=-100,
+            pad_token_id=4,
+            image_token_index=3,
+            projector_hidden_act="gelu",
+            vision_feature_layer=-2,
+            vision_feature_select_strategy="default",
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if SMOLLM3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_smollm3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_smollm3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_smollm3,
+        model_class=SmolLM3ForCausalLM,
+        mini_model_config=SmolLM3Config(
+            attention_bias=False,
+            attention_dropout=0.0,
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,  # 128000
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if INTERNVL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_internvl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_internvl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_internvl,
+        model_class=InternVLForConditionalGeneration,
+        mini_model_config=InternVLConfig(
+            text_config=Qwen2Config(
+                rms_norm_eps=1e-5,
+                hidden_size=256,  # 1024
+                intermediate_size=1024,  # 4096
+                hidden_act="silu",
+                num_hidden_layers=4,  # 24
+                num_attention_heads=4,  # 16
+                num_key_value_heads=2,  # 16
+                max_position_embeddings=4096,  # 8192
+                vocab_size=32000,  # 151936
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                tie_word_embeddings=False,
+            ),
+            vision_config={
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+            },
+            image_token_id=10,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if FALCONH1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_falcon_h1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_falcon_h1,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_falcon_h1,
+        model_class=FalconH1ForCausalLM,
+        mini_model_config=FalconH1Config(
+            model_type="falcon_h1",
+            vocab_size=32000,
+            hidden_size=256,  # 4096
+            num_hidden_layers=4,  # 24
+            num_attention_heads=4,  # 32
+            num_key_value_heads=2,  # 8
+            intermediate_size=1024,  # 11008
+            hidden_act="silu",
+            max_position_embeddings=4096,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            tie_word_embeddings=False,
+            mamba_d_ssm=128,  # 1024
+            mamba_n_heads=16,  # 128
+            mamba_d_state=32,  # 245
+            mamba_d_conv=2,  # 4
+        ),
+    )
+
+if QWEN3NEXT_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_next"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_next,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_next,
+        model_class=Qwen3NextForCausalLM,
+        mini_model_config=Qwen3NextConfig(  # Copypaste Qwen3MoeConfig
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            # config.dtype must be set if fla installed since there's a bug in the original code (No torch.get_current_dtype())
+            dtype=torch.float32,
+        ),
+    )
+
+if QWEN3_5_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5_moe,
+        model_class=Qwen3_5MoeForCausalLM,
+        mini_model_config=Qwen3_5MoeTextConfig(
+            vocab_size=32000,
+            hidden_size=896,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            head_dim=128,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=64,
+            linear_value_head_dim=64,
+            linear_num_key_heads=8,
+            linear_num_value_heads=8,
+            moe_intermediate_size=768,
+            shared_expert_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            # config.dtype must be set if fla installed since there's a bug in the original code (No torch.get_current_dtype())
+            dtype=torch.float32,
+        ),
+    )
+
+if QWEN3_5_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5,
+        model_class=Qwen3_5ForCausalLM,
+        mini_model_config=Qwen3_5TextConfig(
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            head_dim=128,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=64,
+            linear_value_head_dim=64,
+            linear_num_key_heads=8,
+            linear_num_value_heads=8,
+            layer_types=["linear_attention", "linear_attention", "linear_attention", "full_attention"],
+            dtype=torch.float32,
+        ),
+    )
+
+if HUNYUAN_V1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1,
+        model_class=HunYuanDenseV1ForCausalLM,
+        mini_model_config=HunYuanDenseV1Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            num_hidden_layers=4,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_attention_heads=8,
+            head_dim=112,
+            rms_norm_eps=1e-6,
+            tie_word_embeddings=True,
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            norm_eps=1e-6,
+            num_key_value_heads=2,
+            partial_rotary_factor=1.0,
+            vocab_size=32000,
+            use_cache=True,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe,
+        model_class=HunYuanMoEV1ForCausalLM,
+        mini_model_config=HunYuanMoEV1Config(
+            hidden_act="silu",
+            attention_dropout=0.0,
+            num_hidden_layers=4,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_attention_heads=8,
+            head_dim=112,
+            rms_norm_eps=1e-6,
+            tie_word_embeddings=True,
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            norm_eps=1e-6,
+            num_key_value_heads=2,
+            partial_rotary_factor=1.0,
+            vocab_size=32000,
+            num_experts=8,
+            moe_topk=2,
+            use_cache=True,
+            attn_implementation="sdpa",
+        ),
+    )
+
+if EXAONE4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_exaone4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_exaone4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_exaone4,
+        model_class=Exaone4ForCausalLM,
+        mini_model_config=Exaone4Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+            pad_token_id=None,
+        ),
+    )
+
+
+def create_model(model_name="mini_llama3"):
+    """
+    Create a mini version model
+    The commented values are the original values
+    """
+    model_config = MINI_MODEL_SETUPS[model_name].mini_model_config
+    model_class = MINI_MODEL_SETUPS[model_name].model_class
+    return model_class(model_config)
+
+
+@require_deterministic
+def run_mini_model(
+    model_name="mini_llama3",
+    num_steps=100,
+    dtype=torch.float32,
+    lr=1e-5,
+    with_liger=False,
+):
+    # If we move it to the beginning of test_mini_model, the two runs are initialized with different weights.
+    # This is due to RNG (Random Number Generator). The formula of RNG progression is x_(n+1) = (a * x_n + c) % m
+    # Everytime RNG is used, like randomly initialzing weight, the RNG progresses to the next state.
+    # Therefore, we have to reset RNG before we create the model to ensure the weight initialization started from the same RNG state.
+
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+    if "mllama" in model_name:
+        revert_kwargs["model_type"] = "causal_lm"
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+        }
+
+        if "glm4" in model_name or "qwen3_next" in model_name or "qwen3_5" in model_name:
+            kwargs["rope"] = False
+
+        model_supports_layer_norm = "qwen2_vl" in model_name
+        if model_supports_layer_norm:
+            kwargs["layer_norm"] = True
+
+        if "gemma" in model_name:
+            kwargs["geglu"] = True
+        else:
+            kwargs["swiglu"] = True
+
+        if "llava" in model_name:
+            apply_liger_kernel_to_llama(**kwargs)
+
+        # fused_linear_cross_entropy is not supported in mini_granite3
+        kwargs["fused_linear_cross_entropy"] = True if model_name != "mini_granite3" else False
+        kwargs["cross_entropy"] = False
+
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+
+    train_dataset = load_from_disk(DEFAULT_DATASET_PATH)
+    loader = DataLoader(train_dataset, batch_size=16, shuffle=False, collate_fn=simple_collate_fn)
+    loader_iter = iter(loader)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+    for i in range(num_steps):
+        batch = next(loader_iter).to(model.device)
+        optimizer.zero_grad()
+        output = model(**batch)
+        output.loss.backward()
+        optimizer.step()
+        print(f"Step {i}, Loss: {output.loss.item()}")
+        loss_list.append(output.loss.item())
+
+    model.eval()
+    eval_batch = next(loader_iter).to(model.device)
+    if with_liger:
+        eval_batch["skip_logits"] = False
+    with torch.no_grad():
+        eval_output = model(**eval_batch)
+    print(f"Eval Loss: {eval_output.loss.item()}")
+    loss_list.append(eval_output.loss.item())
+    topk_logprobs = get_topk(get_logprobs(eval_output.logits))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        pytest.param(
+            "mini_llama4",  # llama4 requires slightly larger tolerances to pass this test after bug fix to llama4 in transformers v5.0.0
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-3,
+            5e-3,
+            1e-3,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not LLAMA4_AVAILABLE,
+                    reason="Llama4 not available in this version of trasnformers",
+                ),
+                # pytest.mark.xfail(
+                #    reason=(
+                #        "RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype:"
+                #        " float key.dtype: c10::BFloat16 and value.dtype: c10::BFloat16 instead."
+                #    )
+                # ),
+            ],
+        ),
+        ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_llava",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not LLAVA_AVAILABLE,
+                    reason="LLaVa not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    version.parse(transformers.__version__) < version.parse("4.52.0"),
+                    reason="LLaVa doesn't materialize logits in transformers<=4.52.0 so we can't test it",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_mllama",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not MLLAMA_AVAILABLE,
+                reason="Mllama not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_gemma3_text",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-4,
+            5e-2,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GEMMA3_AVAILABLE,
+                reason="Gemma3 not available in this version of transformers",
+            ),
+        ),
+        ("mini_qwen2", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_qwen3",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN3_AVAILABLE,
+                reason="Qwen3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_moe",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN3_AVAILABLE,
+                reason="Qwen3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_gpt_oss",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GPT_OSS_AVAILABLE,
+                reason="GPT-OSS not available in this version of transformers",
+            ),
+        ),
+        pytest.param(  # qwen2_vl requires slightly larger tolerances to pass this test after bug fix to qwen2_vl in transformers v4.47.0
+            "mini_qwen2_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-5,  # 1e-8,
+            1e-1,  # 1e-5,
+            5e-3,  # 5e-3,
+            1e-5,  # 1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN2_VL_AVAILABLE,
+                reason="Qwen2-VL not available in this version of transformers",
+            ),
+        ),
+        # TODO: logits tolerances are significantly larger than the other tests, need to investigate
+        pytest.param(  # qwen2_5_vl requires slightly larger tolerances to pass this test after bug fix to qwen2_vl in transformers v4.47.0
+            "mini_qwen2_5_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-5,  # 1e-8,
+            1e-1,  # 1e-5,
+            5e-3,  # 5e-3,
+            1e-5,  # 1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN2_5_VL_AVAILABLE,
+                reason="Qwen2.5-VL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-5,  # 1e-8,
+            1e-1,  # 1e-5,
+            5e-3,  # 5e-3,
+            1e-5,  # 1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN3_VL_AVAILABLE,
+                reason="Qwen3-VL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_vl_moe",
+            32,
+            1e-4,
+            torch.float32,
+            1e-5,
+            1e-1,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN3_VL_MOE_AVAILABLE,
+                reason="Qwen3-VL-MoE not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_olmo2",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not OLMO2_AVAILABLE,
+                reason="OLMO2 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_olmo3",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not OLMO3_AVAILABLE,
+                reason="OLMO3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_glm4",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GLM4_AVAILABLE,
+                reason="Glm4 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_glm4v",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GLM4V_AVAILABLE,
+                reason="Glm4v not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_glm4v_moe",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-3,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not GLM4V_MOE_AVAILABLE,
+                    reason="Glm4v_moe not available in this version of transformers",
+                ),
+            ],
+        ),
+        ("mini_phi3", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_mistral",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[],
+        ),
+        # TODO: mixtral is flaky so disable the test for now
+        # ("mini_mixtral", 32, 1e-4, torch.float32, 5e-4, 1e-4, 5e-3, 1e-5, 1e-2, 1e-5),
+        # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match (casts are not done the same way)
+        ("mini_gemma1", 32, 1e-5, torch.float32, 1e-8, 1e-4, 5e-2, 1e-5, 5e-3, 1e-5),
+        ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
+        ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_granite3",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-4,
+            4e-2,  # 4e-3
+            1e-5,  # 1e-5
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GRANITE_AVAILABLE,
+                reason="Granite not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_smollm3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(
+                not SMOLLM3_AVAILABLE,
+                reason="Smollm3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_internvl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not INTERNVL_AVAILABLE,
+                reason="InternVL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_falcon_h1",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-4,
+            4e-2,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not FALCONH1_AVAILABLE,
+                reason="FalconH1 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_next",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3NEXT_AVAILABLE,
+                    reason="Qwen3Next not available in this version of transformers",
+                ),
+                pytest.mark.skip(
+                    reason="flash-linear-attention's ChunkGatedDeltaRuleFunction does not support float32.\n"
+                    + " Torch's implementation takes too long"
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5_moe",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3_5_MOE_AVAILABLE,
+                    reason="Qwen3_5Moe not available in this version of transformers",
+                ),
+                pytest.mark.skip(
+                    reason="flash-linear-attention's ChunkGatedDeltaRuleFunction does not support float32.\n"
+                    + " Torch's implementation takes too long"
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3_5_AVAILABLE,
+                    reason="Qwen3_5 not available in this version of transformers",
+                ),
+                pytest.mark.skip(
+                    reason="flash-linear-attention's ChunkGatedDeltaRuleFunction does not support float32.\n"
+                    + " Torch's implementation takes too long"
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_hunyuan_v1",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not HUNYUAN_V1_AVAILABLE,
+                reason="Hunyuan_v1 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_hunyuan_v1_moe",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not HUNYUAN_V1_AVAILABLE,
+                reason="Hunyuan_v1_moe not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_exaone4",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not EXAONE4_AVAILABLE,
+                reason="EXAONE4 not available in this version of transformers",
+            ),
+        ),
+    ],
+)
+def test_mini_model(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+
+    expected_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True)
+
+    # Compare every step of the loss
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # Compare the topk logprobs from evaluation step
+    if expected_output["topk_logprobs"] is not None and actual_output["topk_logprobs"] is not None:
+        assert_verbose_allclose(
+            expected_output["topk_logprobs"],
+            actual_output["topk_logprobs"],
+            atol=logprobs_atol,
+            rtol=logprobs_rtol,
+            extra_info="[Top k logprobs]",
+        )
+
+    # Compare the params from the last step
+    # Iterate over the model's parameters and compare them
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1],
+            actual_param[1],
+            atol=param_atol,
+            rtol=param_rtol,
+            extra_info="[Model parameters]",
+        )
diff --git a/test/convergence/fp32/test_mini_models_multimodal.py b/test/convergence/fp32/test_mini_models_multimodal.py
new file mode 100755
index 0000000000000000000000000000000000000000..f3e59bc3a1ae8ee2353f657f8bf1e67c967d5df1
--- /dev/null
+++ b/test/convergence/fp32/test_mini_models_multimodal.py
@@ -0,0 +1,1934 @@
+import functools
+import os
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Ensure deterministic behavior with CuBLAS
+
+import pytest
+import torch
+import transformers
+
+from datasets import load_dataset
+from packaging import version
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerFast
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+
+from liger_kernel.transformers import apply_liger_kernel_to_gemma3
+from liger_kernel.transformers import apply_liger_kernel_to_internvl
+from liger_kernel.transformers import apply_liger_kernel_to_llama4
+from liger_kernel.transformers import apply_liger_kernel_to_llava
+from liger_kernel.transformers import apply_liger_kernel_to_mllama
+from liger_kernel.transformers import apply_liger_kernel_to_paligemma
+from liger_kernel.transformers import apply_liger_kernel_to_pixtral
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl_moe
+from liger_kernel.transformers import apply_liger_kernel_to_smolvlm
+from liger_kernel.utils import infer_device
+from test.utils import FAKE_CONFIGS_PATH
+from test.utils import UNTOKENIZED_DATASET_PATH
+from test.utils import MiniModelConfig
+from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
+from test.utils import is_torchvision_available
+from test.utils import load_image_processing_config
+from test.utils import load_processor_config
+from test.utils import load_tokenizer_config
+from test.utils import multimodal_collate_fn
+from test.utils import require_deterministic
+from test.utils import revert_liger_kernel_to_gemma3
+from test.utils import revert_liger_kernel_to_internvl
+from test.utils import revert_liger_kernel_to_llama4
+from test.utils import revert_liger_kernel_to_llava
+from test.utils import revert_liger_kernel_to_mllama
+from test.utils import revert_liger_kernel_to_Paligemma
+from test.utils import revert_liger_kernel_to_pixtral
+from test.utils import revert_liger_kernel_to_qwen2_5_vl
+from test.utils import revert_liger_kernel_to_qwen2_vl
+from test.utils import revert_liger_kernel_to_qwen3_5
+from test.utils import revert_liger_kernel_to_qwen3_vl
+from test.utils import revert_liger_kernel_to_qwen3_vl_moe
+from test.utils import revert_liger_kernel_to_smolvlm2
+from test.utils import set_seed
+from test.utils import train_bpe_tokenizer
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+if IS_TRANSFORMERS_V5_OR_LATER:
+    from transformers.models.gemma.tokenization_gemma import GemmaTokenizer
+else:
+    from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast as GemmaTokenizer
+
+try:
+    # Qwen2-VL is only available in transformers>=4.52.4
+    import transformers
+
+    from packaging import version
+
+    if IS_TRANSFORMERS_V5_OR_LATER:
+        from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    else:
+        from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as Qwen2Tokenizer
+    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+    from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
+    from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
+
+    QWEN2_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_VL_AVAILABLE = False
+
+try:
+    # Qwen2.5-VL is only available in transformers>4.48.2
+    import transformers
+
+    from packaging import version
+
+    if IS_TRANSFORMERS_V5_OR_LATER:
+        from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    else:
+        from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as Qwen2Tokenizer
+    from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+    from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessor
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor
+
+    QWEN2_5_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_5_VL_AVAILABLE = False
+
+
+try:
+    if IS_TRANSFORMERS_V5_OR_LATER:
+        from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    else:
+        from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as Qwen2Tokenizer
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLVisionConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+    from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
+    from transformers.models.qwen3_vl.video_processing_qwen3_vl import Qwen3VLVideoProcessor
+
+    QWEN3_VL_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+
+try:
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeTextConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeVisionConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+try:
+    if IS_TRANSFORMERS_V5_OR_LATER:
+        from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer
+    else:
+        from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as Qwen2Tokenizer
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLVisionConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+    from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
+    from transformers.models.qwen3_vl.video_processing_qwen3_vl import Qwen3VLVideoProcessor
+
+    QWEN3_VL_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+
+try:
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeTextConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeVisionConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5Config
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5VisionConfig
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
+    from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
+    from transformers.models.qwen3_vl.video_processing_qwen3_vl import Qwen3VLVideoProcessor
+
+    QWEN3_5_AVAILABLE = True
+except ImportError:
+    QWEN3_5_AVAILABLE = False
+
+try:
+    # Mllama is only available in transformers>=4.45.0
+    from transformers.models.mllama.configuration_mllama import MllamaConfig
+    from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+    from transformers.models.mllama.configuration_mllama import MllamaVisionConfig
+    from transformers.models.mllama.image_processing_mllama import MllamaImageProcessor
+    from transformers.models.mllama.modeling_mllama import MllamaForConditionalGeneration
+    from transformers.models.mllama.processing_mllama import MllamaProcessor
+
+    MLLAMA_AVAILABLE = True
+except ImportError:
+    MLLAMA_AVAILABLE = False
+
+try:
+    from transformers import CLIPImageProcessor
+    from transformers import CLIPVisionConfig
+    from transformers import LlamaConfig
+    from transformers.models.llava.configuration_llava import LlavaConfig
+    from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration
+    from transformers.models.llava.processing_llava import LlavaProcessor
+
+    from liger_kernel.transformers import apply_liger_kernel_to_llama
+
+    LLAVA_AVAILABLE = True
+except ImportError:
+    LLAVA_AVAILABLE = False
+
+try:
+    from transformers.models.llama4.configuration_llama4 import Llama4Config
+    from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+    from transformers.models.llama4.configuration_llama4 import Llama4VisionConfig
+    from transformers.models.llama4.image_processing_llama4_fast import Llama4ImageProcessorFast
+    from transformers.models.llama4.modeling_llama4 import Llama4ForConditionalGeneration
+    from transformers.models.llama4.processing_llama4 import Llama4Processor
+
+    LLAMA4_AVAILABLE = True
+
+except ImportError:
+    LLAMA4_AVAILABLE = False
+
+try:
+    import transformers
+
+    from packaging import version
+    from transformers.models.gemma.configuration_gemma import GemmaConfig
+    from transformers.models.gemma2.configuration_gemma2 import Gemma2Config
+    from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
+    from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+    from transformers.models.paligemma.processing_paligemma import PaliGemmaProcessor
+    from transformers.models.siglip.image_processing_siglip import SiglipImageProcessor
+
+    PALIGEMMA_AVAILABLE = True
+except ImportError:
+    PALIGEMMA_AVAILABLE = False
+
+try:
+    # Gemma3 is only available in transformers>=4.50.0
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3Config
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
+    from transformers.models.gemma3.image_processing_gemma3 import Gemma3ImageProcessor
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForConditionalGeneration
+    from transformers.models.gemma3.processing_gemma3 import Gemma3Processor
+
+    GEMMA3_AVAILABLE = True
+except ImportError:
+    GEMMA3_AVAILABLE = False
+
+try:
+    # InternVL is only available in transformers>=4.52.1
+    from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
+    from transformers.models.internvl.configuration_internvl import InternVLConfig
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+    from transformers.models.internvl.processing_internvl import InternVLProcessor
+    from transformers.models.internvl.video_processing_internvl import InternVLVideoProcessor
+    from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+
+    INTERNVL_AVAILABLE = True
+except ImportError:
+    INTERNVL_AVAILABLE = False
+
+try:
+    # SmolVLM2 is only available in transformers>=4.50.0
+    from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+    from transformers.models.smolvlm.configuration_smolvlm import SmolVLMConfig
+    from transformers.models.smolvlm.image_processing_smolvlm import SmolVLMImageProcessor
+    from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration
+    from transformers.models.smolvlm.processing_smolvlm import SmolVLMProcessor
+    from transformers.models.smolvlm.video_processing_smolvlm import SmolVLMVideoProcessor
+
+    SMOLVLM2_AVAILABLE = True
+except ImportError:
+    SMOLVLM2_AVAILABLE = False
+
+try:
+    from transformers.models.pixtral.configuration_pixtral import PixtralVisionConfig
+    from transformers.models.pixtral.modeling_pixtral import PixtralVisionModel
+
+    PIXTRAL_AVAILABLE = True
+except ImportError:
+    PIXTRAL_AVAILABLE = False
+
+try:
+    from num2words import num2words  # noqa: F401
+
+    NUM2WORDS_AVAILABLE = True
+except ImportError:
+    NUM2WORDS_AVAILABLE = False
+
+
+device = infer_device()
+
+torch.use_deterministic_algorithms(True)
+
+#  Only setting torch.use_deterministic_algorithms(True) throws the following error:
+#  RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`,
+#  but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an
+#  environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information,
+#  go to https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+TEST_IMAGE_DIM = 64
+
+MINI_MODEL_SETUPS = {}
+
+if LLAMA4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_llama4"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_llama4, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
+        model_class=Llama4ForConditionalGeneration,
+        mini_model_config=Llama4Config(
+            image_token_index=8,
+            vision_config=Llama4VisionConfig(
+                attn_implementation_autoset=True,
+                attention_dropout=0.0,
+                hidden_act="gelu",
+                hidden_size=512,  # 1280
+                image_size=560,  # 560
+                initializer_range=0.02,
+                intermediate_layers_indices=[2],  # [3, 7, 15, etc...]
+                intermediate_size=2048,  # 5120
+                max_num_tiles=1,  # 4
+                norm_eps=1e-5,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_global_layers=2,  # 8
+                num_hidden_layers=8,  # 32
+                patch_size=280,  # 14
+                supported_aspect_ratios=[[1, 1]],  # [[1, 1], [1, 2], etc... ]
+                vision_output_dim=4096,  # 7680
+            ),
+            text_config=Llama4TextConfig(
+                bos_token_id=0,
+                eos_token_id=0,
+                pad_token_id=0,
+                cross_attention_layers=[2],  # [3, 8, 13, 18, etc...]
+                dropout=0,
+                hidden_act="silu",
+                hidden_size=1024,  # 4096
+                initializer_range=0.02,
+                intermediate_size=2048,  # 14336
+                max_position_embeddings=131_072,
+                num_attention_heads=8,  # 32
+                num_hidden_layers=4,  # 40
+                num_key_value_heads=2,  # 8
+                rms_norm_eps=1e-5,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,  # 128256,
+            ),
+            attn_implementation="sdpa",
+        ),
+    )
+
+
+if MLLAMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_mllama"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_mllama, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mllama,
+        model_class=MllamaForConditionalGeneration,
+        mini_model_config=MllamaConfig(
+            vision_config=MllamaVisionConfig(
+                hidden_act="gelu",
+                hidden_size=512,  # 1280
+                image_size=560,  # 560
+                initializer_range=0.02,
+                intermediate_layers_indices=[2],  # [3, 7, 15, etc...]
+                intermediate_size=2048,  # 5120
+                max_num_tiles=1,  # 4
+                norm_eps=1e-5,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_global_layers=2,  # 8
+                num_hidden_layers=8,  # 32
+                patch_size=140,  # 14
+                supported_aspect_ratios=[[1, 1]],  # [[1, 1], [1, 2], etc... ]
+                vision_output_dim=1024,  # 7680
+            ),
+            text_config=MllamaTextConfig(
+                bos_token_id=0,
+                eos_token_id=0,
+                pad_token_id=0,
+                cross_attention_layers=[2],  # [3, 8, 13, 18, etc...]
+                dropout=0,
+                hidden_act="silu",
+                hidden_size=1024,  # 4096
+                initializer_range=0.02,
+                intermediate_size=2048,  # 14336
+                max_position_embeddings=131_072,
+                num_attention_heads=8,  # 32
+                num_hidden_layers=4,  # 40
+                num_key_value_heads=2,  # 8
+                rms_norm_eps=1e-5,
+                rope_scaling=dict(
+                    factor=8.0,
+                    high_freq_factor=4.0,
+                    low_freq_factor=1.0,
+                    original_max_position_embeddings=8192,
+                    rope_type="llama3",
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,  # 128256,
+            ),
+            image_token_index=1,  # NOTE: outside the vocab size
+            attn_implementation="sdpa",
+        ),
+    )
+
+if PALIGEMMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_paligemma"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
+        model_class=PaliGemmaForConditionalGeneration,
+        mini_model_config=PaliGemmaConfig(
+            vision_config=SiglipVisionConfig(
+                attention_dropout=0.0,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_size=1152,
+                image_size=224,
+                intermediate_size=2048,  # 4304
+                layer_norm_eps=1e-06,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_hidden_layers=4,  # 27
+                num_image_tokens=256,
+                num_positions=256,
+                patch_size=14,
+                projection_dim=1024,  # 2304
+            ),
+            text_config=GemmaConfig(
+                vocab_size=32000,  # 256000
+                hidden_size=1024,  # 3072
+                intermediate_size=2048,  # 24576
+                num_hidden_layers=4,  # 28
+                num_attention_heads=4,  # 16
+                num_key_value_heads=4,  # 16
+                head_dim=256,
+                hidden_activation="gelu_pytorch_tanh",
+                max_position_embeddings=8192,
+                initializer_range=0.02,
+                rms_norm_eps=1e-06,
+                use_cache=True,
+                pad_token_id=0,
+                # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                bos_token_id=1,  # 128000
+                eos_token_id=2,  # 128001
+                tie_word_embeddings=True,
+                attention_bias=False,
+                attention_dropout=0.0,
+            ),
+            image_token_index=4,  # NOTE: outside the vocab size
+            attn_implementation="eager",
+            vocab_size=32000,
+            projection_dim=1024,
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_paligemma2"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_paligemma, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_Paligemma,
+        model_class=PaliGemmaForConditionalGeneration,
+        mini_model_config=PaliGemmaConfig(
+            vision_config=SiglipVisionConfig(
+                attention_dropout=0.0,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_size=1152,
+                image_size=224,
+                intermediate_size=2048,  # 4304
+                layer_norm_eps=1e-06,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_hidden_layers=4,  # 27
+                num_image_tokens=256,
+                num_positions=256,
+                patch_size=14,
+                projection_dim=1024,  # 2304
+            ),
+            text_config=Gemma2Config(
+                vocab_size=32000,  # 256000
+                hidden_size=1024,  # 3072
+                intermediate_size=2048,  # 24576
+                num_hidden_layers=4,  # 28
+                num_attention_heads=4,  # 16
+                num_key_value_heads=4,  # 16
+                head_dim=256,
+                hidden_activation="gelu_pytorch_tanh",
+                max_position_embeddings=8192,
+                initializer_range=0.02,
+                rms_norm_eps=1e-06,
+                use_cache=True,
+                pad_token_id=0,
+                # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+                # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+                bos_token_id=1,  # 128000
+                eos_token_id=2,  # 128001
+                tie_word_embeddings=True,
+                attention_bias=False,
+                attention_dropout=0.0,
+            ),
+            image_token_index=4,  # NOTE: outside the vocab size
+            attn_implementation="eager",
+            vocab_size=32000,
+            projection_dim=1024,
+        ),
+    )
+
+
+if GEMMA3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gemma3"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_gemma3, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma3,
+        model_class=Gemma3ForConditionalGeneration,
+        mini_model_config=Gemma3Config(
+            vision_config=SiglipVisionConfig(
+                attention_dropout=0.0,
+                hidden_act="gelu_pytorch_tanh",
+                hidden_size=1152,
+                image_size=224,
+                intermediate_size=2048,  # 4304
+                layer_norm_eps=1e-06,
+                num_attention_heads=4,  # 16
+                num_channels=3,
+                num_hidden_layers=4,  # 27
+                num_image_tokens=256,
+                num_positions=256,
+                patch_size=14,
+            ).to_dict(),
+            text_config=Gemma3TextConfig(
+                vocab_size=32000,  # 256000
+                hidden_size=1024,  # 3072
+                intermediate_size=2048,  # 24576
+                num_hidden_layers=4,  # 28
+                num_attention_heads=4,  # 16
+                num_key_value_heads=4,  # 16
+                head_dim=256,
+                hidden_activation="gelu_pytorch_tanh",
+                max_position_embeddings=8192,
+                initializer_range=0.02,
+                rms_norm_eps=1e-06,
+                use_cache=True,
+                tie_word_embeddings=True,
+                attention_bias=False,
+                attention_dropout=0.0,
+            ),
+            image_token_index=5,  # NOTE: outside the vocab size
+            boi_token_index=4,
+            eoi_token_index=6,
+        ),
+    )
+
+if QWEN2_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_qwen2_vl, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
+        model_class=Qwen2VLForConditionalGeneration,
+        mini_model_config=Qwen2VLConfig(
+            attention_dropout=0.0,
+            # Token Ids and vocab size must match those in the tokenizer/processor
+            # test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json
+            bos_token_id=0,
+            eos_token_id=0,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            vision_token_id=3,
+            image_token_id=4,
+            video_token_id=5,
+            hidden_act="silu",
+            hidden_size=1024,  # 8192
+            initializer_range=0.02,
+            intermediate_size=1024,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=8,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=True,
+            use_cache=False,  # True
+            vocab_size=32000,  # 152064,
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "embed_dim": 128,  # 1280
+                "mlp_ratio": 1,
+                "num_heads": 8,  # 16
+                "in_chans": 3,
+                "hidden_size": 1024,  # 1536
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if LLAVA_AVAILABLE:
+    # https://huggingface.co/llava-hf/llava-1.5-7b-hf
+    MINI_MODEL_SETUPS["mini_llava"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_llava, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llava,
+        model_class=LlavaForConditionalGeneration,
+        mini_model_config=LlavaConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                hidden_act="silu",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=2048,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pretraining_tp=1,
+                tie_word_embeddings=False,
+                use_cache=True,
+                max_position_embeddings=4096,  # llava-1.5-7b-hf
+                rms_norm_eps=1e-05,  # llava-1.5-7b-hf
+                vocab_size=32064,  # llava-1.5-7b-hf
+                # At rope backward
+                # Eager produces incontiguous dq and dk
+                # SDPA produces contiguous dq and incontiguous dk
+                # Flash_attn produces contiguous dq and dk
+                attn_implementation="sdpa",  # default value, pytorch native attention
+            ),
+            vision_config=CLIPVisionConfig(
+                hidden_size=1024,
+                image_size=336,
+                intermediate_size=2048,  # 4096
+                model_type="clip_vision_model",
+                num_attention_heads=4,  # 16
+                num_hidden_layers=4,  # 24
+                patch_size=14,
+                projection_dim=768,
+                vocab_size=32000,
+            ),
+            vocab_size=32064,
+            ignore_index=-100,
+            pad_token_id=4,
+            image_token_index=3,
+            projector_hidden_act="gelu",
+            vision_feature_layer=-2,
+            vision_feature_select_strategy="default",
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if INTERNVL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_internvl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_internvl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_internvl,
+        model_class=InternVLForConditionalGeneration,
+        mini_model_config=InternVLConfig(
+            text_config=Qwen2Config(
+                rms_norm_eps=1e-5,
+                hidden_size=256,  # 1024
+                intermediate_size=1024,  # 4096
+                hidden_act="silu",
+                num_hidden_layers=4,  # 24
+                num_attention_heads=4,  # 16
+                num_key_value_heads=2,  # 16
+                max_position_embeddings=4096,  # 8192
+                vocab_size=32000,  # 151936
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                tie_word_embeddings=False,
+            ),
+            vision_config={
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+            },
+            image_token_id=24,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if SMOLVLM2_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_smolvlm2"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_smolvlm,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_smolvlm2,
+        model_class=SmolVLMForConditionalGeneration,
+        mini_model_config=SmolVLMConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                hidden_act="silu",
+                hidden_size=576,  # 576 for 256M model
+                initializer_range=0.041666666666666664,
+                intermediate_size=1536,  # 1536 for 256M model
+                max_position_embeddings=8192,
+                num_attention_heads=9,  # 9 for 256M model
+                num_hidden_layers=4,  # 30 -> reduced to 4 for testing
+                num_key_value_heads=3,  # 3 for 256M model
+                rms_norm_eps=1e-5,
+                tie_word_embeddings=False,
+                vocab_size=49280,
+            ),
+            vision_config={
+                "hidden_size": 768,
+                "intermediate_size": 3072,
+                "num_hidden_layers": 4,  # 12 -> reduced to 4 for testing
+                "num_attention_heads": 12,
+                "image_size": 512,
+                "patch_size": 16,
+            },
+            image_token_id=49190,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if QWEN2_5_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_5_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=functools.partial(apply_liger_kernel_to_qwen2_5_vl, fused_linear_cross_entropy=False),
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
+        model_class=Qwen2_5_VLForConditionalGeneration,
+        mini_model_config=Qwen2_5_VLConfig(
+            attention_dropout=0.0,
+            # Token Ids and vocab size must match those in the tokenizer/processor
+            # test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json
+            bos_token_id=0,
+            eos_token_id=0,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            vision_token_id=3,
+            image_token_id=4,
+            video_token_id=5,
+            hidden_act="silu",
+            hidden_size=1024,  # 8192
+            initializer_range=0.02,
+            intermediate_size=1024,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=8,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=True,
+            use_cache=False,  # True
+            vocab_size=32000,  # 152064,
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_size": 128,  # 1280
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 1024,
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if QWEN3_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl,
+        model_class=Qwen3VLForConditionalGeneration,
+        mini_model_config=Qwen3VLConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3VLVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3VLTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+            ).to_dict(),
+        ),
+    )
+
+if QWEN3_VL_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl_moe,
+        model_class=Qwen3VLMoeForConditionalGeneration,
+        mini_model_config=Qwen3VLMoeConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3VLMoeTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+                decoder_sparse_step=1,
+                moe_intermediate_size=1024,
+                num_experts_per_tok=2,
+                num_experts=4,
+                norm_topk_prob=False,
+                output_router_logits=False,
+                router_aux_loss_coef=0.001,
+            ).to_dict(),
+        ),
+    )
+
+
+if QWEN3_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl,
+        model_class=Qwen3VLForConditionalGeneration,
+        mini_model_config=Qwen3VLConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3VLVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3VLTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+            ).to_dict(),
+        ),
+    )
+
+if QWEN3_VL_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl_moe,
+        model_class=Qwen3VLMoeForConditionalGeneration,
+        mini_model_config=Qwen3VLMoeConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3VLMoeTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+                decoder_sparse_step=1,
+                moe_intermediate_size=1024,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+            ).to_dict(),
+        ),
+    )
+
+if QWEN3_5_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5,
+        model_class=Qwen3_5ForConditionalGeneration,
+        mini_model_config=Qwen3_5Config(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=Qwen3_5VisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=Qwen3_5TextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+                attention_dropout=0.0,
+                attention_bias=False,
+                decoder_sparse_step=1,
+                moe_intermediate_size=1024,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+                pad_token_id=None,
+            ).to_dict(),
+        ),
+    )
+
+if PIXTRAL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_pixtral"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_pixtral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_pixtral,
+        model_class=PixtralVisionModel,
+        mini_model_config=PixtralVisionConfig(
+            hidden_size=1024,
+            intermediate_size=2048,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_channels=3,
+            image_size=256,
+            patch_size=16,
+            hidden_act="silu",
+            attention_dropout=0.0,
+            rope_theta=10000.0,
+            initializer_range=0.02,
+        ),
+    )
+
+
+def create_processor(model_name: str):
+    if model_name == "mini_qwen2_vl":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Qwen2VLImageProcessor()
+        video_processor = Qwen2VLVideoProcessor()
+        return Qwen2VLProcessor(
+            image_processor=image_processor,
+            video_processor=video_processor,
+            tokenizer=qwen_tokenizer,
+        )
+
+    elif model_name == "mini_qwen2_5_vl":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Qwen2VLImageProcessor()
+        video_processor = Qwen2VLVideoProcessor()
+        return Qwen2_5_VLProcessor(
+            image_processor=image_processor,
+            video_processor=video_processor,
+            tokenizer=qwen_tokenizer,
+        )
+
+    elif model_name in ("mini_qwen3_vl", "mini_qwen3_vl_moe", "mini_qwen3_5"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Qwen2VLImageProcessor(patch_size=16, temporal_patch_size=2, merge_size=2)
+        video_processor = Qwen3VLVideoProcessor()
+        return Qwen3VLProcessor(
+            image_processor=image_processor,
+            video_processor=video_processor,
+            tokenizer=qwen_tokenizer,
+        )
+
+    elif model_name == "mini_llava":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Llava/llava-1.5-7b-hf/tokenizer_config.json",
+            )
+        )
+        image_processor_config = load_image_processing_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Llava/llava-1.5-7b-hf/preprocessor_config.json",
+            )
+        )
+        processor_config = load_processor_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Llava/llava-1.5-7b-hf/processor_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config)
+        fast_tokenizer.model_input_names = ["input_ids", "attention_mask"]
+        image_processor = CLIPImageProcessor(**image_processor_config)
+
+        return LlavaProcessor(**processor_config, image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    elif model_name == "mini_internvl":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "OpenGVLab/InternVL3-1B-hf/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = GotOcr2ImageProcessorFast(
+            crop_to_patches=False, min_patches=1, max_patches=12, size={"height": 448, "width": 448}
+        )
+        video_processor = InternVLVideoProcessor()
+
+        # Return proper InternVL processor
+        return InternVLProcessor(
+            image_processor=image_processor, tokenizer=qwen_tokenizer, video_processor=video_processor
+        )
+
+    elif model_name == "mini_smolvlm2":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(FAKE_CONFIGS_PATH, "HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json")
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        gpt2_tokenizer = GPT2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = SmolVLMImageProcessor(size={"longest_edge": 512})
+        video_processor = SmolVLMVideoProcessor()
+
+        # Return proper SmolVLM processor
+        return SmolVLMProcessor(
+            image_processor=image_processor, tokenizer=gpt2_tokenizer, video_processor=video_processor
+        )
+
+    elif model_name.startswith("mini_llama4"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Llama4ImageProcessorFast(size={"height": 560, "width": 560})
+        return Llama4Processor(
+            image_processor=image_processor,
+            tokenizer=fast_tokenizer,
+            fake_image_token="<|image|>",
+            image_token="<|image|>",
+        )
+
+    elif model_name == "mini_mllama":
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = MllamaImageProcessor(size={"height": 560, "width": 560})
+        return MllamaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    elif model_name.startswith("mini_paligemma"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = SiglipImageProcessor(size={"height": 224, "width": 224}, image_seq_length=256)
+        return PaliGemmaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    elif model_name.startswith("mini_gemma3"):
+        tokenizer_config = load_tokenizer_config(
+            os.path.join(
+                FAKE_CONFIGS_PATH,
+                "Google/Gemma3/gemma-3-4b-it/tokenizer_config.json",
+            )
+        )
+        tokenizer_base = train_bpe_tokenizer(
+            [
+                token.content
+                for key, token in sorted(
+                    tokenizer_config["added_tokens_decoder"].items(),
+                    key=lambda x: int(x[0]),
+                )
+            ]
+        )
+        fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config)
+        image_processor = Gemma3ImageProcessor()
+        return Gemma3Processor(image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    else:
+        raise ValueError(f"Processor not available for model {model_name}")
+
+
+def create_multimodal_dataset(model_name: str):
+    processor = create_processor(model_name)
+
+    def generate_procedural_image(example, index):
+        """Generate an image with a single row of white pixels at the index specified"""
+        image = torch.zeros(3, TEST_IMAGE_DIM, TEST_IMAGE_DIM)
+        image[:, index % TEST_IMAGE_DIM, :] = 255
+        example["image"] = image
+        return example
+
+    def apply_chat_template(example):
+        """
+        Under the hood, this inserts the correct image placeholder token into the text.
+        More or less this conversation format is used by HF's mllms. The fact that it is
+        formatting as for IFT is not in-and-of-itself important here.
+        """
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": example["text"]}],
+            },
+        ]
+        example["text"] = processor.tokenizer.apply_chat_template(conversation, tokenize=False)
+        return example
+
+    def preprocess_function(examples):
+        """Tokenize text, preprocess images, and generate other relevant inputs for the model."""
+        if model_name == "mini_llama4":
+            # Process images and text separately to avoid complex token replacement, this helped setting lower tolerance than processing them together.
+            image_inputs = processor.image_processor(images=examples["image"], return_tensors="pt")
+            text_inputs = processor.tokenizer(
+                examples["text"],
+                padding="max_length",
+                truncation=True,
+                max_length=1024,
+                return_tensors="pt",
+            )
+            return {**text_inputs, **image_inputs}
+        else:
+            # For other models, use the normal processor
+            return processor(
+                text=examples["text"],
+                images=examples["image"],
+                padding="max_length",
+                truncation=True,
+                max_length=1024,  # longer than for text-only b/c images require quite a few tokens
+                return_tensors="pt",
+            )
+
+    train_dataset = (
+        load_dataset("text", data_files={"train": UNTOKENIZED_DATASET_PATH}, split="train")
+        .to_iterable_dataset()  # only map examples as-needed and on-demand
+        .map(generate_procedural_image, with_indices=True)
+        .map(apply_chat_template)
+        .map(preprocess_function, remove_columns=["text", "image"])
+    )
+    return train_dataset
+
+
+def create_model(model_name):
+    """
+    Create a mini version model
+    The commented values are the original values
+    """
+    model_config = MINI_MODEL_SETUPS[model_name].mini_model_config
+    model_class = MINI_MODEL_SETUPS[model_name].model_class
+    return model_class(model_config)
+
+
+@require_deterministic
+def run_mini_model_multimodal(
+    model_name="mini_qwen2_vl",
+    num_steps=100,
+    dtype=torch.bfloat16,
+    lr=1e-5,
+    with_liger=False,
+):
+    # If we move it to the beginning of test_mini_model, the two runs are initialized with different weights.
+    # This is due to RNG (Random Number Generator). The formula of RNG progression is x_(n+1) = (a * x_n + c) % m
+    # Everytime RNG is used, like randomly initialzing weight, the RNG progresses to the next state.
+    # Therefore, we have to reset RNG before we create the model to ensure the weight initialization started from the same RNG state.
+
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+    if "mllama" in model_name or "llama4" in model_name or "qwen3_5" in model_name:
+        revert_kwargs["model_type"] = "conditional_generation"
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+            "cross_entropy": False,
+        }
+        if "llama4" in model_name:
+            kwargs["rope"] = False
+        if (
+            "qwen2_5_vl" not in model_name
+            and "llava" not in model_name
+            and "qwen3_vl" not in model_name
+            and "qwen3_5" not in model_name
+        ):
+            kwargs["layer_norm"] = True
+
+        if "qwen3_5" in model_name:
+            kwargs["rope"] = False
+
+        if "gemma" in model_name:
+            kwargs["geglu"] = True
+        else:
+            kwargs["swiglu"] = True
+
+        if "llava" in model_name:
+            apply_liger_kernel_to_llama(**kwargs)
+
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+
+    model.gradient_checkpointing_enable()
+
+    train_dataset = create_multimodal_dataset(model_name)
+    loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=multimodal_collate_fn)
+    loader_iter = iter(loader)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+
+    for i in range(num_steps):
+        batch = next(loader_iter).to(model.device)
+        optimizer.zero_grad()
+        output = model(**batch)
+        output.loss.backward()
+        optimizer.step()
+
+        print(f"Step {i}, Loss: {output.loss.item()}")
+        loss_list.append(output.loss.item())
+
+    model.eval()
+    eval_batch = next(loader_iter).to(model.device)
+    if with_liger:
+        eval_batch["skip_logits"] = False
+    with torch.no_grad():
+        eval_output = model(**eval_batch)
+    print(f"Eval Loss: {eval_output.loss.item()}")
+    loss_list.append(eval_output.loss.item())
+    topk_logprobs = get_topk(get_logprobs(eval_output.logits))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        pytest.param(
+            "mini_qwen2_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN2_VL_AVAILABLE,
+                    reason="Qwen2-VL not available in this version of transformers",
+                ),
+                pytest.mark.skipif(not is_torchvision_available(), reason="Qwen2VLVideoProcessor requires torchvision"),
+            ],
+        ),
+        # Disable since Llama4 image processor resacle and normalize images to torch.bfloat16, the dtype of model parameters have to be bfloat16
+        # Refer to: https://github.com/huggingface/transformers/blob/67ddc82fbc7e52c6f42a395b4a6d278c55b77a39/src/transformers/models/llama4/image_processing_llama4_fast.py#L371
+        pytest.param(
+            "mini_llama4",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(
+                    not LLAMA4_AVAILABLE,
+                    reason="Llama4 not available in this version of transformers",
+                ),
+                pytest.mark.xfail(
+                    reason=(
+                        "RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype:"
+                        " float key.dtype: c10::BFloat16 and value.dtype: c10::BFloat16 instead."
+                    )
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_llava",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not LLAVA_AVAILABLE,
+                    reason="LLaVa not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    True,
+                    reason="Flaky test",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_internvl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not INTERNVL_AVAILABLE,
+                reason="InternVL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_smolvlm2",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not SMOLVLM2_AVAILABLE,
+                    reason="SmolVLM2 not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not NUM2WORDS_AVAILABLE,
+                    reason="num2words must be present to run SmolVLMProcessor",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen2_5_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN2_5_VL_AVAILABLE,
+                    reason="Qwen2.5-VL not available in this version of transformers",
+                ),
+                pytest.mark.skipif(not is_torchvision_available(), reason="Qwen2VLVideoProcessor requires torchvision"),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3_VL_AVAILABLE,
+                    reason="Qwen3-VL not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not is_torchvision_available(),
+                    reason="Qwen3VLVideoProcessor requires torchvision",
+                ),
+                pytest.mark.skipif(
+                    True,
+                    reason="Flaky test",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_vl_moe",
+            32,
+            1e-4,
+            torch.float32,
+            1e-7,
+            5e-4,
+            5e-2,
+            5e-3,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3_VL_MOE_AVAILABLE,
+                    reason="Qwen3-VL-MoE not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not is_torchvision_available(),
+                    reason="Qwen3VLVideoProcessor requires torchvision",
+                ),
+                pytest.mark.skipif(
+                    True,
+                    reason="Flaky test",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_mllama",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not MLLAMA_AVAILABLE,
+                    reason="Mllama not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    version.parse("4.51.0") > version.parse(transformers.__version__),
+                    reason="MllamaForConditionalGeneration doesn't accecpt `skip_logits` kwargs",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_paligemma",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not PALIGEMMA_AVAILABLE,
+                reason="Paligemma not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_paligemma2",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not PALIGEMMA_AVAILABLE,
+                reason="Paligemma2 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_gemma3",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-4,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not GEMMA3_AVAILABLE,
+                    reason="Gemma3 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3_5_AVAILABLE,
+                    reason="Qwen3.5 not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    not is_torchvision_available(),
+                    reason="Qwen3VLVideoProcessor requires torchvision",
+                ),
+            ],
+        ),
+    ],
+)
+def test_mini_model_multimodal(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+    expected_output = run_mini_model_multimodal(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model_multimodal(
+        model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True
+    )
+
+    # Compare the loss of every step
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # Compare the logits from the last step
+    assert_verbose_allclose(
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
+        extra_info="[Top k logrpobs]",
+    )
+
+    # Compare the params from the last step
+    # Iterate over the model's parameters and compare them
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1],
+            actual_param[1],
+            atol=param_atol,
+            rtol=param_rtol,
+            extra_info="[Model parameters]",
+        )
+
+
+#
+# Vision-only model tests (e.g. Pixtral vision encoder)
+#
+
+
+def generate_procedural_pixel_values(batch_size, num_channels, image_size, index, dtype, device):
+    """Generate deterministic pixel values for vision-only model testing.
+
+    Each image has a single row of white pixels at a deterministic position,
+    providing a reproducible signal for convergence testing.
+    """
+    pixel_values = torch.zeros(batch_size, num_channels, image_size, image_size, dtype=dtype, device=device)
+    for b in range(batch_size):
+        row = (index + b) % image_size
+        pixel_values[b, :, row, :] = 1.0
+    return pixel_values
+
+
+@require_deterministic
+def run_mini_model_vision(
+    model_name="mini_pixtral",
+    num_steps=100,
+    dtype=torch.float32,
+    lr=1e-5,
+    with_liger=False,
+):
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+            "swiglu": True,
+        }
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+
+    for i in range(num_steps):
+        optimizer.zero_grad()
+        pixel_values = generate_procedural_pixel_values(
+            batch_size=2,
+            num_channels=model.config.num_channels,
+            image_size=model.config.image_size,
+            index=i,
+            dtype=dtype,
+            device=device,
+        )
+        output = model(pixel_values=pixel_values)
+        loss = output.last_hidden_state.sum()
+        loss.backward()
+        optimizer.step()
+        print(f"Step {i}, Loss: {loss.item()}")
+        loss_list.append(loss.item())
+
+    # Eval step with deterministic input
+    model.eval()
+    with torch.no_grad():
+        eval_pixel_values = generate_procedural_pixel_values(
+            batch_size=2,
+            num_channels=model.config.num_channels,
+            image_size=model.config.image_size,
+            index=num_steps,
+            dtype=dtype,
+            device=device,
+        )
+        eval_output = model(pixel_values=eval_pixel_values)
+
+    topk_logprobs = get_topk(get_logprobs(eval_output.last_hidden_state))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        pytest.param(
+            "mini_pixtral",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not PIXTRAL_AVAILABLE, reason="Pixtral not available in this version of transformers"
+                ),
+            ],
+        ),
+    ],
+)
+def test_mini_model_vision(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+    expected_output = run_mini_model_vision(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model_vision(
+        model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True
+    )
+
+    # Compare the loss of every step
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # Compare the topk logprobs from evaluation step
+    assert_verbose_allclose(
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
+        extra_info="[Top k logprobs]",
+    )
+
+    # Compare the params from the last step
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1],
+            actual_param[1],
+            atol=param_atol,
+            rtol=param_rtol,
+            extra_info="[Model parameters]",
+        )
diff --git a/test/convergence/fp32/test_mini_models_with_logits.py b/test/convergence/fp32/test_mini_models_with_logits.py
new file mode 100755
index 0000000000000000000000000000000000000000..d225e08bafa4a1f99ffb6f6840cfe55acf77b163
--- /dev/null
+++ b/test/convergence/fp32/test_mini_models_with_logits.py
@@ -0,0 +1,2038 @@
+import os
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Ensure deterministic behavior with CuBLAS
+
+import pytest
+import torch
+import transformers
+
+from datasets import load_from_disk
+from packaging import version
+from torch.utils.data import DataLoader
+from transformers.models.gemma import GemmaConfig
+from transformers.models.gemma import GemmaForCausalLM
+from transformers.models.gemma2 import Gemma2Config
+from transformers.models.gemma2 import Gemma2ForCausalLM
+from transformers.models.llama import LlamaConfig
+from transformers.models.llama import LlamaForCausalLM
+from transformers.models.mistral import MistralConfig
+from transformers.models.mistral import MistralForCausalLM
+from transformers.models.mixtral import MixtralConfig
+from transformers.models.mixtral import MixtralForCausalLM
+from transformers.models.phi3 import Phi3Config
+from transformers.models.phi3 import Phi3ForCausalLM
+from transformers.models.qwen2 import Qwen2Config
+from transformers.models.qwen2 import Qwen2ForCausalLM
+
+from liger_kernel.transformers import apply_liger_kernel_to_exaone4
+from liger_kernel.transformers import apply_liger_kernel_to_falcon_h1
+from liger_kernel.transformers import apply_liger_kernel_to_gemma
+from liger_kernel.transformers import apply_liger_kernel_to_gemma2
+from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text
+from liger_kernel.transformers import apply_liger_kernel_to_glm4
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v
+from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe
+from liger_kernel.transformers import apply_liger_kernel_to_granite
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense
+from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe
+from liger_kernel.transformers import apply_liger_kernel_to_internvl
+from liger_kernel.transformers import apply_liger_kernel_to_llama
+from liger_kernel.transformers import apply_liger_kernel_to_llama4
+from liger_kernel.transformers import apply_liger_kernel_to_llava
+from liger_kernel.transformers import apply_liger_kernel_to_mistral
+from liger_kernel.transformers import apply_liger_kernel_to_mixtral
+from liger_kernel.transformers import apply_liger_kernel_to_mllama
+from liger_kernel.transformers import apply_liger_kernel_to_olmo2
+from liger_kernel.transformers import apply_liger_kernel_to_olmo3
+from liger_kernel.transformers import apply_liger_kernel_to_phi3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_next
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl
+from liger_kernel.transformers import apply_liger_kernel_to_qwen3_vl_moe
+from liger_kernel.transformers import apply_liger_kernel_to_smollm3
+from liger_kernel.utils import infer_device
+from test.utils import DEFAULT_DATASET_PATH
+from test.utils import MiniModelConfig
+from test.utils import assert_verbose_allclose
+from test.utils import get_logprobs
+from test.utils import get_topk
+from test.utils import require_deterministic
+from test.utils import revert_liger_kernel_to_exaone4
+from test.utils import revert_liger_kernel_to_falcon_h1
+from test.utils import revert_liger_kernel_to_gemma
+from test.utils import revert_liger_kernel_to_gemma2
+from test.utils import revert_liger_kernel_to_gemma3_text
+from test.utils import revert_liger_kernel_to_glm4
+from test.utils import revert_liger_kernel_to_glm4v
+from test.utils import revert_liger_kernel_to_glm4v_moe
+from test.utils import revert_liger_kernel_to_granite
+from test.utils import revert_liger_kernel_to_hunyuan_v1
+from test.utils import revert_liger_kernel_to_hunyuan_v1_moe
+from test.utils import revert_liger_kernel_to_internvl
+from test.utils import revert_liger_kernel_to_llama
+from test.utils import revert_liger_kernel_to_llama4
+from test.utils import revert_liger_kernel_to_llava
+from test.utils import revert_liger_kernel_to_mistral
+from test.utils import revert_liger_kernel_to_mixtral
+from test.utils import revert_liger_kernel_to_mllama
+from test.utils import revert_liger_kernel_to_olmo2
+from test.utils import revert_liger_kernel_to_olmo3
+from test.utils import revert_liger_kernel_to_phi3
+from test.utils import revert_liger_kernel_to_qwen2
+from test.utils import revert_liger_kernel_to_qwen2_5_vl
+from test.utils import revert_liger_kernel_to_qwen2_vl
+from test.utils import revert_liger_kernel_to_qwen3
+from test.utils import revert_liger_kernel_to_qwen3_5
+from test.utils import revert_liger_kernel_to_qwen3_moe
+from test.utils import revert_liger_kernel_to_qwen3_next
+from test.utils import revert_liger_kernel_to_qwen3_vl
+from test.utils import revert_liger_kernel_to_qwen3_vl_moe
+from test.utils import revert_liger_kernel_to_smollm3
+from test.utils import set_seed
+from test.utils import simple_collate_fn
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+try:
+    from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+    from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
+
+    LLAMA4_AVAILABLE = True
+except ImportError:
+    LLAMA4_AVAILABLE = False
+
+try:
+    # Mllama is only available in transformers>=4.45.0
+    from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+    from transformers.models.mllama.modeling_mllama import MllamaForCausalLM
+
+    MLLAMA_AVAILABLE = True
+except ImportError:
+    MLLAMA_AVAILABLE = False
+
+try:
+    # Qwen2-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+
+    QWEN2_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_VL_AVAILABLE = False
+
+try:
+    # Qwen2.5-VL is only available in transformers>4.52.4
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig
+    from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+
+    QWEN2_5_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.52.4")
+except ImportError:
+    QWEN2_5_VL_AVAILABLE = False
+
+
+try:
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+
+    QWEN3_VL_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.57.0")
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+
+try:
+    import transformers
+
+    from packaging import version
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeTextConfig
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeVisionConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = version.parse(transformers.__version__) >= version.parse("4.57.0")
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+    from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
+    from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+    from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeForCausalLM
+
+    QWEN3_AVAILABLE = True
+except ImportError:
+    QWEN3_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+
+    QWEN3_VL_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+    QWEN3_VL_MOE_AVAILABLE = True
+except ImportError:
+    QWEN3_VL_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.granite import GraniteConfig
+    from transformers.models.granite import GraniteForCausalLM
+
+    GRANITE_AVAILABLE = True
+except ImportError:
+    GRANITE_AVAILABLE = False
+
+try:
+    from transformers import CLIPVisionConfig
+    from transformers.models.llava.configuration_llava import LlavaConfig
+    from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration
+
+    LLAVA_AVAILABLE = True
+except ImportError:
+    LLAVA_AVAILABLE = False
+
+try:
+    # OLMO2 is only available in transformers>=4.47.0
+    from transformers.models.olmo2.configuration_olmo2 import Olmo2Config
+    from transformers.models.olmo2.modeling_olmo2 import Olmo2ForCausalLM
+
+    OLMO2_AVAILABLE = True
+except ImportError:
+    OLMO2_AVAILABLE = False
+
+try:
+    # OLMO3 is only available in transformers>=4.57.0
+    from transformers.models.olmo3.configuration_olmo3 import Olmo3Config
+    from transformers.models.olmo3.modeling_olmo3 import Olmo3ForCausalLM
+
+    OLMO3_AVAILABLE = True
+except ImportError:
+    OLMO3_AVAILABLE = False
+
+try:
+    # Glm4 is only available in transformers>=4.51.3
+    from transformers.models.glm4.configuration_glm4 import Glm4Config
+    from transformers.models.glm4.modeling_glm4 import Glm4ForCausalLM
+
+    GLM4_AVAILABLE = True
+except ImportError:
+    GLM4_AVAILABLE = False
+
+try:
+    # Glm4v is only available in transformers>=4.51.3
+    from transformers.models.glm4v.configuration_glm4v import Glm4vConfig
+    from transformers.models.glm4v.modeling_glm4v import Glm4vForConditionalGeneration
+
+    GLM4V_AVAILABLE = True
+except ImportError:
+    GLM4V_AVAILABLE = False
+
+try:
+    # Glm4v_moe is only available in transformers>=4.51.3
+    from transformers.models.glm4v_moe.configuration_glm4v_moe import Glm4vMoeConfig
+    from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeForConditionalGeneration
+
+    GLM4V_MOE_AVAILABLE = True
+except ImportError:
+    GLM4V_MOE_AVAILABLE = False
+
+try:
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
+
+    GEMMA3_AVAILABLE = True
+except ImportError:
+    GEMMA3_AVAILABLE = False
+
+try:
+    # Smollm3 is only available in transformers>=4.53.0
+    from transformers.models.smollm3.configuration_smollm3 import SmolLM3Config
+    from transformers.models.smollm3.modeling_smollm3 import SmolLM3ForCausalLM
+
+    SMOLLM3_AVAILABLE = True
+except ImportError:
+    SMOLLM3_AVAILABLE = False
+
+try:
+    # InternVL is only available in transformers>=4.52.1
+    from transformers.models.internvl.configuration_internvl import InternVLConfig
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+
+    INTERNVL_AVAILABLE = True
+except ImportError:
+    INTERNVL_AVAILABLE = False
+
+try:
+    # FalconH1 is only available in transformers>=4.53.0
+    from transformers.models.falcon_h1.configuration_falcon_h1 import FalconH1Config
+    from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1ForCausalLM
+
+    FALCONH1_AVAILABLE = True
+except ImportError:
+    FALCONH1_AVAILABLE = False
+
+try:
+    # Qwen3Next is only available in transformers>=4.57.0
+    from transformers.models.qwen3_next.configuration_qwen3_next import Qwen3NextConfig
+    from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextForCausalLM
+
+    QWEN3NEXT_AVAILABLE = True
+except ImportError:
+    QWEN3NEXT_AVAILABLE = False
+
+try:
+    from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig
+    from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForCausalLM
+
+    QWEN3_5_AVAILABLE = True
+except ImportError:
+    QWEN3_5_AVAILABLE = False
+
+try:
+    from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config
+    from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM
+    from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+    from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM
+
+    HUNYUAN_V1_AVAILABLE = True
+except ImportError:
+    HUNYUAN_V1_AVAILABLE = False
+
+try:
+    from transformers.models.exaone4.configuration_exaone4 import Exaone4Config
+    from transformers.models.exaone4.modeling_exaone4 import Exaone4ForCausalLM
+
+    EXAONE4_AVAILABLE = True
+except ImportError:
+    EXAONE4_AVAILABLE = False
+
+
+device = infer_device()
+
+MINI_MODEL_SETUPS = {
+    "mini_llama3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama,
+        model_class=LlamaForCausalLM,
+        mini_model_config=LlamaConfig(
+            attention_bias=False,
+            attention_dropout=0.0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_qwen2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2,
+        model_class=Qwen2ForCausalLM,
+        mini_model_config=Qwen2Config(
+            attention_dropout=0.0,
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151643
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,  # 131072
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,  # 151936
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    ),
+    "mini_phi3": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_phi3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_phi3,
+        model_class=Phi3ForCausalLM,
+        mini_model_config=Phi3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,  # 32000
+            hidden_act="silu",
+            hidden_size=896,  # 3072
+            initializer_range=0.02,
+            intermediate_size=4864,  # 8192
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=None,  # defaults to num_attention_heads
+            rms_norm_eps=1e-5,
+            sliding_window=None,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32064,
+            attn_implementation="eager",
+        ),
+    ),
+    "mini_mistral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mistral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mistral,
+        model_class=MistralForCausalLM,
+        mini_model_config=MistralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=1024,
+            initializer_range=0.02,
+            intermediate_size=2048,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_mixtral": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mixtral,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mixtral,
+        model_class=MixtralForCausalLM,
+        mini_model_config=MixtralConfig(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=512,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=32768,  # 32768
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    ),
+    "mini_gemma1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            # gemma1 model config uses `hidden_act` and point it to gelu,
+            # https://huggingface.co/google/gemma-7b/blob/main/config.json#L10
+            # but in reality it's ignored and HuggingFace will use tanh approximation:
+            # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/gemma/modeling_gemma.py#L175
+            hidden_act="gelu",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma1.1": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma,
+        model_class=GemmaForCausalLM,
+        mini_model_config=GemmaConfig(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+        ),
+    ),
+    "mini_gemma2": MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma2,
+        model_class=Gemma2ForCausalLM,
+        mini_model_config=Gemma2Config(
+            vocab_size=32000,  # 256000
+            hidden_size=1024,  # 3072
+            intermediate_size=2048,  # 24576
+            num_hidden_layers=4,  # 28
+            num_attention_heads=4,  # 16
+            num_key_value_heads=4,  # 16
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    ),
+}
+if LLAMA4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_llama4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llama4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llama4,
+        model_class=Llama4ForCausalLM,
+        mini_model_config=Llama4TextConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=1.0,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if QWEN3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3,
+        model_class=Qwen3ForCausalLM,
+        mini_model_config=Qwen3Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-6,
+            sliding_window=131072,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_qwen3_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_moe,
+        model_class=Qwen3MoeForCausalLM,
+        mini_model_config=Qwen3MoeConfig(
+            vocab_size=32000,  # 151936
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            mlp_only_layers=None,
+        ),
+    )
+
+if GEMMA3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_gemma3_text"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_gemma3_text,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_gemma3_text,
+        model_class=Gemma3ForCausalLM,
+        mini_model_config=Gemma3TextConfig(
+            vocab_size=32000,  # 262144
+            hidden_size=1024,  # 1152
+            intermediate_size=2048,  # 6912
+            num_hidden_layers=4,  # 26
+            num_attention_heads=4,
+            num_key_value_heads=1,
+            head_dim=256,
+            hidden_activation="gelu_pytorch_tanh",
+            max_position_embeddings=8192,  # 32768
+            initializer_range=0.02,
+            rms_norm_eps=1e-06,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=2,
+            eos_token_id=1,
+            tie_word_embeddings=True,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+        ),
+    )
+
+if MLLAMA_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_mllama"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_mllama,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_mllama,
+        model_class=MllamaForCausalLM,
+        mini_model_config=MllamaTextConfig(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=131_072,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            rope_scaling=dict(
+                factor=8.0,
+                high_freq_factor=4.0,
+                low_freq_factor=1.0,
+                original_max_position_embeddings=8192,
+                rope_type="llama3",
+                rope_theta=500_000,
+            )
+            if not IS_TRANSFORMERS_V5_OR_LATER
+            else None,
+        ),
+    )
+
+if QWEN2_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_vl,
+        model_class=Qwen2VLForConditionalGeneration,
+        mini_model_config=Qwen2VLConfig(
+            attention_dropout=0.0,
+            # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151645
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            vision_token_id=32767,  # vocab_size - 3
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            hidden_act="silu",
+            hidden_size=1536,  # 8192
+            initializer_range=0.02,
+            intermediate_size=4864,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=12,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "embed_dim": 1280,
+                "mlp_ratio": 4,
+                "num_heads": 16,
+                "in_chans": 3,
+                "hidden_size": 128,  # 1536
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "temporal_patch_size": 2,
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if QWEN2_5_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen2_5_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen2_5_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2_5_vl,
+        model_class=Qwen2_5_VLForConditionalGeneration,
+        mini_model_config=Qwen2_5_VLConfig(
+            attention_dropout=0.0,
+            # bos and eos set to match the Mistral-7B tokenizer used to create the test dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 151643
+            eos_token_id=2,  # 151645
+            vision_start_token_id=32765,  # vocab_size - 5
+            vision_end_token_id=32766,  # vocab_size - 4
+            vision_token_id=32767,  # vocab_size - 3
+            image_token_id=32768,  # vocab_size - 2
+            video_token_id=32769,  # vocab_size - 1
+            hidden_act="silu",
+            hidden_size=1536,  # 8192
+            initializer_range=0.02,
+            intermediate_size=4864,  # 29568
+            max_position_embeddings=32768,
+            max_window_layers=4,  # 80
+            num_attention_heads=12,  # 64
+            num_hidden_layers=4,  # 80
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-6,  # 1e-5
+            **(
+                dict(rope_parameters=dict(mrope_section=[16, 24, 24]))  # (temporal, height, width)
+                if IS_TRANSFORMERS_V5_OR_LATER
+                else dict(rope_scaling=dict(type="mrope", mrope_section=[16, 24, 24]))
+            ),
+            sliding_window=4096,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32768,  # 152064  # >32k, Mistral-7B tokenizer vocab size
+            use_sliding_window=False,
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "spatial_patch_size": 14,
+                "window_size": 112,
+                "fullatt_block_indexes": [7, 15, 23, 31],
+                "tokens_per_second": 2,
+                "temporal_patch_size": 2,
+            },
+            attn_implementation="sdpa",
+        ),
+    )
+
+if QWEN3_VL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl,
+        model_class=Qwen3VLForConditionalGeneration,
+        mini_model_config=Qwen3VLConfig(
+            tie_word_embeddings=False,
+            image_token_id=31997,
+            video_token_id=31998,
+            vision_start_token_id=31995,
+            vision_end_token_id=31996,
+            text_config=dict(
+                attention_dropout=0.0,
+                attn_implementation="sdpa",
+                bos_token_id=1,
+                eos_token_id=2,
+                head_dim=112,
+                hidden_act="silu",
+                hidden_size=896,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pad_token_id=2,
+                rms_norm_eps=1e-6,
+                sliding_window=131072,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,
+                rope_scaling=dict(
+                    type="mrope",
+                    mrope_section=[16, 24, 24],  # (temporal, height, width)
+                )
+                if not IS_TRANSFORMERS_V5_OR_LATER
+                else None,
+            ),
+            vision_config=dict(
+                depth=4,
+                hidden_size=128,
+                initializer_range=0.02,
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=896,
+                num_position_embeddings=576,
+                deepstack_visual_indexes=[1, 2, 3],
+            ),
+        ),
+    )
+
+if QWEN3_VL_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_vl_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_vl_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_vl_moe,
+        model_class=Qwen3VLMoeForConditionalGeneration,
+        mini_model_config=Qwen3VLMoeConfig(
+            tie_word_embeddings=False,
+            image_token_id=31997,
+            video_token_id=31998,
+            vision_start_token_id=31995,
+            vision_end_token_id=31996,
+            text_config=Qwen3VLMoeTextConfig(
+                attention_dropout=0.0,
+                attention_bias=False,
+                attn_implementation="sdpa",
+                bos_token_id=1,
+                eos_token_id=2,
+                head_dim=112,
+                hidden_act="silu",
+                hidden_size=896,
+                initializer_range=0.02,
+                intermediate_size=4864,
+                max_position_embeddings=32768,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pad_token_id=2,
+                rms_norm_eps=1e-6,
+                sliding_window=131072,
+                tie_word_embeddings=False,
+                use_cache=True,
+                vocab_size=32000,
+                decoder_sparse_step=1,
+                moe_intermediate_size=3072,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+            ).to_dict(),
+            vision_config=Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=128,
+                initializer_range=0.02,
+                intermediate_size=256,
+                num_heads=8,
+                in_channels=3,
+                patch_size=14,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=896,
+                num_position_embeddings=576,
+                deepstack_visual_indexes=[1, 2, 3],
+            ).to_dict(),
+        ),
+    )
+
+if GRANITE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_granite3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_granite,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
+        model_class=GraniteForCausalLM,
+        mini_model_config=GraniteConfig(
+            attention_bias=False,
+            attention_dropout=0.0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            logits_scaling=4.0,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if LLAVA_AVAILABLE:
+    # https://huggingface.co/llava-hf/llava-1.5-7b-hf
+    MINI_MODEL_SETUPS["mini_llava"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_llava,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_llava,
+        model_class=LlavaForConditionalGeneration,
+        mini_model_config=LlavaConfig(
+            text_config=LlamaConfig(
+                attention_bias=False,
+                attention_dropout=0.0,
+                bos_token_id=1,
+                eos_token_id=2,
+                hidden_act="silu",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=2048,
+                num_attention_heads=8,
+                num_hidden_layers=4,
+                num_key_value_heads=2,
+                pretraining_tp=1,
+                tie_word_embeddings=False,
+                use_cache=True,
+                max_position_embeddings=4096,  # llava-1.5-7b-hf
+                rms_norm_eps=1e-05,  # llava-1.5-7b-hf
+                vocab_size=32064,  # llava-1.5-7b-hf
+                # At rope backward
+                # Eager produces incontiguous dq and dk
+                # SDPA produces contiguous dq and incontiguous dk
+                # Flash_attn produces contiguous dq and dk
+                attn_implementation="sdpa",  # default value, pytorch native attention
+            ),
+            vision_config=CLIPVisionConfig(
+                hidden_size=1024,
+                image_size=336,
+                intermediate_size=2048,  # 4096
+                model_type="clip_vision_model",
+                num_attention_heads=4,  # 16
+                num_hidden_layers=4,  # 24
+                patch_size=14,
+                projection_dim=768,
+                vocab_size=32000,
+            ),
+            vocab_size=32064,
+            ignore_index=-100,
+            pad_token_id=4,
+            image_token_index=3,
+            projector_hidden_act="gelu",
+            vision_feature_layer=-2,
+            vision_feature_select_strategy="default",
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO2_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo2"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo2,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo2,
+        model_class=Olmo2ForCausalLM,
+        mini_model_config=Olmo2Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if OLMO3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_olmo3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_olmo3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_olmo3,
+        model_class=Olmo3ForCausalLM,
+        mini_model_config=Olmo3Config(
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 40
+            num_key_value_heads=2,  # 8
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if GLM4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4,
+        model_class=Glm4ForCausalLM,
+        mini_model_config=Glm4Config(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if GLM4V_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v,
+        model_class=Glm4vForConditionalGeneration,
+        mini_model_config=Glm4vConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+                "pad_token_id": None,
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+if GLM4V_MOE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_glm4v_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_glm4v_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_glm4v_moe,
+        model_class=Glm4vMoeForConditionalGeneration,
+        mini_model_config=Glm4vMoeConfig(
+            bos_token_id=1,  # None
+            eos_token_id=2,  # 151329, 151336, 151338
+            pad_token_id=2,  # 151329
+            image_token_id=151343,
+            video_token_id=151344,
+            image_start_token_id=151339,
+            image_end_token_id=151340,
+            video_start_token_id=151341,
+            video_end_token_id=151342,
+            partial_rotary_factor=0.5,
+            cross_attention_layers=None,
+            dropout=0,
+            hidden_act="silu",
+            hidden_size=1024,  # 6144
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=4096,  # 32768
+            num_attention_heads=8,  # 48
+            num_hidden_layers=4,  # 61
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 151552
+            attention_bias=True,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+            text_config={
+                "partial_rotary_factor": 0.5,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "intermediate_size": 2048,
+                "max_position_embeddings": 4096,
+                "num_attention_heads": 8,
+                "num_hidden_layers": 4,
+                "num_key_value_heads": 2,
+                "rms_norm_eps": 1e-5,
+                "vocab_size": 32000,
+                "attention_bias": True,
+                "attention_dropout": 0.0,
+                "moe_intermediate_size": 1408,
+                "num_experts_per_tok": 2,
+                "n_shared_experts": 1,
+                "n_routed_experts": 8,
+                "routed_scaling_factor": 1.0,
+                "n_group": 1,
+                "topk_group": 1,
+                "first_k_dense_replace": 1,
+                "norm_topk_prob": True,
+                **(
+                    {"rope_scaling": {"type": "default", "mrope_section": [8, 12, 12]}}
+                    if not IS_TRANSFORMERS_V5_OR_LATER
+                    else {}
+                ),
+            },
+            vision_config={
+                "depth": 4,  # 32
+                "hidden_act": "silu",
+                "hidden_size": 128,  # 1280
+                "intermediate_size": 256,  # 3420
+                "num_heads": 16,
+                "in_chans": 3,
+                "out_hidden_size": 128,  # 3584
+                "patch_size": 14,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+            },
+        ),
+    )
+
+if SMOLLM3_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_smollm3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_smollm3,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_smollm3,
+        model_class=SmolLM3ForCausalLM,
+        mini_model_config=SmolLM3Config(
+            attention_bias=False,
+            attention_dropout=0.0,
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            pad_token_id=2,  # 128000
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if INTERNVL_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_internvl"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_internvl,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_internvl,
+        model_class=InternVLForConditionalGeneration,
+        mini_model_config=InternVLConfig(
+            text_config=Qwen2Config(
+                rms_norm_eps=1e-5,
+                hidden_size=256,  # 1024
+                intermediate_size=1024,  # 4096
+                hidden_act="silu",
+                num_hidden_layers=4,  # 24
+                num_attention_heads=4,  # 16
+                num_key_value_heads=2,  # 16
+                max_position_embeddings=4096,  # 8192
+                vocab_size=32000,  # 151936
+                bos_token_id=1,
+                eos_token_id=2,
+                pad_token_id=2,
+                tie_word_embeddings=False,
+            ),
+            vision_config={
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+            },
+            image_token_id=10,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
+
+if FALCONH1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_falcon_h1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_falcon_h1,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_falcon_h1,
+        model_class=FalconH1ForCausalLM,
+        mini_model_config=FalconH1Config(
+            model_type="falcon_h1",
+            vocab_size=32000,
+            hidden_size=256,  # 4096
+            num_hidden_layers=4,  # 24
+            num_attention_heads=4,  # 32
+            num_key_value_heads=2,  # 8
+            intermediate_size=1024,  # 11008
+            hidden_act="silu",
+            max_position_embeddings=4096,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            tie_word_embeddings=False,
+            mamba_d_ssm=128,  # 1024
+            mamba_n_heads=16,  # 128
+            mamba_d_state=32,  # 245
+            mamba_d_conv=2,  # 4
+        ),
+    )
+
+if QWEN3NEXT_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_next"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_next,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_next,
+        model_class=Qwen3NextForCausalLM,
+        mini_model_config=Qwen3NextConfig(  # Copypaste Qwen3MoeConfig
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=28,
+            attention_dropout=0.0,
+            decoder_sparse_step=1,
+            moe_intermediate_size=768,
+            num_experts_per_tok=2,
+            num_experts=8,
+            norm_topk_prob=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            # config.dtype must be set if fla installed since there's a bug in the original code (No torch.get_current_dtype())
+            # https://github.com/huggingface/transformers/blob/v4.57.1/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L613
+            dtype=torch.float32,
+        ),
+    )
+
+if QWEN3_5_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_qwen3_5"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_qwen3_5,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen3_5,
+        model_class=Qwen3_5ForCausalLM,
+        mini_model_config=Qwen3_5TextConfig(
+            vocab_size=32000,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            tie_word_embeddings=False,
+            attention_bias=False,
+            attention_dropout=0.0,
+            head_dim=128,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=64,
+            linear_value_head_dim=64,
+            linear_num_key_heads=8,
+            linear_num_value_heads=8,
+            layer_types=["linear_attention", "linear_attention", "linear_attention", "full_attention"],
+            dtype=torch.float32,
+        ),
+    )
+
+
+if HUNYUAN_V1_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1,
+        model_class=HunYuanDenseV1ForCausalLM,
+        mini_model_config=HunYuanDenseV1Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            num_hidden_layers=4,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_attention_heads=8,
+            head_dim=112,
+            rms_norm_eps=1e-6,
+            tie_word_embeddings=True,
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            norm_eps=1e-6,
+            num_key_value_heads=2,
+            partial_rotary_factor=1.0,
+            vocab_size=32000,
+            use_cache=True,
+            attn_implementation="sdpa",
+        ),
+    )
+
+    MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe,
+        model_class=HunYuanMoEV1ForCausalLM,
+        mini_model_config=HunYuanMoEV1Config(
+            hidden_act="silu",
+            attention_dropout=0.0,
+            num_hidden_layers=4,
+            hidden_size=896,
+            intermediate_size=4864,
+            num_attention_heads=8,
+            head_dim=112,
+            rms_norm_eps=1e-6,
+            tie_word_embeddings=True,
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            norm_eps=1e-6,
+            num_key_value_heads=2,
+            partial_rotary_factor=1.0,
+            vocab_size=32000,
+            num_experts=8,
+            moe_topk=2,
+            use_cache=True,
+            attn_implementation="sdpa",
+        ),
+    )
+
+if EXAONE4_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_exaone4"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_exaone4,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_exaone4,
+        model_class=Exaone4ForCausalLM,
+        mini_model_config=Exaone4Config(
+            attention_dropout=0.0,
+            bos_token_id=1,
+            eos_token_id=2,
+            hidden_act="silu",
+            hidden_size=896,
+            initializer_range=0.02,
+            intermediate_size=4864,
+            max_position_embeddings=32768,
+            num_attention_heads=8,
+            num_hidden_layers=4,
+            num_key_value_heads=2,
+            rms_norm_eps=1e-5,
+            tie_word_embeddings=True,
+            use_cache=True,
+            vocab_size=32000,
+            attn_implementation="sdpa",
+            pad_token_id=None,
+        ),
+    )
+
+
+def create_model(model_name="mini_llama3"):
+    """
+    Create a mini version model
+    The commented values are the original values
+    """
+    model_config = MINI_MODEL_SETUPS[model_name].mini_model_config
+    model_class = MINI_MODEL_SETUPS[model_name].model_class
+    return model_class(model_config)
+
+
+@require_deterministic
+def run_mini_model(
+    model_name="mini_llama3",
+    num_steps=100,
+    dtype=torch.bfloat16,
+    lr=1e-5,
+    with_liger=False,
+):
+    # If we move it to the beginning of test_mini_model, the two runs are initialized with different weights.
+    # This is due to RNG (Random Number Generator). The formula of RNG progression is x_(n+1) = (a * x_n + c) % m
+    # Everytime RNG is used, like randomly initialzing weight, the RNG progresses to the next state.
+    # Therefore, we have to reset RNG before we create the model to ensure the weight initialization started from the same RNG state.
+
+    set_seed(42)
+
+    revert_kwargs = {"model_config": MINI_MODEL_SETUPS[model_name]}
+    if "mllama" in model_name:
+        revert_kwargs["model_type"] = "causal_lm"
+
+    if with_liger is True:
+        kwargs = {
+            "rope": True,
+            "rms_norm": True,
+        }
+
+        if "glm4" in model_name or "llama4" in model_name or "qwen3_next" in model_name or "qwen3_5" in model_name:
+            kwargs["rope"] = False
+
+        model_supports_layer_norm = "qwen2_vl" in model_name
+        if model_supports_layer_norm:
+            kwargs["layer_norm"] = True
+
+        if "gemma" in model_name:
+            kwargs["geglu"] = True
+        else:
+            kwargs["swiglu"] = True
+
+        if "llava" in model_name:
+            apply_liger_kernel_to_llama(**kwargs)
+
+        kwargs["fused_linear_cross_entropy"] = False
+        kwargs["cross_entropy"] = False
+
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
+    else:
+        MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+
+    model = create_model(model_name).to(dtype).to(device)
+
+    train_dataset = load_from_disk(DEFAULT_DATASET_PATH)
+    loader = DataLoader(train_dataset, batch_size=16, shuffle=False, collate_fn=simple_collate_fn)
+    loader_iter = iter(loader)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+
+    loss_list = []
+
+    for i in range(num_steps):
+        batch = next(loader_iter).to(model.device)
+        optimizer.zero_grad()
+        output = model(**batch)
+        output.loss.backward()
+        optimizer.step()
+        print(f"Step {i}, Loss: {output.loss.item()}")
+        loss_list.append(output.loss.item())
+
+    topk_logprobs = get_topk(get_logprobs(output.logits))
+    MINI_MODEL_SETUPS[model_name].liger_kernel_patch_revert_func(**revert_kwargs)
+    return {
+        "loss": loss_list,
+        "topk_logprobs": topk_logprobs.values,
+        "model": model,
+    }
+
+
+@pytest.mark.parametrize(
+    "model_name, num_steps, lr, dtype, loss_atol, loss_rtol, logprobs_atol, logprobs_rtol, param_atol, param_rtol",
+    [
+        pytest.param(
+            "mini_llama4",  # llama4 requires slightly larger tolerances to pass this test after bug fix to llama4 in transformers v5.0.0
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-3,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not LLAMA4_AVAILABLE,
+                    reason="Llama4 not available in this version of transformers",
+                ),
+                pytest.mark.xfail(
+                    reason=(
+                        "RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype:"
+                        " float key.dtype: c10::BFloat16 and value.dtype: c10::BFloat16 instead."
+                    )
+                ),
+            ],
+        ),
+        ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 2e-5, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_llava",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not LLAVA_AVAILABLE,
+                reason="LLaVa not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_mllama",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not MLLAMA_AVAILABLE,
+                reason="Mllama not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_gemma3_text",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-4,
+            5e-2,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GEMMA3_AVAILABLE,
+                reason="Gemma3 not available in this version of transformers",
+            ),
+        ),
+        ("mini_qwen2", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_qwen3",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN3_AVAILABLE,
+                reason="Qwen3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_moe",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN3_AVAILABLE,
+                reason="Qwen3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen2_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            2e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN2_VL_AVAILABLE,
+                reason="Qwen2-VL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen2_5_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            2e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN2_5_VL_AVAILABLE,
+                reason="Qwen2.5-VL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_vl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            2e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not QWEN3_VL_AVAILABLE,
+                reason="Qwen3-VL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_vl_moe",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            2e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3_VL_MOE_AVAILABLE,
+                    reason="Qwen3-VL-MoE not available in this version of transformers",
+                ),
+                pytest.mark.skipif(
+                    True,
+                    reason="Flaky test",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_olmo2",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not OLMO2_AVAILABLE,
+                reason="OLMO2 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_olmo3",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not OLMO3_AVAILABLE,
+                reason="OLMO3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_glm4",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GLM4_AVAILABLE,
+                reason="Glm4 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_glm4v",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GLM4V_AVAILABLE,
+                reason="Glm4v not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_glm4v_moe",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GLM4V_MOE_AVAILABLE,
+                reason="Glm4v_moe not available in this version of transformers",
+            ),
+        ),
+        ("mini_phi3", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
+        ("mini_mistral", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),
+        # TODO: mixtral is flaky so disable the test for now
+        # ("mini_mixtral", 32, 1e-4, torch.float32, 5e-4, 1e-4, 5e-3, 1e-5, 1e-2, 1e-5),
+        # Gemma 1.1 and 2 has more tolerance because currently, the kernel is not a perfect match
+        ("mini_gemma1", 32, 1e-5, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
+        ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
+        ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_granite3",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-4,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GRANITE_AVAILABLE,
+                reason="Granite not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_smollm3",
+            32,
+            1e-4,
+            torch.bfloat16,
+            1e-3,
+            1e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(
+                not SMOLLM3_AVAILABLE,
+                reason="Smollm3 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_internvl",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not INTERNVL_AVAILABLE,
+                reason="InternVL not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_falcon_h1",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-4,
+            4e-2,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not FALCONH1_AVAILABLE,
+                reason="FalconH1 not available in this version of transformers",
+            ),
+        ),
+        pytest.param(
+            "mini_qwen3_next",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3NEXT_AVAILABLE,
+                    reason="Qwen3Next not available in this version of transformers",
+                ),
+                pytest.mark.skip(
+                    reason="flash-linear-attention's ChunkGatedDeltaRuleFunction does not support float32.\n"
+                    + " Torch's implementation takes too long"
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_qwen3_5",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=[
+                pytest.mark.skipif(
+                    not QWEN3_5_AVAILABLE,
+                    reason="Qwen3_5 not available in this version of transformers",
+                ),
+                pytest.mark.skip(
+                    reason="flash-linear-attention's ChunkGatedDeltaRuleFunction does not support float32.\n"
+                    + " Torch's implementation takes too long"
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_hunyuan_v1",
+            32,
+            1e-5,
+            torch.float32,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(
+                    not HUNYUAN_V1_AVAILABLE,
+                    reason="Hunyuan_v1 not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_hunyuan_v1_moe",
+            32,
+            1e-5,
+            torch.float32,
+            1e-2,
+            5e-2,
+            1e-1,
+            1e-2,
+            1e-2,
+            1e-2,
+            marks=[
+                pytest.mark.skipif(
+                    not HUNYUAN_V1_AVAILABLE,
+                    reason="Hunyuan_v1_moe not available in this version of transformers",
+                ),
+            ],
+        ),
+        pytest.param(
+            "mini_exaone4",
+            32,
+            1e-5,
+            torch.float32,
+            1e-8,
+            1e-5,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not EXAONE4_AVAILABLE,
+                reason="EXAONE4 not available in this version of transformers",
+            ),
+        ),
+    ],
+)
+def test_mini_model(
+    model_name,
+    num_steps,
+    lr,
+    dtype,
+    loss_atol,
+    loss_rtol,
+    logprobs_atol,
+    logprobs_rtol,
+    param_atol,
+    param_rtol,
+):
+    # Non-liger models should be initialized and tested first to avoid the module being overridden
+
+    expected_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr)
+
+    actual_output = run_mini_model(model_name=model_name, num_steps=num_steps, dtype=dtype, lr=lr, with_liger=True)
+
+    # Compare every step of the loss
+    assert_verbose_allclose(
+        torch.tensor([expected_output["loss"]]),
+        torch.tensor([actual_output["loss"]]),
+        atol=loss_atol,
+        rtol=loss_rtol,
+        extra_info="[Loss]",
+    )
+
+    # No logits are materialized
+    # import pdb; pdb.set_trace()
+    assert_verbose_allclose(
+        expected_output["topk_logprobs"],
+        actual_output["topk_logprobs"],
+        atol=logprobs_atol,
+        rtol=logprobs_rtol,
+        extra_info="[Top K Logprobs]",
+    )
+
+    # Compare the params from the last step
+    # Iterate over the model's parameters and compare them
+    for expected_param, actual_param in zip(
+        expected_output["model"].named_parameters(),
+        actual_output["model"].named_parameters(),
+    ):
+        assert_verbose_allclose(
+            expected_param[1],
+            actual_param[1],
+            atol=param_atol,
+            rtol=param_rtol,
+            extra_info="[Model parameters]",
+        )
diff --git a/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json b/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..5c5c2adf67d113a9cc6896464cd3b98436b01fee
--- /dev/null
+++ b/test/resources/fake_configs/Google/Gemma3/gemma-3-4b-it/tokenizer_config.json
@@ -0,0 +1,90 @@
+{
+    "add_bos_token": true,
+    "add_eos_token": false,
+    "added_tokens_decoder": {
+        "0": {
+            "content": "<pad>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "1": {
+            "content": "<eos>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "2": {
+            "content": "<bos>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "3": {
+            "content": "<unk>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "4": {
+            "content": "<start_of_image>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "5": {
+            "content": "<image_soft_token>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "6": {
+            "content": "<end_of_image>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "7": {
+            "content": "<start_of_turn>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        }
+    },
+    "boi_token": "<start_of_image>",
+    "bos_token": "<bos>",
+    "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n",
+    "clean_up_tokenization_spaces": false,
+    "eoi_token": "<end_of_image>",
+    "eos_token": "<eos>",
+    "extra_special_tokens": {
+        "boi_token": "<start_of_image>",
+        "eoi_token": "<end_of_image>",
+        "image_token": "<image_soft_token>"
+    },
+    "image_token": "<image_soft_token>",
+    "model_max_length": 1000000000000000019884624838656,
+    "pad_token": "<pad>",
+    "processor_class": "Gemma3Processor",
+    "sp_model_kwargs": null,
+    "spaces_between_special_tokens": false,
+    "tokenizer_class": "GemmaTokenizer",
+    "unk_token": "<unk>",
+    "use_default_system_prompt": false
+}
\ No newline at end of file
diff --git a/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json b/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..6c7d5eec91a94408618dc343d1f1f10abff0bbe5
--- /dev/null
+++ b/test/resources/fake_configs/Google/Paligemma/paligemma-3b-pt-224/tokenizer_config.json
@@ -0,0 +1,61 @@
+{
+    "add_bos_token": true,
+    "add_eos_token": false,
+    "added_tokens_decoder": {
+        "0": {
+            "content": "<pad>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "1": {
+            "content": "<eos>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "2": {
+            "content": "<bos>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "3": {
+            "content": "<unk>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "4": {
+            "content": "<image>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        }
+    },
+    "additional_special_tokens": [
+        "<image>"
+    ],
+    "bos_token": "<bos>",
+    "clean_up_tokenization_spaces": false,
+    "eos_token": "<eos>",
+    "model_max_length": 1000000000000000019884624838656,
+    "pad_token": "<pad>",
+    "processor_class": "PaliGemmaProcessor",
+    "sp_model_kwargs": {},
+    "spaces_between_special_tokens": false,
+    "tokenizer_class": "GemmaTokenizer",
+    "unk_token": "<unk>",
+    "use_default_system_prompt": false,
+    "chat_template": "{% for message in messages %}{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<image>' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{ '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>' }}{% endif %}"
+}
\ No newline at end of file
diff --git a/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json b/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..e4042a1126290fdb96ece2a4ad8dd7108c5de484
--- /dev/null
+++ b/test/resources/fake_configs/HuggingFaceTB/SmolVLM2-256M-Video-Instruct/tokenizer_config.json
@@ -0,0 +1,1192 @@
+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49152": {
+      "content": "<global-img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49153": {
+      "content": "<row_1_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49154": {
+      "content": "<row_1_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49155": {
+      "content": "<row_1_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49156": {
+      "content": "<row_1_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49157": {
+      "content": "<row_1_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49158": {
+      "content": "<row_1_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49159": {
+      "content": "<row_2_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49160": {
+      "content": "<row_2_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49161": {
+      "content": "<row_2_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49162": {
+      "content": "<row_2_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49163": {
+      "content": "<row_2_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49164": {
+      "content": "<row_2_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49165": {
+      "content": "<row_3_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49166": {
+      "content": "<row_3_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49167": {
+      "content": "<row_3_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49168": {
+      "content": "<row_3_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49169": {
+      "content": "<row_3_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49170": {
+      "content": "<row_3_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49171": {
+      "content": "<row_4_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49172": {
+      "content": "<row_4_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49173": {
+      "content": "<row_4_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49174": {
+      "content": "<row_4_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49175": {
+      "content": "<row_4_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49176": {
+      "content": "<row_4_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49177": {
+      "content": "<row_5_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49178": {
+      "content": "<row_5_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49179": {
+      "content": "<row_5_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49180": {
+      "content": "<row_5_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49181": {
+      "content": "<row_5_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49182": {
+      "content": "<row_5_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49183": {
+      "content": "<row_6_col_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49184": {
+      "content": "<row_6_col_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49185": {
+      "content": "<row_6_col_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49186": {
+      "content": "<row_6_col_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49187": {
+      "content": "<row_6_col_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49188": {
+      "content": "<row_6_col_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49189": {
+      "content": "<fake_token_around_image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49190": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49191": {
+      "content": "<|reserved_special_token_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49192": {
+      "content": "<|reserved_special_token_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49193": {
+      "content": "<|reserved_special_token_2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49194": {
+      "content": "<|reserved_special_token_3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49195": {
+      "content": "<|reserved_special_token_4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49196": {
+      "content": "<|reserved_special_token_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49197": {
+      "content": "<|reserved_special_token_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49198": {
+      "content": "<|reserved_special_token_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49199": {
+      "content": "<|reserved_special_token_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49200": {
+      "content": "<|reserved_special_token_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49201": {
+      "content": "<|reserved_special_token_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49202": {
+      "content": "<|reserved_special_token_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49203": {
+      "content": "<|reserved_special_token_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49204": {
+      "content": "<|reserved_special_token_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49205": {
+      "content": "<|reserved_special_token_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49206": {
+      "content": "<|reserved_special_token_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49207": {
+      "content": "<|reserved_special_token_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49208": {
+      "content": "<|reserved_special_token_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49209": {
+      "content": "<|reserved_special_token_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49210": {
+      "content": "<|reserved_special_token_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49211": {
+      "content": "<|reserved_special_token_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49212": {
+      "content": "<|reserved_special_token_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49213": {
+      "content": "<|reserved_special_token_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49214": {
+      "content": "<|reserved_special_token_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49215": {
+      "content": "<|reserved_special_token_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49216": {
+      "content": "<|reserved_special_token_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49217": {
+      "content": "<|reserved_special_token_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49218": {
+      "content": "<|reserved_special_token_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49219": {
+      "content": "<|reserved_special_token_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49220": {
+      "content": "<|reserved_special_token_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49221": {
+      "content": "<|reserved_special_token_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49222": {
+      "content": "<|reserved_special_token_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49223": {
+      "content": "<|reserved_special_token_32|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49224": {
+      "content": "<|reserved_special_token_33|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49225": {
+      "content": "<|reserved_special_token_34|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49226": {
+      "content": "<|reserved_special_token_35|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49227": {
+      "content": "<|reserved_special_token_36|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49228": {
+      "content": "<|reserved_special_token_37|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49229": {
+      "content": "<|reserved_special_token_38|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49230": {
+      "content": "<|reserved_special_token_39|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49231": {
+      "content": "<|reserved_special_token_40|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49232": {
+      "content": "<|reserved_special_token_41|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49233": {
+      "content": "<|reserved_special_token_42|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49234": {
+      "content": "<|reserved_special_token_43|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49235": {
+      "content": "<|reserved_special_token_44|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49236": {
+      "content": "<|reserved_special_token_45|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49237": {
+      "content": "<|reserved_special_token_46|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49238": {
+      "content": "<|reserved_special_token_47|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49239": {
+      "content": "<|reserved_special_token_48|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49240": {
+      "content": "<|reserved_special_token_49|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49241": {
+      "content": "<|reserved_special_token_50|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49242": {
+      "content": "<|reserved_special_token_51|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49243": {
+      "content": "<|reserved_special_token_52|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49244": {
+      "content": "<|reserved_special_token_53|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49245": {
+      "content": "<|reserved_special_token_54|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49246": {
+      "content": "<|reserved_special_token_55|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49247": {
+      "content": "<|reserved_special_token_56|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49248": {
+      "content": "<|reserved_special_token_57|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49249": {
+      "content": "<|reserved_special_token_58|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49250": {
+      "content": "<|reserved_special_token_59|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49251": {
+      "content": "<|reserved_special_token_60|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49252": {
+      "content": "<|reserved_special_token_61|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49253": {
+      "content": "<|reserved_special_token_62|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49254": {
+      "content": "<|reserved_special_token_63|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49255": {
+      "content": "<|reserved_special_token_64|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49256": {
+      "content": "<|reserved_special_token_65|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49257": {
+      "content": "<|reserved_special_token_66|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49258": {
+      "content": "<|reserved_special_token_67|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49259": {
+      "content": "<|reserved_special_token_68|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49260": {
+      "content": "<|reserved_special_token_69|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49261": {
+      "content": "<|reserved_special_token_70|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49262": {
+      "content": "<|reserved_special_token_71|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49263": {
+      "content": "<|reserved_special_token_72|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49264": {
+      "content": "<|reserved_special_token_73|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49265": {
+      "content": "<|reserved_special_token_74|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49266": {
+      "content": "<|reserved_special_token_75|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49267": {
+      "content": "<|reserved_special_token_76|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49268": {
+      "content": "<|reserved_special_token_77|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49269": {
+      "content": "<|reserved_special_token_78|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49270": {
+      "content": "<|reserved_special_token_79|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49271": {
+      "content": "<|reserved_special_token_80|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49272": {
+      "content": "<|reserved_special_token_81|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49273": {
+      "content": "<|reserved_special_token_82|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49274": {
+      "content": "<|reserved_special_token_83|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49275": {
+      "content": "<|reserved_special_token_84|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49276": {
+      "content": "<|reserved_special_token_85|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49277": {
+      "content": "<|reserved_special_token_86|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49278": {
+      "content": "<|reserved_special_token_87|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49279": {
+      "content": "<end_of_utterance>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<fake_token_around_image>",
+    "<image>",
+    "<end_of_utterance>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "<|im_start|>{% for message in messages %}{{message['role'] | capitalize}}{% if message['content'][0]['type'] == 'image' %}{{':'}}{% else %}{{': '}}{% endif %}{% for line in message['content'] %}{% if line['type'] == 'text' %}{{line['text']}}{% elif line['type'] == 'image' %}{{ '<image>' }}{% endif %}{% endfor %}<end_of_utterance>\n{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "end_of_utterance_token": "<end_of_utterance>",
+  "eos_token": "<end_of_utterance>",
+  "extra_special_tokens": {
+    "end_of_utterance_token": "<end_of_utterance>",
+    "fake_image_token": "<fake_token_around_image>",
+    "global_image_token": "<global-img>",
+    "image_token": "<image>"
+  },
+  "fake_image_token": "<fake_token_around_image>",
+  "global_image_token": "<global-img>",
+  "image_token": "<image>",
+  "legacy": false,
+  "model_max_length": 8192,
+  "pad_token": "<|im_end|>",
+  "processor_class": "SmolVLMProcessor",
+  "tokenizer_class": "GPT2Tokenizer",
+  "truncation_side": "left",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}
diff --git a/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json b/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..c32625c74fdedbde4c654d205c66f6b3dc852454
--- /dev/null
+++ b/test/resources/fake_configs/Llava/llava-1.5-7b-hf/preprocessor_config.json
@@ -0,0 +1,28 @@
+{
+    "crop_size": {
+        "height": 336,
+        "width": 336
+    },
+    "do_center_crop": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_processor_type": "CLIPImageProcessor",
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "processor_class": "LlavaProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+        "shortest_edge": 336
+    }
+}
\ No newline at end of file
diff --git a/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json b/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..8fbb221c7fdc95258d63f57d1e33aed4633068f6
--- /dev/null
+++ b/test/resources/fake_configs/Llava/llava-1.5-7b-hf/processor_config.json
@@ -0,0 +1,7 @@
+{
+    "image_token": "<image>",
+    "num_additional_image_tokens": 1,
+    "patch_size": 14,
+    "processor_class": "LlavaProcessor",
+    "vision_feature_select_strategy": "default"
+}
\ No newline at end of file
diff --git a/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json b/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..f9c6572a84cc8de54b5807d28398daf1bc106dbf
--- /dev/null
+++ b/test/resources/fake_configs/Llava/llava-1.5-7b-hf/tokenizer_config.json
@@ -0,0 +1,66 @@
+{
+    "add_bos_token": true,
+    "add_eos_token": false,
+    "add_prefix_space": null,
+    "added_tokens_decoder": {
+        "0": {
+            "content": "<unk>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "1": {
+            "content": "<s>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "2": {
+            "content": "</s>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "3": {
+            "content": "<image>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "4": {
+            "content": "<pad>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        }
+    },
+    "bos_token": "<s>",
+    "clean_up_tokenization_spaces": false,
+    "eos_token": "</s>",
+    "extra_special_tokens": {
+        "image_token": "<image>"
+    },
+    "image_token": "<image>",
+    "legacy": false,
+    "chat_template": "{% if not add_generation_prompt is defined %}{% set add_last_empty_assistant = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message.role == 'user' %}{{ '### User:\n' }}{% if message.content is not string %}{% for content in message.content %}{% if content.type == 'image' %}{{ '<image>' }}{% elif content.type == 'text' %}{{ content.text }}{% else %}{# Do nothing #}{% endif %}{% endfor %}{% else %}{{ message.content }}{% endif %}{{ '\n\n' }}{% elif message.role == 'system' %}{{ '### System:\n' }}{% if message.content is not string %}{% for content in message.content %}{% if content.type == 'image' %}{{ '<image>' }}{% elif content.type == 'text' %}{{ content.text }}{% else %}{# Do nothing #}{% endif %}{% endfor %}{% else %}{{ message.content }}{% endif %}{{ '\n\n' }}{% elif message.role == 'assistant' %}{{ '### Assistant:\n' }}{% if message.content is not string %}{% for content in message.content %}{% if content.type == 'text' %}{{ content.text }}{% else %}{# Do nothing #}{% endif %}{% endfor %}{% else %}{{ message.content }}{% endif %}{% else %}{{ '' }}{% endif %}{% endfor %}{% if not add_generation_prompt %}{{ eos_token }}{% elif add_generation_prompt %}{{ '### Assistant:\n' }}{% else %}{# Do nothing #}{% endif %}",
+    "model_max_length": 1000000000000000019884624838656,
+    "pad_token": "<pad>",
+    "padding_side": "left",
+    "processor_class": "LlavaProcessor",
+    "sp_model_kwargs": {},
+    "tokenizer_class": "LlamaTokenizer",
+    "trust_remote_code": false,
+    "unk_token": "<unk>",
+    "use_default_system_prompt": false,
+    "return_token_type_ids": false
+}
\ No newline at end of file
diff --git a/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json b/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..f47a164c28afd23a0a1994b7ebd8afa7b6f33c32
--- /dev/null
+++ b/test/resources/fake_configs/OpenGVLab/InternVL3-1B-hf/tokenizer_config.json
@@ -0,0 +1,307 @@
+{
+    "add_bos_token": false,
+    "add_prefix_space": false,
+    "added_tokens_decoder": {
+        "151643": {
+            "content": "<|endoftext|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151644": {
+            "content": "<|im_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151645": {
+            "content": "<|im_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151646": {
+            "content": "<|object_ref_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151647": {
+            "content": "<|object_ref_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151648": {
+            "content": "<|box_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151649": {
+            "content": "<|box_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151650": {
+            "content": "<|quad_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151651": {
+            "content": "<|quad_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151652": {
+            "content": "<|vision_start|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151653": {
+            "content": "<|vision_end|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151654": {
+            "content": "<|vision_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151655": {
+            "content": "<|image_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151656": {
+            "content": "<|video_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151657": {
+            "content": "<tool_call>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151658": {
+            "content": "</tool_call>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151659": {
+            "content": "<|fim_prefix|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151660": {
+            "content": "<|fim_middle|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151661": {
+            "content": "<|fim_suffix|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151662": {
+            "content": "<|fim_pad|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151663": {
+            "content": "<|repo_name|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151664": {
+            "content": "<|file_sep|>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": false
+        },
+        "151665": {
+            "content": "<img>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151666": {
+            "content": "</img>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151667": {
+            "content": "<IMG_CONTEXT>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151668": {
+            "content": "<quad>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151669": {
+            "content": "</quad>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151670": {
+            "content": "<ref>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151671": {
+            "content": "</ref>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151672": {
+            "content": "<box>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151673": {
+            "content": "</box>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        },
+        "151674": {
+            "content": "<video>",
+            "lstrip": false,
+            "normalized": false,
+            "rstrip": false,
+            "single_word": false,
+            "special": true
+        }
+    },
+    "additional_special_tokens": [
+        "<|im_start|>",
+        "<|im_end|>",
+        "<|object_ref_start|>",
+        "<|object_ref_end|>",
+        "<|box_start|>",
+        "<|box_end|>",
+        "<|quad_start|>",
+        "<|quad_end|>",
+        "<|vision_start|>",
+        "<|vision_end|>",
+        "<|vision_pad|>",
+        "<|image_pad|>",
+        "<|video_pad|>",
+        "<img>",
+        "</img>",
+        "<IMG_CONTEXT>",
+        "<quad>",
+        "</quad>",
+        "<ref>",
+        "</ref>",
+        "<box>",
+        "</box>"
+    ],
+    "bos_token": null,
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<IMG_CONTEXT>\n' }}{% elif content['type'] == 'video' %}{{ '<video>\n' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}",
+    "clean_up_tokenization_spaces": false,
+    "context_image_token": "<IMG_CONTEXT>",
+    "end_image_token": "</img>",
+    "eos_token": "<|im_end|>",
+    "errors": "replace",
+    "extra_special_tokens": {
+        "context_image_token": "<IMG_CONTEXT>",
+        "end_image_token": "</img>",
+        "start_image_token": "<img>",
+        "video_token": "<video>"
+    },
+    "model_max_length": 8192,
+    "pad_token": "<|endoftext|>",
+    "return_token_type_ids": false,
+    "split_special_tokens": false,
+    "start_image_token": "<img>",
+    "tokenizer_class": "Qwen2Tokenizer",
+    "unk_token": null,
+    "video_token": "<video>"
+}
\ No newline at end of file
diff --git a/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json b/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..a53673562ca60d3e3a152f73211cb690e5b7c63f
--- /dev/null
+++ b/test/resources/fake_configs/Qwen/Qwen2-VL-7B-Instruct/tokenizer_config.json
@@ -0,0 +1,63 @@
+{
+    "added_tokens_decoder": {
+      "0": {
+        "content": "<|unk|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "1": {
+        "content": "<|vision_start|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "2": {
+        "content": "<|vision_end|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "3": {
+        "content": "<|vision_pad|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "4": {
+        "content": "<|image_pad|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "5": {
+        "content": "<|video_pad|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      }
+    },
+    "additional_special_tokens": ["<|im_start|>", "<|im_end|>", "<|object_ref_start|>","<|object_ref_end|>","<|box_start|>","<|box_end|>","<|quad_start|>","<|quad_end|>","<|vision_start|>","<|vision_end|>","<|vision_pad|>","<|image_pad|>","<|video_pad|>"],
+    "bos_token": null,
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "clean_up_tokenization_spaces": false,
+    "eos_token": "<|im_end|>",
+    "padding_side": "left",
+    "errors": "replace",
+    "model_max_length": 32768,
+    "pad_token": "<|endoftext|>",
+    "split_special_tokens": false,
+    "unk_token": null
+  }
\ No newline at end of file
diff --git a/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json b/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..faf53bf3d660246ef03271ca4bc6ba78a99853c2
--- /dev/null
+++ b/test/resources/fake_configs/Qwen/Qwen2.5-VL-7B-Instruct/tokenizer_config.json
@@ -0,0 +1,63 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|unk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": ["<|im_start|>", "<|im_end|>", "<|object_ref_start|>","<|object_ref_end|>","<|box_start|>","<|box_end|>","<|quad_start|>","<|quad_end|>","<|vision_start|>","<|vision_end|>","<|vision_pad|>","<|image_pad|>","<|video_pad|>"],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "padding_side": "left",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "unk_token": null
+}
\ No newline at end of file
diff --git a/test/resources/fake_configs/Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json b/test/resources/fake_configs/Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..f1cc37c4db65b7452e2a4fdf5ec31f80d6496b5a
--- /dev/null
+++ b/test/resources/fake_configs/Qwen/Qwen3-VL-4B-Instruct/tokenizer_config.json
@@ -0,0 +1,63 @@
+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|unk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": ["<|im_start|>", "<|im_end|>", "<|object_ref_start|>", "<|object_ref_end|>", "<|box_start|>", "<|box_end|>", "<|quad_start|>", "<|quad_end|>", "<|vision_start|>", "<|vision_end|>", "<|vision_pad|>", "<|image_pad|>", "<|video_pad|>"],
+  "bos_token": null,
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "padding_side": "left",
+  "errors": "replace",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "unk_token": null
+}
diff --git a/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json b/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..f760c041e72ff000a52965a9e60c157bce10773c
--- /dev/null
+++ b/test/resources/fake_configs/meta-llama/Llama-3.2-11B-Vision-Instruct/tokenizer_config.json
@@ -0,0 +1,31 @@
+{
+    "added_tokens_decoder": {
+      "0": {
+        "content": "<|unk|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      },
+      "1": {
+        "content": "<|image|>",
+        "lstrip": false,
+        "normalized": false,
+        "rstrip": false,
+        "single_word": false,
+        "special": true
+      }
+    },
+    "bos_token": "<|begin_of_text|>",
+    "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now(\"%d %b %Y\") %}\n    {%- else %}\n        {%- set date_string = \"26 Jul 2024\" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- Find out if there are any images #}\n{% set image_ns = namespace(has_images=false) %}      \n{%- for message in messages %}\n    {%- for content in message['content'] %}\n        {%- if content['type'] == 'image' %}\n            {%- set image_ns.has_images = true %}\n        {%- endif %}\n    {%- endfor %}\n{%- endfor %}\n\n{#- Error out if there are images and system message #}\n{%- if image_ns.has_images and not system_message == \"\" %}\n    {{- raise_exception(\"Prompting with images is incompatible with system messages.\") }}\n{%- endif %}\n\n{#- System message if there are no images #}\n{%- if not image_ns.has_images %}\n    {{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n    {%- if tools is not none %}\n        {{- \"Environment: ipython\\n\" }}\n    {%- endif %}\n    {{- \"Cutting Knowledge Date: December 2023\\n\" }}\n    {{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n        {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n        {{- \"Do not use variables.\\n\\n\" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- \"\\n\\n\" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- \"<|eot_id|>\" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n        {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n        {{- '\"parameters\": ' }}\n        {{- tool_call.arguments | tojson }}\n        {{- \"}\" }}\n        {{- \"<|eot_id|>\" }}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+    "clean_up_tokenization_spaces": true,
+    "eos_token": "<|eot_id|>",
+    "model_input_names": [
+      "input_ids",
+      "attention_mask"
+    ],
+    "model_max_length": 131072,
+    "pad_token": "<|finetune_right_pad_id|>",
+    "tokenizer_class": "PreTrainedTokenizerFast"
+  }
\ No newline at end of file
diff --git a/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json b/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json
new file mode 100755
index 0000000000000000000000000000000000000000..bb8ef6fc8aa11c156bd111b1d1fccc96391a3d46
--- /dev/null
+++ b/test/resources/fake_configs/meta-llama/Llama-4-Scout-17B-16E-Instruct/tokenizer_config.json
@@ -0,0 +1,98 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+      "0": {
+          "content": "<unk>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "1": {
+          "content": "<s>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "2": {
+          "content": "</s>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "3": {
+          "content": "<image>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "4": {
+          "content": "<pad>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "5": {
+          "content": "<|header_start|>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "6": {
+          "content": "<|header_end|>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "7": {
+          "content": "<|eot|>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      },
+      "8": {
+          "content": "<|image|>",
+          "lstrip": false,
+          "normalized": false,
+          "rstrip": false,
+          "single_word": false,
+          "special": true
+      }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {
+      "image_token": "<image>"
+  },
+  "image_token": "<image>",
+  "legacy": false,
+  "chat_template": "{{ bos_token }}{%- for message in messages %}{%- if message['role'] == 'user' %}{{ '<|header_start|>user<|header_end|>\\n\\n' }}{%- elif message['role'] == 'assistant' %}{{ '<|header_start|>assistant<|header_end|>\\n\\n' }}{%- endif %}{%- if message['content'] is string %}{{ message['content'] }}{%- else %}{%- for content in message['content'] %}{%- if content['type'] == 'image' %}{{ '<|image|>' }}{%- elif content['type'] == 'text' %}{{ content['text'] | trim }}{%- endif %}{%- endfor %}{%- endif %}{{ '<|eot|>' }}{%- endfor %}{%- if add_generation_prompt %}{{ '<|header_start|>assistant<|header_end|>\\n\\n' }}{%- endif %}",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "processor_class": "Llama4Processor",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "trust_remote_code": false,
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "return_token_type_ids": false
+}
\ No newline at end of file
diff --git a/test/resources/scripts/generate_tokenized_dataset.py b/test/resources/scripts/generate_tokenized_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..9a4b8c8f38091ae6aabe796d050315002d8f22ff
--- /dev/null
+++ b/test/resources/scripts/generate_tokenized_dataset.py
@@ -0,0 +1,71 @@
+import argparse
+
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+
+def prepare_dataset(tokenizer, text_file_path: str):
+    """
+    Tokenizes a text file where each line is a different example.
+    Padding is applied to each example.
+    """
+    # Each line is a different example
+    dataset = load_dataset("text", data_files={"train": text_file_path})
+
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
+
+    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+    return tokenized_dataset["train"]
+
+
+def generate_tokenized_dataset(tokenizer_path: str, text_file_path: str, output_dir: str) -> None:
+    """
+    Generate tokenized dataset from a text file, where each line is a different example.
+
+    Args:
+        tokenizer_path (str): Path to the directory containing the tokenizer files.
+        text_file_path (str): Path to the text file to tokenize.
+        output_dir (str): Directory where the tokenized dataset will be saved
+    """
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    train_dataset = prepare_dataset(tokenizer, text_file_path)
+    train_dataset.save_to_disk(output_dir)
+
+
+if __name__ == "__main__":
+    # Example usage:
+    # python generate_tokenized_dataset.py --tokenizer_path /shared/public/models/Mistral-7B --text_file_path ./../../resources/tiny_shakespeare.txt --output_dir ./../../resources/tiny_shakespeare_tokenized
+    parser = argparse.ArgumentParser(description="Generate tokenized dataset from a text file.")
+
+    # Add arguments
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        required=True,
+        help="Path to the directory containing the tokenizer files.",
+    )
+    parser.add_argument(
+        "--text_file_path",
+        type=str,
+        required=True,
+        help="Path to the text file to tokenize.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        required=True,
+        help="Directory where the tokenized dataset will be saved.",
+    )
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Call the function with parsed arguments
+    generate_tokenized_dataset(
+        tokenizer_path=args.tokenizer_path,
+        text_file_path=args.text_file_path,
+        output_dir=args.output_dir,
+    )
diff --git a/test/resources/tiny_shakespeare.txt b/test/resources/tiny_shakespeare.txt
new file mode 100755
index 0000000000000000000000000000000000000000..952b00b88fbe58db1dbf2eca7d7747a3bd64f261
--- /dev/null
+++ b/test/resources/tiny_shakespeare.txt
@@ -0,0 +1,40000 @@
+First Citizen:
+Before we proceed any further, hear me speak.
+
+All:
+Speak, speak.
+
+First Citizen:
+You are all resolved rather to die than to famish?
+
+All:
+Resolved. resolved.
+
+First Citizen:
+First, you know Caius Marcius is chief enemy to the people.
+
+All:
+We know't, we know't.
+
+First Citizen:
+Let us kill him, and we'll have corn at our own price.
+Is't a verdict?
+
+All:
+No more talking on't; let it be done: away, away!
+
+Second Citizen:
+One word, good citizens.
+
+First Citizen:
+We are accounted poor citizens, the patricians good.
+What authority surfeits on would relieve us: if they
+would yield us but the superfluity, while it were
+wholesome, we might guess they relieved us humanely;
+but they think we are too dear: the leanness that
+afflicts us, the object of our misery, is as an
+inventory to particularise their abundance; our
+sufferance is a gain to them Let us revenge this with
+our pikes, ere we become rakes: for the gods know I
+speak this in hunger for bread, not in thirst for revenge.
+
+Second Citizen:
+Would you proceed especially against Caius Marcius?
+
+All:
+Against him first: he's a very dog to the commonalty.
+
+Second Citizen:
+Consider you what services he has done for his country?
+
+First Citizen:
+Very well; and could be content to give him good
+report fort, but that he pays himself with being proud.
+
+Second Citizen:
+Nay, but speak not maliciously.
+
+First Citizen:
+I say unto you, what he hath done famously, he did
+it to that end: though soft-conscienced men can be
+content to say it was for his country he did it to
+please his mother and to be partly proud; which he
+is, even till the altitude of his virtue.
+
+Second Citizen:
+What he cannot help in his nature, you account a
+vice in him. You must in no way say he is covetous.
+
+First Citizen:
+If I must not, I need not be barren of accusations;
+he hath faults, with surplus, to tire in repetition.
+What shouts are these? The other side o' the city
+is risen: why stay we prating here? to the Capitol!
+
+All:
+Come, come.
+
+First Citizen:
+Soft! who comes here?
+
+Second Citizen:
+Worthy Menenius Agrippa; one that hath always loved
+the people.
+
+First Citizen:
+He's one honest enough: would all the rest were so!
+
+MENENIUS:
+What work's, my countrymen, in hand? where go you
+With bats and clubs? The matter? speak, I pray you.
+
+First Citizen:
+Our business is not unknown to the senate; they have
+had inkling this fortnight what we intend to do,
+which now we'll show 'em in deeds. They say poor
+suitors have strong breaths: they shall know we
+have strong arms too.
+
+MENENIUS:
+Why, masters, my good friends, mine honest neighbours,
+Will you undo yourselves?
+
+First Citizen:
+We cannot, sir, we are undone already.
+
+MENENIUS:
+I tell you, friends, most charitable care
+Have the patricians of you. For your wants,
+Your suffering in this dearth, you may as well
+Strike at the heaven with your staves as lift them
+Against the Roman state, whose course will on
+The way it takes, cracking ten thousand curbs
+Of more strong link asunder than can ever
+Appear in your impediment. For the dearth,
+The gods, not the patricians, make it, and
+Your knees to them, not arms, must help. Alack,
+You are transported by calamity
+Thither where more attends you, and you slander
+The helms o' the state, who care for you like fathers,
+When you curse them as enemies.
+
+First Citizen:
+Care for us! True, indeed! They ne'er cared for us
+yet: suffer us to famish, and their store-houses
+crammed with grain; make edicts for usury, to
+support usurers; repeal daily any wholesome act
+established against the rich, and provide more
+piercing statutes daily, to chain up and restrain
+the poor. If the wars eat us not up, they will; and
+there's all the love they bear us.
+
+MENENIUS:
+Either you must
+Confess yourselves wondrous malicious,
+Or be accused of folly. I shall tell you
+A pretty tale: it may be you have heard it;
+But, since it serves my purpose, I will venture
+To stale 't a little more.
+
+First Citizen:
+Well, I'll hear it, sir: yet you must not think to
+fob off our disgrace with a tale: but, an 't please
+you, deliver.
+
+MENENIUS:
+There was a time when all the body's members
+Rebell'd against the belly, thus accused it:
+That only like a gulf it did remain
+I' the midst o' the body, idle and unactive,
+Still cupboarding the viand, never bearing
+Like labour with the rest, where the other instruments
+Did see and hear, devise, instruct, walk, feel,
+And, mutually participate, did minister
+Unto the appetite and affection common
+Of the whole body. The belly answer'd--
+
+First Citizen:
+Well, sir, what answer made the belly?
+
+MENENIUS:
+Sir, I shall tell you. With a kind of smile,
+Which ne'er came from the lungs, but even thus--
+For, look you, I may make the belly smile
+As well as speak--it tauntingly replied
+To the discontented members, the mutinous parts
+That envied his receipt; even so most fitly
+As you malign our senators for that
+They are not such as you.
+
+First Citizen:
+Your belly's answer? What!
+The kingly-crowned head, the vigilant eye,
+The counsellor heart, the arm our soldier,
+Our steed the leg, the tongue our trumpeter.
+With other muniments and petty helps
+In this our fabric, if that they--
+
+MENENIUS:
+What then?
+'Fore me, this fellow speaks! What then? what then?
+
+First Citizen:
+Should by the cormorant belly be restrain'd,
+Who is the sink o' the body,--
+
+MENENIUS:
+Well, what then?
+
+First Citizen:
+The former agents, if they did complain,
+What could the belly answer?
+
+MENENIUS:
+I will tell you
+If you'll bestow a small--of what you have little--
+Patience awhile, you'll hear the belly's answer.
+
+First Citizen:
+Ye're long about it.
+
+MENENIUS:
+Note me this, good friend;
+Your most grave belly was deliberate,
+Not rash like his accusers, and thus answer'd:
+'True is it, my incorporate friends,' quoth he,
+'That I receive the general food at first,
+Which you do live upon; and fit it is,
+Because I am the store-house and the shop
+Of the whole body: but, if you do remember,
+I send it through the rivers of your blood,
+Even to the court, the heart, to the seat o' the brain;
+And, through the cranks and offices of man,
+The strongest nerves and small inferior veins
+From me receive that natural competency
+Whereby they live: and though that all at once,
+You, my good friends,'--this says the belly, mark me,--
+
+First Citizen:
+Ay, sir; well, well.
+
+MENENIUS:
+'Though all at once cannot
+See what I do deliver out to each,
+Yet I can make my audit up, that all
+From me do back receive the flour of all,
+And leave me but the bran.' What say you to't?
+
+First Citizen:
+It was an answer: how apply you this?
+
+MENENIUS:
+The senators of Rome are this good belly,
+And you the mutinous members; for examine
+Their counsels and their cares, digest things rightly
+Touching the weal o' the common, you shall find
+No public benefit which you receive
+But it proceeds or comes from them to you
+And no way from yourselves. What do you think,
+You, the great toe of this assembly?
+
+First Citizen:
+I the great toe! why the great toe?
+
+MENENIUS:
+For that, being one o' the lowest, basest, poorest,
+Of this most wise rebellion, thou go'st foremost:
+Thou rascal, that art worst in blood to run,
+Lead'st first to win some vantage.
+But make you ready your stiff bats and clubs:
+Rome and her rats are at the point of battle;
+The one side must have bale.
+Hail, noble Marcius!
+
+MARCIUS:
+Thanks. What's the matter, you dissentious rogues,
+That, rubbing the poor itch of your opinion,
+Make yourselves scabs?
+
+First Citizen:
+We have ever your good word.
+
+MARCIUS:
+He that will give good words to thee will flatter
+Beneath abhorring. What would you have, you curs,
+That like nor peace nor war? the one affrights you,
+The other makes you proud. He that trusts to you,
+Where he should find you lions, finds you hares;
+Where foxes, geese: you are no surer, no,
+Than is the coal of fire upon the ice,
+Or hailstone in the sun. Your virtue is
+To make him worthy whose offence subdues him
+And curse that justice did it.
+Who deserves greatness
+Deserves your hate; and your affections are
+A sick man's appetite, who desires most that
+Which would increase his evil. He that depends
+Upon your favours swims with fins of lead
+And hews down oaks with rushes. Hang ye! Trust Ye?
+With every minute you do change a mind,
+And call him noble that was now your hate,
+Him vile that was your garland. What's the matter,
+That in these several places of the city
+You cry against the noble senate, who,
+Under the gods, keep you in awe, which else
+Would feed on one another? What's their seeking?
+
+MENENIUS:
+For corn at their own rates; whereof, they say,
+The city is well stored.
+
+MARCIUS:
+Hang 'em! They say!
+They'll sit by the fire, and presume to know
+What's done i' the Capitol; who's like to rise,
+Who thrives and who declines; side factions
+and give out
+Conjectural marriages; making parties strong
+And feebling such as stand not in their liking
+Below their cobbled shoes. They say there's
+grain enough!
+Would the nobility lay aside their ruth,
+And let me use my sword, I'll make a quarry
+With thousands of these quarter'd slaves, as high
+As I could pick my lance.
+
+MENENIUS:
+Nay, these are almost thoroughly persuaded;
+For though abundantly they lack discretion,
+Yet are they passing cowardly. But, I beseech you,
+What says the other troop?
+
+MARCIUS:
+They are dissolved: hang 'em!
+They said they were an-hungry; sigh'd forth proverbs,
+That hunger broke stone walls, that dogs must eat,
+That meat was made for mouths, that the gods sent not
+Corn for the rich men only: with these shreds
+They vented their complainings; which being answer'd,
+And a petition granted them, a strange one--
+To break the heart of generosity,
+And make bold power look pale--they threw their caps
+As they would hang them on the horns o' the moon,
+Shouting their emulation.
+
+MENENIUS:
+What is granted them?
+
+MARCIUS:
+Five tribunes to defend their vulgar wisdoms,
+Of their own choice: one's Junius Brutus,
+Sicinius Velutus, and I know not--'Sdeath!
+The rabble should have first unroof'd the city,
+Ere so prevail'd with me: it will in time
+Win upon power and throw forth greater themes
+For insurrection's arguing.
+
+MENENIUS:
+This is strange.
+
+MARCIUS:
+Go, get you home, you fragments!
+
+Messenger:
+Where's Caius Marcius?
+
+MARCIUS:
+Here: what's the matter?
+
+Messenger:
+The news is, sir, the Volsces are in arms.
+
+MARCIUS:
+I am glad on 't: then we shall ha' means to vent
+Our musty superfluity. See, our best elders.
+
+First Senator:
+Marcius, 'tis true that you have lately told us;
+The Volsces are in arms.
+
+MARCIUS:
+They have a leader,
+Tullus Aufidius, that will put you to 't.
+I sin in envying his nobility,
+And were I any thing but what I am,
+I would wish me only he.
+
+COMINIUS:
+You have fought together.
+
+MARCIUS:
+Were half to half the world by the ears and he.
+Upon my party, I'ld revolt to make
+Only my wars with him: he is a lion
+That I am proud to hunt.
+
+First Senator:
+Then, worthy Marcius,
+Attend upon Cominius to these wars.
+
+COMINIUS:
+It is your former promise.
+
+MARCIUS:
+Sir, it is;
+And I am constant. Titus Lartius, thou
+Shalt see me once more strike at Tullus' face.
+What, art thou stiff? stand'st out?
+
+TITUS:
+No, Caius Marcius;
+I'll lean upon one crutch and fight with t'other,
+Ere stay behind this business.
+
+MENENIUS:
+O, true-bred!
+
+First Senator:
+Your company to the Capitol; where, I know,
+Our greatest friends attend us.
+
+TITUS:
+
+COMINIUS:
+Noble Marcius!
+
+First Senator:
+
+MARCIUS:
+Nay, let them follow:
+The Volsces have much corn; take these rats thither
+To gnaw their garners. Worshipful mutiners,
+Your valour puts well forth: pray, follow.
+
+SICINIUS:
+Was ever man so proud as is this Marcius?
+
+BRUTUS:
+He has no equal.
+
+SICINIUS:
+When we were chosen tribunes for the people,--
+
+BRUTUS:
+Mark'd you his lip and eyes?
+
+SICINIUS:
+Nay. but his taunts.
+
+BRUTUS:
+Being moved, he will not spare to gird the gods.
+
+SICINIUS:
+Be-mock the modest moon.
+
+BRUTUS:
+The present wars devour him: he is grown
+Too proud to be so valiant.
+
+SICINIUS:
+Such a nature,
+Tickled with good success, disdains the shadow
+Which he treads on at noon: but I do wonder
+His insolence can brook to be commanded
+Under Cominius.
+
+BRUTUS:
+Fame, at the which he aims,
+In whom already he's well graced, can not
+Better be held nor more attain'd than by
+A place below the first: for what miscarries
+Shall be the general's fault, though he perform
+To the utmost of a man, and giddy censure
+Will then cry out of Marcius 'O if he
+Had borne the business!'
+
+SICINIUS:
+Besides, if things go well,
+Opinion that so sticks on Marcius shall
+Of his demerits rob Cominius.
+
+BRUTUS:
+Come:
+Half all Cominius' honours are to Marcius.
+Though Marcius earned them not, and all his faults
+To Marcius shall be honours, though indeed
+In aught he merit not.
+
+SICINIUS:
+Let's hence, and hear
+How the dispatch is made, and in what fashion,
+More than his singularity, he goes
+Upon this present action.
+
+BRUTUS:
+Lets along.
+
+First Senator:
+So, your opinion is, Aufidius,
+That they of Rome are entered in our counsels
+And know how we proceed.
+
+AUFIDIUS:
+Is it not yours?
+What ever have been thought on in this state,
+That could be brought to bodily act ere Rome
+Had circumvention? 'Tis not four days gone
+Since I heard thence; these are the words: I think
+I have the letter here; yes, here it is.
+'They have press'd a power, but it is not known
+Whether for east or west: the dearth is great;
+The people mutinous; and it is rumour'd,
+Cominius, Marcius your old enemy,
+Who is of Rome worse hated than of you,
+And Titus Lartius, a most valiant Roman,
+These three lead on this preparation
+Whither 'tis bent: most likely 'tis for you:
+Consider of it.'
+
+First Senator:
+Our army's in the field
+We never yet made doubt but Rome was ready
+To answer us.
+
+AUFIDIUS:
+Nor did you think it folly
+To keep your great pretences veil'd till when
+They needs must show themselves; which
+in the hatching,
+It seem'd, appear'd to Rome. By the discovery.
+We shall be shorten'd in our aim, which was
+To take in many towns ere almost Rome
+Should know we were afoot.
+
+Second Senator:
+Noble Aufidius,
+Take your commission; hie you to your bands:
+Let us alone to guard Corioli:
+If they set down before 's, for the remove
+Bring your army; but, I think, you'll find
+They've not prepared for us.
+
+AUFIDIUS:
+O, doubt not that;
+I speak from certainties. Nay, more,
+Some parcels of their power are forth already,
+And only hitherward. I leave your honours.
+If we and Caius Marcius chance to meet,
+'Tis sworn between us we shall ever strike
+Till one can do no more.
+
+All:
+The gods assist you!
+
+AUFIDIUS:
+And keep your honours safe!
+
+First Senator:
+Farewell.
+
+Second Senator:
+Farewell.
+
+All:
+Farewell.
+
+VOLUMNIA:
+I pray you, daughter, sing; or express yourself in a
+more comfortable sort: if my son were my husband, I
+should freelier rejoice in that absence wherein he
+won honour than in the embracements of his bed where
+he would show most love. When yet he was but
+tender-bodied and the only son of my womb, when
+youth with comeliness plucked all gaze his way, when
+for a day of kings' entreaties a mother should not
+sell him an hour from her beholding, I, considering
+how honour would become such a person. that it was
+no better than picture-like to hang by the wall, if
+renown made it not stir, was pleased to let him seek
+danger where he was like to find fame. To a cruel
+war I sent him; from whence he returned, his brows
+bound with oak. I tell thee, daughter, I sprang not
+more in joy at first hearing he was a man-child
+than now in first seeing he had proved himself a
+man.
+
+VIRGILIA:
+But had he died in the business, madam; how then?
+
+VOLUMNIA:
+Then his good report should have been my son; I
+therein would have found issue. Hear me profess
+sincerely: had I a dozen sons, each in my love
+alike and none less dear than thine and my good
+Marcius, I had rather had eleven die nobly for their
+country than one voluptuously surfeit out of action.
+
+Gentlewoman:
+Madam, the Lady Valeria is come to visit you.
+
+VIRGILIA:
+Beseech you, give me leave to retire myself.
+
+VOLUMNIA:
+Indeed, you shall not.
+Methinks I hear hither your husband's drum,
+See him pluck Aufidius down by the hair,
+As children from a bear, the Volsces shunning him:
+Methinks I see him stamp thus, and call thus:
+'Come on, you cowards! you were got in fear,
+Though you were born in Rome:' his bloody brow
+With his mail'd hand then wiping, forth he goes,
+Like to a harvest-man that's task'd to mow
+Or all or lose his hire.
+
+VIRGILIA:
+His bloody brow! O Jupiter, no blood!
+
+VOLUMNIA:
+Away, you fool! it more becomes a man
+Than gilt his trophy: the breasts of Hecuba,
+When she did suckle Hector, look'd not lovelier
+Than Hector's forehead when it spit forth blood
+At Grecian sword, contemning. Tell Valeria,
+We are fit to bid her welcome.
+
+VIRGILIA:
+Heavens bless my lord from fell Aufidius!
+
+VOLUMNIA:
+He'll beat Aufidius 'head below his knee
+And tread upon his neck.
+
+VALERIA:
+My ladies both, good day to you.
+
+VOLUMNIA:
+Sweet madam.
+
+VIRGILIA:
+I am glad to see your ladyship.
+
+VALERIA:
+How do you both? you are manifest house-keepers.
+What are you sewing here? A fine spot, in good
+faith. How does your little son?
+
+VIRGILIA:
+I thank your ladyship; well, good madam.
+
+VOLUMNIA:
+He had rather see the swords, and hear a drum, than
+look upon his school-master.
+
+VALERIA:
+O' my word, the father's son: I'll swear,'tis a
+very pretty boy. O' my troth, I looked upon him o'
+Wednesday half an hour together: has such a
+confirmed countenance. I saw him run after a gilded
+butterfly: and when he caught it, he let it go
+again; and after it again; and over and over he
+comes, and again; catched it again; or whether his
+fall enraged him, or how 'twas, he did so set his
+teeth and tear it; O, I warrant it, how he mammocked
+it!
+
+VOLUMNIA:
+One on 's father's moods.
+
+VALERIA:
+Indeed, la, 'tis a noble child.
+
+VIRGILIA:
+A crack, madam.
+
+VALERIA:
+Come, lay aside your stitchery; I must have you play
+the idle husewife with me this afternoon.
+
+VIRGILIA:
+No, good madam; I will not out of doors.
+
+VALERIA:
+Not out of doors!
+
+VOLUMNIA:
+She shall, she shall.
+
+VIRGILIA:
+Indeed, no, by your patience; I'll not over the
+threshold till my lord return from the wars.
+
+VALERIA:
+Fie, you confine yourself most unreasonably: come,
+you must go visit the good lady that lies in.
+
+VIRGILIA:
+I will wish her speedy strength, and visit her with
+my prayers; but I cannot go thither.
+
+VOLUMNIA:
+Why, I pray you?
+
+VIRGILIA:
+'Tis not to save labour, nor that I want love.
+
+VALERIA:
+You would be another Penelope: yet, they say, all
+the yarn she spun in Ulysses' absence did but fill
+Ithaca full of moths. Come; I would your cambric
+were sensible as your finger, that you might leave
+pricking it for pity. Come, you shall go with us.
+
+VIRGILIA:
+No, good madam, pardon me; indeed, I will not forth.
+
+VALERIA:
+In truth, la, go with me; and I'll tell you
+excellent news of your husband.
+
+VIRGILIA:
+O, good madam, there can be none yet.
+
+VALERIA:
+Verily, I do not jest with you; there came news from
+him last night.
+
+VIRGILIA:
+Indeed, madam?
+
+VALERIA:
+In earnest, it's true; I heard a senator speak it.
+Thus it is: the Volsces have an army forth; against
+whom Cominius the general is gone, with one part of
+our Roman power: your lord and Titus Lartius are set
+down before their city Corioli; they nothing doubt
+prevailing and to make it brief wars. This is true,
+on mine honour; and so, I pray, go with us.
+
+VIRGILIA:
+Give me excuse, good madam; I will obey you in every
+thing hereafter.
+
+VOLUMNIA:
+Let her alone, lady: as she is now, she will but
+disease our better mirth.
+
+VALERIA:
+In troth, I think she would. Fare you well, then.
+Come, good sweet lady. Prithee, Virgilia, turn thy
+solemness out o' door. and go along with us.
+
+VIRGILIA:
+No, at a word, madam; indeed, I must not. I wish
+you much mirth.
+
+VALERIA:
+Well, then, farewell.
+
+MARCIUS:
+Yonder comes news. A wager they have met.
+
+LARTIUS:
+My horse to yours, no.
+
+MARCIUS:
+'Tis done.
+
+LARTIUS:
+Agreed.
+
+MARCIUS:
+Say, has our general met the enemy?
+
+Messenger:
+They lie in view; but have not spoke as yet.
+
+LARTIUS:
+So, the good horse is mine.
+
+MARCIUS:
+I'll buy him of you.
+
+LARTIUS:
+No, I'll nor sell nor give him: lend you him I will
+For half a hundred years. Summon the town.
+
+MARCIUS:
+How far off lie these armies?
+
+Messenger:
+Within this mile and half.
+
+MARCIUS:
+Then shall we hear their 'larum, and they ours.
+Now, Mars, I prithee, make us quick in work,
+That we with smoking swords may march from hence,
+To help our fielded friends! Come, blow thy blast.
+Tutus Aufidius, is he within your walls?
+
+First Senator:
+No, nor a man that fears you less than he,
+That's lesser than a little.
+Hark! our drums
+Are bringing forth our youth. We'll break our walls,
+Rather than they shall pound us up: our gates,
+Which yet seem shut, we, have but pinn'd with rushes;
+They'll open of themselves.
+Hark you. far off!
+There is Aufidius; list, what work he makes
+Amongst your cloven army.
+
+MARCIUS:
+O, they are at it!
+
+LARTIUS:
+Their noise be our instruction. Ladders, ho!
+
+MARCIUS:
+They fear us not, but issue forth their city.
+Now put your shields before your hearts, and fight
+With hearts more proof than shields. Advance,
+brave Titus:
+They do disdain us much beyond our thoughts,
+Which makes me sweat with wrath. Come on, my fellows:
+He that retires I'll take him for a Volsce,
+And he shall feel mine edge.
+
+MARCIUS:
+All the contagion of the south light on you,
+You shames of Rome! you herd of--Boils and plagues
+Plaster you o'er, that you may be abhorr'd
+Further than seen and one infect another
+Against the wind a mile! You souls of geese,
+That bear the shapes of men, how have you run
+From slaves that apes would beat! Pluto and hell!
+All hurt behind; backs red, and faces pale
+With flight and agued fear! Mend and charge home,
+Or, by the fires of heaven, I'll leave the foe
+And make my wars on you: look to't: come on;
+If you'll stand fast, we'll beat them to their wives,
+As they us to our trenches followed.
+So, now the gates are ope: now prove good seconds:
+'Tis for the followers fortune widens them,
+Not for the fliers: mark me, and do the like.
+
+First Soldier:
+Fool-hardiness; not I.
+
+Second Soldier:
+Nor I.
+
+First Soldier:
+See, they have shut him in.
+
+All:
+To the pot, I warrant him.
+
+LARTIUS:
+What is become of Marcius?
+
+All:
+Slain, sir, doubtless.
+
+First Soldier:
+Following the fliers at the very heels,
+With them he enters; who, upon the sudden,
+Clapp'd to their gates: he is himself alone,
+To answer all the city.
+
+LARTIUS:
+O noble fellow!
+Who sensibly outdares his senseless sword,
+And, when it bows, stands up. Thou art left, Marcius:
+A carbuncle entire, as big as thou art,
+Were not so rich a jewel. Thou wast a soldier
+Even to Cato's wish, not fierce and terrible
+Only in strokes; but, with thy grim looks and
+The thunder-like percussion of thy sounds,
+Thou madst thine enemies shake, as if the world
+Were feverous and did tremble.
+
+First Soldier:
+Look, sir.
+
+LARTIUS:
+O,'tis Marcius!
+Let's fetch him off, or make remain alike.
+
+First Roman:
+This will I carry to Rome.
+
+Second Roman:
+And I this.
+
+Third Roman:
+A murrain on't! I took this for silver.
+
+MARCIUS:
+See here these movers that do prize their hours
+At a crack'd drachm! Cushions, leaden spoons,
+Irons of a doit, doublets that hangmen would
+Bury with those that wore them, these base slaves,
+Ere yet the fight be done, pack up: down with them!
+And hark, what noise the general makes! To him!
+There is the man of my soul's hate, Aufidius,
+Piercing our Romans: then, valiant Titus, take
+Convenient numbers to make good the city;
+Whilst I, with those that have the spirit, will haste
+To help Cominius.
+
+LARTIUS:
+Worthy sir, thou bleed'st;
+Thy exercise hath been too violent for
+A second course of fight.
+
+MARCIUS:
+Sir, praise me not;
+My work hath yet not warm'd me: fare you well:
+The blood I drop is rather physical
+Than dangerous to me: to Aufidius thus
+I will appear, and fight.
+
+LARTIUS:
+Now the fair goddess, Fortune,
+Fall deep in love with thee; and her great charms
+Misguide thy opposers' swords! Bold gentleman,
+Prosperity be thy page!
+
+MARCIUS:
+Thy friend no less
+Than those she placeth highest! So, farewell.
+
+LARTIUS:
+Thou worthiest Marcius!
+Go, sound thy trumpet in the market-place;
+Call thither all the officers o' the town,
+Where they shall know our mind: away!
+
+COMINIUS:
+Breathe you, my friends: well fought;
+we are come off
+Like Romans, neither foolish in our stands,
+Nor cowardly in retire: believe me, sirs,
+We shall be charged again. Whiles we have struck,
+By interims and conveying gusts we have heard
+The charges of our friends. Ye Roman gods!
+Lead their successes as we wish our own,
+That both our powers, with smiling
+fronts encountering,
+May give you thankful sacrifice.
+Thy news?
+
+Messenger:
+The citizens of Corioli have issued,
+And given to Lartius and to Marcius battle:
+I saw our party to their trenches driven,
+And then I came away.
+
+COMINIUS:
+Though thou speak'st truth,
+Methinks thou speak'st not well.
+How long is't since?
+
+Messenger:
+Above an hour, my lord.
+
+COMINIUS:
+'Tis not a mile; briefly we heard their drums:
+How couldst thou in a mile confound an hour,
+And bring thy news so late?
+
+Messenger:
+Spies of the Volsces
+Held me in chase, that I was forced to wheel
+Three or four miles about, else had I, sir,
+Half an hour since brought my report.
+
+COMINIUS:
+Who's yonder,
+That does appear as he were flay'd? O gods
+He has the stamp of Marcius; and I have
+Before-time seen him thus.
+
+MARCIUS:
+
+COMINIUS:
+The shepherd knows not thunder from a tabour
+More than I know the sound of Marcius' tongue
+From every meaner man.
+
+MARCIUS:
+Come I too late?
+
+COMINIUS:
+Ay, if you come not in the blood of others,
+But mantled in your own.
+
+MARCIUS:
+O, let me clip ye
+In arms as sound as when I woo'd, in heart
+As merry as when our nuptial day was done,
+And tapers burn'd to bedward!
+
+COMINIUS:
+Flower of warriors,
+How is it with Titus Lartius?
+
+MARCIUS:
+As with a man busied about decrees:
+Condemning some to death, and some to exile;
+Ransoming him, or pitying, threatening the other;
+Holding Corioli in the name of Rome,
+Even like a fawning greyhound in the leash,
+To let him slip at will.
+
+COMINIUS:
+Where is that slave
+Which told me they had beat you to your trenches?
+Where is he? call him hither.
+
+MARCIUS:
+Let him alone;
+He did inform the truth: but for our gentlemen,
+The common file--a plague! tribunes for them!--
+The mouse ne'er shunn'd the cat as they did budge
+From rascals worse than they.
+
+COMINIUS:
+But how prevail'd you?
+
+MARCIUS:
+Will the time serve to tell? I do not think.
+Where is the enemy? are you lords o' the field?
+If not, why cease you till you are so?
+
+COMINIUS:
+Marcius,
+We have at disadvantage fought and did
+Retire to win our purpose.
+
+MARCIUS:
+How lies their battle? know you on which side
+They have placed their men of trust?
+
+COMINIUS:
+As I guess, Marcius,
+Their bands i' the vaward are the Antiates,
+Of their best trust; o'er them Aufidius,
+Their very heart of hope.
+
+MARCIUS:
+I do beseech you,
+By all the battles wherein we have fought,
+By the blood we have shed together, by the vows
+We have made to endure friends, that you directly
+Set me against Aufidius and his Antiates;
+And that you not delay the present, but,
+Filling the air with swords advanced and darts,
+We prove this very hour.
+
+COMINIUS:
+Though I could wish
+You were conducted to a gentle bath
+And balms applied to, you, yet dare I never
+Deny your asking: take your choice of those
+That best can aid your action.
+
+MARCIUS:
+Those are they
+That most are willing. If any such be here--
+As it were sin to doubt--that love this painting
+Wherein you see me smear'd; if any fear
+Lesser his person than an ill report;
+If any think brave death outweighs bad life
+And that his country's dearer than himself;
+Let him alone, or so many so minded,
+Wave thus, to express his disposition,
+And follow Marcius.
+O, me alone! make you a sword of me?
+If these shows be not outward, which of you
+But is four Volsces? none of you but is
+Able to bear against the great Aufidius
+A shield as hard as his. A certain number,
+Though thanks to all, must I select
+from all: the rest
+Shall bear the business in some other fight,
+As cause will be obey'd. Please you to march;
+And four shall quickly draw out my command,
+Which men are best inclined.
+
+COMINIUS:
+March on, my fellows:
+Make good this ostentation, and you shall
+Divide in all with us.
+
+LARTIUS:
+So, let the ports be guarded: keep your duties,
+As I have set them down. If I do send, dispatch
+Those centuries to our aid: the rest will serve
+For a short holding: if we lose the field,
+We cannot keep the town.
+
+Lieutenant:
+Fear not our care, sir.
+
+LARTIUS:
+Hence, and shut your gates upon's.
+Our guider, come; to the Roman camp conduct us.
+
+MARCIUS:
+I'll fight with none but thee; for I do hate thee
+Worse than a promise-breaker.
+
+AUFIDIUS:
+We hate alike:
+Not Afric owns a serpent I abhor
+More than thy fame and envy. Fix thy foot.
+
+MARCIUS:
+Let the first budger die the other's slave,
+And the gods doom him after!
+
+AUFIDIUS:
+If I fly, Marcius,
+Holloa me like a hare.
+
+MARCIUS:
+Within these three hours, Tullus,
+Alone I fought in your Corioli walls,
+And made what work I pleased: 'tis not my blood
+Wherein thou seest me mask'd; for thy revenge
+Wrench up thy power to the highest.
+
+AUFIDIUS:
+Wert thou the Hector
+That was the whip of your bragg'd progeny,
+Thou shouldst not scape me here.
+Officious, and not valiant, you have shamed me
+In your condemned seconds.
+
+COMINIUS:
+If I should tell thee o'er this thy day's work,
+Thou'ldst not believe thy deeds: but I'll report it
+Where senators shall mingle tears with smiles,
+Where great patricians shall attend and shrug,
+I' the end admire, where ladies shall be frighted,
+And, gladly quaked, hear more; where the
+dull tribunes,
+That, with the fusty plebeians, hate thine honours,
+Shall say against their hearts 'We thank the gods
+Our Rome hath such a soldier.'
+Yet camest thou to a morsel of this feast,
+Having fully dined before.
+
+LARTIUS:
+O general,
+Here is the steed, we the caparison:
+Hadst thou beheld--
+
+MARCIUS:
+Pray now, no more: my mother,
+Who has a charter to extol her blood,
+When she does praise me grieves me. I have done
+As you have done; that's what I can; induced
+As you have been; that's for my country:
+He that has but effected his good will
+Hath overta'en mine act.
+
+COMINIUS:
+You shall not be
+The grave of your deserving; Rome must know
+The value of her own: 'twere a concealment
+Worse than a theft, no less than a traducement,
+To hide your doings; and to silence that,
+Which, to the spire and top of praises vouch'd,
+Would seem but modest: therefore, I beseech you
+In sign of what you are, not to reward
+What you have done--before our army hear me.
+
+MARCIUS:
+I have some wounds upon me, and they smart
+To hear themselves remember'd.
+
+COMINIUS:
+Should they not,
+Well might they fester 'gainst ingratitude,
+And tent themselves with death. Of all the horses,
+Whereof we have ta'en good and good store, of all
+The treasure in this field achieved and city,
+We render you the tenth, to be ta'en forth,
+Before the common distribution, at
+Your only choice.
+
+MARCIUS:
+I thank you, general;
+But cannot make my heart consent to take
+A bribe to pay my sword: I do refuse it;
+And stand upon my common part with those
+That have beheld the doing.
+
+MARCIUS:
+May these same instruments, which you profane,
+Never sound more! when drums and trumpets shall
+I' the field prove flatterers, let courts and cities be
+Made all of false-faced soothing!
+When steel grows soft as the parasite's silk,
+Let him be made a coverture for the wars!
+No more, I say! For that I have not wash'd
+My nose that bled, or foil'd some debile wretch.--
+Which, without note, here's many else have done,--
+You shout me forth
+In acclamations hyperbolical;
+As if I loved my little should be dieted
+In praises sauced with lies.
+
+COMINIUS:
+Too modest are you;
+More cruel to your good report than grateful
+To us that give you truly: by your patience,
+If 'gainst yourself you be incensed, we'll put you,
+Like one that means his proper harm, in manacles,
+Then reason safely with you. Therefore, be it known,
+As to us, to all the world, that Caius Marcius
+Wears this war's garland: in token of the which,
+My noble steed, known to the camp, I give him,
+With all his trim belonging; and from this time,
+For what he did before Corioli, call him,
+With all the applause and clamour of the host,
+CAIUS MARCIUS CORIOLANUS! Bear
+The addition nobly ever!
+
+All:
+Caius Marcius Coriolanus!
+
+CORIOLANUS:
+I will go wash;
+And when my face is fair, you shall perceive
+Whether I blush or no: howbeit, I thank you.
+I mean to stride your steed, and at all times
+To undercrest your good addition
+To the fairness of my power.
+
+COMINIUS:
+So, to our tent;
+Where, ere we do repose us, we will write
+To Rome of our success. You, Titus Lartius,
+Must to Corioli back: send us to Rome
+The best, with whom we may articulate,
+For their own good and ours.
+
+LARTIUS:
+I shall, my lord.
+
+CORIOLANUS:
+The gods begin to mock me. I, that now
+Refused most princely gifts, am bound to beg
+Of my lord general.
+
+COMINIUS:
+Take't; 'tis yours. What is't?
+
+CORIOLANUS:
+I sometime lay here in Corioli
+At a poor man's house; he used me kindly:
+He cried to me; I saw him prisoner;
+But then Aufidius was within my view,
+And wrath o'erwhelm'd my pity: I request you
+To give my poor host freedom.
+
+COMINIUS:
+O, well begg'd!
+Were he the butcher of my son, he should
+Be free as is the wind. Deliver him, Titus.
+
+LARTIUS:
+Marcius, his name?
+
+CORIOLANUS:
+By Jupiter! forgot.
+I am weary; yea, my memory is tired.
+Have we no wine here?
+
+COMINIUS:
+Go we to our tent:
+The blood upon your visage dries; 'tis time
+It should be look'd to: come.
+
+AUFIDIUS:
+The town is ta'en!
+
+First Soldier:
+'Twill be deliver'd back on good condition.
+
+AUFIDIUS:
+Condition!
+I would I were a Roman; for I cannot,
+Being a Volsce, be that I am. Condition!
+What good condition can a treaty find
+I' the part that is at mercy? Five times, Marcius,
+I have fought with thee: so often hast thou beat me,
+And wouldst do so, I think, should we encounter
+As often as we eat. By the elements,
+If e'er again I meet him beard to beard,
+He's mine, or I am his: mine emulation
+Hath not that honour in't it had; for where
+I thought to crush him in an equal force,
+True sword to sword, I'll potch at him some way
+Or wrath or craft may get him.
+
+First Soldier:
+He's the devil.
+
+AUFIDIUS:
+Bolder, though not so subtle. My valour's poison'd
+With only suffering stain by him; for him
+Shall fly out of itself: nor sleep nor sanctuary,
+Being naked, sick, nor fane nor Capitol,
+The prayers of priests nor times of sacrifice,
+Embarquements all of fury, shall lift up
+Their rotten privilege and custom 'gainst
+My hate to Marcius: where I find him, were it
+At home, upon my brother's guard, even there,
+Against the hospitable canon, would I
+Wash my fierce hand in's heart. Go you to the city;
+Learn how 'tis held; and what they are that must
+Be hostages for Rome.
+
+First Soldier:
+Will not you go?
+
+AUFIDIUS:
+I am attended at the cypress grove: I pray you--
+'Tis south the city mills--bring me word thither
+How the world goes, that to the pace of it
+I may spur on my journey.
+
+First Soldier:
+I shall, sir.
+
+MENENIUS:
+The augurer tells me we shall have news to-night.
+
+BRUTUS:
+Good or bad?
+
+MENENIUS:
+Not according to the prayer of the people, for they
+love not Marcius.
+
+SICINIUS:
+Nature teaches beasts to know their friends.
+
+MENENIUS:
+Pray you, who does the wolf love?
+
+SICINIUS:
+The lamb.
+
+MENENIUS:
+Ay, to devour him; as the hungry plebeians would the
+noble Marcius.
+
+BRUTUS:
+He's a lamb indeed, that baes like a bear.
+
+MENENIUS:
+He's a bear indeed, that lives like a lamb. You two
+are old men: tell me one thing that I shall ask you.
+
+Both:
+Well, sir.
+
+MENENIUS:
+In what enormity is Marcius poor in, that you two
+have not in abundance?
+
+BRUTUS:
+He's poor in no one fault, but stored with all.
+
+SICINIUS:
+Especially in pride.
+
+BRUTUS:
+And topping all others in boasting.
+
+MENENIUS:
+This is strange now: do you two know how you are
+censured here in the city, I mean of us o' the
+right-hand file? do you?
+
+Both:
+Why, how are we censured?
+
+MENENIUS:
+Because you talk of pride now,--will you not be angry?
+
+Both:
+Well, well, sir, well.
+
+MENENIUS:
+Why, 'tis no great matter; for a very little thief of
+occasion will rob you of a great deal of patience:
+give your dispositions the reins, and be angry at
+your pleasures; at the least if you take it as a
+pleasure to you in being so. You blame Marcius for
+being proud?
+
+BRUTUS:
+We do it not alone, sir.
+
+MENENIUS:
+I know you can do very little alone; for your helps
+are many, or else your actions would grow wondrous
+single: your abilities are too infant-like for
+doing much alone. You talk of pride: O that you
+could turn your eyes toward the napes of your necks,
+and make but an interior survey of your good selves!
+O that you could!
+
+BRUTUS:
+What then, sir?
+
+MENENIUS:
+Why, then you should discover a brace of unmeriting,
+proud, violent, testy magistrates, alias fools, as
+any in Rome.
+
+SICINIUS:
+Menenius, you are known well enough too.
+
+MENENIUS:
+I am known to be a humorous patrician, and one that
+loves a cup of hot wine with not a drop of allaying
+Tiber in't; said to be something imperfect in
+favouring the first complaint; hasty and tinder-like
+upon too trivial motion; one that converses more
+with the buttock of the night than with the forehead
+of the morning: what I think I utter, and spend my
+malice in my breath. Meeting two such wealsmen as
+you are--I cannot call you Lycurguses--if the drink
+you give me touch my palate adversely, I make a
+crooked face at it. I can't say your worships have
+delivered the matter well, when I find the ass in
+compound with the major part of your syllables: and
+though I must be content to bear with those that say
+you are reverend grave men, yet they lie deadly that
+tell you you have good faces. If you see this in
+the map of my microcosm, follows it that I am known
+well enough too? what barm can your bisson
+conspectuities glean out of this character, if I be
+known well enough too?
+
+BRUTUS:
+Come, sir, come, we know you well enough.
+
+MENENIUS:
+You know neither me, yourselves nor any thing. You
+are ambitious for poor knaves' caps and legs: you
+wear out a good wholesome forenoon in hearing a
+cause between an orange wife and a fosset-seller;
+and then rejourn the controversy of three pence to a
+second day of audience. When you are hearing a
+matter between party and party, if you chance to be
+pinched with the colic, you make faces like
+mummers; set up the bloody flag against all
+patience; and, in roaring for a chamber-pot,
+dismiss the controversy bleeding the more entangled
+by your hearing: all the peace you make in their
+cause is, calling both the parties knaves. You are
+a pair of strange ones.
+
+BRUTUS:
+Come, come, you are well understood to be a
+perfecter giber for the table than a necessary
+bencher in the Capitol.
+
+MENENIUS:
+Our very priests must become mockers, if they shall
+encounter such ridiculous subjects as you are. When
+you speak best unto the purpose, it is not worth the
+wagging of your beards; and your beards deserve not
+so honourable a grave as to stuff a botcher's
+cushion, or to be entombed in an ass's pack-
+saddle. Yet you must be saying, Marcius is proud;
+who in a cheap estimation, is worth predecessors
+since Deucalion, though peradventure some of the
+best of 'em were hereditary hangmen. God-den to
+your worships: more of your conversation would
+infect my brain, being the herdsmen of the beastly
+plebeians: I will be bold to take my leave of you.
+How now, my as fair as noble ladies,--and the moon,
+were she earthly, no nobler,--whither do you follow
+your eyes so fast?
+
+VOLUMNIA:
+Honourable Menenius, my boy Marcius approaches; for
+the love of Juno, let's go.
+
+MENENIUS:
+Ha! Marcius coming home!
+
+VOLUMNIA:
+Ay, worthy Menenius; and with most prosperous
+approbation.
+
+MENENIUS:
+Take my cap, Jupiter, and I thank thee. Hoo!
+Marcius coming home!
+
+VOLUMNIA:
+Nay,'tis true.
+
+VOLUMNIA:
+Look, here's a letter from him: the state hath
+another, his wife another; and, I think, there's one
+at home for you.
+
+MENENIUS:
+I will make my very house reel tonight: a letter for
+me!
+
+VIRGILIA:
+Yes, certain, there's a letter for you; I saw't.
+
+MENENIUS:
+A letter for me! it gives me an estate of seven
+years' health; in which time I will make a lip at
+the physician: the most sovereign prescription in
+Galen is but empiricutic, and, to this preservative,
+of no better report than a horse-drench. Is he
+not wounded? he was wont to come home wounded.
+
+VIRGILIA:
+O, no, no, no.
+
+VOLUMNIA:
+O, he is wounded; I thank the gods for't.
+
+MENENIUS:
+So do I too, if it be not too much: brings a'
+victory in his pocket? the wounds become him.
+
+VOLUMNIA:
+On's brows: Menenius, he comes the third time home
+with the oaken garland.
+
+MENENIUS:
+Has he disciplined Aufidius soundly?
+
+VOLUMNIA:
+Titus Lartius writes, they fought together, but
+Aufidius got off.
+
+MENENIUS:
+And 'twas time for him too, I'll warrant him that:
+an he had stayed by him, I would not have been so
+fidiused for all the chests in Corioli, and the gold
+that's in them. Is the senate possessed of this?
+
+VOLUMNIA:
+Good ladies, let's go. Yes, yes, yes; the senate
+has letters from the general, wherein he gives my
+son the whole name of the war: he hath in this
+action outdone his former deeds doubly
+
+VALERIA:
+In troth, there's wondrous things spoke of him.
+
+MENENIUS:
+Wondrous! ay, I warrant you, and not without his
+true purchasing.
+
+VIRGILIA:
+The gods grant them true!
+
+VOLUMNIA:
+True! pow, wow.
+
+MENENIUS:
+True! I'll be sworn they are true.
+Where is he wounded?
+God save your good worships! Marcius is coming
+home: he has more cause to be proud. Where is he wounded?
+
+VOLUMNIA:
+I' the shoulder and i' the left arm there will be
+large cicatrices to show the people, when he shall
+stand for his place. He received in the repulse of
+Tarquin seven hurts i' the body.
+
+MENENIUS:
+One i' the neck, and two i' the thigh,--there's
+nine that I know.
+
+VOLUMNIA:
+He had, before this last expedition, twenty-five
+wounds upon him.
+
+MENENIUS:
+Now it's twenty-seven: every gash was an enemy's grave.
+Hark! the trumpets.
+
+VOLUMNIA:
+These are the ushers of Marcius: before him he
+carries noise, and behind him he leaves tears:
+Death, that dark spirit, in 's nervy arm doth lie;
+Which, being advanced, declines, and then men die.
+
+Herald:
+Know, Rome, that all alone Marcius did fight
+Within Corioli gates: where he hath won,
+With fame, a name to Caius Marcius; these
+In honour follows Coriolanus.
+Welcome to Rome, renowned Coriolanus!
+
+All:
+Welcome to Rome, renowned Coriolanus!
+
+CORIOLANUS:
+No more of this; it does offend my heart:
+Pray now, no more.
+
+COMINIUS:
+Look, sir, your mother!
+
+CORIOLANUS:
+O,
+You have, I know, petition'd all the gods
+For my prosperity!
+
+VOLUMNIA:
+Nay, my good soldier, up;
+My gentle Marcius, worthy Caius, and
+By deed-achieving honour newly named,--
+What is it?--Coriolanus must I call thee?--
+But O, thy wife!
+
+CORIOLANUS:
+My gracious silence, hail!
+Wouldst thou have laugh'd had I come coffin'd home,
+That weep'st to see me triumph? Ay, my dear,
+Such eyes the widows in Corioli wear,
+And mothers that lack sons.
+
+MENENIUS:
+Now, the gods crown thee!
+
+CORIOLANUS:
+And live you yet?
+O my sweet lady, pardon.
+
+VOLUMNIA:
+I know not where to turn: O, welcome home:
+And welcome, general: and ye're welcome all.
+
+MENENIUS:
+A hundred thousand welcomes. I could weep
+And I could laugh, I am light and heavy. Welcome.
+A curse begin at very root on's heart,
+That is not glad to see thee! You are three
+That Rome should dote on: yet, by the faith of men,
+We have some old crab-trees here
+at home that will not
+Be grafted to your relish. Yet welcome, warriors:
+We call a nettle but a nettle and
+The faults of fools but folly.
+
+COMINIUS:
+Ever right.
+
+CORIOLANUS:
+Menenius ever, ever.
+
+Herald:
+Give way there, and go on!
+
+CORIOLANUS:
+
+VOLUMNIA:
+I have lived
+To see inherited my very wishes
+And the buildings of my fancy: only
+There's one thing wanting, which I doubt not but
+Our Rome will cast upon thee.
+
+CORIOLANUS:
+Know, good mother,
+I had rather be their servant in my way,
+Than sway with them in theirs.
+
+COMINIUS:
+On, to the Capitol!
+
+BRUTUS:
+All tongues speak of him, and the bleared sights
+Are spectacled to see him: your prattling nurse
+Into a rapture lets her baby cry
+While she chats him: the kitchen malkin pins
+Her richest lockram 'bout her reechy neck,
+Clambering the walls to eye him: stalls, bulks, windows,
+Are smother'd up, leads fill'd, and ridges horsed
+With variable complexions, all agreeing
+In earnestness to see him: seld-shown flamens
+Do press among the popular throngs and puff
+To win a vulgar station: or veil'd dames
+Commit the war of white and damask in
+Their nicely-gawded cheeks to the wanton spoil
+Of Phoebus' burning kisses: such a pother
+As if that whatsoever god who leads him
+Were slily crept into his human powers
+And gave him graceful posture.
+
+SICINIUS:
+On the sudden,
+I warrant him consul.
+
+BRUTUS:
+Then our office may,
+During his power, go sleep.
+
+SICINIUS:
+He cannot temperately transport his honours
+From where he should begin and end, but will
+Lose those he hath won.
+
+BRUTUS:
+In that there's comfort.
+
+SICINIUS:
+Doubt not
+The commoners, for whom we stand, but they
+Upon their ancient malice will forget
+With the least cause these his new honours, which
+That he will give them make I as little question
+As he is proud to do't.
+
+BRUTUS:
+I heard him swear,
+Were he to stand for consul, never would he
+Appear i' the market-place nor on him put
+The napless vesture of humility;
+Nor showing, as the manner is, his wounds
+To the people, beg their stinking breaths.
+
+SICINIUS:
+'Tis right.
+
+BRUTUS:
+It was his word: O, he would miss it rather
+Than carry it but by the suit of the gentry to him,
+And the desire of the nobles.
+
+SICINIUS:
+I wish no better
+Than have him hold that purpose and to put it
+In execution.
+
+BRUTUS:
+'Tis most like he will.
+
+SICINIUS:
+It shall be to him then as our good wills,
+A sure destruction.
+
+BRUTUS:
+So it must fall out
+To him or our authorities. For an end,
+We must suggest the people in what hatred
+He still hath held them; that to's power he would
+Have made them mules, silenced their pleaders and
+Dispropertied their freedoms, holding them,
+In human action and capacity,
+Of no more soul nor fitness for the world
+Than camels in the war, who have their provand
+Only for bearing burdens, and sore blows
+For sinking under them.
+
+SICINIUS:
+This, as you say, suggested
+At some time when his soaring insolence
+Shall touch the people--which time shall not want,
+If he be put upon 't; and that's as easy
+As to set dogs on sheep--will be his fire
+To kindle their dry stubble; and their blaze
+Shall darken him for ever.
+
+BRUTUS:
+What's the matter?
+
+Messenger:
+You are sent for to the Capitol. 'Tis thought
+That Marcius shall be consul:
+I have seen the dumb men throng to see him and
+The blind to bear him speak: matrons flung gloves,
+Ladies and maids their scarfs and handkerchers,
+Upon him as he pass'd: the nobles bended,
+As to Jove's statue, and the commons made
+A shower and thunder with their caps and shouts:
+I never saw the like.
+
+BRUTUS:
+Let's to the Capitol;
+And carry with us ears and eyes for the time,
+But hearts for the event.
+
+SICINIUS:
+Have with you.
+
+First Officer:
+Come, come, they are almost here. How many stand
+for consulships?
+
+Second Officer:
+Three, they say: but 'tis thought of every one
+Coriolanus will carry it.
+
+First Officer:
+That's a brave fellow; but he's vengeance proud, and
+loves not the common people.
+
+Second Officer:
+Faith, there had been many great men that have
+flattered the people, who ne'er loved them; and there
+be many that they have loved, they know not
+wherefore: so that, if they love they know not why,
+they hate upon no better a ground: therefore, for
+Coriolanus neither to care whether they love or hate
+him manifests the true knowledge he has in their
+disposition; and out of his noble carelessness lets
+them plainly see't.
+
+First Officer:
+If he did not care whether he had their love or no,
+he waved indifferently 'twixt doing them neither
+good nor harm: but he seeks their hate with greater
+devotion than can render it him; and leaves
+nothing undone that may fully discover him their
+opposite. Now, to seem to affect the malice and
+displeasure of the people is as bad as that which he
+dislikes, to flatter them for their love.
+
+Second Officer:
+He hath deserved worthily of his country: and his
+ascent is not by such easy degrees as those who,
+having been supple and courteous to the people,
+bonneted, without any further deed to have them at
+an into their estimation and report: but he hath so
+planted his honours in their eyes, and his actions
+in their hearts, that for their tongues to be
+silent, and not confess so much, were a kind of
+ingrateful injury; to report otherwise, were a
+malice, that, giving itself the lie, would pluck
+reproof and rebuke from every ear that heard it.
+
+First Officer:
+No more of him; he is a worthy man: make way, they
+are coming.
+
+MENENIUS:
+Having determined of the Volsces and
+To send for Titus Lartius, it remains,
+As the main point of this our after-meeting,
+To gratify his noble service that
+Hath thus stood for his country: therefore,
+please you,
+Most reverend and grave elders, to desire
+The present consul, and last general
+In our well-found successes, to report
+A little of that worthy work perform'd
+By Caius Marcius Coriolanus, whom
+We met here both to thank and to remember
+With honours like himself.
+
+First Senator:
+Speak, good Cominius:
+Leave nothing out for length, and make us think
+Rather our state's defective for requital
+Than we to stretch it out.
+Masters o' the people,
+We do request your kindest ears, and after,
+Your loving motion toward the common body,
+To yield what passes here.
+
+SICINIUS:
+We are convented
+Upon a pleasing treaty, and have hearts
+Inclinable to honour and advance
+The theme of our assembly.
+
+BRUTUS:
+Which the rather
+We shall be blest to do, if he remember
+A kinder value of the people than
+He hath hereto prized them at.
+
+MENENIUS:
+That's off, that's off;
+I would you rather had been silent. Please you
+To hear Cominius speak?
+
+BRUTUS:
+Most willingly;
+But yet my caution was more pertinent
+Than the rebuke you give it.
+
+MENENIUS:
+He loves your people
+But tie him not to be their bedfellow.
+Worthy Cominius, speak.
+Nay, keep your place.
+
+First Senator:
+Sit, Coriolanus; never shame to hear
+What you have nobly done.
+
+CORIOLANUS:
+Your horror's pardon:
+I had rather have my wounds to heal again
+Than hear say how I got them.
+
+BRUTUS:
+Sir, I hope
+My words disbench'd you not.
+
+CORIOLANUS:
+No, sir: yet oft,
+When blows have made me stay, I fled from words.
+You soothed not, therefore hurt not: but
+your people,
+I love them as they weigh.
+
+MENENIUS:
+Pray now, sit down.
+
+CORIOLANUS:
+I had rather have one scratch my head i' the sun
+When the alarum were struck than idly sit
+To hear my nothings monster'd.
+
+MENENIUS:
+Masters of the people,
+Your multiplying spawn how can he flatter--
+That's thousand to one good one--when you now see
+He had rather venture all his limbs for honour
+Than one on's ears to hear it? Proceed, Cominius.
+
+COMINIUS:
+I shall lack voice: the deeds of Coriolanus
+Should not be utter'd feebly. It is held
+That valour is the chiefest virtue, and
+Most dignifies the haver: if it be,
+The man I speak of cannot in the world
+Be singly counterpoised. At sixteen years,
+When Tarquin made a head for Rome, he fought
+Beyond the mark of others: our then dictator,
+Whom with all praise I point at, saw him fight,
+When with his Amazonian chin he drove
+The bristled lips before him: be bestrid
+An o'er-press'd Roman and i' the consul's view
+Slew three opposers: Tarquin's self he met,
+And struck him on his knee: in that day's feats,
+When he might act the woman in the scene,
+He proved best man i' the field, and for his meed
+Was brow-bound with the oak. His pupil age
+Man-enter'd thus, he waxed like a sea,
+And in the brunt of seventeen battles since
+He lurch'd all swords of the garland. For this last,
+Before and in Corioli, let me say,
+I cannot speak him home: he stopp'd the fliers;
+And by his rare example made the coward
+Turn terror into sport: as weeds before
+A vessel under sail, so men obey'd
+And fell below his stem: his sword, death's stamp,
+Where it did mark, it took; from face to foot
+He was a thing of blood, whose every motion
+Was timed with dying cries: alone he enter'd
+The mortal gate of the city, which he painted
+With shunless destiny; aidless came off,
+And with a sudden reinforcement struck
+Corioli like a planet: now all's his:
+When, by and by, the din of war gan pierce
+His ready sense; then straight his doubled spirit
+Re-quicken'd what in flesh was fatigate,
+And to the battle came he; where he did
+Run reeking o'er the lives of men, as if
+'Twere a perpetual spoil: and till we call'd
+Both field and city ours, he never stood
+To ease his breast with panting.
+
+MENENIUS:
+Worthy man!
+
+First Senator:
+He cannot but with measure fit the honours
+Which we devise him.
+
+COMINIUS:
+Our spoils he kick'd at,
+And look'd upon things precious as they were
+The common muck of the world: he covets less
+Than misery itself would give; rewards
+His deeds with doing them, and is content
+To spend the time to end it.
+
+MENENIUS:
+He's right noble:
+Let him be call'd for.
+
+First Senator:
+Call Coriolanus.
+
+Officer:
+He doth appear.
+
+MENENIUS:
+The senate, Coriolanus, are well pleased
+To make thee consul.
+
+CORIOLANUS:
+I do owe them still
+My life and services.
+
+MENENIUS:
+It then remains
+That you do speak to the people.
+
+CORIOLANUS:
+I do beseech you,
+Let me o'erleap that custom, for I cannot
+Put on the gown, stand naked and entreat them,
+For my wounds' sake, to give their suffrage: please you
+That I may pass this doing.
+
+SICINIUS:
+Sir, the people
+Must have their voices; neither will they bate
+One jot of ceremony.
+
+MENENIUS:
+Put them not to't:
+Pray you, go fit you to the custom and
+Take to you, as your predecessors have,
+Your honour with your form.
+
+CORIOLANUS:
+It is apart
+That I shall blush in acting, and might well
+Be taken from the people.
+
+BRUTUS:
+Mark you that?
+
+CORIOLANUS:
+To brag unto them, thus I did, and thus;
+Show them the unaching scars which I should hide,
+As if I had received them for the hire
+Of their breath only!
+
+MENENIUS:
+Do not stand upon't.
+We recommend to you, tribunes of the people,
+Our purpose to them: and to our noble consul
+Wish we all joy and honour.
+
+Senators:
+To Coriolanus come all joy and honour!
+
+BRUTUS:
+You see how he intends to use the people.
+
+SICINIUS:
+May they perceive's intent! He will require them,
+As if he did contemn what he requested
+Should be in them to give.
+
+BRUTUS:
+Come, we'll inform them
+Of our proceedings here: on the marketplace,
+I know, they do attend us.
+
+First Citizen:
+Once, if he do require our voices, we ought not to deny him.
+
+Second Citizen:
+We may, sir, if we will.
+
+Third Citizen:
+We have power in ourselves to do it, but it is a
+power that we have no power to do; for if he show us
+his wounds and tell us his deeds, we are to put our
+tongues into those wounds and speak for them; so, if
+he tell us his noble deeds, we must also tell him
+our noble acceptance of them. Ingratitude is
+monstrous, and for the multitude to be ingrateful,
+were to make a monster of the multitude: of the
+which we being members, should bring ourselves to be
+monstrous members.
+
+First Citizen:
+And to make us no better thought of, a little help
+will serve; for once we stood up about the corn, he
+himself stuck not to call us the many-headed multitude.
+
+Third Citizen:
+We have been called so of many; not that our heads
+are some brown, some black, some auburn, some bald,
+but that our wits are so diversely coloured: and
+truly I think if all our wits were to issue out of
+one skull, they would fly east, west, north, south,
+and their consent of one direct way should be at
+once to all the points o' the compass.
+
+Second Citizen:
+Think you so? Which way do you judge my wit would
+fly?
+
+Third Citizen:
+Nay, your wit will not so soon out as another man's
+will;'tis strongly wedged up in a block-head, but
+if it were at liberty, 'twould, sure, southward.
+
+Second Citizen:
+Why that way?
+
+Third Citizen:
+To lose itself in a fog, where being three parts
+melted away with rotten dews, the fourth would return
+for conscience sake, to help to get thee a wife.
+
+Second Citizen:
+You are never without your tricks: you may, you may.
+
+Third Citizen:
+Are you all resolved to give your voices? But
+that's no matter, the greater part carries it. I
+say, if he would incline to the people, there was
+never a worthier man.
+Here he comes, and in the gown of humility: mark his
+behavior. We are not to stay all together, but to
+come by him where he stands, by ones, by twos, and
+by threes. He's to make his requests by
+particulars; wherein every one of us has a single
+honour, in giving him our own voices with our own
+tongues: therefore follow me, and I direct you how
+you shall go by him.
+
+All:
+Content, content.
+
+MENENIUS:
+O sir, you are not right: have you not known
+The worthiest men have done't?
+
+CORIOLANUS:
+What must I say?
+'I Pray, sir'--Plague upon't! I cannot bring
+My tongue to such a pace:--'Look, sir, my wounds!
+I got them in my country's service, when
+Some certain of your brethren roar'd and ran
+From the noise of our own drums.'
+
+MENENIUS:
+O me, the gods!
+You must not speak of that: you must desire them
+To think upon you.
+
+CORIOLANUS:
+Think upon me! hang 'em!
+I would they would forget me, like the virtues
+Which our divines lose by 'em.
+
+MENENIUS:
+You'll mar all:
+I'll leave you: pray you, speak to 'em, I pray you,
+In wholesome manner.
+
+CORIOLANUS:
+Bid them wash their faces
+And keep their teeth clean.
+So, here comes a brace.
+You know the cause, air, of my standing here.
+
+Third Citizen:
+We do, sir; tell us what hath brought you to't.
+
+CORIOLANUS:
+Mine own desert.
+
+Second Citizen:
+Your own desert!
+
+CORIOLANUS:
+Ay, but not mine own desire.
+
+Third Citizen:
+How not your own desire?
+
+CORIOLANUS:
+No, sir,'twas never my desire yet to trouble the
+poor with begging.
+
+Third Citizen:
+You must think, if we give you any thing, we hope to
+gain by you.
+
+CORIOLANUS:
+Well then, I pray, your price o' the consulship?
+
+First Citizen:
+The price is to ask it kindly.
+
+CORIOLANUS:
+Kindly! Sir, I pray, let me ha't: I have wounds to
+show you, which shall be yours in private. Your
+good voice, sir; what say you?
+
+Second Citizen:
+You shall ha' it, worthy sir.
+
+CORIOLANUS:
+A match, sir. There's in all two worthy voices
+begged. I have your alms: adieu.
+
+Third Citizen:
+But this is something odd.
+
+Second Citizen:
+An 'twere to give again,--but 'tis no matter.
+
+CORIOLANUS:
+Pray you now, if it may stand with the tune of your
+voices that I may be consul, I have here the
+customary gown.
+
+Fourth Citizen:
+You have deserved nobly of your country, and you
+have not deserved nobly.
+
+CORIOLANUS:
+Your enigma?
+
+Fourth Citizen:
+You have been a scourge to her enemies, you have
+been a rod to her friends; you have not indeed loved
+the common people.
+
+CORIOLANUS:
+You should account me the more virtuous that I have
+not been common in my love. I will, sir, flatter my
+sworn brother, the people, to earn a dearer
+estimation of them; 'tis a condition they account
+gentle: and since the wisdom of their choice is
+rather to have my hat than my heart, I will practise
+the insinuating nod and be off to them most
+counterfeitly; that is, sir, I will counterfeit the
+bewitchment of some popular man and give it
+bountiful to the desirers. Therefore, beseech you,
+I may be consul.
+
+Fifth Citizen:
+We hope to find you our friend; and therefore give
+you our voices heartily.
+
+Fourth Citizen:
+You have received many wounds for your country.
+
+CORIOLANUS:
+I will not seal your knowledge with showing them. I
+will make much of your voices, and so trouble you no further.
+
+Both Citizens:
+The gods give you joy, sir, heartily!
+
+CORIOLANUS:
+Most sweet voices!
+Better it is to die, better to starve,
+Than crave the hire which first we do deserve.
+Why in this woolvish toge should I stand here,
+To beg of Hob and Dick, that do appear,
+Their needless vouches? Custom calls me to't:
+What custom wills, in all things should we do't,
+The dust on antique time would lie unswept,
+And mountainous error be too highly heapt
+For truth to o'er-peer. Rather than fool it so,
+Let the high office and the honour go
+To one that would do thus. I am half through;
+The one part suffer'd, the other will I do.
+Here come more voices.
+Your voices: for your voices I have fought;
+Watch'd for your voices; for Your voices bear
+Of wounds two dozen odd; battles thrice six
+I have seen and heard of; for your voices have
+Done many things, some less, some more your voices:
+Indeed I would be consul.
+
+Sixth Citizen:
+He has done nobly, and cannot go without any honest
+man's voice.
+
+Seventh Citizen:
+Therefore let him be consul: the gods give him joy,
+and make him good friend to the people!
+
+All Citizens:
+Amen, amen. God save thee, noble consul!
+
+CORIOLANUS:
+Worthy voices!
+
+MENENIUS:
+You have stood your limitation; and the tribunes
+Endue you with the people's voice: remains
+That, in the official marks invested, you
+Anon do meet the senate.
+
+CORIOLANUS:
+Is this done?
+
+SICINIUS:
+The custom of request you have discharged:
+The people do admit you, and are summon'd
+To meet anon, upon your approbation.
+
+CORIOLANUS:
+Where? at the senate-house?
+
+SICINIUS:
+There, Coriolanus.
+
+CORIOLANUS:
+May I change these garments?
+
+SICINIUS:
+You may, sir.
+
+CORIOLANUS:
+That I'll straight do; and, knowing myself again,
+Repair to the senate-house.
+
+MENENIUS:
+I'll keep you company. Will you along?
+
+BRUTUS:
+We stay here for the people.
+
+SICINIUS:
+Fare you well.
+He has it now, and by his looks methink
+'Tis warm at 's heart.
+
+BRUTUS:
+With a proud heart he wore his humble weeds.
+will you dismiss the people?
+
+SICINIUS:
+How now, my masters! have you chose this man?
+
+First Citizen:
+He has our voices, sir.
+
+BRUTUS:
+We pray the gods he may deserve your loves.
+
+Second Citizen:
+Amen, sir: to my poor unworthy notice,
+He mock'd us when he begg'd our voices.
+
+Third Citizen:
+Certainly
+He flouted us downright.
+
+First Citizen:
+No,'tis his kind of speech: he did not mock us.
+
+Second Citizen:
+Not one amongst us, save yourself, but says
+He used us scornfully: he should have show'd us
+His marks of merit, wounds received for's country.
+
+SICINIUS:
+Why, so he did, I am sure.
+
+Citizens:
+No, no; no man saw 'em.
+
+Third Citizen:
+He said he had wounds, which he could show
+in private;
+And with his hat, thus waving it in scorn,
+'I would be consul,' says he: 'aged custom,
+But by your voices, will not so permit me;
+Your voices therefore.' When we granted that,
+Here was 'I thank you for your voices: thank you:
+Your most sweet voices: now you have left
+your voices,
+I have no further with you.' Was not this mockery?
+
+SICINIUS:
+Why either were you ignorant to see't,
+Or, seeing it, of such childish friendliness
+To yield your voices?
+
+BRUTUS:
+Could you not have told him
+As you were lesson'd, when he had no power,
+But was a petty servant to the state,
+He was your enemy, ever spake against
+Your liberties and the charters that you bear
+I' the body of the weal; and now, arriving
+A place of potency and sway o' the state,
+If he should still malignantly remain
+Fast foe to the plebeii, your voices might
+Be curses to yourselves? You should have said
+That as his worthy deeds did claim no less
+Than what he stood for, so his gracious nature
+Would think upon you for your voices and
+Translate his malice towards you into love,
+Standing your friendly lord.
+
+SICINIUS:
+Thus to have said,
+As you were fore-advised, had touch'd his spirit
+And tried his inclination; from him pluck'd
+Either his gracious promise, which you might,
+As cause had call'd you up, have held him to
+Or else it would have gall'd his surly nature,
+Which easily endures not article
+Tying him to aught; so putting him to rage,
+You should have ta'en the advantage of his choler
+And pass'd him unelected.
+
+BRUTUS:
+Did you perceive
+He did solicit you in free contempt
+When he did need your loves, and do you think
+That his contempt shall not be bruising to you,
+When he hath power to crush? Why, had your bodies
+No heart among you? or had you tongues to cry
+Against the rectorship of judgment?
+
+SICINIUS:
+Have you
+Ere now denied the asker? and now again
+Of him that did not ask, but mock, bestow
+Your sued-for tongues?
+
+Third Citizen:
+He's not confirm'd; we may deny him yet.
+
+Second Citizen:
+And will deny him:
+I'll have five hundred voices of that sound.
+
+First Citizen:
+I twice five hundred and their friends to piece 'em.
+
+BRUTUS:
+Get you hence instantly, and tell those friends,
+They have chose a consul that will from them take
+Their liberties; make them of no more voice
+Than dogs that are as often beat for barking
+As therefore kept to do so.
+
+SICINIUS:
+Let them assemble,
+And on a safer judgment all revoke
+Your ignorant election; enforce his pride,
+And his old hate unto you; besides, forget not
+With what contempt he wore the humble weed,
+How in his suit he scorn'd you; but your loves,
+Thinking upon his services, took from you
+The apprehension of his present portance,
+Which most gibingly, ungravely, he did fashion
+After the inveterate hate he bears you.
+
+BRUTUS:
+Lay
+A fault on us, your tribunes; that we laboured,
+No impediment between, but that you must
+Cast your election on him.
+
+SICINIUS:
+Say, you chose him
+More after our commandment than as guided
+By your own true affections, and that your minds,
+Preoccupied with what you rather must do
+Than what you should, made you against the grain
+To voice him consul: lay the fault on us.
+
+BRUTUS:
+Ay, spare us not. Say we read lectures to you.
+How youngly he began to serve his country,
+How long continued, and what stock he springs of,
+The noble house o' the Marcians, from whence came
+That Ancus Marcius, Numa's daughter's son,
+Who, after great Hostilius, here was king;
+Of the same house Publius and Quintus were,
+That our beat water brought by conduits hither;
+And  
+Twice being  
+Was his great ancestor.
+
+SICINIUS:
+One thus descended,
+That hath beside well in his person wrought
+To be set high in place, we did commend
+To your remembrances: but you have found,
+Scaling his present bearing with his past,
+That he's your fixed enemy, and revoke
+Your sudden approbation.
+
+BRUTUS:
+Say, you ne'er had done't--
+Harp on that still--but by our putting on;
+And presently, when you have drawn your number,
+Repair to the Capitol.
+
+All:
+We will so: almost all
+Repent in their election.
+
+BRUTUS:
+Let them go on;
+This mutiny were better put in hazard,
+Than stay, past doubt, for greater:
+If, as his nature is, he fall in rage
+With their refusal, both observe and answer
+The vantage of his anger.
+
+SICINIUS:
+To the Capitol, come:
+We will be there before the stream o' the people;
+And this shall seem, as partly 'tis, their own,
+Which we have goaded onward.
+
+CORIOLANUS:
+Tullus Aufidius then had made new head?
+
+LARTIUS:
+He had, my lord; and that it was which caused
+Our swifter composition.
+
+CORIOLANUS:
+So then the Volsces stand but as at first,
+Ready, when time shall prompt them, to make road.
+Upon's again.
+
+COMINIUS:
+They are worn, lord consul, so,
+That we shall hardly in our ages see
+Their banners wave again.
+
+CORIOLANUS:
+Saw you Aufidius?
+
+LARTIUS:
+On safe-guard he came to me; and did curse
+Against the Volsces, for they had so vilely
+Yielded the town: he is retired to Antium.
+
+CORIOLANUS:
+Spoke he of me?
+
+LARTIUS:
+He did, my lord.
+
+CORIOLANUS:
+How? what?
+
+LARTIUS:
+How often he had met you, sword to sword;
+That of all things upon the earth he hated
+Your person most, that he would pawn his fortunes
+To hopeless restitution, so he might
+Be call'd your vanquisher.
+
+CORIOLANUS:
+At Antium lives he?
+
+LARTIUS:
+At Antium.
+
+CORIOLANUS:
+I wish I had a cause to seek him there,
+To oppose his hatred fully. Welcome home.
+Behold, these are the tribunes of the people,
+The tongues o' the common mouth: I do despise them;
+For they do prank them in authority,
+Against all noble sufferance.
+
+SICINIUS:
+Pass no further.
+
+CORIOLANUS:
+Ha! what is that?
+
+BRUTUS:
+It will be dangerous to go on: no further.
+
+CORIOLANUS:
+What makes this change?
+
+MENENIUS:
+The matter?
+
+COMINIUS:
+Hath he not pass'd the noble and the common?
+
+BRUTUS:
+Cominius, no.
+
+CORIOLANUS:
+Have I had children's voices?
+
+First Senator:
+Tribunes, give way; he shall to the market-place.
+
+BRUTUS:
+The people are incensed against him.
+
+SICINIUS:
+Stop,
+Or all will fall in broil.
+
+CORIOLANUS:
+Are these your herd?
+Must these have voices, that can yield them now
+And straight disclaim their tongues? What are
+your offices?
+You being their mouths, why rule you not their teeth?
+Have you not set them on?
+
+MENENIUS:
+Be calm, be calm.
+
+CORIOLANUS:
+It is a purposed thing, and grows by plot,
+To curb the will of the nobility:
+Suffer't, and live with such as cannot rule
+Nor ever will be ruled.
+
+BRUTUS:
+Call't not a plot:
+The people cry you mock'd them, and of late,
+When corn was given them gratis, you repined;
+Scandal'd the suppliants for the people, call'd them
+Time-pleasers, flatterers, foes to nobleness.
+
+CORIOLANUS:
+Why, this was known before.
+
+BRUTUS:
+Not to them all.
+
+CORIOLANUS:
+Have you inform'd them sithence?
+
+BRUTUS:
+How! I inform them!
+
+CORIOLANUS:
+You are like to do such business.
+
+BRUTUS:
+Not unlike,
+Each way, to better yours.
+
+CORIOLANUS:
+Why then should I be consul? By yond clouds,
+Let me deserve so ill as you, and make me
+Your fellow tribune.
+
+SICINIUS:
+You show too much of that
+For which the people stir: if you will pass
+To where you are bound, you must inquire your way,
+Which you are out of, with a gentler spirit,
+Or never be so noble as a consul,
+Nor yoke with him for tribune.
+
+MENENIUS:
+Let's be calm.
+
+COMINIUS:
+The people are abused; set on. This paltering
+Becomes not Rome, nor has Coriolanus
+Deserved this so dishonour'd rub, laid falsely
+I' the plain way of his merit.
+
+CORIOLANUS:
+Tell me of corn!
+This was my speech, and I will speak't again--
+
+MENENIUS:
+Not now, not now.
+
+First Senator:
+Not in this heat, sir, now.
+
+CORIOLANUS:
+Now, as I live, I will. My nobler friends,
+I crave their pardons:
+For the mutable, rank-scented many, let them
+Regard me as I do not flatter, and
+Therein behold themselves: I say again,
+In soothing them, we nourish 'gainst our senate
+The cockle of rebellion, insolence, sedition,
+Which we ourselves have plough'd for, sow'd,
+and scatter'd,
+By mingling them with us, the honour'd number,
+Who lack not virtue, no, nor power, but that
+Which they have given to beggars.
+
+MENENIUS:
+Well, no more.
+
+First Senator:
+No more words, we beseech you.
+
+CORIOLANUS:
+How! no more!
+As for my country I have shed my blood,
+Not fearing outward force, so shall my lungs
+Coin words till their decay against those measles,
+Which we disdain should tatter us, yet sought
+The very way to catch them.
+
+BRUTUS:
+You speak o' the people,
+As if you were a god to punish, not
+A man of their infirmity.
+
+SICINIUS:
+'Twere well
+We let the people know't.
+
+MENENIUS:
+What, what? his choler?
+
+CORIOLANUS:
+Choler!
+Were I as patient as the midnight sleep,
+By Jove, 'twould be my mind!
+
+SICINIUS:
+It is a mind
+That shall remain a poison where it is,
+Not poison any further.
+
+CORIOLANUS:
+Shall remain!
+Hear you this Triton of the minnows? mark you
+His absolute 'shall'?
+
+COMINIUS:
+'Twas from the canon.
+
+CORIOLANUS:
+'Shall'!
+O good but most unwise patricians! why,
+You grave but reckless senators, have you thus
+Given Hydra here to choose an officer,
+That with his peremptory 'shall,' being but
+The horn and noise o' the monster's, wants not spirit
+To say he'll turn your current in a ditch,
+And make your channel his? If he have power
+Then vail your ignorance; if none, awake
+Your dangerous lenity. If you are learn'd,
+Be not as common fools; if you are not,
+Let them have cushions by you. You are plebeians,
+If they be senators: and they are no less,
+When, both your voices blended, the great'st taste
+Most palates theirs. They choose their magistrate,
+And such a one as he, who puts his 'shall,'
+His popular 'shall' against a graver bench
+Than ever frown in Greece. By Jove himself!
+It makes the consuls base: and my soul aches
+To know, when two authorities are up,
+Neither supreme, how soon confusion
+May enter 'twixt the gap of both and take
+The one by the other.
+
+COMINIUS:
+Well, on to the market-place.
+
+CORIOLANUS:
+Whoever gave that counsel, to give forth
+The corn o' the storehouse gratis, as 'twas used
+Sometime in Greece,--
+
+MENENIUS:
+Well, well, no more of that.
+
+CORIOLANUS:
+Though there the people had more absolute power,
+I say, they nourish'd disobedience, fed
+The ruin of the state.
+
+BRUTUS:
+Why, shall the people give
+One that speaks thus their voice?
+
+CORIOLANUS:
+I'll give my reasons,
+More worthier than their voices. They know the corn
+Was not our recompense, resting well assured
+That ne'er did service for't: being press'd to the war,
+Even when the navel of the state was touch'd,
+They would not thread the gates. This kind of service
+Did not deserve corn gratis. Being i' the war
+Their mutinies and revolts, wherein they show'd
+Most valour, spoke not for them: the accusation
+Which they have often made against the senate,
+All cause unborn, could never be the motive
+Of our so frank donation. Well, what then?
+How shall this bisson multitude digest
+The senate's courtesy? Let deeds express
+What's like to be their words: 'we did request it;
+We are the greater poll, and in true fear
+They gave us our demands.' Thus we debase
+The nature of our seats and make the rabble
+Call our cares fears; which will in time
+Break ope the locks o' the senate and bring in
+The crows to peck the eagles.
+
+MENENIUS:
+Come, enough.
+
+BRUTUS:
+Enough, with over-measure.
+
+CORIOLANUS:
+No, take more:
+What may be sworn by, both divine and human,
+Seal what I end withal! This double worship,
+Where one part does disdain with cause, the other
+Insult without all reason, where gentry, title, wisdom,
+Cannot conclude but by the yea and no
+Of general ignorance,--it must omit
+Real necessities, and give way the while
+To unstable slightness: purpose so barr'd,
+it follows,
+Nothing is done to purpose. Therefore, beseech you,--
+You that will be less fearful than discreet,
+That love the fundamental part of state
+More than you doubt the change on't, that prefer
+A noble life before a long, and wish
+To jump a body with a dangerous physic
+That's sure of death without it, at once pluck out
+The multitudinous tongue; let them not lick
+The sweet which is their poison: your dishonour
+Mangles true judgment and bereaves the state
+Of that integrity which should become't,
+Not having the power to do the good it would,
+For the in which doth control't.
+
+BRUTUS:
+Has said enough.
+
+SICINIUS:
+Has spoken like a traitor, and shall answer
+As traitors do.
+
+CORIOLANUS:
+Thou wretch, despite o'erwhelm thee!
+What should the people do with these bald tribunes?
+On whom depending, their obedience fails
+To the greater bench: in a rebellion,
+When what's not meet, but what must be, was law,
+Then were they chosen: in a better hour,
+Let what is meet be said it must be meet,
+And throw their power i' the dust.
+
+BRUTUS:
+Manifest treason!
+
+SICINIUS:
+This a consul? no.
+
+BRUTUS:
+The aediles, ho!
+Let him be apprehended.
+
+SICINIUS:
+Go, call the people:
+in whose name myself
+Attach thee as a traitorous innovator,
+A foe to the public weal: obey, I charge thee,
+And follow to thine answer.
+
+CORIOLANUS:
+Hence, old goat!
+
+Senators, &C:
+We'll surety him.
+
+COMINIUS:
+Aged sir, hands off.
+
+CORIOLANUS:
+Hence, rotten thing! or I shall shake thy bones
+Out of thy garments.
+
+SICINIUS:
+Help, ye citizens!
+
+MENENIUS:
+On both sides more respect.
+
+SICINIUS:
+Here's he that would take from you all your power.
+
+BRUTUS:
+Seize him, AEdiles!
+
+Citizens:
+Down with him! down with him!
+
+Senators, &C:
+Weapons, weapons, weapons!
+'Tribunes!' 'Patricians!' 'Citizens!' 'What, ho!'
+'Sicinius!' 'Brutus!' 'Coriolanus!' 'Citizens!'
+'Peace, peace, peace!' 'Stay, hold, peace!'
+
+MENENIUS:
+What is about to be? I am out of breath;
+Confusion's near; I cannot speak. You, tribunes
+To the people! Coriolanus, patience!
+Speak, good Sicinius.
+
+SICINIUS:
+Hear me, people; peace!
+
+Citizens:
+Let's hear our tribune: peace Speak, speak, speak.
+
+SICINIUS:
+You are at point to lose your liberties:
+Marcius would have all from you; Marcius,
+Whom late you have named for consul.
+
+MENENIUS:
+Fie, fie, fie!
+This is the way to kindle, not to quench.
+
+First Senator:
+To unbuild the city and to lay all flat.
+
+SICINIUS:
+What is the city but the people?
+
+Citizens:
+True,
+The people are the city.
+
+BRUTUS:
+By the consent of all, we were establish'd
+The people's magistrates.
+
+Citizens:
+You so remain.
+
+MENENIUS:
+And so are like to do.
+
+COMINIUS:
+That is the way to lay the city flat;
+To bring the roof to the foundation,
+And bury all, which yet distinctly ranges,
+In heaps and piles of ruin.
+
+SICINIUS:
+This deserves death.
+
+BRUTUS:
+Or let us stand to our authority,
+Or let us lose it. We do here pronounce,
+Upon the part o' the people, in whose power
+We were elected theirs, Marcius is worthy
+Of present death.
+
+SICINIUS:
+Therefore lay hold of him;
+Bear him to the rock Tarpeian, and from thence
+Into destruction cast him.
+
+BRUTUS:
+AEdiles, seize him!
+
+Citizens:
+Yield, Marcius, yield!
+
+MENENIUS:
+Hear me one word;
+Beseech you, tribunes, hear me but a word.
+
+AEdile:
+Peace, peace!
+
+MENENIUS:
+
+BRUTUS:
+Sir, those cold ways,
+That seem like prudent helps, are very poisonous
+Where the disease is violent. Lay hands upon him,
+And bear him to the rock.
+
+CORIOLANUS:
+No, I'll die here.
+There's some among you have beheld me fighting:
+Come, try upon yourselves what you have seen me.
+
+MENENIUS:
+Down with that sword! Tribunes, withdraw awhile.
+
+BRUTUS:
+Lay hands upon him.
+
+COMINIUS:
+Help Marcius, help,
+You that be noble; help him, young and old!
+
+Citizens:
+Down with him, down with him!
+
+MENENIUS:
+Go, get you to your house; be gone, away!
+All will be naught else.
+
+Second Senator:
+Get you gone.
+
+COMINIUS:
+Stand fast;
+We have as many friends as enemies.
+
+MENENIUS:
+Sham it be put to that?
+
+First Senator:
+The gods forbid!
+I prithee, noble friend, home to thy house;
+Leave us to cure this cause.
+
+MENENIUS:
+For 'tis a sore upon us,
+You cannot tent yourself: be gone, beseech you.
+
+COMINIUS:
+Come, sir, along with us.
+
+CORIOLANUS:
+I would they were barbarians--as they are,
+Though in Rome litter'd--not Romans--as they are not,
+Though calved i' the porch o' the Capitol--
+
+MENENIUS:
+Be gone;
+Put not your worthy rage into your tongue;
+One time will owe another.
+
+CORIOLANUS:
+On fair ground
+I could beat forty of them.
+
+COMINIUS:
+I could myself
+Take up a brace o' the best of them; yea, the
+two tribunes:
+But now 'tis odds beyond arithmetic;
+And manhood is call'd foolery, when it stands
+Against a falling fabric. Will you hence,
+Before the tag return? whose rage doth rend
+Like interrupted waters and o'erbear
+What they are used to bear.
+
+MENENIUS:
+Pray you, be gone:
+I'll try whether my old wit be in request
+With those that have but little: this must be patch'd
+With cloth of any colour.
+
+COMINIUS:
+Nay, come away.
+
+A Patrician:
+This man has marr'd his fortune.
+
+MENENIUS:
+His nature is too noble for the world:
+He would not flatter Neptune for his trident,
+Or Jove for's power to thunder. His heart's his mouth:
+What his breast forges, that his tongue must vent;
+And, being angry, does forget that ever
+He heard the name of death.
+Here's goodly work!
+
+Second Patrician:
+I would they were abed!
+
+MENENIUS:
+I would they were in Tiber! What the vengeance!
+Could he not speak 'em fair?
+
+SICINIUS:
+Where is this viper
+That would depopulate the city and
+Be every man himself?
+
+MENENIUS:
+You worthy tribunes,--
+
+SICINIUS:
+He shall be thrown down the Tarpeian rock
+With rigorous hands: he hath resisted law,
+And therefore law shall scorn him further trial
+Than the severity of the public power
+Which he so sets at nought.
+
+First Citizen:
+He shall well know
+The noble tribunes are the people's mouths,
+And we their hands.
+
+Citizens:
+He shall, sure on't.
+
+MENENIUS:
+Sir, sir,--
+
+SICINIUS:
+Peace!
+
+MENENIUS:
+Do not cry havoc, where you should but hunt
+With modest warrant.
+
+SICINIUS:
+Sir, how comes't that you
+Have holp to make this rescue?
+
+MENENIUS:
+Hear me speak:
+As I do know the consul's worthiness,
+So can I name his faults,--
+
+SICINIUS:
+Consul! what consul?
+
+MENENIUS:
+The consul Coriolanus.
+
+BRUTUS:
+He consul!
+
+Citizens:
+No, no, no, no, no.
+
+MENENIUS:
+If, by the tribunes' leave, and yours, good people,
+I may be heard, I would crave a word or two;
+The which shall turn you to no further harm
+Than so much loss of time.
+
+SICINIUS:
+Speak briefly then;
+For we are peremptory to dispatch
+This viperous traitor: to eject him hence
+Were but one danger, and to keep him here
+Our certain death: therefore it is decreed
+He dies to-night.
+
+MENENIUS:
+Now the good gods forbid
+That our renowned Rome, whose gratitude
+Towards her deserved children is enroll'd
+In Jove's own book, like an unnatural dam
+Should now eat up her own!
+
+SICINIUS:
+He's a disease that must be cut away.
+
+MENENIUS:
+O, he's a limb that has but a disease;
+Mortal, to cut it off; to cure it, easy.
+What has he done to Rome that's worthy death?
+Killing our enemies, the blood he hath lost--
+Which, I dare vouch, is more than that he hath,
+By many an ounce--he dropp'd it for his country;
+And what is left, to lose it by his country,
+Were to us all, that do't and suffer it,
+A brand to the end o' the world.
+
+SICINIUS:
+This is clean kam.
+
+BRUTUS:
+Merely awry: when he did love his country,
+It honour'd him.
+
+MENENIUS:
+The service of the foot
+Being once gangrened, is not then respected
+For what before it was.
+
+BRUTUS:
+We'll hear no more.
+Pursue him to his house, and pluck him thence:
+Lest his infection, being of catching nature,
+Spread further.
+
+MENENIUS:
+One word more, one word.
+This tiger-footed rage, when it shall find
+The harm of unscann'd swiftness, will too late
+Tie leaden pounds to's heels. Proceed by process;
+Lest parties, as he is beloved, break out,
+And sack great Rome with Romans.
+
+BRUTUS:
+If it were so,--
+
+SICINIUS:
+What do ye talk?
+Have we not had a taste of his obedience?
+Our aediles smote? ourselves resisted? Come.
+
+MENENIUS:
+Consider this: he has been bred i' the wars
+Since he could draw a sword, and is ill school'd
+In bolted language; meal and bran together
+He throws without distinction. Give me leave,
+I'll go to him, and undertake to bring him
+Where he shall answer, by a lawful form,
+In peace, to his utmost peril.
+
+First Senator:
+Noble tribunes,
+It is the humane way: the other course
+Will prove too bloody, and the end of it
+Unknown to the beginning.
+
+SICINIUS:
+Noble Menenius,
+Be you then as the people's officer.
+Masters, lay down your weapons.
+
+BRUTUS:
+Go not home.
+
+SICINIUS:
+Meet on the market-place. We'll attend you there:
+Where, if you bring not Marcius, we'll proceed
+In our first way.
+
+MENENIUS:
+I'll bring him to you.
+Let me desire your company: he must come,
+Or what is worst will follow.
+
+First Senator:
+Pray you, let's to him.
+
+CORIOLANUS:
+Let them puff all about mine ears, present me
+Death on the wheel or at wild horses' heels,
+Or pile ten hills on the Tarpeian rock,
+That the precipitation might down stretch
+Below the beam of sight, yet will I still
+Be thus to them.
+
+A Patrician:
+You do the nobler.
+
+CORIOLANUS:
+I muse my mother
+Does not approve me further, who was wont
+To call them woollen vassals, things created
+To buy and sell with groats, to show bare heads
+In congregations, to yawn, be still and wonder,
+When one but of my ordinance stood up
+To speak of peace or war.
+I talk of you:
+Why did you wish me milder? would you have me
+False to my nature? Rather say I play
+The man I am.
+
+VOLUMNIA:
+O, sir, sir, sir,
+I would have had you put your power well on,
+Before you had worn it out.
+
+CORIOLANUS:
+Let go.
+
+VOLUMNIA:
+You might have been enough the man you are,
+With striving less to be so; lesser had been
+The thwartings of your dispositions, if
+You had not show'd them how ye were disposed
+Ere they lack'd power to cross you.
+
+CORIOLANUS:
+Let them hang.
+
+A Patrician:
+Ay, and burn too.
+
+MENENIUS:
+Come, come, you have been too rough, something
+too rough;
+You must return and mend it.
+
+First Senator:
+There's no remedy;
+Unless, by not so doing, our good city
+Cleave in the midst, and perish.
+
+VOLUMNIA:
+Pray, be counsell'd:
+I have a heart as little apt as yours,
+But yet a brain that leads my use of anger
+To better vantage.
+
+MENENIUS:
+Well said, noble woman?
+Before he should thus stoop to the herd, but that
+The violent fit o' the time craves it as physic
+For the whole state, I would put mine armour on,
+Which I can scarcely bear.
+
+CORIOLANUS:
+What must I do?
+
+MENENIUS:
+Return to the tribunes.
+
+CORIOLANUS:
+Well, what then? what then?
+
+MENENIUS:
+Repent what you have spoke.
+
+CORIOLANUS:
+For them! I cannot do it to the gods;
+Must I then do't to them?
+
+VOLUMNIA:
+You are too absolute;
+Though therein you can never be too noble,
+But when extremities speak. I have heard you say,
+Honour and policy, like unsever'd friends,
+I' the war do grow together: grant that, and tell me,
+In peace what each of them by the other lose,
+That they combine not there.
+
+CORIOLANUS:
+Tush, tush!
+
+MENENIUS:
+A good demand.
+
+VOLUMNIA:
+If it be honour in your wars to seem
+The same you are not, which, for your best ends,
+You adopt your policy, how is it less or worse,
+That it shall hold companionship in peace
+With honour, as in war, since that to both
+It stands in like request?
+
+CORIOLANUS:
+Why force you this?
+
+VOLUMNIA:
+Because that now it lies you on to speak
+To the people; not by your own instruction,
+Nor by the matter which your heart prompts you,
+But with such words that are but rooted in
+Your tongue, though but bastards and syllables
+Of no allowance to your bosom's truth.
+Now, this no more dishonours you at all
+Than to take in a town with gentle words,
+Which else would put you to your fortune and
+The hazard of much blood.
+I would dissemble with my nature where
+My fortunes and my friends at stake required
+I should do so in honour: I am in this,
+Your wife, your son, these senators, the nobles;
+And you will rather show our general louts
+How you can frown than spend a fawn upon 'em,
+For the inheritance of their loves and safeguard
+Of what that want might ruin.
+
+MENENIUS:
+Noble lady!
+Come, go with us; speak fair: you may salve so,
+Not what is dangerous present, but the loss
+Of what is past.
+
+VOLUMNIA:
+I prithee now, my son,
+Go to them, with this bonnet in thy hand;
+And thus far having stretch'd it--here be with them--
+Thy knee bussing the stones--for in such business
+Action is eloquence, and the eyes of the ignorant
+More learned than the ears--waving thy head,
+Which often, thus, correcting thy stout heart,
+Now humble as the ripest mulberry
+That will not hold the handling: or say to them,
+Thou art their soldier, and being bred in broils
+Hast not the soft way which, thou dost confess,
+Were fit for thee to use as they to claim,
+In asking their good loves, but thou wilt frame
+Thyself, forsooth, hereafter theirs, so far
+As thou hast power and person.
+
+MENENIUS:
+This but done,
+Even as she speaks, why, their hearts were yours;
+For they have pardons, being ask'd, as free
+As words to little purpose.
+
+VOLUMNIA:
+Prithee now,
+Go, and be ruled: although I know thou hadst rather
+Follow thine enemy in a fiery gulf
+Than flatter him in a bower. Here is Cominius.
+
+COMINIUS:
+I have been i' the market-place; and, sir,'tis fit
+You make strong party, or defend yourself
+By calmness or by absence: all's in anger.
+
+MENENIUS:
+Only fair speech.
+
+COMINIUS:
+I think 'twill serve, if he
+Can thereto frame his spirit.
+
+VOLUMNIA:
+He must, and will
+Prithee now, say you will, and go about it.
+
+CORIOLANUS:
+Must I go show them my unbarbed sconce?
+Must I with base tongue give my noble heart
+A lie that it must bear? Well, I will do't:
+Yet, were there but this single plot to lose,
+This mould of Marcius, they to dust should grind it
+And throw't against the wind. To the market-place!
+You have put me now to such a part which never
+I shall discharge to the life.
+
+COMINIUS:
+Come, come, we'll prompt you.
+
+VOLUMNIA:
+I prithee now, sweet son, as thou hast said
+My praises made thee first a soldier, so,
+To have my praise for this, perform a part
+Thou hast not done before.
+
+CORIOLANUS:
+Well, I must do't:
+Away, my disposition, and possess me
+Some harlot's spirit! my throat of war be turn'd,
+Which quired with my drum, into a pipe
+Small as an eunuch, or the virgin voice
+That babies lulls asleep! the smiles of knaves
+Tent in my cheeks, and schoolboys' tears take up
+The glasses of my sight! a beggar's tongue
+Make motion through my lips, and my arm'd knees,
+Who bow'd but in my stirrup, bend like his
+That hath received an alms! I will not do't,
+Lest I surcease to honour mine own truth
+And by my body's action teach my mind
+A most inherent baseness.
+
+VOLUMNIA:
+At thy choice, then:
+To beg of thee, it is my more dishonour
+Than thou of them. Come all to ruin; let
+Thy mother rather feel thy pride than fear
+Thy dangerous stoutness, for I mock at death
+With as big heart as thou. Do as thou list
+Thy valiantness was mine, thou suck'dst it from me,
+But owe thy pride thyself.
+
+CORIOLANUS:
+Pray, be content:
+Mother, I am going to the market-place;
+Chide me no more. I'll mountebank their loves,
+Cog their hearts from them, and come home beloved
+Of all the trades in Rome. Look, I am going:
+Commend me to my wife. I'll return consul;
+Or never trust to what my tongue can do
+I' the way of flattery further.
+
+VOLUMNIA:
+Do your will.
+
+COMINIUS:
+Away! the tribunes do attend you: arm yourself
+To answer mildly; for they are prepared
+With accusations, as I hear, more strong
+Than are upon you yet.
+
+CORIOLANUS:
+The word is 'mildly.' Pray you, let us go:
+Let them accuse me by invention, I
+Will answer in mine honour.
+
+MENENIUS:
+Ay, but mildly.
+
+CORIOLANUS:
+Well, mildly be it then. Mildly!
+
+BRUTUS:
+In this point charge him home, that he affects
+Tyrannical power: if he evade us there,
+Enforce him with his envy to the people,
+And that the spoil got on the Antiates
+Was ne'er distributed.
+What, will he come?
+
+AEdile:
+He's coming.
+
+BRUTUS:
+How accompanied?
+
+AEdile:
+With old Menenius, and those senators
+That always favour'd him.
+
+SICINIUS:
+Have you a catalogue
+Of all the voices that we have procured
+Set down by the poll?
+
+AEdile:
+I have; 'tis ready.
+
+SICINIUS:
+Have you collected them by tribes?
+
+AEdile:
+I have.
+
+SICINIUS:
+Assemble presently the people hither;
+And when they bear me say 'It shall be so
+I' the right and strength o' the commons,' be it either
+For death, for fine, or banishment, then let them
+If I say fine, cry 'Fine;' if death, cry 'Death.'
+Insisting on the old prerogative
+And power i' the truth o' the cause.
+
+AEdile:
+I shall inform them.
+
+BRUTUS:
+And when such time they have begun to cry,
+Let them not cease, but with a din confused
+Enforce the present execution
+Of what we chance to sentence.
+
+AEdile:
+Very well.
+
+SICINIUS:
+Make them be strong and ready for this hint,
+When we shall hap to give 't them.
+
+BRUTUS:
+Go about it.
+Put him to choler straight: he hath been used
+Ever to conquer, and to have his worth
+Of contradiction: being once chafed, he cannot
+Be rein'd again to temperance; then he speaks
+What's in his heart; and that is there which looks
+With us to break his neck.
+
+SICINIUS:
+Well, here he comes.
+
+MENENIUS:
+Calmly, I do beseech you.
+
+CORIOLANUS:
+Ay, as an ostler, that for the poorest piece
+Will bear the knave by the volume. The honour'd gods
+Keep Rome in safety, and the chairs of justice
+Supplied with worthy men! plant love among 's!
+Throng our large temples with the shows of peace,
+And not our streets with war!
+
+First Senator:
+Amen, amen.
+
+MENENIUS:
+A noble wish.
+
+SICINIUS:
+Draw near, ye people.
+
+AEdile:
+List to your tribunes. Audience: peace, I say!
+
+CORIOLANUS:
+First, hear me speak.
+
+Both Tribunes:
+Well, say. Peace, ho!
+
+CORIOLANUS:
+Shall I be charged no further than this present?
+Must all determine here?
+
+SICINIUS:
+I do demand,
+If you submit you to the people's voices,
+Allow their officers and are content
+To suffer lawful censure for such faults
+As shall be proved upon you?
+
+CORIOLANUS:
+I am content.
+
+MENENIUS:
+Lo, citizens, he says he is content:
+The warlike service he has done, consider; think
+Upon the wounds his body bears, which show
+Like graves i' the holy churchyard.
+
+CORIOLANUS:
+Scratches with briers,
+Scars to move laughter only.
+
+MENENIUS:
+Consider further,
+That when he speaks not like a citizen,
+You find him like a soldier: do not take
+His rougher accents for malicious sounds,
+But, as I say, such as become a soldier,
+Rather than envy you.
+
+COMINIUS:
+Well, well, no more.
+
+CORIOLANUS:
+What is the matter
+That being pass'd for consul with full voice,
+I am so dishonour'd that the very hour
+You take it off again?
+
+SICINIUS:
+Answer to us.
+
+CORIOLANUS:
+Say, then: 'tis true, I ought so.
+
+SICINIUS:
+We charge you, that you have contrived to take
+From Rome all season'd office and to wind
+Yourself into a power tyrannical;
+For which you are a traitor to the people.
+
+CORIOLANUS:
+How! traitor!
+
+MENENIUS:
+Nay, temperately; your promise.
+
+CORIOLANUS:
+The fires i' the lowest hell fold-in the people!
+Call me their traitor! Thou injurious tribune!
+Within thine eyes sat twenty thousand deaths,
+In thy hand clutch'd as many millions, in
+Thy lying tongue both numbers, I would say
+'Thou liest' unto thee with a voice as free
+As I do pray the gods.
+
+SICINIUS:
+Mark you this, people?
+
+Citizens:
+To the rock, to the rock with him!
+
+SICINIUS:
+Peace!
+We need not put new matter to his charge:
+What you have seen him do and heard him speak,
+Beating your officers, cursing yourselves,
+Opposing laws with strokes and here defying
+Those whose great power must try him; even this,
+So criminal and in such capital kind,
+Deserves the extremest death.
+
+BRUTUS:
+But since he hath
+Served well for Rome,--
+
+CORIOLANUS:
+What do you prate of service?
+
+BRUTUS:
+I talk of that, that know it.
+
+CORIOLANUS:
+You?
+
+MENENIUS:
+Is this the promise that you made your mother?
+
+COMINIUS:
+Know, I pray you,--
+
+CORIOLANUS:
+I know no further:
+Let them pronounce the steep Tarpeian death,
+Vagabond exile, raying, pent to linger
+But with a grain a day, I would not buy
+Their mercy at the price of one fair word;
+Nor cheque my courage for what they can give,
+To have't with saying 'Good morrow.'
+
+SICINIUS:
+For that he has,
+As much as in him lies, from time to time
+Envied against the people, seeking means
+To pluck away their power, as now at last
+Given hostile strokes, and that not in the presence
+Of dreaded justice, but on the ministers
+That do distribute it; in the name o' the people
+And in the power of us the tribunes, we,
+Even from this instant, banish him our city,
+In peril of precipitation
+From off the rock Tarpeian never more
+To enter our Rome gates: i' the people's name,
+I say it shall be so.
+
+Citizens:
+It shall be so, it shall be so; let him away:
+He's banish'd, and it shall be so.
+
+COMINIUS:
+Hear me, my masters, and my common friends,--
+
+SICINIUS:
+He's sentenced; no more hearing.
+
+COMINIUS:
+Let me speak:
+I have been consul, and can show for Rome
+Her enemies' marks upon me. I do love
+My country's good with a respect more tender,
+More holy and profound, than mine own life,
+My dear wife's estimate, her womb's increase,
+And treasure of my loins; then if I would
+Speak that,--
+
+SICINIUS:
+We know your drift: speak what?
+
+BRUTUS:
+There's no more to be said, but he is banish'd,
+As enemy to the people and his country:
+It shall be so.
+
+Citizens:
+It shall be so, it shall be so.
+
+CORIOLANUS:
+You common cry of curs! whose breath I hate
+As reek o' the rotten fens, whose loves I prize
+As the dead carcasses of unburied men
+That do corrupt my air, I banish you;
+And here remain with your uncertainty!
+Let every feeble rumour shake your hearts!
+Your enemies, with nodding of their plumes,
+Fan you into despair! Have the power still
+To banish your defenders; till at length
+Your ignorance, which finds not till it feels,
+Making not reservation of yourselves,
+Still your own foes, deliver you as most
+Abated captives to some nation
+That won you without blows! Despising,
+For you, the city, thus I turn my back:
+There is a world elsewhere.
+
+AEdile:
+The people's enemy is gone, is gone!
+
+Citizens:
+Our enemy is banish'd! he is gone! Hoo! hoo!
+
+SICINIUS:
+Go, see him out at gates, and follow him,
+As he hath followed you, with all despite;
+Give him deserved vexation. Let a guard
+Attend us through the city.
+
+Citizens:
+Come, come; let's see him out at gates; come.
+The gods preserve our noble tribunes! Come.
+
+CORIOLANUS:
+Come, leave your tears: a brief farewell: the beast
+With many heads butts me away. Nay, mother,
+Where is your ancient courage? you were used
+To say extremity was the trier of spirits;
+That common chances common men could bear;
+That when the sea was calm all boats alike
+Show'd mastership in floating; fortune's blows,
+When most struck home, being gentle wounded, craves
+A noble cunning: you were used to load me
+With precepts that would make invincible
+The heart that conn'd them.
+
+VIRGILIA:
+O heavens! O heavens!
+
+CORIOLANUS:
+Nay! prithee, woman,--
+
+VOLUMNIA:
+Now the red pestilence strike all trades in Rome,
+And occupations perish!
+
+CORIOLANUS:
+What, what, what!
+I shall be loved when I am lack'd. Nay, mother.
+Resume that spirit, when you were wont to say,
+If you had been the wife of Hercules,
+Six of his labours you'ld have done, and saved
+Your husband so much sweat. Cominius,
+Droop not; adieu. Farewell, my wife, my mother:
+I'll do well yet. Thou old and true Menenius,
+Thy tears are salter than a younger man's,
+And venomous to thine eyes. My sometime general,
+I have seen thee stem, and thou hast oft beheld
+Heart-hardening spectacles; tell these sad women
+'Tis fond to wail inevitable strokes,
+As 'tis to laugh at 'em. My mother, you wot well
+My hazards still have been your solace: and
+Believe't not lightly--though I go alone,
+Like to a lonely dragon, that his fen
+Makes fear'd and talk'd of more than seen--your son
+Will or exceed the common or be caught
+With cautelous baits and practise.
+
+VOLUMNIA:
+My first son.
+Whither wilt thou go? Take good Cominius
+With thee awhile: determine on some course,
+More than a wild exposture to each chance
+That starts i' the way before thee.
+
+CORIOLANUS:
+O the gods!
+
+COMINIUS:
+I'll follow thee a month, devise with thee
+Where thou shalt rest, that thou mayst hear of us
+And we of thee: so if the time thrust forth
+A cause for thy repeal, we shall not send
+O'er the vast world to seek a single man,
+And lose advantage, which doth ever cool
+I' the absence of the needer.
+
+CORIOLANUS:
+Fare ye well:
+Thou hast years upon thee; and thou art too full
+Of the wars' surfeits, to go rove with one
+That's yet unbruised: bring me but out at gate.
+Come, my sweet wife, my dearest mother, and
+My friends of noble touch, when I am forth,
+Bid me farewell, and smile. I pray you, come.
+While I remain above the ground, you shall
+Hear from me still, and never of me aught
+But what is like me formerly.
+
+MENENIUS:
+That's worthily
+As any ear can hear. Come, let's not weep.
+If I could shake off but one seven years
+From these old arms and legs, by the good gods,
+I'ld with thee every foot.
+
+CORIOLANUS:
+Give me thy hand: Come.
+
+SICINIUS:
+Bid them all home; he's gone, and we'll no further.
+The nobility are vex'd, whom we see have sided
+In his behalf.
+
+BRUTUS:
+Now we have shown our power,
+Let us seem humbler after it is done
+Than when it was a-doing.
+
+SICINIUS:
+Bid them home:
+Say their great enemy is gone, and they
+Stand in their ancient strength.
+
+BRUTUS:
+Dismiss them home.
+Here comes his mother.
+
+SICINIUS:
+Let's not meet her.
+
+BRUTUS:
+Why?
+
+SICINIUS:
+They say she's mad.
+
+BRUTUS:
+They have ta'en note of us: keep on your way.
+
+VOLUMNIA:
+O, ye're well met: the hoarded plague o' the gods
+Requite your love!
+
+MENENIUS:
+Peace, peace; be not so loud.
+
+VOLUMNIA:
+If that I could for weeping, you should hear,--
+Nay, and you shall hear some.
+Will you be gone?
+
+VIRGILIA:
+
+SICINIUS:
+Are you mankind?
+
+VOLUMNIA:
+Ay, fool; is that a shame? Note but this fool.
+Was not a man my father? Hadst thou foxship
+To banish him that struck more blows for Rome
+Than thou hast spoken words?
+
+SICINIUS:
+O blessed heavens!
+
+VOLUMNIA:
+More noble blows than ever thou wise words;
+And for Rome's good. I'll tell thee what; yet go:
+Nay, but thou shalt stay too: I would my son
+Were in Arabia, and thy tribe before him,
+His good sword in his hand.
+
+SICINIUS:
+What then?
+
+VIRGILIA:
+What then!
+He'ld make an end of thy posterity.
+
+VOLUMNIA:
+Bastards and all.
+Good man, the wounds that he does bear for Rome!
+
+MENENIUS:
+Come, come, peace.
+
+SICINIUS:
+I would he had continued to his country
+As he began, and not unknit himself
+The noble knot he made.
+
+BRUTUS:
+I would he had.
+
+VOLUMNIA:
+'I would he had'! 'Twas you incensed the rabble:
+Cats, that can judge as fitly of his worth
+As I can of those mysteries which heaven
+Will not have earth to know.
+
+BRUTUS:
+Pray, let us go.
+
+VOLUMNIA:
+Now, pray, sir, get you gone:
+You have done a brave deed. Ere you go, hear this:--
+As far as doth the Capitol exceed
+The meanest house in Rome, so far my son--
+This lady's husband here, this, do you see--
+Whom you have banish'd, does exceed you all.
+
+BRUTUS:
+Well, well, we'll leave you.
+
+SICINIUS:
+Why stay we to be baited
+With one that wants her wits?
+
+VOLUMNIA:
+Take my prayers with you.
+I would the gods had nothing else to do
+But to confirm my curses! Could I meet 'em
+But once a-day, it would unclog my heart
+Of what lies heavy to't.
+
+MENENIUS:
+You have told them home;
+And, by my troth, you have cause. You'll sup with me?
+
+VOLUMNIA:
+Anger's my meat; I sup upon myself,
+And so shall starve with feeding. Come, let's go:
+Leave this faint puling and lament as I do,
+In anger, Juno-like. Come, come, come.
+
+MENENIUS:
+Fie, fie, fie!
+
+Roman:
+I know you well, sir, and you know
+me: your name, I think, is Adrian.
+
+Volsce:
+It is so, sir: truly, I have forgot you.
+
+Roman:
+I am a Roman; and my services are,
+as you are, against 'em: know you me yet?
+
+Volsce:
+Nicanor? no.
+
+Roman:
+The same, sir.
+
+Volsce:
+You had more beard when I last saw you; but your
+favour is well approved by your tongue. What's the
+news in Rome? I have a note from the Volscian state,
+to find you out there: you have well saved me a
+day's journey.
+
+Roman:
+There hath been in Rome strange insurrections; the
+people against the senators, patricians, and nobles.
+
+Volsce:
+Hath been! is it ended, then? Our state thinks not
+so: they are in a most warlike preparation, and
+hope to come upon them in the heat of their division.
+
+Roman:
+The main blaze of it is past, but a small thing
+would make it flame again: for the nobles receive
+so to heart the banishment of that worthy
+Coriolanus, that they are in a ripe aptness to take
+all power from the people and to pluck from them
+their tribunes for ever. This lies glowing, I can
+tell you, and is almost mature for the violent
+breaking out.
+
+Volsce:
+Coriolanus banished!
+
+Roman:
+Banished, sir.
+
+Volsce:
+You will be welcome with this intelligence, Nicanor.
+
+Roman:
+The day serves well for them now. I have heard it
+said, the fittest time to corrupt a man's wife is
+when she's fallen out with her husband. Your noble
+Tullus Aufidius will appear well in these wars, his
+great opposer, Coriolanus, being now in no request
+of his country.
+
+Volsce:
+He cannot choose. I am most fortunate, thus
+accidentally to encounter you: you have ended my
+business, and I will merrily accompany you home.
+
+Roman:
+I shall, between this and supper, tell you most
+strange things from Rome; all tending to the good of
+their adversaries. Have you an army ready, say you?
+
+Volsce:
+A most royal one; the centurions and their charges,
+distinctly billeted, already in the entertainment,
+and to be on foot at an hour's warning.
+
+Roman:
+I am joyful to hear of their readiness, and am the
+man, I think, that shall set them in present action.
+So, sir, heartily well met, and most glad of your company.
+
+Volsce:
+You take my part from me, sir; I have the most cause
+to be glad of yours.
+
+Roman:
+Well, let us go together.
+
+CORIOLANUS:
+A goodly city is this Antium. City,
+'Tis I that made thy widows: many an heir
+Of these fair edifices 'fore my wars
+Have I heard groan and drop: then know me not,
+Lest that thy wives with spits and boys with stones
+In puny battle slay me.
+Save you, sir.
+
+Citizen:
+And you.
+
+CORIOLANUS:
+Direct me, if it be your will,
+Where great Aufidius lies: is he in Antium?
+
+Citizen:
+He is, and feasts the nobles of the state
+At his house this night.
+
+CORIOLANUS:
+Which is his house, beseech you?
+
+Citizen:
+This, here before you.
+
+CORIOLANUS:
+Thank you, sir: farewell.
+O world, thy slippery turns! Friends now fast sworn,
+Whose double bosoms seem to wear one heart,
+Whose house, whose bed, whose meal, and exercise,
+Are still together, who twin, as 'twere, in love
+Unseparable, shall within this hour,
+On a dissension of a doit, break out
+To bitterest enmity: so, fellest foes,
+Whose passions and whose plots have broke their sleep,
+To take the one the other, by some chance,
+Some trick not worth an egg, shall grow dear friends
+And interjoin their issues. So with me:
+My birth-place hate I, and my love's upon
+This enemy town. I'll enter: if he slay me,
+He does fair justice; if he give me way,
+I'll do his country service.
+
+First Servingman:
+Wine, wine, wine! What service
+is here! I think our fellows are asleep.
+
+Second Servingman:
+Where's Cotus? my master calls
+for him. Cotus!
+
+CORIOLANUS:
+A goodly house: the feast smells well; but I
+Appear not like a guest.
+
+First Servingman:
+What would you have, friend? whence are you?
+Here's no place for you: pray, go to the door.
+
+CORIOLANUS:
+I have deserved no better entertainment,
+In being Coriolanus.
+
+Second Servingman:
+Whence are you, sir? Has the porter his eyes in his
+head; that he gives entrance to such companions?
+Pray, get you out.
+
+CORIOLANUS:
+Away!
+
+Second Servingman:
+Away! get you away.
+
+CORIOLANUS:
+Now thou'rt troublesome.
+
+Second Servingman:
+Are you so brave? I'll have you talked with anon.
+
+Third Servingman:
+What fellow's this?
+
+First Servingman:
+A strange one as ever I looked on: I cannot get him
+out of the house: prithee, call my master to him.
+
+Third Servingman:
+What have you to do here, fellow? Pray you, avoid
+the house.
+
+CORIOLANUS:
+Let me but stand; I will not hurt your hearth.
+
+Third Servingman:
+What are you?
+
+CORIOLANUS:
+A gentleman.
+
+Third Servingman:
+A marvellous poor one.
+
+CORIOLANUS:
+True, so I am.
+
+Third Servingman:
+Pray you, poor gentleman, take up some other
+station; here's no place for you; pray you, avoid: come.
+
+CORIOLANUS:
+Follow your function, go, and batten on cold bits.
+
+Third Servingman:
+What, you will not? Prithee, tell my master what a
+strange guest he has here.
+
+Second Servingman:
+And I shall.
+
+Third Servingman:
+Where dwellest thou?
+
+CORIOLANUS:
+Under the canopy.
+
+Third Servingman:
+Under the canopy!
+
+CORIOLANUS:
+Ay.
+
+Third Servingman:
+Where's that?
+
+CORIOLANUS:
+I' the city of kites and crows.
+
+Third Servingman:
+I' the city of kites and crows! What an ass it is!
+Then thou dwellest with daws too?
+
+CORIOLANUS:
+No, I serve not thy master.
+
+Third Servingman:
+How, sir! do you meddle with my master?
+
+CORIOLANUS:
+Ay; 'tis an honester service than to meddle with thy
+mistress. Thou pratest, and pratest; serve with thy
+trencher, hence!
+
+AUFIDIUS:
+Where is this fellow?
+
+Second Servingman:
+Here, sir: I'ld have beaten him like a dog, but for
+disturbing the lords within.
+
+AUFIDIUS:
+Whence comest thou? what wouldst thou? thy name?
+Why speak'st not? speak, man: what's thy name?
+
+CORIOLANUS:
+If, Tullus,
+Not yet thou knowest me, and, seeing me, dost not
+Think me for the man I am, necessity
+Commands me name myself.
+
+AUFIDIUS:
+What is thy name?
+
+CORIOLANUS:
+A name unmusical to the Volscians' ears,
+And harsh in sound to thine.
+
+AUFIDIUS:
+Say, what's thy name?
+Thou hast a grim appearance, and thy face
+Bears a command in't; though thy tackle's torn.
+Thou show'st a noble vessel: what's thy name?
+
+CORIOLANUS:
+Prepare thy brow to frown: know'st
+thou me yet?
+
+AUFIDIUS:
+I know thee not: thy name?
+
+CORIOLANUS:
+My name is Caius Marcius, who hath done
+To thee particularly and to all the Volsces
+Great hurt and mischief; thereto witness may
+My surname, Coriolanus: the painful service,
+The extreme dangers and the drops of blood
+Shed for my thankless country are requited
+But with that surname; a good memory,
+And witness of the malice and displeasure
+Which thou shouldst bear me: only that name remains;
+The cruelty and envy of the people,
+Permitted by our dastard nobles, who
+Have all forsook me, hath devour'd the rest;
+And suffer'd me by the voice of slaves to be
+Whoop'd out of Rome. Now this extremity
+Hath brought me to thy hearth; not out of hope--
+Mistake me not--to save my life, for if
+I had fear'd death, of all the men i' the world
+I would have 'voided thee, but in mere spite,
+To be full quit of those my banishers,
+Stand I before thee here. Then if thou hast
+A heart of wreak in thee, that wilt revenge
+Thine own particular wrongs and stop those maims
+Of shame seen through thy country, speed
+thee straight,
+And make my misery serve thy turn: so use it
+That my revengeful services may prove
+As benefits to thee, for I will fight
+Against my canker'd country with the spleen
+Of all the under fiends. But if so be
+Thou darest not this and that to prove more fortunes
+Thou'rt tired, then, in a word, I also am
+Longer to live most weary, and present
+My throat to thee and to thy ancient malice;
+Which not to cut would show thee but a fool,
+Since I have ever follow'd thee with hate,
+Drawn tuns of blood out of thy country's breast,
+And cannot live but to thy shame, unless
+It be to do thee service.
+
+AUFIDIUS:
+O Marcius, Marcius!
+Each word thou hast spoke hath weeded from my heart
+A root of ancient envy. If Jupiter
+Should from yond cloud speak divine things,
+And say 'Tis true,' I'ld not believe them more
+Than thee, all noble Marcius. Let me twine
+Mine arms about that body, where against
+My grained ash an hundred times hath broke
+And scarr'd the moon with splinters: here I clip
+The anvil of my sword, and do contest
+As hotly and as nobly with thy love
+As ever in ambitious strength I did
+Contend against thy valour. Know thou first,
+I loved the maid I married; never man
+Sigh'd truer breath; but that I see thee here,
+Thou noble thing! more dances my rapt heart
+Than when I first my wedded mistress saw
+Bestride my threshold. Why, thou Mars! I tell thee,
+We have a power on foot; and I had purpose
+Once more to hew thy target from thy brawn,
+Or lose mine arm fort: thou hast beat me out
+Twelve several times, and I have nightly since
+Dreamt of encounters 'twixt thyself and me;
+We have been down together in my sleep,
+Unbuckling helms, fisting each other's throat,
+And waked half dead with nothing. Worthy Marcius,
+Had we no quarrel else to Rome, but that
+Thou art thence banish'd, we would muster all
+From twelve to seventy, and pouring war
+Into the bowels of ungrateful Rome,
+Like a bold flood o'er-bear. O, come, go in,
+And take our friendly senators by the hands;
+Who now are here, taking their leaves of me,
+Who am prepared against your territories,
+Though not for Rome itself.
+
+CORIOLANUS:
+You bless me, gods!
+
+AUFIDIUS:
+Therefore, most absolute sir, if thou wilt have
+The leading of thine own revenges, take
+The one half of my commission; and set down--
+As best thou art experienced, since thou know'st
+Thy country's strength and weakness,--thine own ways;
+Whether to knock against the gates of Rome,
+Or rudely visit them in parts remote,
+To fright them, ere destroy. But come in:
+Let me commend thee first to those that shall
+Say yea to thy desires. A thousand welcomes!
+And more a friend than e'er an enemy;
+Yet, Marcius, that was much. Your hand: most welcome!
+
+First Servingman:
+Here's a strange alteration!
+
+Second Servingman:
+By my hand, I had thought to have strucken him with
+a cudgel; and yet my mind gave me his clothes made a
+false report of him.
+
+First Servingman:
+What an arm he has! he turned me about with his
+finger and his thumb, as one would set up a top.
+
+Second Servingman:
+Nay, I knew by his face that there was something in
+him: he had, sir, a kind of face, methought,--I
+cannot tell how to term it.
+
+First Servingman:
+He had so; looking as it were--would I were hanged,
+but I thought there was more in him than I could think.
+
+Second Servingman:
+So did I, I'll be sworn: he is simply the rarest
+man i' the world.
+
+First Servingman:
+I think he is: but a greater soldier than he you wot on.
+
+Second Servingman:
+Who, my master?
+
+First Servingman:
+Nay, it's no matter for that.
+
+Second Servingman:
+Worth six on him.
+
+First Servingman:
+Nay, not so neither: but I take him to be the
+greater soldier.
+
+Second Servingman:
+Faith, look you, one cannot tell how to say that:
+for the defence of a town, our general is excellent.
+
+First Servingman:
+Ay, and for an assault too.
+
+Third Servingman:
+O slaves, I can tell you news,-- news, you rascals!
+
+First Servingman:
+What, what, what? let's partake.
+
+Third Servingman:
+I would not be a Roman, of all nations; I had as
+lieve be a condemned man.
+
+First Servingman:
+Wherefore? wherefore?
+
+Third Servingman:
+Why, here's he that was wont to thwack our general,
+Caius Marcius.
+
+First Servingman:
+Why do you say 'thwack our general '?
+
+Third Servingman:
+I do not say 'thwack our general;' but he was always
+good enough for him.
+
+Second Servingman:
+Come, we are fellows and friends: he was ever too
+hard for him; I have heard him say so himself.
+
+First Servingman:
+He was too hard for him directly, to say the troth
+on't: before Corioli he scotched him and notched
+him like a carbon ado.
+
+Second Servingman:
+An he had been cannibally given, he might have
+broiled and eaten him too.
+
+First Servingman:
+But, more of thy news?
+
+Third Servingman:
+Why, he is so made on here within, as if he were son
+and heir to Mars; set at upper end o' the table; no
+question asked him by any of the senators, but they
+stand bald before him: our general himself makes a
+mistress of him: sanctifies himself with's hand and
+turns up the white o' the eye to his discourse. But
+the bottom of the news is that our general is cut i'
+the middle and but one half of what he was
+yesterday; for the other has half, by the entreaty
+and grant of the whole table. He'll go, he says,
+and sowl the porter of Rome gates by the ears: he
+will mow all down before him, and leave his passage polled.
+
+Second Servingman:
+And he's as like to do't as any man I can imagine.
+
+Third Servingman:
+Do't! he will do't; for, look you, sir, he has as
+many friends as enemies; which friends, sir, as it
+were, durst not, look you, sir, show themselves, as
+we term it, his friends whilst he's in directitude.
+
+First Servingman:
+Directitude! what's that?
+
+Third Servingman:
+But when they shall see, sir, his crest up again,
+and the man in blood, they will out of their
+burrows, like conies after rain, and revel all with
+him.
+
+First Servingman:
+But when goes this forward?
+
+Third Servingman:
+To-morrow; to-day; presently; you shall have the
+drum struck up this afternoon: 'tis, as it were, a
+parcel of their feast, and to be executed ere they
+wipe their lips.
+
+Second Servingman:
+Why, then we shall have a stirring world again.
+This peace is nothing, but to rust iron, increase
+tailors, and breed ballad-makers.
+
+First Servingman:
+Let me have war, say I; it exceeds peace as far as
+day does night; it's spritely, waking, audible, and
+full of vent. Peace is a very apoplexy, lethargy;
+mulled, deaf, sleepy, insensible; a getter of more
+bastard children than war's a destroyer of men.
+
+Second Servingman:
+'Tis so: and as war, in some sort, may be said to
+be a ravisher, so it cannot be denied but peace is a
+great maker of cuckolds.
+
+First Servingman:
+Ay, and it makes men hate one another.
+
+Third Servingman:
+Reason; because they then less need one another.
+The wars for my money. I hope to see Romans as cheap
+as Volscians. They are rising, they are rising.
+
+All:
+In, in, in, in!
+
+SICINIUS:
+We hear not of him, neither need we fear him;
+His remedies are tame i' the present peace
+And quietness of the people, which before
+Were in wild hurry. Here do we make his friends
+Blush that the world goes well, who rather had,
+Though they themselves did suffer by't, behold
+Dissentious numbers pestering streets than see
+Our tradesmen with in their shops and going
+About their functions friendly.
+
+BRUTUS:
+We stood to't in good time.
+Is this Menenius?
+
+SICINIUS:
+'Tis he,'tis he: O, he is grown most kind of late.
+
+Both Tribunes:
+Hail sir!
+
+MENENIUS:
+Hail to you both!
+
+SICINIUS:
+Your Coriolanus
+Is not much miss'd, but with his friends:
+The commonwealth doth stand, and so would do,
+Were he more angry at it.
+
+MENENIUS:
+All's well; and might have been much better, if
+He could have temporized.
+
+SICINIUS:
+Where is he, hear you?
+
+MENENIUS:
+Nay, I hear nothing: his mother and his wife
+Hear nothing from him.
+
+Citizens:
+The gods preserve you both!
+
+SICINIUS:
+God-den, our neighbours.
+
+BRUTUS:
+God-den to you all, god-den to you all.
+
+First Citizen:
+Ourselves, our wives, and children, on our knees,
+Are bound to pray for you both.
+
+SICINIUS:
+Live, and thrive!
+
+BRUTUS:
+Farewell, kind neighbours: we wish'd Coriolanus
+Had loved you as we did.
+
+Citizens:
+Now the gods keep you!
+
+Both Tribunes:
+Farewell, farewell.
+
+SICINIUS:
+This is a happier and more comely time
+Than when these fellows ran about the streets,
+Crying confusion.
+
+BRUTUS:
+Caius Marcius was
+A worthy officer i' the war; but insolent,
+O'ercome with pride, ambitious past all thinking,
+Self-loving,--
+
+SICINIUS:
+And affecting one sole throne,
+Without assistance.
+
+MENENIUS:
+I think not so.
+
+SICINIUS:
+We should by this, to all our lamentation,
+If he had gone forth consul, found it so.
+
+BRUTUS:
+The gods have well prevented it, and Rome
+Sits safe and still without him.
+
+AEdile:
+Worthy tribunes,
+There is a slave, whom we have put in prison,
+Reports, the Volsces with two several powers
+Are enter'd in the Roman territories,
+And with the deepest malice of the war
+Destroy what lies before 'em.
+
+MENENIUS:
+'Tis Aufidius,
+Who, hearing of our Marcius' banishment,
+Thrusts forth his horns again into the world;
+Which were inshell'd when Marcius stood for Rome,
+And durst not once peep out.
+
+SICINIUS:
+Come, what talk you
+Of Marcius?
+
+BRUTUS:
+Go see this rumourer whipp'd. It cannot be
+The Volsces dare break with us.
+
+MENENIUS:
+Cannot be!
+We have record that very well it can,
+And three examples of the like have been
+Within my age. But reason with the fellow,
+Before you punish him, where he heard this,
+Lest you shall chance to whip your information
+And beat the messenger who bids beware
+Of what is to be dreaded.
+
+SICINIUS:
+Tell not me:
+I know this cannot be.
+
+BRUTUS:
+Not possible.
+
+Messenger:
+The nobles in great earnestness are going
+All to the senate-house: some news is come
+That turns their countenances.
+
+SICINIUS:
+'Tis this slave;--
+Go whip him, 'fore the people's eyes:--his raising;
+Nothing but his report.
+
+Messenger:
+Yes, worthy sir,
+The slave's report is seconded; and more,
+More fearful, is deliver'd.
+
+SICINIUS:
+What more fearful?
+
+Messenger:
+It is spoke freely out of many mouths--
+How probable I do not know--that Marcius,
+Join'd with Aufidius, leads a power 'gainst Rome,
+And vows revenge as spacious as between
+The young'st and oldest thing.
+
+SICINIUS:
+This is most likely!
+
+BRUTUS:
+Raised only, that the weaker sort may wish
+Good Marcius home again.
+
+SICINIUS:
+The very trick on't.
+
+MENENIUS:
+This is unlikely:
+He and Aufidius can no more atone
+Than violentest contrariety.
+
+Second Messenger:
+You are sent for to the senate:
+A fearful army, led by Caius Marcius
+Associated with Aufidius, rages
+Upon our territories; and have already
+O'erborne their way, consumed with fire, and took
+What lay before them.
+
+COMINIUS:
+O, you have made good work!
+
+MENENIUS:
+What news? what news?
+
+COMINIUS:
+You have holp to ravish your own daughters and
+To melt the city leads upon your pates,
+To see your wives dishonour'd to your noses,--
+
+MENENIUS:
+What's the news? what's the news?
+
+COMINIUS:
+Your temples burned in their cement, and
+Your franchises, whereon you stood, confined
+Into an auger's bore.
+
+MENENIUS:
+Pray now, your news?
+You have made fair work, I fear me.--Pray, your news?--
+If Marcius should be join'd with Volscians,--
+
+COMINIUS:
+If!
+He is their god: he leads them like a thing
+Made by some other deity than nature,
+That shapes man better; and they follow him,
+Against us brats, with no less confidence
+Than boys pursuing summer butterflies,
+Or butchers killing flies.
+
+MENENIUS:
+You have made good work,
+You and your apron-men; you that stood so up much
+on the voice of occupation and
+The breath of garlic-eaters!
+
+COMINIUS:
+He will shake
+Your Rome about your ears.
+
+MENENIUS:
+As Hercules
+Did shake down mellow fruit.
+You have made fair work!
+
+BRUTUS:
+But is this true, sir?
+
+COMINIUS:
+Ay; and you'll look pale
+Before you find it other. All the regions
+Do smilingly revolt; and who resist
+Are mock'd for valiant ignorance,
+And perish constant fools. Who is't can blame him?
+Your enemies and his find something in him.
+
+MENENIUS:
+We are all undone, unless
+The noble man have mercy.
+
+COMINIUS:
+Who shall ask it?
+The tribunes cannot do't for shame; the people
+Deserve such pity of him as the wolf
+Does of the shepherds: for his best friends, if they
+Should say 'Be good to Rome,' they charged him even
+As those should do that had deserved his hate,
+And therein show'd like enemies.
+
+MENENIUS:
+'Tis true:
+If he were putting to my house the brand
+That should consume it, I have not the face
+To say 'Beseech you, cease.' You have made fair hands,
+You and your crafts! you have crafted fair!
+
+COMINIUS:
+You have brought
+A trembling upon Rome, such as was never
+So incapable of help.
+
+Both Tribunes:
+Say not we brought it.
+
+MENENIUS:
+How! Was it we? we loved him but, like beasts
+And cowardly nobles, gave way unto your clusters,
+Who did hoot him out o' the city.
+
+COMINIUS:
+But I fear
+They'll roar him in again. Tullus Aufidius,
+The second name of men, obeys his points
+As if he were his officer: desperation
+Is all the policy, strength and defence,
+That Rome can make against them.
+
+MENENIUS:
+Here come the clusters.
+And is Aufidius with him? You are they
+That made the air unwholesome, when you cast
+Your stinking greasy caps in hooting at
+Coriolanus' exile. Now he's coming;
+And not a hair upon a soldier's head
+Which will not prove a whip: as many coxcombs
+As you threw caps up will he tumble down,
+And pay you for your voices. 'Tis no matter;
+if he could burn us all into one coal,
+We have deserved it.
+
+Citizens:
+Faith, we hear fearful news.
+
+First Citizen:
+For mine own part,
+When I said, banish him, I said 'twas pity.
+
+Second Citizen:
+And so did I.
+
+Third Citizen:
+And so did I; and, to say the truth, so did very
+many of us: that we did, we did for the best; and
+though we willingly consented to his banishment, yet
+it was against our will.
+
+COMINIUS:
+Ye re goodly things, you voices!
+
+MENENIUS:
+You have made
+Good work, you and your cry! Shall's to the Capitol?
+
+COMINIUS:
+O, ay, what else?
+
+SICINIUS:
+Go, masters, get you home; be not dismay'd:
+These are a side that would be glad to have
+This true which they so seem to fear. Go home,
+And show no sign of fear.
+
+First Citizen:
+The gods be good to us! Come, masters, let's home.
+I ever said we were i' the wrong when we banished
+him.
+
+Second Citizen:
+So did we all. But, come, let's home.
+
+BRUTUS:
+I do not like this news.
+
+SICINIUS:
+Nor I.
+
+BRUTUS:
+Let's to the Capitol. Would half my wealth
+Would buy this for a lie!
+
+SICINIUS:
+Pray, let us go.
+
+AUFIDIUS:
+Do they still fly to the Roman?
+
+Lieutenant:
+I do not know what witchcraft's in him, but
+Your soldiers use him as the grace 'fore meat,
+Their talk at table, and their thanks at end;
+And you are darken'd in this action, sir,
+Even by your own.
+
+AUFIDIUS:
+I cannot help it now,
+Unless, by using means, I lame the foot
+Of our design. He bears himself more proudlier,
+Even to my person, than I thought he would
+When first I did embrace him: yet his nature
+In that's no changeling; and I must excuse
+What cannot be amended.
+
+Lieutenant:
+Yet I wish, sir,--
+I mean for your particular,--you had not
+Join'd in commission with him; but either
+Had borne the action of yourself, or else
+To him had left it solely.
+
+AUFIDIUS:
+I understand thee well; and be thou sure,
+when he shall come to his account, he knows not
+What I can urge against him. Although it seems,
+And so he thinks, and is no less apparent
+To the vulgar eye, that he bears all things fairly.
+And shows good husbandry for the Volscian state,
+Fights dragon-like, and does achieve as soon
+As draw his sword; yet he hath left undone
+That which shall break his neck or hazard mine,
+Whene'er we come to our account.
+
+Lieutenant:
+Sir, I beseech you, think you he'll carry Rome?
+
+AUFIDIUS:
+All places yield to him ere he sits down;
+And the nobility of Rome are his:
+The senators and patricians love him too:
+The tribunes are no soldiers; and their people
+Will be as rash in the repeal, as hasty
+To expel him thence. I think he'll be to Rome
+As is the osprey to the fish, who takes it
+By sovereignty of nature. First he was
+A noble servant to them; but he could not
+Carry his honours even: whether 'twas pride,
+Which out of daily fortune ever taints
+The happy man; whether defect of judgment,
+To fail in the disposing of those chances
+Which he was lord of; or whether nature,
+Not to be other than one thing, not moving
+From the casque to the cushion, but commanding peace
+Even with the same austerity and garb
+As he controll'd the war; but one of these--
+As he hath spices of them all, not all,
+For I dare so far free him--made him fear'd,
+So hated, and so banish'd: but he has a merit,
+To choke it in the utterance. So our virtues
+Lie in the interpretation of the time:
+And power, unto itself most commendable,
+Hath not a tomb so evident as a chair
+To extol what it hath done.
+One fire drives out one fire; one nail, one nail;
+Rights by rights falter, strengths by strengths do fail.
+Come, let's away. When, Caius, Rome is thine,
+Thou art poor'st of all; then shortly art thou mine.
+
+MENENIUS:
+No, I'll not go: you hear what he hath said
+Which was sometime his general; who loved him
+In a most dear particular. He call'd me father:
+But what o' that? Go, you that banish'd him;
+A mile before his tent fall down, and knee
+The way into his mercy: nay, if he coy'd
+To hear Cominius speak, I'll keep at home.
+
+COMINIUS:
+He would not seem to know me.
+
+MENENIUS:
+Do you hear?
+
+COMINIUS:
+Yet one time he did call me by my name:
+I urged our old acquaintance, and the drops
+That we have bled together. Coriolanus
+He would not answer to: forbad all names;
+He was a kind of nothing, titleless,
+Till he had forged himself a name o' the fire
+Of burning Rome.
+
+MENENIUS:
+Why, so: you have made good work!
+A pair of tribunes that have rack'd for Rome,
+To make coals cheap,--a noble memory!
+
+COMINIUS:
+I minded him how royal 'twas to pardon
+When it was less expected: he replied,
+It was a bare petition of a state
+To one whom they had punish'd.
+
+MENENIUS:
+Very well:
+Could he say less?
+
+COMINIUS:
+I offer'd to awaken his regard
+For's private friends: his answer to me was,
+He could not stay to pick them in a pile
+Of noisome musty chaff: he said 'twas folly,
+For one poor grain or two, to leave unburnt,
+And still to nose the offence.
+
+MENENIUS:
+For one poor grain or two!
+I am one of those; his mother, wife, his child,
+And this brave fellow too, we are the grains:
+You are the musty chaff; and you are smelt
+Above the moon: we must be burnt for you.
+
+SICINIUS:
+Nay, pray, be patient: if you refuse your aid
+In this so never-needed help, yet do not
+Upbraid's with our distress. But, sure, if you
+Would be your country's pleader, your good tongue,
+More than the instant army we can make,
+Might stop our countryman.
+
+MENENIUS:
+No, I'll not meddle.
+
+SICINIUS:
+Pray you, go to him.
+
+MENENIUS:
+What should I do?
+
+BRUTUS:
+Only make trial what your love can do
+For Rome, towards Marcius.
+
+MENENIUS:
+Well, and say that Marcius
+Return me, as Cominius is return'd,
+Unheard; what then?
+But as a discontented friend, grief-shot
+With his unkindness? say't be so?
+
+SICINIUS:
+Yet your good will
+must have that thanks from Rome, after the measure
+As you intended well.
+
+MENENIUS:
+I'll undertake 't:
+I think he'll hear me. Yet, to bite his lip
+And hum at good Cominius, much unhearts me.
+He was not taken well; he had not dined:
+The veins unfill'd, our blood is cold, and then
+We pout upon the morning, are unapt
+To give or to forgive; but when we have stuff'd
+These and these conveyances of our blood
+With wine and feeding, we have suppler souls
+Than in our priest-like fasts: therefore I'll watch him
+Till he be dieted to my request,
+And then I'll set upon him.
+
+BRUTUS:
+You know the very road into his kindness,
+And cannot lose your way.
+
+MENENIUS:
+Good faith, I'll prove him,
+Speed how it will. I shall ere long have knowledge
+Of my success.
+
+COMINIUS:
+He'll never hear him.
+
+SICINIUS:
+Not?
+
+COMINIUS:
+I tell you, he does sit in gold, his eye
+Red as 'twould burn Rome; and his injury
+The gaoler to his pity. I kneel'd before him;
+'Twas very faintly he said 'Rise;' dismiss'd me
+Thus, with his speechless hand: what he would do,
+He sent in writing after me; what he would not,
+Bound with an oath to yield to his conditions:
+So that all hope is vain.
+Unless his noble mother, and his wife;
+Who, as I hear, mean to solicit him
+For mercy to his country. Therefore, let's hence,
+And with our fair entreaties haste them on.
+
+First Senator:
+Stay: whence are you?
+
+Second Senator:
+Stand, and go back.
+
+MENENIUS:
+You guard like men; 'tis well: but, by your leave,
+I am an officer of state, and come
+To speak with Coriolanus.
+
+First Senator:
+From whence?
+
+MENENIUS:
+From Rome.
+
+First Senator:
+You may not pass, you must return: our general
+Will no more hear from thence.
+
+Second Senator:
+You'll see your Rome embraced with fire before
+You'll speak with Coriolanus.
+
+MENENIUS:
+Good my friends,
+If you have heard your general talk of Rome,
+And of his friends there, it is lots to blanks,
+My name hath touch'd your ears it is Menenius.
+
+First Senator:
+Be it so; go back: the virtue of your name
+Is not here passable.
+
+MENENIUS:
+I tell thee, fellow,
+The general is my lover: I have been
+The book of his good acts, whence men have read
+His name unparallel'd, haply amplified;
+For I have ever verified my friends,
+Of whom he's chief, with all the size that verity
+Would without lapsing suffer: nay, sometimes,
+Like to a bowl upon a subtle ground,
+I have tumbled past the throw; and in his praise
+Have almost stamp'd the leasing: therefore, fellow,
+I must have leave to pass.
+
+First Senator:
+Faith, sir, if you had told as many lies in his
+behalf as you have uttered words in your own, you
+should not pass here; no, though it were as virtuous
+to lie as to live chastely. Therefore, go back.
+
+MENENIUS:
+Prithee, fellow, remember my name is Menenius,
+always factionary on the party of your general.
+
+Second Senator:
+Howsoever you have been his liar, as you say you
+have, I am one that, telling true under him, must
+say, you cannot pass. Therefore, go back.
+
+MENENIUS:
+Has he dined, canst thou tell? for I would not
+speak with him till after dinner.
+
+First Senator:
+You are a Roman, are you?
+
+MENENIUS:
+I am, as thy general is.
+
+First Senator:
+Then you should hate Rome, as he does. Can you,
+when you have pushed out your gates the very
+defender of them, and, in a violent popular
+ignorance, given your enemy your shield, think to
+front his revenges with the easy groans of old
+women, the virginal palms of your daughters, or with
+the palsied intercession of such a decayed dotant as
+you seem to be? Can you think to blow out the
+intended fire your city is ready to flame in, with
+such weak breath as this? No, you are deceived;
+therefore, back to Rome, and prepare for your
+execution: you are condemned, our general has sworn
+you out of reprieve and pardon.
+
+MENENIUS:
+Sirrah, if thy captain knew I were here, he would
+use me with estimation.
+
+Second Senator:
+Come, my captain knows you not.
+
+MENENIUS:
+I mean, thy general.
+
+First Senator:
+My general cares not for you. Back, I say, go; lest
+I let forth your half-pint of blood; back,--that's
+the utmost of your having: back.
+
+MENENIUS:
+Nay, but, fellow, fellow,--
+
+CORIOLANUS:
+What's the matter?
+
+MENENIUS:
+Now, you companion, I'll say an errand for you:
+You shall know now that I am in estimation; you shall
+perceive that a Jack guardant cannot office me from
+my son Coriolanus: guess, but by my entertainment
+with him, if thou standest not i' the state of
+hanging, or of some death more long in
+spectatorship, and crueller in suffering; behold now
+presently, and swoon for what's to come upon thee.
+The glorious gods sit in hourly synod about thy
+particular prosperity, and love thee no worse than
+thy old father Menenius does! O my son, my son!
+thou art preparing fire for us; look thee, here's
+water to quench it. I was hardly moved to come to
+thee; but being assured none but myself could move
+thee, I have been blown out of your gates with
+sighs; and conjure thee to pardon Rome, and thy
+petitionary countrymen. The good gods assuage thy
+wrath, and turn the dregs of it upon this varlet
+here,--this, who, like a block, hath denied my
+access to thee.
+
+CORIOLANUS:
+Away!
+
+MENENIUS:
+How! away!
+
+CORIOLANUS:
+Wife, mother, child, I know not. My affairs
+Are servanted to others: though I owe
+My revenge properly, my remission lies
+In Volscian breasts. That we have been familiar,
+Ingrate forgetfulness shall poison, rather
+Than pity note how much. Therefore, be gone.
+Mine ears against your suits are stronger than
+Your gates against my force. Yet, for I loved thee,
+Take this along; I writ it for thy sake
+And would have rent it. Another word, Menenius,
+I will not hear thee speak. This man, Aufidius,
+Was my beloved in Rome: yet thou behold'st!
+
+AUFIDIUS:
+You keep a constant temper.
+
+First Senator:
+Now, sir, is your name Menenius?
+
+Second Senator:
+'Tis a spell, you see, of much power: you know the
+way home again.
+
+First Senator:
+Do you hear how we are shent for keeping your
+greatness back?
+
+Second Senator:
+What cause, do you think, I have to swoon?
+
+MENENIUS:
+I neither care for the world nor your general: for
+such things as you, I can scarce think there's any,
+ye're so slight. He that hath a will to die by
+himself fears it not from another: let your general
+do his worst. For you, be that you are, long; and
+your misery increase with your age! I say to you,
+as I was said to, Away!
+
+First Senator:
+A noble fellow, I warrant him.
+
+Second Senator:
+The worthy fellow is our general: he's the rock, the
+oak not to be wind-shaken.
+
+CORIOLANUS:
+We will before the walls of Rome tomorrow
+Set down our host. My partner in this action,
+You must report to the Volscian lords, how plainly
+I have borne this business.
+
+AUFIDIUS:
+Only their ends
+You have respected; stopp'd your ears against
+The general suit of Rome; never admitted
+A private whisper, no, not with such friends
+That thought them sure of you.
+
+CORIOLANUS:
+This last old man,
+Whom with a crack'd heart I have sent to Rome,
+Loved me above the measure of a father;
+Nay, godded me, indeed. Their latest refuge
+Was to send him; for whose old love I have,
+Though I show'd sourly to him, once more offer'd
+The first conditions, which they did refuse
+And cannot now accept; to grace him only
+That thought he could do more, a very little
+I have yielded to: fresh embassies and suits,
+Nor from the state nor private friends, hereafter
+Will I lend ear to. Ha! what shout is this?
+Shall I be tempted to infringe my vow
+In the same time 'tis made? I will not.
+My wife comes foremost; then the honour'd mould
+Wherein this trunk was framed, and in her hand
+The grandchild to her blood. But, out, affection!
+All bond and privilege of nature, break!
+Let it be virtuous to be obstinate.
+What is that curt'sy worth? or those doves' eyes,
+Which can make gods forsworn? I melt, and am not
+Of stronger earth than others. My mother bows;
+As if Olympus to a molehill should
+In supplication nod: and my young boy
+Hath an aspect of intercession, which
+Great nature cries 'Deny not.' let the Volsces
+Plough Rome and harrow Italy: I'll never
+Be such a gosling to obey instinct, but stand,
+As if a man were author of himself
+And knew no other kin.
+
+VIRGILIA:
+My lord and husband!
+
+CORIOLANUS:
+These eyes are not the same I wore in Rome.
+
+VIRGILIA:
+The sorrow that delivers us thus changed
+Makes you think so.
+
+CORIOLANUS:
+Like a dull actor now,
+I have forgot my part, and I am out,
+Even to a full disgrace. Best of my flesh,
+Forgive my tyranny; but do not say
+For that 'Forgive our Romans.' O, a kiss
+Long as my exile, sweet as my revenge!
+Now, by the jealous queen of heaven, that kiss
+I carried from thee, dear; and my true lip
+Hath virgin'd it e'er since. You gods! I prate,
+And the most noble mother of the world
+Leave unsaluted: sink, my knee, i' the earth;
+Of thy deep duty more impression show
+Than that of common sons.
+
+VOLUMNIA:
+O, stand up blest!
+Whilst, with no softer cushion than the flint,
+I kneel before thee; and unproperly
+Show duty, as mistaken all this while
+Between the child and parent.
+
+CORIOLANUS:
+What is this?
+Your knees to me? to your corrected son?
+Then let the pebbles on the hungry beach
+Fillip the stars; then let the mutinous winds
+Strike the proud cedars 'gainst the fiery sun;
+Murdering impossibility, to make
+What cannot be, slight work.
+
+VOLUMNIA:
+Thou art my warrior;
+I holp to frame thee. Do you know this lady?
+
+CORIOLANUS:
+The noble sister of Publicola,
+The moon of Rome, chaste as the icicle
+That's curdied by the frost from purest snow
+And hangs on Dian's temple: dear Valeria!
+
+VOLUMNIA:
+This is a poor epitome of yours,
+Which by the interpretation of full time
+May show like all yourself.
+
+CORIOLANUS:
+The god of soldiers,
+With the consent of supreme Jove, inform
+Thy thoughts with nobleness; that thou mayst prove
+To shame unvulnerable, and stick i' the wars
+Like a great sea-mark, standing every flaw,
+And saving those that eye thee!
+
+VOLUMNIA:
+Your knee, sirrah.
+
+CORIOLANUS:
+That's my brave boy!
+
+VOLUMNIA:
+Even he, your wife, this lady, and myself,
+Are suitors to you.
+
+CORIOLANUS:
+I beseech you, peace:
+Or, if you'ld ask, remember this before:
+The thing I have forsworn to grant may never
+Be held by you denials. Do not bid me
+Dismiss my soldiers, or capitulate
+Again with Rome's mechanics: tell me not
+Wherein I seem unnatural: desire not
+To ally my rages and revenges with
+Your colder reasons.
+
+VOLUMNIA:
+O, no more, no more!
+You have said you will not grant us any thing;
+For we have nothing else to ask, but that
+Which you deny already: yet we will ask;
+That, if you fail in our request, the blame
+May hang upon your hardness: therefore hear us.
+
+CORIOLANUS:
+Aufidius, and you Volsces, mark; for we'll
+Hear nought from Rome in private. Your request?
+
+VOLUMNIA:
+Should we be silent and not speak, our raiment
+And state of bodies would bewray what life
+We have led since thy exile. Think with thyself
+How more unfortunate than all living women
+Are we come hither: since that thy sight,
+which should
+Make our eyes flow with joy, hearts dance
+with comforts,
+Constrains them weep and shake with fear and sorrow;
+Making the mother, wife and child to see
+The son, the husband and the father tearing
+His country's bowels out. And to poor we
+Thine enmity's most capital: thou barr'st us
+Our prayers to the gods, which is a comfort
+That all but we enjoy; for how can we,
+Alas, how can we for our country pray.
+Whereto we are bound, together with thy victory,
+Whereto we are bound? alack, or we must lose
+The country, our dear nurse, or else thy person,
+Our comfort in the country. We must find
+An evident calamity, though we had
+Our wish, which side should win: for either thou
+Must, as a foreign recreant, be led
+With manacles thorough our streets, or else
+triumphantly tread on thy country's ruin,
+And bear the palm for having bravely shed
+Thy wife and children's blood. For myself, son,
+I purpose not to wait on fortune till
+These wars determine: if I cannot persuade thee
+Rather to show a noble grace to both parts
+Than seek the end of one, thou shalt no sooner
+March to assault thy country than to tread--
+Trust to't, thou shalt not--on thy mother's womb,
+That brought thee to this world.
+
+VIRGILIA:
+Ay, and mine,
+That brought you forth this boy, to keep your name
+Living to time.
+
+Young MARCIUS:
+A' shall not tread on me;
+I'll run away till I am bigger, but then I'll fight.
+
+CORIOLANUS:
+Not of a woman's tenderness to be,
+Requires nor child nor woman's face to see.
+I have sat too long.
+
+VOLUMNIA:
+Nay, go not from us thus.
+If it were so that our request did tend
+To save the Romans, thereby to destroy
+The Volsces whom you serve, you might condemn us,
+As poisonous of your honour: no; our suit
+Is that you reconcile them: while the Volsces
+May say 'This mercy we have show'd;' the Romans,
+'This we received;' and each in either side
+Give the all-hail to thee and cry 'Be blest
+For making up this peace!' Thou know'st, great son,
+The end of war's uncertain, but this certain,
+That, if thou conquer Rome, the benefit
+Which thou shalt thereby reap is such a name,
+Whose repetition will be dogg'd with curses;
+Whose chronicle thus writ: 'The man was noble,
+But with his last attempt he wiped it out;
+Destroy'd his country, and his name remains
+To the ensuing age abhorr'd.' Speak to me, son:
+Thou hast affected the fine strains of honour,
+To imitate the graces of the gods;
+To tear with thunder the wide cheeks o' the air,
+And yet to charge thy sulphur with a bolt
+That should but rive an oak. Why dost not speak?
+Think'st thou it honourable for a noble man
+Still to remember wrongs? Daughter, speak you:
+He cares not for your weeping. Speak thou, boy:
+Perhaps thy childishness will move him more
+Than can our reasons. There's no man in the world
+More bound to 's mother; yet here he lets me prate
+Like one i' the stocks. Thou hast never in thy life
+Show'd thy dear mother any courtesy,
+When she, poor hen, fond of no second brood,
+Has cluck'd thee to the wars and safely home,
+Loaden with honour. Say my request's unjust,
+And spurn me back: but if it be not so,
+Thou art not honest; and the gods will plague thee,
+That thou restrain'st from me the duty which
+To a mother's part belongs. He turns away:
+Down, ladies; let us shame him with our knees.
+To his surname Coriolanus 'longs more pride
+Than pity to our prayers. Down: an end;
+This is the last: so we will home to Rome,
+And die among our neighbours. Nay, behold 's:
+This boy, that cannot tell what he would have
+But kneels and holds up bands for fellowship,
+Does reason our petition with more strength
+Than thou hast to deny 't. Come, let us go:
+This fellow had a Volscian to his mother;
+His wife is in Corioli and his child
+Like him by chance. Yet give us our dispatch:
+I am hush'd until our city be a-fire,
+And then I'll speak a little.
+
+CORIOLANUS:
+O mother, mother!
+What have you done? Behold, the heavens do ope,
+The gods look down, and this unnatural scene
+They laugh at. O my mother, mother! O!
+You have won a happy victory to Rome;
+But, for your son,--believe it, O, believe it,
+Most dangerously you have with him prevail'd,
+If not most mortal to him. But, let it come.
+Aufidius, though I cannot make true wars,
+I'll frame convenient peace. Now, good Aufidius,
+Were you in my stead, would you have heard
+A mother less? or granted less, Aufidius?
+
+AUFIDIUS:
+I was moved withal.
+
+CORIOLANUS:
+I dare be sworn you were:
+And, sir, it is no little thing to make
+Mine eyes to sweat compassion. But, good sir,
+What peace you'll make, advise me: for my part,
+I'll not to Rome, I'll back with you; and pray you,
+Stand to me in this cause. O mother! wife!
+
+AUFIDIUS:
+
+CORIOLANUS:
+Ay, by and by;
+But we will drink together; and you shall bear
+A better witness back than words, which we,
+On like conditions, will have counter-seal'd.
+Come, enter with us. Ladies, you deserve
+To have a temple built you: all the swords
+In Italy, and her confederate arms,
+Could not have made this peace.
+
+MENENIUS:
+See you yond coign o' the Capitol, yond
+corner-stone?
+
+SICINIUS:
+Why, what of that?
+
+MENENIUS:
+If it be possible for you to displace it with your
+little finger, there is some hope the ladies of
+Rome, especially his mother, may prevail with him.
+But I say there is no hope in't: our throats are
+sentenced and stay upon execution.
+
+SICINIUS:
+Is't possible that so short a time can alter the
+condition of a man!
+
+MENENIUS:
+There is differency between a grub and a butterfly;
+yet your butterfly was a grub. This Marcius is grown
+from man to dragon: he has wings; he's more than a
+creeping thing.
+
+SICINIUS:
+He loved his mother dearly.
+
+MENENIUS:
+So did he me: and he no more remembers his mother
+now than an eight-year-old horse. The tartness
+of his face sours ripe grapes: when he walks, he
+moves like an engine, and the ground shrinks before
+his treading: he is able to pierce a corslet with
+his eye; talks like a knell, and his hum is a
+battery. He sits in his state, as a thing made for
+Alexander. What he bids be done is finished with
+his bidding. He wants nothing of a god but eternity
+and a heaven to throne in.
+
+SICINIUS:
+Yes, mercy, if you report him truly.
+
+MENENIUS:
+I paint him in the character. Mark what mercy his
+mother shall bring from him: there is no more mercy
+in him than there is milk in a male tiger; that
+shall our poor city find: and all this is long of
+you.
+
+SICINIUS:
+The gods be good unto us!
+
+MENENIUS:
+No, in such a case the gods will not be good unto
+us. When we banished him, we respected not them;
+and, he returning to break our necks, they respect not us.
+
+Messenger:
+Sir, if you'ld save your life, fly to your house:
+The plebeians have got your fellow-tribune
+And hale him up and down, all swearing, if
+The Roman ladies bring not comfort home,
+They'll give him death by inches.
+
+SICINIUS:
+What's the news?
+
+Second Messenger:
+Good news, good news; the ladies have prevail'd,
+The Volscians are dislodged, and Marcius gone:
+A merrier day did never yet greet Rome,
+No, not the expulsion of the Tarquins.
+
+SICINIUS:
+Friend,
+Art thou certain this is true? is it most certain?
+
+Second Messenger:
+As certain as I know the sun is fire:
+Where have you lurk'd, that you make doubt of it?
+Ne'er through an arch so hurried the blown tide,
+As the recomforted through the gates. Why, hark you!
+The trumpets, sackbuts, psalteries and fifes,
+Tabours and cymbals and the shouting Romans,
+Make the sun dance. Hark you!
+
+MENENIUS:
+This is good news:
+I will go meet the ladies. This Volumnia
+Is worth of consuls, senators, patricians,
+A city full; of tribunes, such as you,
+A sea and land full. You have pray'd well to-day:
+This morning for ten thousand of your throats
+I'd not have given a doit. Hark, how they joy!
+
+SICINIUS:
+First, the gods bless you for your tidings; next,
+Accept my thankfulness.
+
+Second Messenger:
+Sir, we have all
+Great cause to give great thanks.
+
+SICINIUS:
+They are near the city?
+
+Second Messenger:
+Almost at point to enter.
+
+SICINIUS:
+We will meet them,
+And help the joy.
+
+First Senator:
+Behold our patroness, the life of Rome!
+Call all your tribes together, praise the gods,
+And make triumphant fires; strew flowers before them:
+Unshout the noise that banish'd Marcius,
+Repeal him with the welcome of his mother;
+Cry 'Welcome, ladies, welcome!'
+
+All:
+Welcome, ladies, Welcome!
+
+AUFIDIUS:
+Go tell the lords o' the city I am here:
+Deliver them this paper: having read it,
+Bid them repair to the market place; where I,
+Even in theirs and in the commons' ears,
+Will vouch the truth of it. Him I accuse
+The city ports by this hath enter'd and
+Intends to appear before the people, hoping
+To purge herself with words: dispatch.
+Most welcome!
+
+First Conspirator:
+How is it with our general?
+
+AUFIDIUS:
+Even so
+As with a man by his own alms empoison'd,
+And with his charity slain.
+
+Second Conspirator:
+Most noble sir,
+If you do hold the same intent wherein
+You wish'd us parties, we'll deliver you
+Of your great danger.
+
+AUFIDIUS:
+Sir, I cannot tell:
+We must proceed as we do find the people.
+
+Third Conspirator:
+The people will remain uncertain whilst
+'Twixt you there's difference; but the fall of either
+Makes the survivor heir of all.
+
+AUFIDIUS:
+I know it;
+And my pretext to strike at him admits
+A good construction. I raised him, and I pawn'd
+Mine honour for his truth: who being so heighten'd,
+He water'd his new plants with dews of flattery,
+Seducing so my friends; and, to this end,
+He bow'd his nature, never known before
+But to be rough, unswayable and free.
+
+Third Conspirator:
+Sir, his stoutness
+When he did stand for consul, which he lost
+By lack of stooping,--
+
+AUFIDIUS:
+That I would have spoke of:
+Being banish'd for't, he came unto my hearth;
+Presented to my knife his throat: I took him;
+Made him joint-servant with me; gave him way
+In all his own desires; nay, let him choose
+Out of my files, his projects to accomplish,
+My best and freshest men; served his designments
+In mine own person; holp to reap the fame
+Which he did end all his; and took some pride
+To do myself this wrong: till, at the last,
+I seem'd his follower, not partner, and
+He waged me with his countenance, as if
+I had been mercenary.
+
+First Conspirator:
+So he did, my lord:
+The army marvell'd at it, and, in the last,
+When he had carried Rome and that we look'd
+For no less spoil than glory,--
+
+AUFIDIUS:
+There was it:
+For which my sinews shall be stretch'd upon him.
+At a few drops of women's rheum, which are
+As cheap as lies, he sold the blood and labour
+Of our great action: therefore shall he die,
+And I'll renew me in his fall. But, hark!
+
+First Conspirator:
+Your native town you enter'd like a post,
+And had no welcomes home: but he returns,
+Splitting the air with noise.
+
+Second Conspirator:
+And patient fools,
+Whose children he hath slain, their base throats tear
+With giving him glory.
+
+Third Conspirator:
+Therefore, at your vantage,
+Ere he express himself, or move the people
+With what he would say, let him feel your sword,
+Which we will second. When he lies along,
+After your way his tale pronounced shall bury
+His reasons with his body.
+
+AUFIDIUS:
+Say no more:
+Here come the lords.
+
+All The Lords:
+You are most welcome home.
+
+AUFIDIUS:
+I have not deserved it.
+But, worthy lords, have you with heed perused
+What I have written to you?
+
+Lords:
+We have.
+
+First Lord:
+And grieve to hear't.
+What faults he made before the last, I think
+Might have found easy fines: but there to end
+Where he was to begin and give away
+The benefit of our levies, answering us
+With our own charge, making a treaty where
+There was a yielding,--this admits no excuse.
+
+AUFIDIUS:
+He approaches: you shall hear him.
+
+CORIOLANUS:
+Hail, lords! I am return'd your soldier,
+No more infected with my country's love
+Than when I parted hence, but still subsisting
+Under your great command. You are to know
+That prosperously I have attempted and
+With bloody passage led your wars even to
+The gates of Rome. Our spoils we have brought home
+Do more than counterpoise a full third part
+The charges of the action. We have made peace
+With no less honour to the Antiates
+Than shame to the Romans: and we here deliver,
+Subscribed by the consuls and patricians,
+Together with the seal o' the senate, what
+We have compounded on.
+
+AUFIDIUS:
+Read it not, noble lords;
+But tell the traitor, in the high'st degree
+He hath abused your powers.
+
+CORIOLANUS:
+Traitor! how now!
+
+AUFIDIUS:
+Ay, traitor, Marcius!
+
+CORIOLANUS:
+Marcius!
+
+AUFIDIUS:
+Ay, Marcius, Caius Marcius: dost thou think
+I'll grace thee with that robbery, thy stol'n name
+Coriolanus in Corioli?
+You lords and heads o' the state, perfidiously
+He has betray'd your business, and given up,
+For certain drops of salt, your city Rome,
+I say 'your city,' to his wife and mother;
+Breaking his oath and resolution like
+A twist of rotten silk, never admitting
+Counsel o' the war, but at his nurse's tears
+He whined and roar'd away your victory,
+That pages blush'd at him and men of heart
+Look'd wondering each at other.
+
+CORIOLANUS:
+Hear'st thou, Mars?
+
+AUFIDIUS:
+Name not the god, thou boy of tears!
+
+CORIOLANUS:
+Ha!
+
+AUFIDIUS:
+No more.
+
+CORIOLANUS:
+Measureless liar, thou hast made my heart
+Too great for what contains it. Boy! O slave!
+Pardon me, lords, 'tis the first time that ever
+I was forced to scold. Your judgments, my grave lords,
+Must give this cur the lie: and his own notion--
+Who wears my stripes impress'd upon him; that
+Must bear my beating to his grave--shall join
+To thrust the lie unto him.
+
+First Lord:
+Peace, both, and hear me speak.
+
+CORIOLANUS:
+Cut me to pieces, Volsces; men and lads,
+Stain all your edges on me. Boy! false hound!
+If you have writ your annals true, 'tis there,
+That, like an eagle in a dove-cote, I
+Flutter'd your Volscians in Corioli:
+Alone I did it. Boy!
+
+AUFIDIUS:
+Why, noble lords,
+Will you be put in mind of his blind fortune,
+Which was your shame, by this unholy braggart,
+'Fore your own eyes and ears?
+
+All Conspirators:
+Let him die for't.
+
+All The People:
+'Tear him to pieces.' 'Do it presently.' 'He kill'd
+my son.' 'My daughter.' 'He killed my cousin
+Marcus.' 'He killed my father.'
+
+Second Lord:
+Peace, ho! no outrage: peace!
+The man is noble and his fame folds-in
+This orb o' the earth. His last offences to us
+Shall have judicious hearing. Stand, Aufidius,
+And trouble not the peace.
+
+CORIOLANUS:
+O that I had him,
+With six Aufidiuses, or more, his tribe,
+To use my lawful sword!
+
+AUFIDIUS:
+Insolent villain!
+
+All Conspirators:
+Kill, kill, kill, kill, kill him!
+
+Lords:
+Hold, hold, hold, hold!
+
+AUFIDIUS:
+My noble masters, hear me speak.
+
+First Lord:
+O Tullus,--
+
+Second Lord:
+Thou hast done a deed whereat valour will weep.
+
+Third Lord:
+Tread not upon him. Masters all, be quiet;
+Put up your swords.
+
+AUFIDIUS:
+My lords, when you shall know--as in this rage,
+Provoked by him, you cannot--the great danger
+Which this man's life did owe you, you'll rejoice
+That he is thus cut off. Please it your honours
+To call me to your senate, I'll deliver
+Myself your loyal servant, or endure
+Your heaviest censure.
+
+First Lord:
+Bear from hence his body;
+And mourn you for him: let him be regarded
+As the most noble corse that ever herald
+Did follow to his urn.
+
+Second Lord:
+His own impatience
+Takes from Aufidius a great part of blame.
+Let's make the best of it.
+
+AUFIDIUS:
+My rage is gone;
+And I am struck with sorrow. Take him up.
+Help, three o' the chiefest soldiers; I'll be one.
+Beat thou the drum, that it speak mournfully:
+Trail your steel pikes. Though in this city he
+Hath widow'd and unchilded many a one,
+Which to this hour bewail the injury,
+Yet he shall have a noble memory. Assist.
+
+GLOUCESTER:
+Now is the winter of our discontent
+Made glorious summer by this sun of York;
+And all the clouds that lour'd upon our house
+In the deep bosom of the ocean buried.
+Now are our brows bound with victorious wreaths;
+Our bruised arms hung up for monuments;
+Our stern alarums changed to merry meetings,
+Our dreadful marches to delightful measures.
+Grim-visaged war hath smooth'd his wrinkled front;
+And now, instead of mounting barded steeds
+To fright the souls of fearful adversaries,
+He capers nimbly in a lady's chamber
+To the lascivious pleasing of a lute.
+But I, that am not shaped for sportive tricks,
+Nor made to court an amorous looking-glass;
+I, that am rudely stamp'd, and want love's majesty
+To strut before a wanton ambling nymph;
+I, that am curtail'd of this fair proportion,
+Cheated of feature by dissembling nature,
+Deformed, unfinish'd, sent before my time
+Into this breathing world, scarce half made up,
+And that so lamely and unfashionable
+That dogs bark at me as I halt by them;
+Why, I, in this weak piping time of peace,
+Have no delight to pass away the time,
+Unless to spy my shadow in the sun
+And descant on mine own deformity:
+And therefore, since I cannot prove a lover,
+To entertain these fair well-spoken days,
+I am determined to prove a villain
+And hate the idle pleasures of these days.
+Plots have I laid, inductions dangerous,
+By drunken prophecies, libels and dreams,
+To set my brother Clarence and the king
+In deadly hate the one against the other:
+And if King Edward be as true and just
+As I am subtle, false and treacherous,
+This day should Clarence closely be mew'd up,
+About a prophecy, which says that 'G'
+Of Edward's heirs the murderer shall be.
+Dive, thoughts, down to my soul: here
+Clarence comes.
+Brother, good day; what means this armed guard
+That waits upon your grace?
+
+CLARENCE:
+His majesty
+Tendering my person's safety, hath appointed
+This conduct to convey me to the Tower.
+
+GLOUCESTER:
+Upon what cause?
+
+CLARENCE:
+Because my name is George.
+
+GLOUCESTER:
+Alack, my lord, that fault is none of yours;
+He should, for that, commit your godfathers:
+O, belike his majesty hath some intent
+That you shall be new-christen'd in the Tower.
+But what's the matter, Clarence?  may I know?
+
+CLARENCE:
+Yea, Richard, when I know; for I protest
+As yet I do not: but, as I can learn,
+He hearkens after prophecies and dreams;
+And from the cross-row plucks the letter G.
+And says a wizard told him that by G
+His issue disinherited should be;
+And, for my name of George begins with G,
+It follows in his thought that I am he.
+These, as I learn, and such like toys as these
+Have moved his highness to commit me now.
+
+GLOUCESTER:
+Why, this it is, when men are ruled by women:
+'Tis not the king that sends you to the Tower:
+My Lady Grey his wife, Clarence, 'tis she
+That tempers him to this extremity.
+Was it not she and that good man of worship,
+Anthony Woodville, her brother there,
+That made him send Lord Hastings to the Tower,
+From whence this present day he is deliver'd?
+We are not safe, Clarence; we are not safe.
+
+CLARENCE:
+By heaven, I think there's no man is secure
+But the queen's kindred and night-walking heralds
+That trudge betwixt the king and Mistress Shore.
+Heard ye not what an humble suppliant
+Lord hastings was to her for his delivery?
+
+GLOUCESTER:
+Humbly complaining to her deity
+Got my lord chamberlain his liberty.
+I'll tell you what; I think it is our way,
+If we will keep in favour with the king,
+To be her men and wear her livery:
+The jealous o'erworn widow and herself,
+Since that our brother dubb'd them gentlewomen.
+Are mighty gossips in this monarchy.
+
+BRAKENBURY:
+I beseech your graces both to pardon me;
+His majesty hath straitly given in charge
+That no man shall have private conference,
+Of what degree soever, with his brother.
+
+GLOUCESTER:
+Even so; an't please your worship, Brakenbury,
+You may partake of any thing we say:
+We speak no treason, man: we say the king
+Is wise and virtuous, and his noble queen
+Well struck in years, fair, and not jealous;
+We say that Shore's wife hath a pretty foot,
+A cherry lip, a bonny eye, a passing pleasing tongue;
+And that the queen's kindred are made gentle-folks:
+How say you sir? Can you deny all this?
+
+BRAKENBURY:
+With this, my lord, myself have nought to do.
+
+GLOUCESTER:
+Naught to do with mistress Shore! I tell thee, fellow,
+He that doth naught with her, excepting one,
+Were best he do it secretly, alone.
+
+BRAKENBURY:
+What one, my lord?
+
+GLOUCESTER:
+Her husband, knave: wouldst thou betray me?
+
+BRAKENBURY:
+I beseech your grace to pardon me, and withal
+Forbear your conference with the noble duke.
+
+CLARENCE:
+We know thy charge, Brakenbury, and will obey.
+
+GLOUCESTER:
+We are the queen's abjects, and must obey.
+Brother, farewell: I will unto the king;
+And whatsoever you will employ me in,
+Were it to call King Edward's widow sister,
+I will perform it to enfranchise you.
+Meantime, this deep disgrace in brotherhood
+Touches me deeper than you can imagine.
+
+CLARENCE:
+I know it pleaseth neither of us well.
+
+GLOUCESTER:
+Well, your imprisonment shall not be long;
+Meantime, have patience.
+
+CLARENCE:
+I must perforce. Farewell.
+
+GLOUCESTER:
+Go, tread the path that thou shalt ne'er return.
+Simple, plain Clarence! I do love thee so,
+That I will shortly send thy soul to heaven,
+If heaven will take the present at our hands.
+But who comes here? the new-deliver'd Hastings?
+
+HASTINGS:
+Good time of day unto my gracious lord!
+
+GLOUCESTER:
+As much unto my good lord chamberlain!
+Well are you welcome to the open air.
+How hath your lordship brook'd imprisonment?
+
+HASTINGS:
+With patience, noble lord, as prisoners must:
+But I shall live, my lord, to give them thanks
+That were the cause of my imprisonment.
+
+GLOUCESTER:
+No doubt, no doubt; and so shall Clarence too;
+For they that were your enemies are his,
+And have prevail'd as much on him as you.
+
+HASTINGS:
+More pity that the eagle should be mew'd,
+While kites and buzzards prey at liberty.
+
+GLOUCESTER:
+What news abroad?
+
+HASTINGS:
+No news so bad abroad as this at home;
+The King is sickly, weak and melancholy,
+And his physicians fear him mightily.
+
+GLOUCESTER:
+Now, by Saint Paul, this news is bad indeed.
+O, he hath kept an evil diet long,
+And overmuch consumed his royal person:
+'Tis very grievous to be thought upon.
+What, is he in his bed?
+
+HASTINGS:
+He is.
+
+GLOUCESTER:
+Go you before, and I will follow you.
+He cannot live, I hope; and must not die
+Till George be pack'd with post-horse up to heaven.
+I'll in, to urge his hatred more to Clarence,
+With lies well steel'd with weighty arguments;
+And, if I fall not in my deep intent,
+Clarence hath not another day to live:
+Which done, God take King Edward to his mercy,
+And leave the world for me to bustle in!
+For then I'll marry Warwick's youngest daughter.
+What though I kill'd her husband and her father?
+The readiest way to make the wench amends
+Is to become her husband and her father:
+The which will I; not all so much for love
+As for another secret close intent,
+By marrying her which I must reach unto.
+But yet I run before my horse to market:
+Clarence still breathes; Edward still lives and reigns:
+When they are gone, then must I count my gains.
+
+LADY ANNE:
+Set down, set down your honourable load,
+If honour may be shrouded in a hearse,
+Whilst I awhile obsequiously lament
+The untimely fall of virtuous Lancaster.
+Poor key-cold figure of a holy king!
+Pale ashes of the house of Lancaster!
+Thou bloodless remnant of that royal blood!
+Be it lawful that I invocate thy ghost,
+To hear the lamentations of Poor Anne,
+Wife to thy Edward, to thy slaughter'd son,
+Stabb'd by the selfsame hand that made these wounds!
+Lo, in these windows that let forth thy life,
+I pour the helpless balm of my poor eyes.
+Cursed be the hand that made these fatal holes!
+Cursed be the heart that had the heart to do it!
+Cursed the blood that let this blood from hence!
+More direful hap betide that hated wretch,
+That makes us wretched by the death of thee,
+Than I can wish to adders, spiders, toads,
+Or any creeping venom'd thing that lives!
+If ever he have child, abortive be it,
+Prodigious, and untimely brought to light,
+Whose ugly and unnatural aspect
+May fright the hopeful mother at the view;
+And that be heir to his unhappiness!
+If ever he have wife, let her he made
+A miserable by the death of him
+As I am made by my poor lord and thee!
+Come, now towards Chertsey with your holy load,
+Taken from Paul's to be interred there;
+And still, as you are weary of the weight,
+Rest you, whiles I lament King Henry's corse.
+
+GLOUCESTER:
+Stay, you that bear the corse, and set it down.
+
+LADY ANNE:
+What black magician conjures up this fiend,
+To stop devoted charitable deeds?
+
+GLOUCESTER:
+Villains, set down the corse; or, by Saint Paul,
+I'll make a corse of him that disobeys.
+
+Gentleman:
+My lord, stand back, and let the coffin pass.
+
+GLOUCESTER:
+Unmanner'd dog! stand thou, when I command:
+Advance thy halbert higher than my breast,
+Or, by Saint Paul, I'll strike thee to my foot,
+And spurn upon thee, beggar, for thy boldness.
+
+LADY ANNE:
+What, do you tremble? are you all afraid?
+Alas, I blame you not; for you are mortal,
+And mortal eyes cannot endure the devil.
+Avaunt, thou dreadful minister of hell!
+Thou hadst but power over his mortal body,
+His soul thou canst not have; therefore be gone.
+
+GLOUCESTER:
+Sweet saint, for charity, be not so curst.
+
+LADY ANNE:
+Foul devil, for God's sake, hence, and trouble us not;
+For thou hast made the happy earth thy hell,
+Fill'd it with cursing cries and deep exclaims.
+If thou delight to view thy heinous deeds,
+Behold this pattern of thy butcheries.
+O, gentlemen, see, see! dead Henry's wounds
+Open their congeal'd mouths and bleed afresh!
+Blush, Blush, thou lump of foul deformity;
+For 'tis thy presence that exhales this blood
+From cold and empty veins, where no blood dwells;
+Thy deed, inhuman and unnatural,
+Provokes this deluge most unnatural.
+O God, which this blood madest, revenge his death!
+O earth, which this blood drink'st revenge his death!
+Either heaven with lightning strike the
+murderer dead,
+Or earth, gape open wide and eat him quick,
+As thou dost swallow up this good king's blood
+Which his hell-govern'd arm hath butchered!
+
+GLOUCESTER:
+Lady, you know no rules of charity,
+Which renders good for bad, blessings for curses.
+
+LADY ANNE:
+Villain, thou know'st no law of God nor man:
+No beast so fierce but knows some touch of pity.
+
+GLOUCESTER:
+But I know none, and therefore am no beast.
+
+LADY ANNE:
+O wonderful, when devils tell the truth!
+
+GLOUCESTER:
+More wonderful, when angels are so angry.
+Vouchsafe, divine perfection of a woman,
+Of these supposed-evils, to give me leave,
+By circumstance, but to acquit myself.
+
+LADY ANNE:
+Vouchsafe, defused infection of a man,
+For these known evils, but to give me leave,
+By circumstance, to curse thy cursed self.
+
+GLOUCESTER:
+Fairer than tongue can name thee, let me have
+Some patient leisure to excuse myself.
+
+LADY ANNE:
+Fouler than heart can think thee, thou canst make
+No excuse current, but to hang thyself.
+
+GLOUCESTER:
+By such despair, I should accuse myself.
+
+LADY ANNE:
+And, by despairing, shouldst thou stand excused;
+For doing worthy vengeance on thyself,
+Which didst unworthy slaughter upon others.
+
+GLOUCESTER:
+Say that I slew them not?
+
+LADY ANNE:
+Why, then they are not dead:
+But dead they are, and devilish slave, by thee.
+
+GLOUCESTER:
+I did not kill your husband.
+
+LADY ANNE:
+Why, then he is alive.
+
+GLOUCESTER:
+Nay, he is dead; and slain by Edward's hand.
+
+LADY ANNE:
+In thy foul throat thou liest: Queen Margaret saw
+Thy murderous falchion smoking in his blood;
+The which thou once didst bend against her breast,
+But that thy brothers beat aside the point.
+
+GLOUCESTER:
+I was provoked by her slanderous tongue,
+which laid their guilt upon my guiltless shoulders.
+
+LADY ANNE:
+Thou wast provoked by thy bloody mind.
+Which never dreamt on aught but butcheries:
+Didst thou not kill this king?
+
+GLOUCESTER:
+I grant ye.
+
+LADY ANNE:
+Dost grant me, hedgehog? then, God grant me too
+Thou mayst be damned for that wicked deed!
+O, he was gentle, mild, and virtuous!
+
+GLOUCESTER:
+The fitter for the King of heaven, that hath him.
+
+LADY ANNE:
+He is in heaven, where thou shalt never come.
+
+GLOUCESTER:
+Let him thank me, that holp to send him thither;
+For he was fitter for that place than earth.
+
+LADY ANNE:
+And thou unfit for any place but hell.
+
+GLOUCESTER:
+Yes, one place else, if you will hear me name it.
+
+LADY ANNE:
+Some dungeon.
+
+GLOUCESTER:
+Your bed-chamber.
+
+LADY ANNE:
+I'll rest betide the chamber where thou liest!
+
+GLOUCESTER:
+So will it, madam till I lie with you.
+
+LADY ANNE:
+I hope so.
+
+GLOUCESTER:
+I know so. But, gentle Lady Anne,
+To leave this keen encounter of our wits,
+And fall somewhat into a slower method,
+Is not the causer of the timeless deaths
+Of these Plantagenets, Henry and Edward,
+As blameful as the executioner?
+
+LADY ANNE:
+Thou art the cause, and most accursed effect.
+
+GLOUCESTER:
+Your beauty was the cause of that effect;
+Your beauty: which did haunt me in my sleep
+To undertake the death of all the world,
+So I might live one hour in your sweet bosom.
+
+LADY ANNE:
+If I thought that, I tell thee, homicide,
+These nails should rend that beauty from my cheeks.
+
+GLOUCESTER:
+These eyes could never endure sweet beauty's wreck;
+You should not blemish it, if I stood by:
+As all the world is cheered by the sun,
+So I by that; it is my day, my life.
+
+LADY ANNE:
+Black night o'ershade thy day, and death thy life!
+
+GLOUCESTER:
+Curse not thyself, fair creature thou art both.
+
+LADY ANNE:
+I would I were, to be revenged on thee.
+
+GLOUCESTER:
+It is a quarrel most unnatural,
+To be revenged on him that loveth you.
+
+LADY ANNE:
+It is a quarrel just and reasonable,
+To be revenged on him that slew my husband.
+
+GLOUCESTER:
+He that bereft thee, lady, of thy husband,
+Did it to help thee to a better husband.
+
+LADY ANNE:
+His better doth not breathe upon the earth.
+
+GLOUCESTER:
+He lives that loves thee better than he could.
+
+LADY ANNE:
+Name him.
+
+GLOUCESTER:
+Plantagenet.
+
+LADY ANNE:
+Why, that was he.
+
+GLOUCESTER:
+The selfsame name, but one of better nature.
+
+LADY ANNE:
+Where is he?
+
+GLOUCESTER:
+Here.
+Why dost thou spit at me?
+
+LADY ANNE:
+Would it were mortal poison, for thy sake!
+
+GLOUCESTER:
+Never came poison from so sweet a place.
+
+LADY ANNE:
+Never hung poison on a fouler toad.
+Out of my sight! thou dost infect my eyes.
+
+GLOUCESTER:
+Thine eyes, sweet lady, have infected mine.
+
+LADY ANNE:
+Would they were basilisks, to strike thee dead!
+
+GLOUCESTER:
+I would they were, that I might die at once;
+For now they kill me with a living death.
+Those eyes of thine from mine have drawn salt tears,
+Shamed their aspect with store of childish drops:
+These eyes that never shed remorseful tear,
+No, when my father York and Edward wept,
+To hear the piteous moan that Rutland made
+When black-faced Clifford shook his sword at him;
+Nor when thy warlike father, like a child,
+Told the sad story of my father's death,
+And twenty times made pause to sob and weep,
+That all the standers-by had wet their cheeks
+Like trees bedash'd with rain: in that sad time
+My manly eyes did scorn an humble tear;
+And what these sorrows could not thence exhale,
+Thy beauty hath, and made them blind with weeping.
+I never sued to friend nor enemy;
+My tongue could never learn sweet smoothing word;
+But now thy beauty is proposed my fee,
+My proud heart sues, and prompts my tongue to speak.
+Teach not thy lips such scorn, for they were made
+For kissing, lady, not for such contempt.
+If thy revengeful heart cannot forgive,
+Lo, here I lend thee this sharp-pointed sword;
+Which if thou please to hide in this true bosom.
+And let the soul forth that adoreth thee,
+I lay it naked to the deadly stroke,
+And humbly beg the death upon my knee.
+Nay, do not pause; for I did kill King Henry,
+But 'twas thy beauty that provoked me.
+Nay, now dispatch; 'twas I that stabb'd young Edward,
+But 'twas thy heavenly face that set me on.
+Take up the sword again, or take up me.
+
+LADY ANNE:
+Arise, dissembler: though I wish thy death,
+I will not be the executioner.
+
+GLOUCESTER:
+Then bid me kill myself, and I will do it.
+
+LADY ANNE:
+I have already.
+
+GLOUCESTER:
+Tush, that was in thy rage:
+Speak it again, and, even with the word,
+That hand, which, for thy love, did kill thy love,
+Shall, for thy love, kill a far truer love;
+To both their deaths thou shalt be accessary.
+
+LADY ANNE:
+I would I knew thy heart.
+
+GLOUCESTER:
+'Tis figured in my tongue.
+
+LADY ANNE:
+I fear me both are false.
+
+GLOUCESTER:
+Then never man was true.
+
+LADY ANNE:
+Well, well, put up your sword.
+
+GLOUCESTER:
+Say, then, my peace is made.
+
+LADY ANNE:
+That shall you know hereafter.
+
+GLOUCESTER:
+But shall I live in hope?
+
+LADY ANNE:
+All men, I hope, live so.
+
+GLOUCESTER:
+Vouchsafe to wear this ring.
+
+LADY ANNE:
+To take is not to give.
+
+GLOUCESTER:
+Look, how this ring encompasseth finger.
+Even so thy breast encloseth my poor heart;
+Wear both of them, for both of them are thine.
+And if thy poor devoted suppliant may
+But beg one favour at thy gracious hand,
+Thou dost confirm his happiness for ever.
+
+LADY ANNE:
+What is it?
+
+GLOUCESTER:
+That it would please thee leave these sad designs
+To him that hath more cause to be a mourner,
+And presently repair to Crosby Place;
+Where, after I have solemnly interr'd
+At Chertsey monastery this noble king,
+And wet his grave with my repentant tears,
+I will with all expedient duty see you:
+For divers unknown reasons. I beseech you,
+Grant me this boon.
+
+LADY ANNE:
+With all my heart; and much it joys me too,
+To see you are become so penitent.
+Tressel and Berkeley, go along with me.
+
+GLOUCESTER:
+Bid me farewell.
+
+LADY ANNE:
+'Tis more than you deserve;
+But since you teach me how to flatter you,
+Imagine I have said farewell already.
+
+GLOUCESTER:
+Sirs, take up the corse.
+
+GENTLEMEN:
+Towards Chertsey, noble lord?
+
+GLOUCESTER:
+No, to White-Friars; there attend my coining.
+Was ever woman in this humour woo'd?
+Was ever woman in this humour won?
+I'll have her; but I will not keep her long.
+What! I, that kill'd her husband and his father,
+To take her in her heart's extremest hate,
+With curses in her mouth, tears in her eyes,
+The bleeding witness of her hatred by;
+Having God, her conscience, and these bars
+against me,
+And I nothing to back my suit at all,
+But the plain devil and dissembling looks,
+And yet to win her, all the world to nothing!
+Ha!
+Hath she forgot already that brave prince,
+Edward, her lord, whom I, some three months since,
+Stabb'd in my angry mood at Tewksbury?
+A sweeter and a lovelier gentleman,
+Framed in the prodigality of nature,
+Young, valiant, wise, and, no doubt, right royal,
+The spacious world cannot again afford
+And will she yet debase her eyes on me,
+That cropp'd the golden prime of this sweet prince,
+And made her widow to a woful bed?
+On me, whose all not equals Edward's moiety?
+On me, that halt and am unshapen thus?
+My dukedom to a beggarly denier,
+I do mistake my person all this while:
+Upon my life, she finds, although I cannot,
+Myself to be a marvellous proper man.
+I'll be at charges for a looking-glass,
+And entertain some score or two of tailors,
+To study fashions to adorn my body:
+Since I am crept in favour with myself,
+Will maintain it with some little cost.
+But first I'll turn yon fellow in his grave;
+And then return lamenting to my love.
+Shine out, fair sun, till I have bought a glass,
+That I may see my shadow as I pass.
+
+RIVERS:
+Have patience, madam: there's no doubt his majesty
+Will soon recover his accustom'd health.
+
+GREY:
+In that you brook it in, it makes him worse:
+Therefore, for God's sake, entertain good comfort,
+And cheer his grace with quick and merry words.
+
+QUEEN ELIZABETH:
+If he were dead, what would betide of me?
+
+RIVERS:
+No other harm but loss of such a lord.
+
+QUEEN ELIZABETH:
+The loss of such a lord includes all harm.
+
+GREY:
+The heavens have bless'd you with a goodly son,
+To be your comforter when he is gone.
+
+QUEEN ELIZABETH:
+Oh, he is young and his minority
+Is put unto the trust of Richard Gloucester,
+A man that loves not me, nor none of you.
+
+RIVERS:
+Is it concluded that he shall be protector?
+
+QUEEN ELIZABETH:
+It is determined, not concluded yet:
+But so it must be, if the king miscarry.
+
+GREY:
+Here come the lords of Buckingham and Derby.
+
+BUCKINGHAM:
+Good time of day unto your royal grace!
+
+DERBY:
+God make your majesty joyful as you have been!
+
+QUEEN ELIZABETH:
+The Countess Richmond, good my Lord of Derby.
+To your good prayers will scarcely say amen.
+Yet, Derby, notwithstanding she's your wife,
+And loves not me, be you, good lord, assured
+I hate not you for her proud arrogance.
+
+DERBY:
+I do beseech you, either not believe
+The envious slanders of her false accusers;
+Or, if she be accused in true report,
+Bear with her weakness, which, I think proceeds
+From wayward sickness, and no grounded malice.
+
+RIVERS:
+Saw you the king to-day, my Lord of Derby?
+
+DERBY:
+But now the Duke of Buckingham and I
+Are come from visiting his majesty.
+
+QUEEN ELIZABETH:
+What likelihood of his amendment, lords?
+
+BUCKINGHAM:
+Madam, good hope; his grace speaks cheerfully.
+
+QUEEN ELIZABETH:
+God grant him health! Did you confer with him?
+
+BUCKINGHAM:
+Madam, we did: he desires to make atonement
+Betwixt the Duke of Gloucester and your brothers,
+And betwixt them and my lord chamberlain;
+And sent to warn them to his royal presence.
+
+QUEEN ELIZABETH:
+Would all were well! but that will never be
+I fear our happiness is at the highest.
+
+GLOUCESTER:
+They do me wrong, and I will not endure it:
+Who are they that complain unto the king,
+That I, forsooth, am stern, and love them not?
+By holy Paul, they love his grace but lightly
+That fill his ears with such dissentious rumours.
+Because I cannot flatter and speak fair,
+Smile in men's faces, smooth, deceive and cog,
+Duck with French nods and apish courtesy,
+I must be held a rancorous enemy.
+Cannot a plain man live and think no harm,
+But thus his simple truth must be abused
+By silken, sly, insinuating Jacks?
+
+RIVERS:
+To whom in all this presence speaks your grace?
+
+GLOUCESTER:
+To thee, that hast nor honesty nor grace.
+When have I injured thee? when done thee wrong?
+Or thee? or thee? or any of your faction?
+A plague upon you all! His royal person,--
+Whom God preserve better than you would wish!--
+Cannot be quiet scarce a breathing-while,
+But you must trouble him with lewd complaints.
+
+QUEEN ELIZABETH:
+Brother of Gloucester, you mistake the matter.
+The king, of his own royal disposition,
+And not provoked by any suitor else;
+Aiming, belike, at your interior hatred,
+Which in your outward actions shows itself
+Against my kindred, brothers, and myself,
+Makes him to send; that thereby he may gather
+The ground of your ill-will, and so remove it.
+
+GLOUCESTER:
+I cannot tell: the world is grown so bad,
+That wrens make prey where eagles dare not perch:
+Since every Jack became a gentleman
+There's many a gentle person made a Jack.
+
+QUEEN ELIZABETH:
+Come, come, we know your meaning, brother
+Gloucester;
+You envy my advancement and my friends':
+God grant we never may have need of you!
+
+GLOUCESTER:
+Meantime, God grants that we have need of you:
+Your brother is imprison'd by your means,
+Myself disgraced, and the nobility
+Held in contempt; whilst many fair promotions
+Are daily given to ennoble those
+That scarce, some two days since, were worth a noble.
+
+QUEEN ELIZABETH:
+By Him that raised me to this careful height
+From that contented hap which I enjoy'd,
+I never did incense his majesty
+Against the Duke of Clarence, but have been
+An earnest advocate to plead for him.
+My lord, you do me shameful injury,
+Falsely to draw me in these vile suspects.
+
+GLOUCESTER:
+You may deny that you were not the cause
+Of my Lord Hastings' late imprisonment.
+
+RIVERS:
+She may, my lord, for--
+
+GLOUCESTER:
+She may, Lord Rivers! why, who knows not so?
+She may do more, sir, than denying that:
+She may help you to many fair preferments,
+And then deny her aiding hand therein,
+And lay those honours on your high deserts.
+What may she not? She may, yea, marry, may she--
+
+RIVERS:
+What, marry, may she?
+
+GLOUCESTER:
+What, marry, may she! marry with a king,
+A bachelor, a handsome stripling too:
+I wis your grandam had a worser match.
+
+QUEEN ELIZABETH:
+My Lord of Gloucester, I have too long borne
+Your blunt upbraidings and your bitter scoffs:
+By heaven, I will acquaint his majesty
+With those gross taunts I often have endured.
+I had rather be a country servant-maid
+Than a great queen, with this condition,
+To be thus taunted, scorn'd, and baited at:
+Small joy have I in being England's queen.
+
+QUEEN MARGARET:
+And lessen'd be that small, God, I beseech thee!
+Thy honour, state and seat is due to me.
+
+GLOUCESTER:
+What! threat you me with telling of the king?
+Tell him, and spare not: look, what I have said
+I will avouch in presence of the king:
+I dare adventure to be sent to the Tower.
+'Tis time to speak; my pains are quite forgot.
+
+QUEEN MARGARET:
+Out, devil! I remember them too well:
+Thou slewest my husband Henry in the Tower,
+And Edward, my poor son, at Tewksbury.
+
+GLOUCESTER:
+Ere you were queen, yea, or your husband king,
+I was a pack-horse in his great affairs;
+A weeder-out of his proud adversaries,
+A liberal rewarder of his friends:
+To royalize his blood I spilt mine own.
+
+QUEEN MARGARET:
+Yea, and much better blood than his or thine.
+
+GLOUCESTER:
+In all which time you and your husband Grey
+Were factious for the house of Lancaster;
+And, Rivers, so were you. Was not your husband
+In Margaret's battle at Saint Alban's slain?
+Let me put in your minds, if you forget,
+What you have been ere now, and what you are;
+Withal, what I have been, and what I am.
+
+QUEEN MARGARET:
+A murderous villain, and so still thou art.
+
+GLOUCESTER:
+Poor Clarence did forsake his father, Warwick;
+Yea, and forswore himself,--which Jesu pardon!--
+
+QUEEN MARGARET:
+Which God revenge!
+
+GLOUCESTER:
+To fight on Edward's party for the crown;
+And for his meed, poor lord, he is mew'd up.
+I would to God my heart were flint, like Edward's;
+Or Edward's soft and pitiful, like mine
+I am too childish-foolish for this world.
+
+QUEEN MARGARET:
+Hie thee to hell for shame, and leave the world,
+Thou cacodemon! there thy kingdom is.
+
+RIVERS:
+My Lord of Gloucester, in those busy days
+Which here you urge to prove us enemies,
+We follow'd then our lord, our lawful king:
+So should we you, if you should be our king.
+
+GLOUCESTER:
+If I should be! I had rather be a pedlar:
+Far be it from my heart, the thought of it!
+
+QUEEN ELIZABETH:
+As little joy, my lord, as you suppose
+You should enjoy, were you this country's king,
+As little joy may you suppose in me.
+That I enjoy, being the queen thereof.
+
+QUEEN MARGARET:
+A little joy enjoys the queen thereof;
+For I am she, and altogether joyless.
+I can no longer hold me patient.
+Hear me, you wrangling pirates, that fall out
+In sharing that which you have pill'd from me!
+Which of you trembles not that looks on me?
+If not, that, I being queen, you bow like subjects,
+Yet that, by you deposed, you quake like rebels?
+O gentle villain, do not turn away!
+
+GLOUCESTER:
+Foul wrinkled witch, what makest thou in my sight?
+
+QUEEN MARGARET:
+But repetition of what thou hast marr'd;
+That will I make before I let thee go.
+
+GLOUCESTER:
+Wert thou not banished on pain of death?
+
+QUEEN MARGARET:
+I was; but I do find more pain in banishment
+Than death can yield me here by my abode.
+A husband and a son thou owest to me;
+And thou a kingdom; all of you allegiance:
+The sorrow that I have, by right is yours,
+And all the pleasures you usurp are mine.
+
+GLOUCESTER:
+The curse my noble father laid on thee,
+When thou didst crown his warlike brows with paper
+And with thy scorns drew'st rivers from his eyes,
+And then, to dry them, gavest the duke a clout
+Steep'd in the faultless blood of pretty Rutland--
+His curses, then from bitterness of soul
+Denounced against thee, are all fall'n upon thee;
+And God, not we, hath plagued thy bloody deed.
+
+QUEEN ELIZABETH:
+So just is God, to right the innocent.
+
+HASTINGS:
+O, 'twas the foulest deed to slay that babe,
+And the most merciless that e'er was heard of!
+
+RIVERS:
+Tyrants themselves wept when it was reported.
+
+DORSET:
+No man but prophesied revenge for it.
+
+BUCKINGHAM:
+Northumberland, then present, wept to see it.
+
+QUEEN MARGARET:
+What were you snarling all before I came,
+Ready to catch each other by the throat,
+And turn you all your hatred now on me?
+Did York's dread curse prevail so much with heaven?
+That Henry's death, my lovely Edward's death,
+Their kingdom's loss, my woful banishment,
+Could all but answer for that peevish brat?
+Can curses pierce the clouds and enter heaven?
+Why, then, give way, dull clouds, to my quick curses!
+If not by war, by surfeit die your king,
+As ours by murder, to make him a king!
+Edward thy son, which now is Prince of Wales,
+For Edward my son, which was Prince of Wales,
+Die in his youth by like untimely violence!
+Thyself a queen, for me that was a queen,
+Outlive thy glory, like my wretched self!
+Long mayst thou live to wail thy children's loss;
+And see another, as I see thee now,
+Deck'd in thy rights, as thou art stall'd in mine!
+Long die thy happy days before thy death;
+And, after many lengthen'd hours of grief,
+Die neither mother, wife, nor England's queen!
+Rivers and Dorset, you were standers by,
+And so wast thou, Lord Hastings, when my son
+Was stabb'd with bloody daggers: God, I pray him,
+That none of you may live your natural age,
+But by some unlook'd accident cut off!
+
+GLOUCESTER:
+Have done thy charm, thou hateful wither'd hag!
+
+QUEEN MARGARET:
+And leave out thee? stay, dog, for thou shalt hear me.
+If heaven have any grievous plague in store
+Exceeding those that I can wish upon thee,
+O, let them keep it till thy sins be ripe,
+And then hurl down their indignation
+On thee, the troubler of the poor world's peace!
+The worm of conscience still begnaw thy soul!
+Thy friends suspect for traitors while thou livest,
+And take deep traitors for thy dearest friends!
+No sleep close up that deadly eye of thine,
+Unless it be whilst some tormenting dream
+Affrights thee with a hell of ugly devils!
+Thou elvish-mark'd, abortive, rooting hog!
+Thou that wast seal'd in thy nativity
+The slave of nature and the son of hell!
+Thou slander of thy mother's heavy womb!
+Thou loathed issue of thy father's loins!
+Thou rag of honour! thou detested--
+
+GLOUCESTER:
+Margaret.
+
+QUEEN MARGARET:
+Richard!
+
+GLOUCESTER:
+Ha!
+
+QUEEN MARGARET:
+I call thee not.
+
+GLOUCESTER:
+I cry thee mercy then, for I had thought
+That thou hadst call'd me all these bitter names.
+
+QUEEN MARGARET:
+Why, so I did; but look'd for no reply.
+O, let me make the period to my curse!
+
+GLOUCESTER:
+'Tis done by me, and ends in 'Margaret.'
+
+QUEEN ELIZABETH:
+Thus have you breathed your curse against yourself.
+
+QUEEN MARGARET:
+Poor painted queen, vain flourish of my fortune!
+Why strew'st thou sugar on that bottled spider,
+Whose deadly web ensnareth thee about?
+Fool, fool! thou whet'st a knife to kill thyself.
+The time will come when thou shalt wish for me
+To help thee curse that poisonous bunchback'd toad.
+
+HASTINGS:
+False-boding woman, end thy frantic curse,
+Lest to thy harm thou move our patience.
+
+QUEEN MARGARET:
+Foul shame upon you! you have all moved mine.
+
+RIVERS:
+Were you well served, you would be taught your duty.
+
+QUEEN MARGARET:
+To serve me well, you all should do me duty,
+Teach me to be your queen, and you my subjects:
+O, serve me well, and teach yourselves that duty!
+
+DORSET:
+Dispute not with her; she is lunatic.
+
+QUEEN MARGARET:
+Peace, master marquess, you are malapert:
+Your fire-new stamp of honour is scarce current.
+O, that your young nobility could judge
+What 'twere to lose it, and be miserable!
+They that stand high have many blasts to shake them;
+And if they fall, they dash themselves to pieces.
+
+GLOUCESTER:
+Good counsel, marry: learn it, learn it, marquess.
+
+DORSET:
+It toucheth you, my lord, as much as me.
+
+GLOUCESTER:
+Yea, and much more: but I was born so high,
+Our aery buildeth in the cedar's top,
+And dallies with the wind and scorns the sun.
+
+QUEEN MARGARET:
+And turns the sun to shade; alas! alas!
+Witness my son, now in the shade of death;
+Whose bright out-shining beams thy cloudy wrath
+Hath in eternal darkness folded up.
+Your aery buildeth in our aery's nest.
+O God, that seest it, do not suffer it!
+As it was won with blood, lost be it so!
+
+BUCKINGHAM:
+Have done! for shame, if not for charity.
+
+QUEEN MARGARET:
+Urge neither charity nor shame to me:
+Uncharitably with me have you dealt,
+And shamefully by you my hopes are butcher'd.
+My charity is outrage, life my shame
+And in that shame still live my sorrow's rage.
+
+BUCKINGHAM:
+Have done, have done.
+
+QUEEN MARGARET:
+O princely Buckingham I'll kiss thy hand,
+In sign of league and amity with thee:
+Now fair befal thee and thy noble house!
+Thy garments are not spotted with our blood,
+Nor thou within the compass of my curse.
+
+BUCKINGHAM:
+Nor no one here; for curses never pass
+The lips of those that breathe them in the air.
+
+QUEEN MARGARET:
+I'll not believe but they ascend the sky,
+And there awake God's gentle-sleeping peace.
+O Buckingham, take heed of yonder dog!
+Look, when he fawns, he bites; and when he bites,
+His venom tooth will rankle to the death:
+Have not to do with him, beware of him;
+Sin, death, and hell have set their marks on him,
+And all their ministers attend on him.
+
+GLOUCESTER:
+What doth she say, my Lord of Buckingham?
+
+BUCKINGHAM:
+Nothing that I respect, my gracious lord.
+
+QUEEN MARGARET:
+What, dost thou scorn me for my gentle counsel?
+And soothe the devil that I warn thee from?
+O, but remember this another day,
+When he shall split thy very heart with sorrow,
+And say poor Margaret was a prophetess!
+Live each of you the subjects to his hate,
+And he to yours, and all of you to God's!
+
+HASTINGS:
+My hair doth stand on end to hear her curses.
+
+RIVERS:
+And so doth mine: I muse why she's at liberty.
+
+GLOUCESTER:
+I cannot blame her: by God's holy mother,
+She hath had too much wrong; and I repent
+My part thereof that I have done to her.
+
+QUEEN ELIZABETH:
+I never did her any, to my knowledge.
+
+GLOUCESTER:
+But you have all the vantage of her wrong.
+I was too hot to do somebody good,
+That is too cold in thinking of it now.
+Marry, as for Clarence, he is well repaid,
+He is frank'd up to fatting for his pains
+God pardon them that are the cause of it!
+
+RIVERS:
+A virtuous and a Christian-like conclusion,
+To pray for them that have done scathe to us.
+
+GLOUCESTER:
+So do I ever:
+being well-advised.
+For had I cursed now, I had cursed myself.
+
+CATESBY:
+Madam, his majesty doth call for you,
+And for your grace; and you, my noble lords.
+
+QUEEN ELIZABETH:
+Catesby, we come. Lords, will you go with us?
+
+RIVERS:
+Madam, we will attend your grace.
+
+GLOUCESTER:
+I do the wrong, and first begin to brawl.
+The secret mischiefs that I set abroach
+I lay unto the grievous charge of others.
+Clarence, whom I, indeed, have laid in darkness,
+I do beweep to many simple gulls
+Namely, to Hastings, Derby, Buckingham;
+And say it is the queen and her allies
+That stir the king against the duke my brother.
+Now, they believe it; and withal whet me
+To be revenged on Rivers, Vaughan, Grey:
+But then I sigh; and, with a piece of scripture,
+Tell them that God bids us do good for evil:
+And thus I clothe my naked villany
+With old odd ends stolen out of holy writ;
+And seem a saint, when most I play the devil.
+But, soft! here come my executioners.
+How now, my hardy, stout resolved mates!
+Are you now going to dispatch this deed?
+
+First Murderer:
+We are, my lord; and come to have the warrant
+That we may be admitted where he is.
+
+GLOUCESTER:
+Well thought upon; I have it here about me.
+When you have done, repair to Crosby Place.
+But, sirs, be sudden in the execution,
+Withal obdurate, do not hear him plead;
+For Clarence is well-spoken, and perhaps
+May move your hearts to pity if you mark him.
+
+First Murderer:
+Tush!
+Fear not, my lord, we will not stand to prate;
+Talkers are no good doers: be assured
+We come to use our hands and not our tongues.
+
+GLOUCESTER:
+Your eyes drop millstones, when fools' eyes drop tears:
+I like you, lads; about your business straight;
+Go, go, dispatch.
+
+First Murderer:
+We will, my noble lord.
+
+BRAKENBURY:
+Why looks your grace so heavily today?
+
+CLARENCE:
+O, I have pass'd a miserable night,
+So full of ugly sights, of ghastly dreams,
+That, as I am a Christian faithful man,
+I would not spend another such a night,
+Though 'twere to buy a world of happy days,
+So full of dismal terror was the time!
+
+BRAKENBURY:
+What was your dream? I long to hear you tell it.
+
+CLARENCE:
+Methoughts that I had broken from the Tower,
+And was embark'd to cross to Burgundy;
+And, in my company, my brother Gloucester;
+Who from my cabin tempted me to walk
+Upon the hatches: thence we looked toward England,
+And cited up a thousand fearful times,
+During the wars of York and Lancaster
+That had befall'n us. As we paced along
+Upon the giddy footing of the hatches,
+Methought that Gloucester stumbled; and, in falling,
+Struck me, that thought to stay him, overboard,
+Into the tumbling billows of the main.
+Lord, Lord! methought, what pain it was to drown!
+What dreadful noise of waters in mine ears!
+What ugly sights of death within mine eyes!
+Methought I saw a thousand fearful wrecks;
+Ten thousand men that fishes gnaw'd upon;
+Wedges of gold, great anchors, heaps of pearl,
+Inestimable stones, unvalued jewels,
+All scatter'd in the bottom of the sea:
+Some lay in dead men's skulls; and, in those holes
+Where eyes did once inhabit, there were crept,
+As 'twere in scorn of eyes, reflecting gems,
+Which woo'd the slimy bottom of the deep,
+And mock'd the dead bones that lay scatter'd by.
+
+BRAKENBURY:
+Had you such leisure in the time of death
+To gaze upon the secrets of the deep?
+
+CLARENCE:
+Methought I had; and often did I strive
+To yield the ghost: but still the envious flood
+Kept in my soul, and would not let it forth
+To seek the empty, vast and wandering air;
+But smother'd it within my panting bulk,
+Which almost burst to belch it in the sea.
+
+BRAKENBURY:
+Awaked you not with this sore agony?
+
+CLARENCE:
+O, no, my dream was lengthen'd after life;
+O, then began the tempest to my soul,
+Who pass'd, methought, the melancholy flood,
+With that grim ferryman which poets write of,
+Unto the kingdom of perpetual night.
+The first that there did greet my stranger soul,
+Was my great father-in-law, renowned Warwick;
+Who cried aloud, 'What scourge for perjury
+Can this dark monarchy afford false Clarence?'
+And so he vanish'd: then came wandering by
+A shadow like an angel, with bright hair
+Dabbled in blood; and he squeak'd out aloud,
+'Clarence is come; false, fleeting, perjured Clarence,
+That stabb'd me in the field by Tewksbury;
+Seize on him, Furies, take him to your torments!'
+With that, methoughts, a legion of foul fiends
+Environ'd me about, and howled in mine ears
+Such hideous cries, that with the very noise
+I trembling waked, and for a season after
+Could not believe but that I was in hell,
+Such terrible impression made the dream.
+
+BRAKENBURY:
+No marvel, my lord, though it affrighted you;
+I promise, I am afraid to hear you tell it.
+
+CLARENCE:
+O Brakenbury, I have done those things,
+Which now bear evidence against my soul,
+For Edward's sake; and see how he requites me!
+O God! if my deep prayers cannot appease thee,
+But thou wilt be avenged on my misdeeds,
+Yet execute thy wrath in me alone,
+O, spare my guiltless wife and my poor children!
+I pray thee, gentle keeper, stay by me;
+My soul is heavy, and I fain would sleep.
+
+BRAKENBURY:
+I will, my lord: God give your grace good rest!
+Sorrow breaks seasons and reposing hours,
+Makes the night morning, and the noon-tide night.
+Princes have but their tides for their glories,
+An outward honour for an inward toil;
+And, for unfelt imagination,
+They often feel a world of restless cares:
+So that, betwixt their tides and low names,
+There's nothing differs but the outward fame.
+
+First Murderer:
+Ho! who's here?
+
+BRAKENBURY:
+In God's name what are you, and how came you hither?
+
+First Murderer:
+I would speak with Clarence, and I came hither on my legs.
+
+BRAKENBURY:
+Yea, are you so brief?
+
+Second Murderer:
+O sir, it is better to be brief than tedious. Show
+him our commission; talk no more.
+
+BRAKENBURY:
+I am, in this, commanded to deliver
+The noble Duke of Clarence to your hands:
+I will not reason what is meant hereby,
+Because I will be guiltless of the meaning.
+Here are the keys, there sits the duke asleep:
+I'll to the king; and signify to him
+That thus I have resign'd my charge to you.
+
+First Murderer:
+Do so, it is a point of wisdom: fare you well.
+
+Second Murderer:
+What, shall we stab him as he sleeps?
+
+First Murderer:
+No; then he will say 'twas done cowardly, when he wakes.
+
+Second Murderer:
+When he wakes! why, fool, he shall never wake till
+the judgment-day.
+
+First Murderer:
+Why, then he will say we stabbed him sleeping.
+
+Second Murderer:
+The urging of that word 'judgment' hath bred a kind
+of remorse in me.
+
+First Murderer:
+What, art thou afraid?
+
+Second Murderer:
+Not to kill him, having a warrant for it; but to be
+damned for killing him, from which no warrant can defend us.
+
+First Murderer:
+I thought thou hadst been resolute.
+
+Second Murderer:
+So I am, to let him live.
+
+First Murderer:
+Back to the Duke of Gloucester, tell him so.
+
+Second Murderer:
+I pray thee, stay a while: I hope my holy humour
+will change; 'twas wont to hold me but while one
+would tell twenty.
+
+First Murderer:
+How dost thou feel thyself now?
+
+Second Murderer:
+'Faith, some certain dregs of conscience are yet
+within me.
+
+First Murderer:
+Remember our reward, when the deed is done.
+
+Second Murderer:
+'Zounds, he dies: I had forgot the reward.
+
+First Murderer:
+Where is thy conscience now?
+
+Second Murderer:
+In the Duke of Gloucester's purse.
+
+First Murderer:
+So when he opens his purse to give us our reward,
+thy conscience flies out.
+
+Second Murderer:
+Let it go; there's few or none will entertain it.
+
+First Murderer:
+How if it come to thee again?
+
+Second Murderer:
+I'll not meddle with it: it is a dangerous thing: it
+makes a man a coward: a man cannot steal, but it
+accuseth him; he cannot swear, but it cheques him;
+he cannot lie with his neighbour's wife, but it
+detects him: 'tis a blushing shamefast spirit that
+mutinies in a man's bosom; it fills one full of
+obstacles: it made me once restore a purse of gold
+that I found; it beggars any man that keeps it: it
+is turned out of all towns and cities for a
+dangerous thing; and every man that means to live
+well endeavours to trust to himself and to live
+without it.
+
+First Murderer:
+'Zounds, it is even now at my elbow, persuading me
+not to kill the duke.
+
+Second Murderer:
+Take the devil in thy mind, and relieve him not: he
+would insinuate with thee but to make thee sigh.
+
+First Murderer:
+Tut, I am strong-framed, he cannot prevail with me,
+I warrant thee.
+
+Second Murderer:
+Spoke like a tail fellow that respects his
+reputation. Come, shall we to this gear?
+
+First Murderer:
+Take him over the costard with the hilts of thy
+sword, and then we will chop him in the malmsey-butt
+in the next room.
+
+Second Murderer:
+O excellent devise! make a sop of him.
+
+First Murderer:
+Hark! he stirs: shall I strike?
+
+Second Murderer:
+No, first let's reason with him.
+
+CLARENCE:
+Where art thou, keeper? give me a cup of wine.
+
+Second murderer:
+You shall have wine enough, my lord, anon.
+
+CLARENCE:
+In God's name, what art thou?
+
+Second Murderer:
+A man, as you are.
+
+CLARENCE:
+But not, as I am, royal.
+
+Second Murderer:
+Nor you, as we are, loyal.
+
+CLARENCE:
+Thy voice is thunder, but thy looks are humble.
+
+Second Murderer:
+My voice is now the king's, my looks mine own.
+
+CLARENCE:
+How darkly and how deadly dost thou speak!
+Your eyes do menace me: why look you pale?
+Who sent you hither? Wherefore do you come?
+
+Both:
+To, to, to--
+
+CLARENCE:
+To murder me?
+
+Both:
+Ay, ay.
+
+CLARENCE:
+You scarcely have the hearts to tell me so,
+And therefore cannot have the hearts to do it.
+Wherein, my friends, have I offended you?
+
+First Murderer:
+Offended us you have not, but the king.
+
+CLARENCE:
+I shall be reconciled to him again.
+
+Second Murderer:
+Never, my lord; therefore prepare to die.
+
+CLARENCE:
+Are you call'd forth from out a world of men
+To slay the innocent? What is my offence?
+Where are the evidence that do accuse me?
+What lawful quest have given their verdict up
+Unto the frowning judge? or who pronounced
+The bitter sentence of poor Clarence' death?
+Before I be convict by course of law,
+To threaten me with death is most unlawful.
+I charge you, as you hope to have redemption
+By Christ's dear blood shed for our grievous sins,
+That you depart and lay no hands on me
+The deed you undertake is damnable.
+
+First Murderer:
+What we will do, we do upon command.
+
+Second Murderer:
+And he that hath commanded is the king.
+
+CLARENCE:
+Erroneous vassal! the great King of kings
+Hath in the tables of his law commanded
+That thou shalt do no murder: and wilt thou, then,
+Spurn at his edict and fulfil a man's?
+Take heed; for he holds vengeance in his hands,
+To hurl upon their heads that break his law.
+
+Second Murderer:
+And that same vengeance doth he hurl on thee,
+For false forswearing and for murder too:
+Thou didst receive the holy sacrament,
+To fight in quarrel of the house of Lancaster.
+
+First Murderer:
+And, like a traitor to the name of God,
+Didst break that vow; and with thy treacherous blade
+Unrip'dst the bowels of thy sovereign's son.
+
+Second Murderer:
+Whom thou wert sworn to cherish and defend.
+
+First Murderer:
+How canst thou urge God's dreadful law to us,
+When thou hast broke it in so dear degree?
+
+CLARENCE:
+Alas! for whose sake did I that ill deed?
+For Edward, for my brother, for his sake: Why, sirs,
+He sends ye not to murder me for this
+For in this sin he is as deep as I.
+If God will be revenged for this deed.
+O, know you yet, he doth it publicly,
+Take not the quarrel from his powerful arm;
+He needs no indirect nor lawless course
+To cut off those that have offended him.
+
+First Murderer:
+Who made thee, then, a bloody minister,
+When gallant-springing brave Plantagenet,
+That princely novice, was struck dead by thee?
+
+CLARENCE:
+My brother's love, the devil, and my rage.
+
+First Murderer:
+Thy brother's love, our duty, and thy fault,
+Provoke us hither now to slaughter thee.
+
+CLARENCE:
+Oh, if you love my brother, hate not me;
+I am his brother, and I love him well.
+If you be hired for meed, go back again,
+And I will send you to my brother Gloucester,
+Who shall reward you better for my life
+Than Edward will for tidings of my death.
+
+Second Murderer:
+You are deceived, your brother Gloucester hates you.
+
+CLARENCE:
+O, no, he loves me, and he holds me dear:
+Go you to him from me.
+
+Both:
+Ay, so we will.
+
+CLARENCE:
+Tell him, when that our princely father York
+Bless'd his three sons with his victorious arm,
+And charged us from his soul to love each other,
+He little thought of this divided friendship:
+Bid Gloucester think of this, and he will weep.
+
+First Murderer:
+Ay, millstones; as be lesson'd us to weep.
+
+CLARENCE:
+O, do not slander him, for he is kind.
+
+First Murderer:
+Right,
+As snow in harvest. Thou deceivest thyself:
+'Tis he that sent us hither now to slaughter thee.
+
+CLARENCE:
+It cannot be; for when I parted with him,
+He hugg'd me in his arms, and swore, with sobs,
+That he would labour my delivery.
+
+Second Murderer:
+Why, so he doth, now he delivers thee
+From this world's thraldom to the joys of heaven.
+
+First Murderer:
+Make peace with God, for you must die, my lord.
+
+CLARENCE:
+Hast thou that holy feeling in thy soul,
+To counsel me to make my peace with God,
+And art thou yet to thy own soul so blind,
+That thou wilt war with God by murdering me?
+Ah, sirs, consider, he that set you on
+To do this deed will hate you for the deed.
+
+Second Murderer:
+What shall we do?
+
+CLARENCE:
+Relent, and save your souls.
+
+First Murderer:
+Relent! 'tis cowardly and womanish.
+
+CLARENCE:
+Not to relent is beastly, savage, devilish.
+Which of you, if you were a prince's son,
+Being pent from liberty, as I am now,
+if two such murderers as yourselves came to you,
+Would not entreat for life?
+My friend, I spy some pity in thy looks:
+O, if thine eye be not a flatterer,
+Come thou on my side, and entreat for me,
+As you would beg, were you in my distress
+A begging prince what beggar pities not?
+
+Second Murderer:
+Look behind you, my lord.
+
+First Murderer:
+Take that, and that: if all this will not do,
+I'll drown you in the malmsey-butt within.
+
+Second Murderer:
+A bloody deed, and desperately dispatch'd!
+How fain, like Pilate, would I wash my hands
+Of this most grievous guilty murder done!
+
+First Murderer:
+How now! what mean'st thou, that thou help'st me not?
+By heavens, the duke shall know how slack thou art!
+
+Second Murderer:
+I would he knew that I had saved his brother!
+Take thou the fee, and tell him what I say;
+For I repent me that the duke is slain.
+
+First Murderer:
+So do not I: go, coward as thou art.
+Now must I hide his body in some hole,
+Until the duke take order for his burial:
+And when I have my meed, I must away;
+For this will out, and here I must not stay.
+
+KING EDWARD IV:
+Why, so: now have I done a good day's work:
+You peers, continue this united league:
+I every day expect an embassage
+From my Redeemer to redeem me hence;
+And now in peace my soul shall part to heaven,
+Since I have set my friends at peace on earth.
+Rivers and Hastings, take each other's hand;
+Dissemble not your hatred, swear your love.
+
+RIVERS:
+By heaven, my heart is purged from grudging hate:
+And with my hand I seal my true heart's love.
+
+HASTINGS:
+So thrive I, as I truly swear the like!
+
+KING EDWARD IV:
+Take heed you dally not before your king;
+Lest he that is the supreme King of kings
+Confound your hidden falsehood, and award
+Either of you to be the other's end.
+
+HASTINGS:
+So prosper I, as I swear perfect love!
+
+RIVERS:
+And I, as I love Hastings with my heart!
+
+KING EDWARD IV:
+Madam, yourself are not exempt in this,
+Nor your son Dorset, Buckingham, nor you;
+You have been factious one against the other,
+Wife, love Lord Hastings, let him kiss your hand;
+And what you do, do it unfeignedly.
+
+QUEEN ELIZABETH:
+Here, Hastings; I will never more remember
+Our former hatred, so thrive I and mine!
+
+KING EDWARD IV:
+Dorset, embrace him; Hastings, love lord marquess.
+
+DORSET:
+This interchange of love, I here protest,
+Upon my part shall be unviolable.
+
+HASTINGS:
+And so swear I, my lord
+
+KING EDWARD IV:
+Now, princely Buckingham, seal thou this league
+With thy embracements to my wife's allies,
+And make me happy in your unity.
+
+BUCKINGHAM:
+Whenever Buckingham doth turn his hate
+On you or yours,
+but with all duteous love
+Doth cherish you and yours, God punish me
+With hate in those where I expect most love!
+When I have most need to employ a friend,
+And most assured that he is a friend
+Deep, hollow, treacherous, and full of guile,
+Be he unto me! this do I beg of God,
+When I am cold in zeal to yours.
+
+KING EDWARD IV:
+A pleasing cordial, princely Buckingham,
+is this thy vow unto my sickly heart.
+There wanteth now our brother Gloucester here,
+To make the perfect period of this peace.
+
+BUCKINGHAM:
+And, in good time, here comes the noble duke.
+
+GLOUCESTER:
+Good morrow to my sovereign king and queen:
+And, princely peers, a happy time of day!
+
+KING EDWARD IV:
+Happy, indeed, as we have spent the day.
+Brother, we done deeds of charity;
+Made peace enmity, fair love of hate,
+Between these swelling wrong-incensed peers.
+
+GLOUCESTER:
+A blessed labour, my most sovereign liege:
+Amongst this princely heap, if any here,
+By false intelligence, or wrong surmise,
+Hold me a foe;
+If I unwittingly, or in my rage,
+Have aught committed that is hardly borne
+By any in this presence, I desire
+To reconcile me to his friendly peace:
+'Tis death to me to be at enmity;
+I hate it, and desire all good men's love.
+First, madam, I entreat true peace of you,
+Which I will purchase with my duteous service;
+Of you, my noble cousin Buckingham,
+If ever any grudge were lodged between us;
+Of you, Lord Rivers, and, Lord Grey, of you;
+That without desert have frown'd on me;
+Dukes, earls, lords, gentlemen; indeed, of all.
+I do not know that Englishman alive
+With whom my soul is any jot at odds
+More than the infant that is born to-night
+I thank my God for my humility.
+
+QUEEN ELIZABETH:
+A holy day shall this be kept hereafter:
+I would to God all strifes were well compounded.
+My sovereign liege, I do beseech your majesty
+To take our brother Clarence to your grace.
+
+GLOUCESTER:
+Why, madam, have I offer'd love for this
+To be so bouted in this royal presence?
+Who knows not that the noble duke is dead?
+You do him injury to scorn his corse.
+
+RIVERS:
+Who knows not he is dead! who knows he is?
+
+QUEEN ELIZABETH:
+All seeing heaven, what a world is this!
+
+BUCKINGHAM:
+Look I so pale, Lord Dorset, as the rest?
+
+DORSET:
+Ay, my good lord; and no one in this presence
+But his red colour hath forsook his cheeks.
+
+KING EDWARD IV:
+Is Clarence dead? the order was reversed.
+
+GLOUCESTER:
+But he, poor soul, by your first order died,
+And that a winged Mercury did bear:
+Some tardy cripple bore the countermand,
+That came too lag to see him buried.
+God grant that some, less noble and less loyal,
+Nearer in bloody thoughts, but not in blood,
+Deserve not worse than wretched Clarence did,
+And yet go current from suspicion!
+
+DORSET:
+A boon, my sovereign, for my service done!
+
+KING EDWARD IV:
+I pray thee, peace: my soul is full of sorrow.
+
+DORSET:
+I will not rise, unless your highness grant.
+
+KING EDWARD IV:
+Then speak at once what is it thou demand'st.
+
+DORSET:
+The forfeit, sovereign, of my servant's life;
+Who slew to-day a righteous gentleman
+Lately attendant on the Duke of Norfolk.
+
+KING EDWARD IV:
+Have a tongue to doom my brother's death,
+And shall the same give pardon to a slave?
+My brother slew no man; his fault was thought,
+And yet his punishment was cruel death.
+Who sued to me for him? who, in my rage,
+Kneel'd at my feet, and bade me be advised
+Who spake of brotherhood? who spake of love?
+Who told me how the poor soul did forsake
+The mighty Warwick, and did fight for me?
+Who told me, in the field by Tewksbury
+When Oxford had me down, he rescued me,
+And said, 'Dear brother, live, and be a king'?
+Who told me, when we both lay in the field
+Frozen almost to death, how he did lap me
+Even in his own garments, and gave himself,
+All thin and naked, to the numb cold night?
+All this from my remembrance brutish wrath
+Sinfully pluck'd, and not a man of you
+Had so much grace to put it in my mind.
+But when your carters or your waiting-vassals
+Have done a drunken slaughter, and defaced
+The precious image of our dear Redeemer,
+You straight are on your knees for pardon, pardon;
+And I unjustly too, must grant it you
+But for my brother not a man would speak,
+Nor I, ungracious, speak unto myself
+For him, poor soul. The proudest of you all
+Have been beholding to him in his life;
+Yet none of you would once plead for his life.
+O God, I fear thy justice will take hold
+On me, and you, and mine, and yours for this!
+Come, Hastings, help me to my closet.
+Oh, poor Clarence!
+
+GLOUCESTER:
+This is the fruit of rashness! Mark'd you not
+How that the guilty kindred of the queen
+Look'd pale when they did hear of Clarence' death?
+O, they did urge it still unto the king!
+God will revenge it. But come, let us in,
+To comfort Edward with our company.
+
+BUCKINGHAM:
+We wait upon your grace.
+
+Boy:
+Tell me, good grandam, is our father dead?
+
+DUCHESS OF YORK:
+No, boy.
+
+Boy:
+Why do you wring your hands, and beat your breast,
+And cry 'O Clarence, my unhappy son!'
+
+Girl:
+Why do you look on us, and shake your head,
+And call us wretches, orphans, castaways
+If that our noble father be alive?
+
+DUCHESS OF YORK:
+My pretty cousins, you mistake me much;
+I do lament the sickness of the king.
+As loath to lose him, not your father's death;
+It were lost sorrow to wail one that's lost.
+
+Boy:
+Then, grandam, you conclude that he is dead.
+The king my uncle is to blame for this:
+God will revenge it; whom I will importune
+With daily prayers all to that effect.
+
+Girl:
+And so will I.
+
+DUCHESS OF YORK:
+Peace, children, peace! the king doth love you well:
+Incapable and shallow innocents,
+You cannot guess who caused your father's death.
+
+Boy:
+Grandam, we can; for my good uncle Gloucester
+Told me, the king, provoked by the queen,
+Devised impeachments to imprison him :
+And when my uncle told me so, he wept,
+And hugg'd me in his arm, and kindly kiss'd my cheek;
+Bade me rely on him as on my father,
+And he would love me dearly as his child.
+
+DUCHESS OF YORK:
+Oh, that deceit should steal such gentle shapes,
+And with a virtuous vizard hide foul guile!
+He is my son; yea, and therein my shame;
+Yet from my dugs he drew not this deceit.
+
+Boy:
+Think you my uncle did dissemble, grandam?
+
+DUCHESS OF YORK:
+Ay, boy.
+
+Boy:
+I cannot think it. Hark! what noise is this?
+
+QUEEN ELIZABETH:
+Oh, who shall hinder me to wail and weep,
+To chide my fortune, and torment myself?
+I'll join with black despair against my soul,
+And to myself become an enemy.
+
+DUCHESS OF YORK:
+What means this scene of rude impatience?
+
+QUEEN ELIZABETH:
+To make an act of tragic violence:
+Edward, my lord, your son, our king, is dead.
+Why grow the branches now the root is wither'd?
+Why wither not the leaves the sap being gone?
+If you will live, lament; if die, be brief,
+That our swift-winged souls may catch the king's;
+Or, like obedient subjects, follow him
+To his new kingdom of perpetual rest.
+
+DUCHESS OF YORK:
+Ah, so much interest have I in thy sorrow
+As I had title in thy noble husband!
+I have bewept a worthy husband's death,
+And lived by looking on his images:
+But now two mirrors of his princely semblance
+Are crack'd in pieces by malignant death,
+And I for comfort have but one false glass,
+Which grieves me when I see my shame in him.
+Thou art a widow; yet thou art a mother,
+And hast the comfort of thy children left thee:
+But death hath snatch'd my husband from mine arms,
+And pluck'd two crutches from my feeble limbs,
+Edward and Clarence. O, what cause have I,
+Thine being but a moiety of my grief,
+To overgo thy plaints and drown thy cries!
+
+Boy:
+Good aunt, you wept not for our father's death;
+How can we aid you with our kindred tears?
+
+Girl:
+Our fatherless distress was left unmoan'd;
+Your widow-dolour likewise be unwept!
+
+QUEEN ELIZABETH:
+Give me no help in lamentation;
+I am not barren to bring forth complaints
+All springs reduce their currents to mine eyes,
+That I, being govern'd by the watery moon,
+May send forth plenteous tears to drown the world!
+Oh for my husband, for my dear lord Edward!
+
+Children:
+Oh for our father, for our dear lord Clarence!
+
+DUCHESS OF YORK:
+Alas for both, both mine, Edward and Clarence!
+
+QUEEN ELIZABETH:
+What stay had I but Edward? and he's gone.
+
+Children:
+What stay had we but Clarence? and he's gone.
+
+DUCHESS OF YORK:
+What stays had I but they? and they are gone.
+
+QUEEN ELIZABETH:
+Was never widow had so dear a loss!
+
+Children:
+Were never orphans had so dear a loss!
+
+DUCHESS OF YORK:
+Was never mother had so dear a loss!
+Alas, I am the mother of these moans!
+Their woes are parcell'd, mine are general.
+She for an Edward weeps, and so do I;
+I for a Clarence weep, so doth not she:
+These babes for Clarence weep and so do I;
+I for an Edward weep, so do not they:
+Alas, you three, on me, threefold distress'd,
+Pour all your tears! I am your sorrow's nurse,
+And I will pamper it with lamentations.
+
+DORSET:
+Comfort, dear mother: God is much displeased
+That you take with unthankfulness, his doing:
+In common worldly things, 'tis call'd ungrateful,
+With dull unwilligness to repay a debt
+Which with a bounteous hand was kindly lent;
+Much more to be thus opposite with heaven,
+For it requires the royal debt it lent you.
+
+RIVERS:
+Madam, bethink you, like a careful mother,
+Of the young prince your son: send straight for him
+Let him be crown'd; in him your comfort lives:
+Drown desperate sorrow in dead Edward's grave,
+And plant your joys in living Edward's throne.
+
+GLOUCESTER:
+Madam, have comfort: all of us have cause
+To wail the dimming of our shining star;
+But none can cure their harms by wailing them.
+Madam, my mother, I do cry you mercy;
+I did not see your grace: humbly on my knee
+I crave your blessing.
+
+DUCHESS OF YORK:
+God bless thee; and put meekness in thy mind,
+Love, charity, obedience, and true duty!
+
+GLOUCESTER:
+
+BUCKINGHAM:
+You cloudy princes and heart-sorrowing peers,
+That bear this mutual heavy load of moan,
+Now cheer each other in each other's love
+Though we have spent our harvest of this king,
+We are to reap the harvest of his son.
+The broken rancour of your high-swoln hearts,
+But lately splinter'd, knit, and join'd together,
+Must gently be preserved, cherish'd, and kept:
+Me seemeth good, that, with some little train,
+Forthwith from Ludlow the young prince be fetch'd
+Hither to London, to be crown'd our king.
+
+RIVERS:
+Why with some little train, my Lord of Buckingham?
+
+BUCKINGHAM:
+Marry, my lord, lest, by a multitude,
+The new-heal'd wound of malice should break out,
+Which would be so much the more dangerous
+By how much the estate is green and yet ungovern'd:
+Where every horse bears his commanding rein,
+And may direct his course as please himself,
+As well the fear of harm, as harm apparent,
+In my opinion, ought to be prevented.
+
+GLOUCESTER:
+I hope the king made peace with all of us
+And the compact is firm and true in me.
+
+RIVERS:
+And so in me; and so, I think, in all:
+Yet, since it is but green, it should be put
+To no apparent likelihood of breach,
+Which haply by much company might be urged:
+Therefore I say with noble Buckingham,
+That it is meet so few should fetch the prince.
+
+HASTINGS:
+And so say I.
+
+GLOUCESTER:
+Then be it so; and go we to determine
+Who they shall be that straight shall post to Ludlow.
+Madam, and you, my mother, will you go
+To give your censures in this weighty business?
+
+QUEEN ELIZABETH:
+With all our harts.
+
+BUCKINGHAM:
+My lord, whoever journeys to the Prince,
+For God's sake, let not us two be behind;
+For, by the way, I'll sort occasion,
+As index to the story we late talk'd of,
+To part the queen's proud kindred from the king.
+
+GLOUCESTER:
+My other self, my counsel's consistory,
+My oracle, my prophet! My dear cousin,
+I, like a child, will go by thy direction.
+Towards Ludlow then, for we'll not stay behind.
+
+First Citizen:
+Neighbour, well met: whither away so fast?
+
+Second Citizen:
+I promise you, I scarcely know myself:
+Hear you the news abroad?
+
+First Citizen:
+Ay, that the king is dead.
+
+Second Citizen:
+Bad news, by'r lady; seldom comes the better:
+I fear, I fear 'twill prove a troublous world.
+
+Third Citizen:
+Neighbours, God speed!
+
+First Citizen:
+Give you good morrow, sir.
+
+Third Citizen:
+Doth this news hold of good King Edward's death?
+
+Second Citizen:
+Ay, sir, it is too true; God help the while!
+
+Third Citizen:
+Then, masters, look to see a troublous world.
+
+First Citizen:
+No, no; by God's good grace his son shall reign.
+
+Third Citizen:
+Woe to the land that's govern'd by a child!
+
+Second Citizen:
+In him there is a hope of government,
+That in his nonage council under him,
+And in his full and ripen'd years himself,
+No doubt, shall then and till then govern well.
+
+First Citizen:
+So stood the state when Henry the Sixth
+Was crown'd in Paris but at nine months old.
+
+Third Citizen:
+Stood the state so? No, no, good friends, God wot;
+For then this land was famously enrich'd
+With politic grave counsel; then the king
+Had virtuous uncles to protect his grace.
+
+First Citizen:
+Why, so hath this, both by the father and mother.
+
+Third Citizen:
+Better it were they all came by the father,
+Or by the father there were none at all;
+For emulation now, who shall be nearest,
+Will touch us all too near, if God prevent not.
+O, full of danger is the Duke of Gloucester!
+And the queen's sons and brothers haught and proud:
+And were they to be ruled, and not to rule,
+This sickly land might solace as before.
+
+First Citizen:
+Come, come, we fear the worst; all shall be well.
+
+Third Citizen:
+When clouds appear, wise men put on their cloaks;
+When great leaves fall, the winter is at hand;
+When the sun sets, who doth not look for night?
+Untimely storms make men expect a dearth.
+All may be well; but, if God sort it so,
+'Tis more than we deserve, or I expect.
+
+Second Citizen:
+Truly, the souls of men are full of dread:
+Ye cannot reason almost with a man
+That looks not heavily and full of fear.
+
+Third Citizen:
+Before the times of change, still is it so:
+By a divine instinct men's minds mistrust
+Ensuing dangers; as by proof, we see
+The waters swell before a boisterous storm.
+But leave it all to God. whither away?
+
+Second Citizen:
+Marry, we were sent for to the justices.
+
+Third Citizen:
+And so was I: I'll bear you company.
+
+ARCHBISHOP OF YORK:
+Last night, I hear, they lay at Northampton;
+At Stony-Stratford will they be to-night:
+To-morrow, or next day, they will be here.
+
+DUCHESS OF YORK:
+I long with all my heart to see the prince:
+I hope he is much grown since last I saw him.
+
+QUEEN ELIZABETH:
+But I hear, no; they say my son of York
+Hath almost overta'en him in his growth.
+
+YORK:
+Ay, mother; but I would not have it so.
+
+DUCHESS OF YORK:
+Why, my young cousin, it is good to grow.
+
+YORK:
+Grandam, one night, as we did sit at supper,
+My uncle Rivers talk'd how I did grow
+More than my brother: 'Ay,' quoth my uncle
+Gloucester,
+'Small herbs have grace, great weeds do grow apace:'
+And since, methinks, I would not grow so fast,
+Because sweet flowers are slow and weeds make haste.
+
+DUCHESS OF YORK:
+Good faith, good faith, the saying did not hold
+In him that did object the same to thee;
+He was the wretched'st thing when he was young,
+So long a-growing and so leisurely,
+That, if this rule were true, he should be gracious.
+
+ARCHBISHOP OF YORK:
+Why, madam, so, no doubt, he is.
+
+DUCHESS OF YORK:
+I hope he is; but yet let mothers doubt.
+
+YORK:
+Now, by my troth, if I had been remember'd,
+I could have given my uncle's grace a flout,
+To touch his growth nearer than he touch'd mine.
+
+DUCHESS OF YORK:
+How, my pretty York? I pray thee, let me hear it.
+
+YORK:
+Marry, they say my uncle grew so fast
+That he could gnaw a crust at two hours old
+'Twas full two years ere I could get a tooth.
+Grandam, this would have been a biting jest.
+
+DUCHESS OF YORK:
+I pray thee, pretty York, who told thee this?
+
+YORK:
+Grandam, his nurse.
+
+DUCHESS OF YORK:
+His nurse! why, she was dead ere thou wert born.
+
+YORK:
+If 'twere not she, I cannot tell who told me.
+
+QUEEN ELIZABETH:
+A parlous boy: go to, you are too shrewd.
+
+ARCHBISHOP OF YORK:
+Good madam, be not angry with the child.
+
+QUEEN ELIZABETH:
+Pitchers have ears.
+
+ARCHBISHOP OF YORK:
+Here comes a messenger. What news?
+
+Messenger:
+Such news, my lord, as grieves me to unfold.
+
+QUEEN ELIZABETH:
+How fares the prince?
+
+Messenger:
+Well, madam, and in health.
+
+DUCHESS OF YORK:
+What is thy news then?
+
+Messenger:
+Lord Rivers and Lord Grey are sent to Pomfret,
+With them Sir Thomas Vaughan, prisoners.
+
+DUCHESS OF YORK:
+Who hath committed them?
+
+Messenger:
+The mighty dukes
+Gloucester and Buckingham.
+
+QUEEN ELIZABETH:
+For what offence?
+
+Messenger:
+The sum of all I can, I have disclosed;
+Why or for what these nobles were committed
+Is all unknown to me, my gracious lady.
+
+QUEEN ELIZABETH:
+Ay me, I see the downfall of our house!
+The tiger now hath seized the gentle hind;
+Insulting tyranny begins to jet
+Upon the innocent and aweless throne:
+Welcome, destruction, death, and massacre!
+I see, as in a map, the end of all.
+
+DUCHESS OF YORK:
+Accursed and unquiet wrangling days,
+How many of you have mine eyes beheld!
+My husband lost his life to get the crown;
+And often up and down my sons were toss'd,
+For me to joy and weep their gain and loss:
+And being seated, and domestic broils
+Clean over-blown, themselves, the conquerors.
+Make war upon themselves; blood against blood,
+Self against self: O, preposterous
+And frantic outrage, end thy damned spleen;
+Or let me die, to look on death no more!
+
+QUEEN ELIZABETH:
+Come, come, my boy; we will to sanctuary.
+Madam, farewell.
+
+DUCHESS OF YORK:
+I'll go along with you.
+
+QUEEN ELIZABETH:
+You have no cause.
+
+ARCHBISHOP OF YORK:
+My gracious lady, go;
+And thither bear your treasure and your goods.
+For my part, I'll resign unto your grace
+The seal I keep: and so betide to me
+As well I tender you and all of yours!
+Come, I'll conduct you to the sanctuary.
+
+BUCKINGHAM:
+Welcome, sweet prince, to London, to your chamber.
+
+GLOUCESTER:
+Welcome, dear cousin, my thoughts' sovereign
+The weary way hath made you melancholy.
+
+PRINCE EDWARD:
+No, uncle; but our crosses on the way
+Have made it tedious, wearisome, and heavy
+I want more uncles here to welcome me.
+
+GLOUCESTER:
+Sweet prince, the untainted virtue of your years
+Hath not yet dived into the world's deceit
+Nor more can you distinguish of a man
+Than of his outward show; which, God he knows,
+Seldom or never jumpeth with the heart.
+Those uncles which you want were dangerous;
+Your grace attended to their sugar'd words,
+But look'd not on the poison of their hearts :
+God keep you from them, and from such false friends!
+
+PRINCE EDWARD:
+God keep me from false friends! but they were none.
+
+GLOUCESTER:
+My lord, the mayor of London comes to greet you.
+
+Lord Mayor:
+God bless your grace with health and happy days!
+
+PRINCE EDWARD:
+I thank you, good my lord; and thank you all.
+I thought my mother, and my brother York,
+Would long ere this have met us on the way
+Fie, what a slug is Hastings, that he comes not
+To tell us whether they will come or no!
+
+BUCKINGHAM:
+And, in good time, here comes the sweating lord.
+
+PRINCE EDWARD:
+Welcome, my lord: what, will our mother come?
+
+HASTINGS:
+On what occasion, God he knows, not I,
+The queen your mother, and your brother York,
+Have taken sanctuary: the tender prince
+Would fain have come with me to meet your grace,
+But by his mother was perforce withheld.
+
+BUCKINGHAM:
+Fie, what an indirect and peevish course
+Is this of hers! Lord cardinal, will your grace
+Persuade the queen to send the Duke of York
+Unto his princely brother presently?
+If she deny, Lord Hastings, go with him,
+And from her jealous arms pluck him perforce.
+
+CARDINAL:
+My Lord of Buckingham, if my weak oratory
+Can from his mother win the Duke of York,
+Anon expect him here; but if she be obdurate
+To mild entreaties, God in heaven forbid
+We should infringe the holy privilege
+Of blessed sanctuary! not for all this land
+Would I be guilty of so deep a sin.
+
+BUCKINGHAM:
+You are too senseless--obstinate, my lord,
+Too ceremonious and traditional
+Weigh it but with the grossness of this age,
+You break not sanctuary in seizing him.
+The benefit thereof is always granted
+To those whose dealings have deserved the place,
+And those who have the wit to claim the place:
+This prince hath neither claim'd it nor deserved it;
+And therefore, in mine opinion, cannot have it:
+Then, taking him from thence that is not there,
+You break no privilege nor charter there.
+Oft have I heard of sanctuary men;
+But sanctuary children ne'er till now.
+
+CARDINAL:
+My lord, you shall o'er-rule my mind for once.
+Come on, Lord Hastings, will you go with me?
+
+HASTINGS:
+I go, my lord.
+
+PRINCE EDWARD:
+Good lords, make all the speedy haste you may.
+Say, uncle Gloucester, if our brother come,
+Where shall we sojourn till our coronation?
+
+GLOUCESTER:
+Where it seems best unto your royal self.
+If I may counsel you, some day or two
+Your highness shall repose you at the Tower:
+Then where you please, and shall be thought most fit
+For your best health and recreation.
+
+PRINCE EDWARD:
+I do not like the Tower, of any place.
+Did Julius Caesar build that place, my lord?
+
+BUCKINGHAM:
+He did, my gracious lord, begin that place;
+Which, since, succeeding ages have re-edified.
+
+PRINCE EDWARD:
+Is it upon record, or else reported
+Successively from age to age, he built it?
+
+BUCKINGHAM:
+Upon record, my gracious lord.
+
+PRINCE EDWARD:
+But say, my lord, it were not register'd,
+Methinks the truth should live from age to age,
+As 'twere retail'd to all posterity,
+Even to the general all-ending day.
+
+GLOUCESTER:
+
+PRINCE EDWARD:
+What say you, uncle?
+
+GLOUCESTER:
+I say, without characters, fame lives long.
+Thus, like the formal vice, Iniquity,
+I moralize two meanings in one word.
+
+PRINCE EDWARD:
+That Julius Caesar was a famous man;
+With what his valour did enrich his wit,
+His wit set down to make his valour live
+Death makes no conquest of this conqueror;
+For now he lives in fame, though not in life.
+I'll tell you what, my cousin Buckingham,--
+
+BUCKINGHAM:
+What, my gracious lord?
+
+PRINCE EDWARD:
+An if I live until I be a man,
+I'll win our ancient right in France again,
+Or die a soldier, as I lived a king.
+
+GLOUCESTER:
+
+BUCKINGHAM:
+Now, in good time, here comes the Duke of York.
+
+PRINCE EDWARD:
+Richard of York! how fares our loving brother?
+
+YORK:
+Well, my dread lord; so must I call you now.
+
+PRINCE EDWARD:
+Ay, brother, to our grief, as it is yours:
+Too late he died that might have kept that title,
+Which by his death hath lost much majesty.
+
+GLOUCESTER:
+How fares our cousin, noble Lord of York?
+
+YORK:
+I thank you, gentle uncle. O, my lord,
+You said that idle weeds are fast in growth
+The prince my brother hath outgrown me far.
+
+GLOUCESTER:
+He hath, my lord.
+
+YORK:
+And therefore is he idle?
+
+GLOUCESTER:
+O, my fair cousin, I must not say so.
+
+YORK:
+Then is he more beholding to you than I.
+
+GLOUCESTER:
+He may command me as my sovereign;
+But you have power in me as in a kinsman.
+
+YORK:
+I pray you, uncle, give me this dagger.
+
+GLOUCESTER:
+My dagger, little cousin? with all my heart.
+
+PRINCE EDWARD:
+A beggar, brother?
+
+YORK:
+Of my kind uncle, that I know will give;
+And being but a toy, which is no grief to give.
+
+GLOUCESTER:
+A greater gift than that I'll give my cousin.
+
+YORK:
+A greater gift! O, that's the sword to it.
+
+GLOUCESTER:
+A gentle cousin, were it light enough.
+
+YORK:
+O, then, I see, you will part but with light gifts;
+In weightier things you'll say a beggar nay.
+
+GLOUCESTER:
+It is too heavy for your grace to wear.
+
+YORK:
+I weigh it lightly, were it heavier.
+
+GLOUCESTER:
+What, would you have my weapon, little lord?
+
+YORK:
+I would, that I might thank you as you call me.
+
+GLOUCESTER:
+How?
+
+YORK:
+Little.
+
+PRINCE EDWARD:
+My Lord of York will still be cross in talk:
+Uncle, your grace knows how to bear with him.
+
+YORK:
+You mean, to bear me, not to bear with me:
+Uncle, my brother mocks both you and me;
+Because that I am little, like an ape,
+He thinks that you should bear me on your shoulders.
+
+BUCKINGHAM:
+With what a sharp-provided wit he reasons!
+To mitigate the scorn he gives his uncle,
+He prettily and aptly taunts himself:
+So cunning and so young is wonderful.
+
+GLOUCESTER:
+My lord, will't please you pass along?
+Myself and my good cousin Buckingham
+Will to your mother, to entreat of her
+To meet you at the Tower and welcome you.
+
+YORK:
+What, will you go unto the Tower, my lord?
+
+PRINCE EDWARD:
+My lord protector needs will have it so.
+
+YORK:
+I shall not sleep in quiet at the Tower.
+
+GLOUCESTER:
+Why, what should you fear?
+
+YORK:
+Marry, my uncle Clarence' angry ghost:
+My grandam told me he was murdered there.
+
+PRINCE EDWARD:
+I fear no uncles dead.
+
+GLOUCESTER:
+Nor none that live, I hope.
+
+PRINCE EDWARD:
+An if they live, I hope I need not fear.
+But come, my lord; and with a heavy heart,
+Thinking on them, go I unto the Tower.
+
+BUCKINGHAM:
+Think you, my lord, this little prating York
+Was not incensed by his subtle mother
+To taunt and scorn you thus opprobriously?
+
+GLOUCESTER:
+No doubt, no doubt; O, 'tis a parlous boy;
+Bold, quick, ingenious, forward, capable
+He is all the mother's, from the top to toe.
+
+BUCKINGHAM:
+Well, let them rest. Come hither, Catesby.
+Thou art sworn as deeply to effect what we intend
+As closely to conceal what we impart:
+Thou know'st our reasons urged upon the way;
+What think'st thou? is it not an easy matter
+To make William Lord Hastings of our mind,
+For the instalment of this noble duke
+In the seat royal of this famous isle?
+
+CATESBY:
+He for his father's sake so loves the prince,
+That he will not be won to aught against him.
+
+BUCKINGHAM:
+What think'st thou, then, of Stanley? what will he?
+
+CATESBY:
+He will do all in all as Hastings doth.
+
+BUCKINGHAM:
+Well, then, no more but this: go, gentle Catesby,
+And, as it were far off sound thou Lord Hastings,
+How doth he stand affected to our purpose;
+And summon him to-morrow to the Tower,
+To sit about the coronation.
+If thou dost find him tractable to us,
+Encourage him, and show him all our reasons:
+If he be leaden, icy-cold, unwilling,
+Be thou so too; and so break off your talk,
+And give us notice of his inclination:
+For we to-morrow hold divided councils,
+Wherein thyself shalt highly be employ'd.
+
+GLOUCESTER:
+Commend me to Lord William: tell him, Catesby,
+His ancient knot of dangerous adversaries
+To-morrow are let blood at Pomfret-castle;
+And bid my friend, for joy of this good news,
+Give mistress Shore one gentle kiss the more.
+
+BUCKINGHAM:
+Good Catesby, go, effect this business soundly.
+
+CATESBY:
+My good lords both, with all the heed I may.
+
+GLOUCESTER:
+Shall we hear from you, Catesby, ere we sleep?
+
+CATESBY:
+You shall, my lord.
+
+GLOUCESTER:
+At Crosby Place, there shall you find us both.
+
+BUCKINGHAM:
+Now, my lord, what shall we do, if we perceive
+Lord Hastings will not yield to our complots?
+
+GLOUCESTER:
+Chop off his head, man; somewhat we will do:
+And, look, when I am king, claim thou of me
+The earldom of Hereford, and the moveables
+Whereof the king my brother stood possess'd.
+
+BUCKINGHAM:
+I'll claim that promise at your grace's hands.
+
+GLOUCESTER:
+And look to have it yielded with all willingness.
+Come, let us sup betimes, that afterwards
+We may digest our complots in some form.
+
+Messenger:
+What, ho! my lord!
+
+HASTINGS:
+
+Messenger:
+A messenger from the Lord Stanley.
+
+HASTINGS:
+What is't o'clock?
+
+Messenger:
+Upon the stroke of four.
+
+HASTINGS:
+Cannot thy master sleep these tedious nights?
+
+Messenger:
+So it should seem by that I have to say.
+First, he commends him to your noble lordship.
+
+HASTINGS:
+And then?
+
+Messenger:
+And then he sends you word
+He dreamt to-night the boar had razed his helm:
+Besides, he says there are two councils held;
+And that may be determined at the one
+which may make you and him to rue at the other.
+Therefore he sends to know your lordship's pleasure,
+If presently you will take horse with him,
+And with all speed post with him toward the north,
+To shun the danger that his soul divines.
+
+HASTINGS:
+Go, fellow, go, return unto thy lord;
+Bid him not fear the separated councils
+His honour and myself are at the one,
+And at the other is my servant Catesby
+Where nothing can proceed that toucheth us
+Whereof I shall not have intelligence.
+Tell him his fears are shallow, wanting instance:
+And for his dreams, I wonder he is so fond
+To trust the mockery of unquiet slumbers
+To fly the boar before the boar pursues,
+Were to incense the boar to follow us
+And make pursuit where he did mean no chase.
+Go, bid thy master rise and come to me
+And we will both together to the Tower,
+Where, he shall see, the boar will use us kindly.
+
+Messenger:
+My gracious lord, I'll tell him what you say.
+
+CATESBY:
+Many good morrows to my noble lord!
+
+HASTINGS:
+Good morrow, Catesby; you are early stirring
+What news, what news, in this our tottering state?
+
+CATESBY:
+It is a reeling world, indeed, my lord;
+And I believe twill never stand upright
+Tim Richard wear the garland of the realm.
+
+HASTINGS:
+How! wear the garland! dost thou mean the crown?
+
+CATESBY:
+Ay, my good lord.
+
+HASTINGS:
+I'll have this crown of mine cut from my shoulders
+Ere I will see the crown so foul misplaced.
+But canst thou guess that he doth aim at it?
+
+CATESBY:
+Ay, on my life; and hopes to find forward
+Upon his party for the gain thereof:
+And thereupon he sends you this good news,
+That this same very day your enemies,
+The kindred of the queen, must die at Pomfret.
+
+HASTINGS:
+Indeed, I am no mourner for that news,
+Because they have been still mine enemies:
+But, that I'll give my voice on Richard's side,
+To bar my master's heirs in true descent,
+God knows I will not do it, to the death.
+
+CATESBY:
+God keep your lordship in that gracious mind!
+
+HASTINGS:
+But I shall laugh at this a twelve-month hence,
+That they who brought me in my master's hate
+I live to look upon their tragedy.
+I tell thee, Catesby--
+
+CATESBY:
+What, my lord?
+
+HASTINGS:
+Ere a fortnight make me elder,
+I'll send some packing that yet think not on it.
+
+CATESBY:
+'Tis a vile thing to die, my gracious lord,
+When men are unprepared and look not for it.
+
+HASTINGS:
+O monstrous, monstrous! and so falls it out
+With Rivers, Vaughan, Grey: and so 'twill do
+With some men else, who think themselves as safe
+As thou and I; who, as thou know'st, are dear
+To princely Richard and to Buckingham.
+
+CATESBY:
+The princes both make high account of you;
+For they account his head upon the bridge.
+
+HASTINGS:
+I know they do; and I have well deserved it.
+Come on, come on; where is your boar-spear, man?
+Fear you the boar, and go so unprovided?
+
+STANLEY:
+My lord, good morrow; good morrow, Catesby:
+You may jest on, but, by the holy rood,
+I do not like these several councils, I.
+
+HASTINGS:
+My lord,
+I hold my life as dear as you do yours;
+And never in my life, I do protest,
+Was it more precious to me than 'tis now:
+Think you, but that I know our state secure,
+I would be so triumphant as I am?
+
+STANLEY:
+The lords at Pomfret, when they rode from London,
+Were jocund, and supposed their state was sure,
+And they indeed had no cause to mistrust;
+But yet, you see how soon the day o'ercast.
+This sudden stag of rancour I misdoubt:
+Pray God, I say, I prove a needless coward!
+What, shall we toward the Tower? the day is spent.
+
+HASTINGS:
+Come, come, have with you. Wot you what, my lord?
+To-day the lords you talk of are beheaded.
+
+LORD STANLEY:
+They, for their truth, might better wear their heads
+Than some that have accused them wear their hats.
+But come, my lord, let us away.
+
+HASTINGS:
+Go on before; I'll talk with this good fellow.
+How now, sirrah! how goes the world with thee?
+
+Pursuivant:
+The better that your lordship please to ask.
+
+HASTINGS:
+I tell thee, man, 'tis better with me now
+Than when I met thee last where now we meet:
+Then was I going prisoner to the Tower,
+By the suggestion of the queen's allies;
+But now, I tell thee--keep it to thyself--
+This day those enemies are put to death,
+And I in better state than e'er I was.
+
+Pursuivant:
+God hold it, to your honour's good content!
+
+HASTINGS:
+Gramercy, fellow: there, drink that for me.
+
+Pursuivant:
+God save your lordship!
+
+Priest:
+Well met, my lord; I am glad to see your honour.
+
+HASTINGS:
+I thank thee, good Sir John, with all my heart.
+I am in your debt for your last exercise;
+Come the next Sabbath, and I will content you.
+
+BUCKINGHAM:
+What, talking with a priest, lord chamberlain?
+Your friends at Pomfret, they do need the priest;
+Your honour hath no shriving work in hand.
+
+HASTINGS:
+Good faith, and when I met this holy man,
+Those men you talk of came into my mind.
+What, go you toward the Tower?
+
+BUCKINGHAM:
+I do, my lord; but long I shall not stay
+I shall return before your lordship thence.
+
+HASTINGS:
+'Tis like enough, for I stay dinner there.
+
+BUCKINGHAM:
+
+HASTINGS:
+I'll wait upon your lordship.
+
+RATCLIFF:
+Come, bring forth the prisoners.
+
+RIVERS:
+Sir Richard Ratcliff, let me tell thee this:
+To-day shalt thou behold a subject die
+For truth, for duty, and for loyalty.
+
+GREY:
+God keep the prince from all the pack of you!
+A knot you are of damned blood-suckers!
+
+VAUGHAN:
+You live that shall cry woe for this after.
+
+RATCLIFF:
+Dispatch; the limit of your lives is out.
+
+RIVERS:
+O Pomfret, Pomfret! O thou bloody prison,
+Fatal and ominous to noble peers!
+Within the guilty closure of thy walls
+Richard the second here was hack'd to death;
+And, for more slander to thy dismal seat,
+We give thee up our guiltless blood to drink.
+
+GREY:
+Now Margaret's curse is fall'n upon our heads,
+For standing by when Richard stabb'd her son.
+
+RIVERS:
+Then cursed she Hastings, then cursed she Buckingham,
+Then cursed she Richard. O, remember, God
+To hear her prayers for them, as now for us
+And for my sister and her princely sons,
+Be satisfied, dear God, with our true blood,
+Which, as thou know'st, unjustly must be spilt.
+
+RATCLIFF:
+Make haste; the hour of death is expiate.
+
+RIVERS:
+Come, Grey, come, Vaughan, let us all embrace:
+And take our leave, until we meet in heaven.
+
+HASTINGS:
+My lords, at once: the cause why we are met
+Is, to determine of the coronation.
+In God's name, speak: when is the royal day?
+
+BUCKINGHAM:
+Are all things fitting for that royal time?
+
+DERBY:
+It is, and wants but nomination.
+
+BISHOP OF ELY:
+To-morrow, then, I judge a happy day.
+
+BUCKINGHAM:
+Who knows the lord protector's mind herein?
+Who is most inward with the royal duke?
+
+BISHOP OF ELY:
+Your grace, we think, should soonest know his mind.
+
+BUCKINGHAM:
+Who, I, my lord I we know each other's faces,
+But for our hearts, he knows no more of mine,
+Than I of yours;
+Nor I no more of his, than you of mine.
+Lord Hastings, you and he are near in love.
+
+HASTINGS:
+I thank his grace, I know he loves me well;
+But, for his purpose in the coronation.
+I have not sounded him, nor he deliver'd
+His gracious pleasure any way therein:
+But you, my noble lords, may name the time;
+And in the duke's behalf I'll give my voice,
+Which, I presume, he'll take in gentle part.
+
+BISHOP OF ELY:
+Now in good time, here comes the duke himself.
+
+GLOUCESTER:
+My noble lords and cousins all, good morrow.
+I have been long a sleeper; but, I hope,
+My absence doth neglect no great designs,
+Which by my presence might have been concluded.
+
+BUCKINGHAM:
+Had not you come upon your cue, my lord
+William Lord Hastings had pronounced your part,--
+I mean, your voice,--for crowning of the king.
+
+GLOUCESTER:
+Than my Lord Hastings no man might be bolder;
+His lordship knows me well, and loves me well.
+
+HASTINGS:
+I thank your grace.
+
+GLOUCESTER:
+My lord of Ely!
+
+BISHOP OF ELY:
+My lord?
+
+GLOUCESTER:
+When I was last in Holborn,
+I saw good strawberries in your garden there
+I do beseech you send for some of them.
+
+BISHOP OF ELY:
+Marry, and will, my lord, with all my heart.
+
+GLOUCESTER:
+Cousin of Buckingham, a word with you.
+Catesby hath sounded Hastings in our business,
+And finds the testy gentleman so hot,
+As he will lose his head ere give consent
+His master's son, as worshipful as he terms it,
+Shall lose the royalty of England's throne.
+
+BUCKINGHAM:
+Withdraw you hence, my lord, I'll follow you.
+
+DERBY:
+We have not yet set down this day of triumph.
+To-morrow, in mine opinion, is too sudden;
+For I myself am not so well provided
+As else I would be, were the day prolong'd.
+
+BISHOP OF ELY:
+Where is my lord protector? I have sent for these
+strawberries.
+
+HASTINGS:
+His grace looks cheerfully and smooth to-day;
+There's some conceit or other likes him well,
+When he doth bid good morrow with such a spirit.
+I think there's never a man in Christendom
+That can less hide his love or hate than he;
+For by his face straight shall you know his heart.
+
+DERBY:
+What of his heart perceive you in his face
+By any likelihood he show'd to-day?
+
+HASTINGS:
+Marry, that with no man here he is offended;
+For, were he, he had shown it in his looks.
+
+DERBY:
+I pray God he be not, I say.
+
+GLOUCESTER:
+I pray you all, tell me what they deserve
+That do conspire my death with devilish plots
+Of damned witchcraft, and that have prevail'd
+Upon my body with their hellish charms?
+
+HASTINGS:
+The tender love I bear your grace, my lord,
+Makes me most forward in this noble presence
+To doom the offenders, whatsoever they be
+I say, my lord, they have deserved death.
+
+GLOUCESTER:
+Then be your eyes the witness of this ill:
+See how I am bewitch'd; behold mine arm
+Is, like a blasted sapling, wither'd up:
+And this is Edward's wife, that monstrous witch,
+Consorted with that harlot strumpet Shore,
+That by their witchcraft thus have marked me.
+
+HASTINGS:
+If they have done this thing, my gracious lord--
+
+GLOUCESTER:
+If I thou protector of this damned strumpet--
+Tellest thou me of 'ifs'?  Thou art a traitor:
+Off with his head! Now, by Saint Paul I swear,
+I will not dine until I see the same.
+Lovel and Ratcliff, look that it be done:
+The rest, that love me, rise and follow me.
+
+HASTINGS:
+Woe, woe for England! not a whit for me;
+For I, too fond, might have prevented this.
+Stanley did dream the boar did raze his helm;
+But I disdain'd it, and did scorn to fly:
+Three times to-day my foot-cloth horse did stumble,
+And startled, when he look'd upon the Tower,
+As loath to bear me to the slaughter-house.
+O, now I want the priest that spake to me:
+I now repent I told the pursuivant
+As 'twere triumphing at mine enemies,
+How they at Pomfret bloodily were butcher'd,
+And I myself secure in grace and favour.
+O Margaret, Margaret, now thy heavy curse
+Is lighted on poor Hastings' wretched head!
+
+RATCLIFF:
+Dispatch, my lord; the duke would be at dinner:
+Make a short shrift; he longs to see your head.
+
+HASTINGS:
+O momentary grace of mortal men,
+Which we more hunt for than the grace of God!
+Who builds his hopes in air of your good looks,
+Lives like a drunken sailor on a mast,
+Ready, with every nod, to tumble down
+Into the fatal bowels of the deep.
+
+LOVEL:
+Come, come, dispatch; 'tis bootless to exclaim.
+
+HASTINGS:
+O bloody Richard! miserable England!
+I prophesy the fearful'st time to thee
+That ever wretched age hath look'd upon.
+Come, lead me to the block; bear him my head.
+They smile at me that shortly shall be dead.
+
+GLOUCESTER:
+Come, cousin, canst thou quake, and change thy colour,
+Murder thy breath in the middle of a word,
+And then begin again, and stop again,
+As if thou wert distraught and mad with terror?
+
+BUCKINGHAM:
+Tut, I can counterfeit the deep tragedian;
+Speak and look back, and pry on every side,
+Tremble and start at wagging of a straw,
+Intending deep suspicion: ghastly looks
+Are at my service, like enforced smiles;
+And both are ready in their offices,
+At any time, to grace my stratagems.
+But what, is Catesby gone?
+
+GLOUCESTER:
+He is; and, see, he brings the mayor along.
+
+BUCKINGHAM:
+Lord mayor,--
+
+GLOUCESTER:
+Look to the drawbridge there!
+
+BUCKINGHAM:
+Hark! a drum.
+
+GLOUCESTER:
+Catesby, o'erlook the walls.
+
+BUCKINGHAM:
+Lord mayor, the reason we have sent--
+
+GLOUCESTER:
+Look back, defend thee, here are enemies.
+
+BUCKINGHAM:
+God and our innocency defend and guard us!
+
+GLOUCESTER:
+Be patient, they are friends, Ratcliff and Lovel.
+
+LOVEL:
+Here is the head of that ignoble traitor,
+The dangerous and unsuspected Hastings.
+
+GLOUCESTER:
+So dear I loved the man, that I must weep.
+I took him for the plainest harmless creature
+That breathed upon this earth a Christian;
+Made him my book wherein my soul recorded
+The history of all her secret thoughts:
+So smooth he daub'd his vice with show of virtue,
+That, his apparent open guilt omitted,
+I mean, his conversation with Shore's wife,
+He lived from all attainder of suspect.
+
+BUCKINGHAM:
+Well, well, he was the covert'st shelter'd traitor
+That ever lived.
+Would you imagine, or almost believe,
+Were't not that, by great preservation,
+We live to tell it you, the subtle traitor
+This day had plotted, in the council-house
+To murder me and my good Lord of Gloucester?
+
+Lord Mayor:
+What, had he so?
+
+GLOUCESTER:
+What, think You we are Turks or infidels?
+Or that we would, against the form of law,
+Proceed thus rashly to the villain's death,
+But that the extreme peril of the case,
+The peace of England and our persons' safety,
+Enforced us to this execution?
+
+Lord Mayor:
+Now, fair befall you! he deserved his death;
+And you my good lords, both have well proceeded,
+To warn false traitors from the like attempts.
+I never look'd for better at his hands,
+After he once fell in with Mistress Shore.
+
+GLOUCESTER:
+Yet had not we determined he should die,
+Until your lordship came to see his death;
+Which now the loving haste of these our friends,
+Somewhat against our meaning, have prevented:
+Because, my lord, we would have had you heard
+The traitor speak, and timorously confess
+The manner and the purpose of his treason;
+That you might well have signified the same
+Unto the citizens, who haply may
+Misconstrue us in him and wail his death.
+
+Lord Mayor:
+But, my good lord, your grace's word shall serve,
+As well as I had seen and heard him speak
+And doubt you not, right noble princes both,
+But I'll acquaint our duteous citizens
+With all your just proceedings in this cause.
+
+GLOUCESTER:
+And to that end we wish'd your lord-ship here,
+To avoid the carping censures of the world.
+
+BUCKINGHAM:
+But since you come too late of our intents,
+Yet witness what you hear we did intend:
+And so, my good lord mayor, we bid farewell.
+
+GLOUCESTER:
+Go, after, after, cousin Buckingham.
+The mayor towards Guildhall hies him in all post:
+There, at your meet'st advantage of the time,
+Infer the bastardy of Edward's children:
+Tell them how Edward put to death a citizen,
+Only for saying he would make his son
+Heir to the crown; meaning indeed his house,
+Which, by the sign thereof was termed so.
+Moreover, urge his hateful luxury
+And bestial appetite in change of lust;
+Which stretched to their servants, daughters, wives,
+Even where his lustful eye or savage heart,
+Without control, listed to make his prey.
+Nay, for a need, thus far come near my person:
+Tell them, when that my mother went with child
+Of that unsatiate Edward, noble York
+My princely father then had wars in France
+And, by just computation of the time,
+Found that the issue was not his begot;
+Which well appeared in his lineaments,
+Being nothing like the noble duke my father:
+But touch this sparingly, as 'twere far off,
+Because you know, my lord, my mother lives.
+
+BUCKINGHAM:
+Fear not, my lord, I'll play the orator
+As if the golden fee for which I plead
+Were for myself: and so, my lord, adieu.
+
+GLOUCESTER:
+If you thrive well, bring them to Baynard's Castle;
+Where you shall find me well accompanied
+With reverend fathers and well-learned bishops.
+
+BUCKINGHAM:
+I go: and towards three or four o'clock
+Look for the news that the Guildhall affords.
+
+GLOUCESTER:
+Go, Lovel, with all speed to Doctor Shaw;
+Go thou to Friar Penker; bid them both
+Meet me within this hour at Baynard's Castle.
+Now will I in, to take some privy order,
+To draw the brats of Clarence out of sight;
+And to give notice, that no manner of person
+At any time have recourse unto the princes.
+
+Scrivener:
+This is the indictment of the good Lord Hastings;
+Which in a set hand fairly is engross'd,
+That it may be this day read over in Paul's.
+And mark how well the sequel hangs together:
+Eleven hours I spent to write it over,
+For yesternight by Catesby was it brought me;
+The precedent was full as long a-doing:
+And yet within these five hours lived Lord Hastings,
+Untainted, unexamined, free, at liberty
+Here's a good world the while! Why who's so gross,
+That seeth not this palpable device?
+Yet who's so blind, but says he sees it not?
+Bad is the world; and all will come to nought,
+When such bad dealings must be seen in thought.
+
+GLOUCESTER:
+How now, my lord, what say the citizens?
+
+BUCKINGHAM:
+Now, by the holy mother of our Lord,
+The citizens are mum and speak not a word.
+
+GLOUCESTER:
+Touch'd you the bastardy of Edward's children?
+
+BUCKINGHAM:
+I did; with his contract with Lady Lucy,
+And his contract by deputy in France;
+The insatiate greediness of his desires,
+And his enforcement of the city wives;
+His tyranny for trifles; his own bastardy,
+As being got, your father then in France,
+His resemblance, being not like the duke;
+Withal I did infer your lineaments,
+Being the right idea of your father,
+Both in your form and nobleness of mind;
+Laid open all your victories in Scotland,
+Your dicipline in war, wisdom in peace,
+Your bounty, virtue, fair humility:
+Indeed, left nothing fitting for the purpose
+Untouch'd, or slightly handled, in discourse
+And when mine oratory grew to an end
+I bid them that did love their country's good
+Cry 'God save Richard, England's royal king!'
+
+GLOUCESTER:
+Ah! and did they so?
+
+BUCKINGHAM:
+No, so God help me, they spake not a word;
+But, like dumb statues or breathing stones,
+Gazed each on other, and look'd deadly pale.
+Which when I saw, I reprehended them;
+And ask'd the mayor what meant this wilful silence:
+His answer was, the people were not wont
+To be spoke to but by the recorder.
+Then he was urged to tell my tale again,
+'Thus saith the duke, thus hath the duke inferr'd;'
+But nothing spake in warrant from himself.
+When he had done, some followers of mine own,
+At the lower end of the hall, hurl'd up their caps,
+And some ten voices cried 'God save King Richard!'
+And thus I took the vantage of those few,
+'Thanks, gentle citizens and friends,' quoth I;
+'This general applause and loving shout
+Argues your wisdoms and your love to Richard:'
+And even here brake off, and came away.
+
+GLOUCESTER:
+What tongueless blocks were they! would not they speak?
+
+BUCKINGHAM:
+No, by my troth, my lord.
+
+GLOUCESTER:
+Will not the mayor then and his brethren come?
+
+BUCKINGHAM:
+The mayor is here at hand: intend some fear;
+Be not you spoke with, but by mighty suit:
+And look you get a prayer-book in your hand,
+And stand betwixt two churchmen, good my lord;
+For on that ground I'll build a holy descant:
+And be not easily won to our request:
+Play the maid's part, still answer nay, and take it.
+
+GLOUCESTER:
+I go; and if you plead as well for them
+As I can say nay to thee for myself,
+No doubt well bring it to a happy issue.
+
+BUCKINGHAM:
+Go, go, up to the leads; the lord mayor knocks.
+Welcome my lord; I dance attendance here;
+I think the duke will not be spoke withal.
+Here comes his servant: how now, Catesby,
+What says he?
+
+CATESBY:
+My lord: he doth entreat your grace;
+To visit him to-morrow or next day:
+He is within, with two right reverend fathers,
+Divinely bent to meditation;
+And no worldly suit would he be moved,
+To draw him from his holy exercise.
+
+BUCKINGHAM:
+Return, good Catesby, to thy lord again;
+Tell him, myself, the mayor and citizens,
+In deep designs and matters of great moment,
+No less importing than our general good,
+Are come to have some conference with his grace.
+
+CATESBY:
+I'll tell him what you say, my lord.
+
+BUCKINGHAM:
+Ah, ha, my lord, this prince is not an Edward!
+He is not lolling on a lewd day-bed,
+But on his knees at meditation;
+Not dallying with a brace of courtezans,
+But meditating with two deep divines;
+Not sleeping, to engross his idle body,
+But praying, to enrich his watchful soul:
+Happy were England, would this gracious prince
+Take on himself the sovereignty thereof:
+But, sure, I fear, we shall ne'er win him to it.
+
+Lord Mayor:
+Marry, God forbid his grace should say us nay!
+
+BUCKINGHAM:
+I fear he will.
+How now, Catesby, what says your lord?
+
+CATESBY:
+My lord,
+He wonders to what end you have assembled
+Such troops of citizens to speak with him,
+His grace not being warn'd thereof before:
+My lord, he fears you mean no good to him.
+
+BUCKINGHAM:
+Sorry I am my noble cousin should
+Suspect me, that I mean no good to him:
+By heaven, I come in perfect love to him;
+And so once more return and tell his grace.
+When holy and devout religious men
+Are at their beads, 'tis hard to draw them thence,
+So sweet is zealous contemplation.
+
+Lord Mayor:
+See, where he stands between two clergymen!
+
+BUCKINGHAM:
+Two props of virtue for a Christian prince,
+To stay him from the fall of vanity:
+And, see, a book of prayer in his hand,
+True ornaments to know a holy man.
+Famous Plantagenet, most gracious prince,
+Lend favourable ears to our request;
+And pardon us the interruption
+Of thy devotion and right Christian zeal.
+
+GLOUCESTER:
+My lord, there needs no such apology:
+I rather do beseech you pardon me,
+Who, earnest in the service of my God,
+Neglect the visitation of my friends.
+But, leaving this, what is your grace's pleasure?
+
+BUCKINGHAM:
+Even that, I hope, which pleaseth God above,
+And all good men of this ungovern'd isle.
+
+GLOUCESTER:
+I do suspect I have done some offence
+That seems disgracious in the city's eyes,
+And that you come to reprehend my ignorance.
+
+BUCKINGHAM:
+You have, my lord: would it might please your grace,
+At our entreaties, to amend that fault!
+
+GLOUCESTER:
+Else wherefore breathe I in a Christian land?
+
+BUCKINGHAM:
+Then know, it is your fault that you resign
+The supreme seat, the throne majestical,
+The scepter'd office of your ancestors,
+Your state of fortune and your due of birth,
+The lineal glory of your royal house,
+To the corruption of a blemished stock:
+Whilst, in the mildness of your sleepy thoughts,
+Which here we waken to our country's good,
+This noble isle doth want her proper limbs;
+Her face defaced with scars of infamy,
+Her royal stock graft with ignoble plants,
+And almost shoulder'd in the swallowing gulf
+Of blind forgetfulness and dark oblivion.
+Which to recure, we heartily solicit
+Your gracious self to take on you the charge
+And kingly government of this your land,
+Not as protector, steward, substitute,
+Or lowly factor for another's gain;
+But as successively from blood to blood,
+Your right of birth, your empery, your own.
+For this, consorted with the citizens,
+Your very worshipful and loving friends,
+And by their vehement instigation,
+In this just suit come I to move your grace.
+
+GLOUCESTER:
+I know not whether to depart in silence,
+Or bitterly to speak in your reproof.
+Best fitteth my degree or your condition
+If not to answer, you might haply think
+Tongue-tied ambition, not replying, yielded
+To bear the golden yoke of sovereignty,
+Which fondly you would here impose on me;
+If to reprove you for this suit of yours,
+So season'd with your faithful love to me.
+Then, on the other side, I cheque'd my friends.
+Therefore, to speak, and to avoid the first,
+And then, in speaking, not to incur the last,
+Definitively thus I answer you.
+Your love deserves my thanks; but my desert
+Unmeritable shuns your high request.
+First if all obstacles were cut away,
+And that my path were even to the crown,
+As my ripe revenue and due by birth
+Yet so much is my poverty of spirit,
+So mighty and so many my defects,
+As I had rather hide me from my greatness,
+Being a bark to brook no mighty sea,
+Than in my greatness covet to be hid,
+And in the vapour of my glory smother'd.
+But, God be thank'd, there's no need of me,
+And much I need to help you, if need were;
+The royal tree hath left us royal fruit,
+Which, mellow'd by the stealing hours of time,
+Will well become the seat of majesty,
+And make, no doubt, us happy by his reign.
+On him I lay what you would lay on me,
+The right and fortune of his happy stars;
+Which God defend that I should wring from him!
+
+BUCKINGHAM:
+My lord, this argues conscience in your grace;
+But the respects thereof are nice and trivial,
+All circumstances well considered.
+You say that Edward is your brother's son:
+So say we too, but not by Edward's wife;
+For first he was contract to Lady Lucy--
+Your mother lives a witness to that vow--
+And afterward by substitute betroth'd
+To Bona, sister to the King of France.
+These both put by a poor petitioner,
+A care-crazed mother of a many children,
+A beauty-waning and distressed widow,
+Even in the afternoon of her best days,
+Made prize and purchase of his lustful eye,
+Seduced the pitch and height of all his thoughts
+To base declension and loathed bigamy
+By her, in his unlawful bed, he got
+This Edward, whom our manners term the prince.
+More bitterly could I expostulate,
+Save that, for reverence to some alive,
+I give a sparing limit to my tongue.
+Then, good my lord, take to your royal self
+This proffer'd benefit of dignity;
+If non to bless us and the land withal,
+Yet to draw forth your noble ancestry
+From the corruption of abusing times,
+Unto a lineal true-derived course.
+
+Lord Mayor:
+Do, good my lord, your citizens entreat you.
+
+BUCKINGHAM:
+Refuse not, mighty lord, this proffer'd love.
+
+CATESBY:
+O, make them joyful, grant their lawful suit!
+
+GLOUCESTER:
+Alas, why would you heap these cares on me?
+I am unfit for state and majesty;
+I do beseech you, take it not amiss;
+I cannot nor I will not yield to you.
+
+BUCKINGHAM:
+If you refuse it,--as, in love and zeal,
+Loath to depose the child, Your brother's son;
+As well we know your tenderness of heart
+And gentle, kind, effeminate remorse,
+Which we have noted in you to your kin,
+And egally indeed to all estates,--
+Yet whether you accept our suit or no,
+Your brother's son shall never reign our king;
+But we will plant some other in the throne,
+To the disgrace and downfall of your house:
+And in this resolution here we leave you.--
+Come, citizens: 'zounds! I'll entreat no more.
+
+GLOUCESTER:
+O, do not swear, my lord of Buckingham.
+
+CATESBY:
+Call them again, my lord, and accept their suit.
+
+ANOTHER:
+Do, good my lord, lest all the land do rue it.
+
+GLOUCESTER:
+Would you enforce me to a world of care?
+Well, call them again. I am not made of stone,
+But penetrable to your. kind entreats,
+Albeit against my conscience and my soul.
+Cousin of Buckingham, and you sage, grave men,
+Since you will buckle fortune on my back,
+To bear her burthen, whether I will or no,
+I must have patience to endure the load:
+But if black scandal or foul-faced reproach
+Attend the sequel of your imposition,
+Your mere enforcement shall acquittance me
+From all the impure blots and stains thereof;
+For God he knows, and you may partly see,
+How far I am from the desire thereof.
+
+Lord Mayor:
+God bless your grace! we see it, and will say it.
+
+GLOUCESTER:
+In saying so, you shall but say the truth.
+
+BUCKINGHAM:
+Then I salute you with this kingly title:
+Long live Richard, England's royal king!
+
+Lord Mayor:
+Amen.
+
+BUCKINGHAM:
+To-morrow will it please you to be crown'd?
+
+GLOUCESTER:
+Even when you please, since you will have it so.
+
+BUCKINGHAM:
+To-morrow, then, we will attend your grace:
+And so most joyfully we take our leave.
+
+GLOUCESTER:
+Come, let us to our holy task again.
+Farewell, good cousin; farewell, gentle friends.
+
+DUCHESS OF YORK:
+Who meets us here?  my niece Plantagenet
+Led in the hand of her kind aunt of Gloucester?
+Now, for my life, she's wandering to the Tower,
+On pure heart's love to greet the tender princes.
+Daughter, well met.
+
+LADY ANNE:
+God give your graces both
+A happy and a joyful time of day!
+
+QUEEN ELIZABETH:
+As much to you, good sister! Whither away?
+
+LADY ANNE:
+No farther than the Tower; and, as I guess,
+Upon the like devotion as yourselves,
+To gratulate the gentle princes there.
+
+QUEEN ELIZABETH:
+Kind sister, thanks: we'll enter all together.
+And, in good time, here the lieutenant comes.
+Master lieutenant, pray you, by your leave,
+How doth the prince, and my young son of York?
+
+BRAKENBURY:
+Right well, dear madam. By your patience,
+I may not suffer you to visit them;
+The king hath straitly charged the contrary.
+
+QUEEN ELIZABETH:
+The king! why, who's that?
+
+BRAKENBURY:
+I cry you mercy: I mean the lord protector.
+
+QUEEN ELIZABETH:
+The Lord protect him from that kingly title!
+Hath he set bounds betwixt their love and me?
+I am their mother; who should keep me from them?
+
+DUCHESS OF YORK:
+I am their fathers mother; I will see them.
+
+LADY ANNE:
+Their aunt I am in law, in love their mother:
+Then bring me to their sights; I'll bear thy blame
+And take thy office from thee, on my peril.
+
+BRAKENBURY:
+No, madam, no; I may not leave it so:
+I am bound by oath, and therefore pardon me.
+
+LORD STANLEY:
+Let me but meet you, ladies, one hour hence,
+And I'll salute your grace of York as mother,
+And reverend looker on, of two fair queens.
+Come, madam, you must straight to Westminster,
+There to be crowned Richard's royal queen.
+
+QUEEN ELIZABETH:
+O, cut my lace in sunder, that my pent heart
+May have some scope to beat, or else I swoon
+With this dead-killing news!
+
+LADY ANNE:
+Despiteful tidings! O unpleasing news!
+
+DORSET:
+Be of good cheer: mother, how fares your grace?
+
+QUEEN ELIZABETH:
+O Dorset, speak not to me, get thee hence!
+Death and destruction dog thee at the heels;
+Thy mother's name is ominous to children.
+If thou wilt outstrip death, go cross the seas,
+And live with Richmond, from the reach of hell
+Go, hie thee, hie thee from this slaughter-house,
+Lest thou increase the number of the dead;
+And make me die the thrall of Margaret's curse,
+Nor mother, wife, nor England's counted queen.
+
+LORD STANLEY:
+Full of wise care is this your counsel, madam.
+Take all the swift advantage of the hours;
+You shall have letters from me to my son
+To meet you on the way, and welcome you.
+Be not ta'en tardy by unwise delay.
+
+DUCHESS OF YORK:
+O ill-dispersing wind of misery!
+O my accursed womb, the bed of death!
+A cockatrice hast thou hatch'd to the world,
+Whose unavoided eye is murderous.
+
+LORD STANLEY:
+Come, madam, come; I in all haste was sent.
+
+LADY ANNE:
+And I in all unwillingness will go.
+I would to God that the inclusive verge
+Of golden metal that must round my brow
+Were red-hot steel, to sear me to the brain!
+Anointed let me be with deadly venom,
+And die, ere men can say, God save the queen!
+
+QUEEN ELIZABETH:
+Go, go, poor soul, I envy not thy glory
+To feed my humour, wish thyself no harm.
+
+LADY ANNE:
+No! why?  When he that is my husband now
+Came to me, as I follow'd Henry's corse,
+When scarce the blood was well wash'd from his hands
+Which issued from my other angel husband
+And that dead saint which then I weeping follow'd;
+O, when, I say, I look'd on Richard's face,
+This was my wish: 'Be thou,' quoth I, ' accursed,
+For making me, so young, so old a widow!
+And, when thou wed'st, let sorrow haunt thy bed;
+And be thy wife--if any be so mad--
+As miserable by the life of thee
+As thou hast made me by my dear lord's death!
+Lo, ere I can repeat this curse again,
+Even in so short a space, my woman's heart
+Grossly grew captive to his honey words
+And proved the subject of my own soul's curse,
+Which ever since hath kept my eyes from rest;
+For never yet one hour in his bed
+Have I enjoy'd the golden dew of sleep,
+But have been waked by his timorous dreams.
+Besides, he hates me for my father Warwick;
+And will, no doubt, shortly be rid of me.
+
+QUEEN ELIZABETH:
+Poor heart, adieu! I pity thy complaining.
+
+LADY ANNE:
+No more than from my soul I mourn for yours.
+
+QUEEN ELIZABETH:
+Farewell, thou woful welcomer of glory!
+
+LADY ANNE:
+Adieu, poor soul, that takest thy leave of it!
+
+DUCHESS OF YORK:
+
+QUEEN ELIZABETH:
+Stay, yet look back with me unto the Tower.
+Pity, you ancient stones, those tender babes
+Whom envy hath immured within your walls!
+Rough cradle for such little pretty ones!
+Rude ragged nurse, old sullen playfellow
+For tender princes, use my babies well!
+So foolish sorrow bids your stones farewell.
+
+KING RICHARD III:
+Stand all apart Cousin of Buckingham!
+
+BUCKINGHAM:
+My gracious sovereign?
+
+KING RICHARD III:
+Give me thy hand.
+Thus high, by thy advice
+And thy assistance, is King Richard seated;
+But shall we wear these honours for a day?
+Or shall they last, and we rejoice in them?
+
+BUCKINGHAM:
+Still live they and for ever may they last!
+
+KING RICHARD III:
+O Buckingham, now do I play the touch,
+To try if thou be current gold indeed
+Young Edward lives: think now what I would say.
+
+BUCKINGHAM:
+Say on, my loving lord.
+
+KING RICHARD III:
+Why, Buckingham, I say, I would be king,
+
+BUCKINGHAM:
+Why, so you are, my thrice renowned liege.
+
+KING RICHARD III:
+Ha! am I king? 'tis so: but Edward lives.
+
+BUCKINGHAM:
+True, noble prince.
+
+KING RICHARD III:
+O bitter consequence,
+That Edward still should live! 'True, noble prince!'
+Cousin, thou wert not wont to be so dull:
+Shall I be plain? I wish the bastards dead;
+And I would have it suddenly perform'd.
+What sayest thou? speak suddenly; be brief.
+
+BUCKINGHAM:
+Your grace may do your pleasure.
+
+KING RICHARD III:
+Tut, tut, thou art all ice, thy kindness freezeth:
+Say, have I thy consent that they shall die?
+
+BUCKINGHAM:
+Give me some breath, some little pause, my lord
+Before I positively herein:
+I will resolve your grace immediately.
+
+CATESBY:
+
+KING RICHARD III:
+I will converse with iron-witted fools
+And unrespective boys: none are for me
+That look into me with considerate eyes:
+High-reaching Buckingham grows circumspect.
+Boy!
+
+Page:
+My lord?
+
+KING RICHARD III:
+Know'st thou not any whom corrupting gold
+Would tempt unto a close exploit of death?
+
+Page:
+My lord, I know a discontented gentleman,
+Whose humble means match not his haughty mind:
+Gold were as good as twenty orators,
+And will, no doubt, tempt him to any thing.
+
+KING RICHARD III:
+What is his name?
+
+Page:
+His name, my lord, is Tyrrel.
+
+KING RICHARD III:
+I partly know the man: go, call him hither.
+The deep-revolving witty Buckingham
+No more shall be the neighbour to my counsel:
+Hath he so long held out with me untired,
+And stops he now for breath?
+How now! what news with you?
+
+STANLEY:
+My lord, I hear the Marquis Dorset's fled
+To Richmond, in those parts beyond the sea
+Where he abides.
+
+KING RICHARD III:
+Catesby!
+
+CATESBY:
+My lord?
+
+KING RICHARD III:
+Rumour it abroad
+That Anne, my wife, is sick and like to die:
+I will take order for her keeping close.
+Inquire me out some mean-born gentleman,
+Whom I will marry straight to Clarence' daughter:
+The boy is foolish, and I fear not him.
+Look, how thou dream'st! I say again, give out
+That Anne my wife is sick and like to die:
+About it; for it stands me much upon,
+To stop all hopes whose growth may damage me.
+I must be married to my brother's daughter,
+Or else my kingdom stands on brittle glass.
+Murder her brothers, and then marry her!
+Uncertain way of gain! But I am in
+So far in blood that sin will pluck on sin:
+Tear-falling pity dwells not in this eye.
+Is thy name Tyrrel?
+
+TYRREL:
+James Tyrrel, and your most obedient subject.
+
+KING RICHARD III:
+Art thou, indeed?
+
+TYRREL:
+Prove me, my gracious sovereign.
+
+KING RICHARD III:
+Darest thou resolve to kill a friend of mine?
+
+TYRREL:
+Ay, my lord;
+But I had rather kill two enemies.
+
+KING RICHARD III:
+Why, there thou hast it: two deep enemies,
+Foes to my rest and my sweet sleep's disturbers
+Are they that I would have thee deal upon:
+Tyrrel, I mean those bastards in the Tower.
+
+TYRREL:
+Let me have open means to come to them,
+And soon I'll rid you from the fear of them.
+
+KING RICHARD III:
+Thou sing'st sweet music. Hark, come hither, Tyrrel
+Go, by this token: rise, and lend thine ear:
+There is no more but so: say it is done,
+And I will love thee, and prefer thee too.
+
+TYRREL:
+'Tis done, my gracious lord.
+
+KING RICHARD III:
+Shall we hear from thee, Tyrrel, ere we sleep?
+
+TYRREL:
+Ye shall, my Lord.
+
+BUCKINGHAM:
+My Lord, I have consider'd in my mind
+The late demand that you did sound me in.
+
+KING RICHARD III:
+Well, let that pass. Dorset is fled to Richmond.
+
+BUCKINGHAM:
+I hear that news, my lord.
+
+KING RICHARD III:
+Stanley, he is your wife's son well, look to it.
+
+BUCKINGHAM:
+My lord, I claim your gift, my due by promise,
+For which your honour and your faith is pawn'd;
+The earldom of Hereford and the moveables
+The which you promised I should possess.
+
+KING RICHARD III:
+Stanley, look to your wife; if she convey
+Letters to Richmond, you shall answer it.
+
+BUCKINGHAM:
+What says your highness to my just demand?
+
+KING RICHARD III:
+As I remember, Henry the Sixth
+Did prophesy that Richmond should be king,
+When Richmond was a little peevish boy.
+A king, perhaps, perhaps,--
+
+BUCKINGHAM:
+My lord!
+
+KING RICHARD III:
+How chance the prophet could not at that time
+Have told me, I being by, that I should kill him?
+
+BUCKINGHAM:
+My lord, your promise for the earldom,--
+
+KING RICHARD III:
+Richmond! When last I was at Exeter,
+The mayor in courtesy show'd me the castle,
+And call'd it Rougemont: at which name I started,
+Because a bard of Ireland told me once
+I should not live long after I saw Richmond.
+
+BUCKINGHAM:
+My Lord!
+
+KING RICHARD III:
+Ay, what's o'clock?
+
+BUCKINGHAM:
+I am thus bold to put your grace in mind
+Of what you promised me.
+
+KING RICHARD III:
+Well, but what's o'clock?
+
+BUCKINGHAM:
+Upon the stroke of ten.
+
+KING RICHARD III:
+Well, let it strike.
+
+BUCKINGHAM:
+Why let it strike?
+
+KING RICHARD III:
+Because that, like a Jack, thou keep'st the stroke
+Betwixt thy begging and my meditation.
+I am not in the giving vein to-day.
+
+BUCKINGHAM:
+Why, then resolve me whether you will or no.
+
+KING RICHARD III:
+Tut, tut,
+Thou troublest me; am not in the vein.
+
+BUCKINGHAM:
+Is it even so? rewards he my true service
+With such deep contempt made I him king for this?
+O, let me think on Hastings, and be gone
+To Brecknock, while my fearful head is on!
+
+TYRREL:
+The tyrannous and bloody deed is done.
+The most arch of piteous massacre
+That ever yet this land was guilty of.
+Dighton and Forrest, whom I did suborn
+To do this ruthless piece of butchery,
+Although they were flesh'd villains, bloody dogs,
+Melting with tenderness and kind compassion
+Wept like two children in their deaths' sad stories.
+'Lo, thus' quoth Dighton, 'lay those tender babes:'
+'Thus, thus,' quoth Forrest, 'girdling one another
+Within their innocent alabaster arms:
+Their lips were four red roses on a stalk,
+Which in their summer beauty kiss'd each other.
+A book of prayers on their pillow lay;
+Which once,' quoth Forrest, 'almost changed my mind;
+But O! the devil'--there the villain stopp'd
+Whilst Dighton thus told on: 'We smothered
+The most replenished sweet work of nature,
+That from the prime creation e'er she framed.'
+Thus both are gone with conscience and remorse;
+They could not speak; and so I left them both,
+To bring this tidings to the bloody king.
+And here he comes.
+All hail, my sovereign liege!
+
+KING RICHARD III:
+Kind Tyrrel, am I happy in thy news?
+
+TYRREL:
+If to have done the thing you gave in charge
+Beget your happiness, be happy then,
+For it is done, my lord.
+
+KING RICHARD III:
+But didst thou see them dead?
+
+TYRREL:
+I did, my lord.
+
+KING RICHARD III:
+And buried, gentle Tyrrel?
+
+TYRREL:
+The chaplain of the Tower hath buried them;
+But how or in what place I do not know.
+
+KING RICHARD III:
+Come to me, Tyrrel, soon at after supper,
+And thou shalt tell the process of their death.
+Meantime, but think how I may do thee good,
+And be inheritor of thy desire.
+Farewell till soon.
+The son of Clarence have I pent up close;
+His daughter meanly have I match'd in marriage;
+The sons of Edward sleep in Abraham's bosom,
+And Anne my wife hath bid the world good night.
+Now, for I know the Breton Richmond aims
+At young Elizabeth, my brother's daughter,
+And, by that knot, looks proudly o'er the crown,
+To her I go, a jolly thriving wooer.
+
+CATESBY:
+My lord!
+
+KING RICHARD III:
+Good news or bad, that thou comest in so bluntly?
+
+CATESBY:
+Bad news, my lord: Ely is fled to Richmond;
+And Buckingham, back'd with the hardy Welshmen,
+Is in the field, and still his power increaseth.
+
+KING RICHARD III:
+Ely with Richmond troubles me more near
+Than Buckingham and his rash-levied army.
+Come, I have heard that fearful commenting
+Is leaden servitor to dull delay;
+Delay leads impotent and snail-paced beggary
+Then fiery expedition be my wing,
+Jove's Mercury, and herald for a king!
+Come, muster men: my counsel is my shield;
+We must be brief when traitors brave the field.
+
+QUEEN MARGARET:
+So, now prosperity begins to mellow
+And drop into the rotten mouth of death.
+Here in these confines slily have I lurk'd,
+To watch the waning of mine adversaries.
+A dire induction am I witness to,
+And will to France, hoping the consequence
+Will prove as bitter, black, and tragical.
+Withdraw thee, wretched Margaret: who comes here?
+
+QUEEN ELIZABETH:
+Ah, my young princes! ah, my tender babes!
+My unblown flowers, new-appearing sweets!
+If yet your gentle souls fly in the air
+And be not fix'd in doom perpetual,
+Hover about me with your airy wings
+And hear your mother's lamentation!
+
+QUEEN MARGARET:
+Hover about her; say, that right for right
+Hath dimm'd your infant morn to aged night.
+
+DUCHESS OF YORK:
+So many miseries have crazed my voice,
+That my woe-wearied tongue is mute and dumb,
+Edward Plantagenet, why art thou dead?
+
+QUEEN MARGARET:
+Plantagenet doth quit Plantagenet.
+Edward for Edward pays a dying debt.
+
+QUEEN ELIZABETH:
+Wilt thou, O God, fly from such gentle lambs,
+And throw them in the entrails of the wolf?
+When didst thou sleep when such a deed was done?
+
+QUEEN MARGARET:
+When holy Harry died, and my sweet son.
+
+DUCHESS OF YORK:
+Blind sight, dead life, poor mortal living ghost,
+Woe's scene, world's shame, grave's due by life usurp'd,
+Brief abstract and record of tedious days,
+Rest thy unrest on England's lawful earth,
+Unlawfully made drunk with innocents' blood!
+
+QUEEN ELIZABETH:
+O, that thou wouldst as well afford a grave
+As thou canst yield a melancholy seat!
+Then would I hide my bones, not rest them here.
+O, who hath any cause to mourn but I?
+
+QUEEN MARGARET:
+If ancient sorrow be most reverend,
+Give mine the benefit of seniory,
+And let my woes frown on the upper hand.
+If sorrow can admit society,
+Tell o'er your woes again by viewing mine:
+I had an Edward, till a Richard kill'd him;
+I had a Harry, till a Richard kill'd him:
+Thou hadst an Edward, till a Richard kill'd him;
+Thou hadst a Richard, till a Richard killed him;
+
+DUCHESS OF YORK:
+I had a Richard too, and thou didst kill him;
+I had a Rutland too, thou holp'st to kill him.
+
+QUEEN MARGARET:
+Thou hadst a Clarence too, and Richard kill'd him.
+From forth the kennel of thy womb hath crept
+A hell-hound that doth hunt us all to death:
+That dog, that had his teeth before his eyes,
+To worry lambs and lap their gentle blood,
+That foul defacer of God's handiwork,
+That excellent grand tyrant of the earth,
+That reigns in galled eyes of weeping souls,
+Thy womb let loose, to chase us to our graves.
+O upright, just, and true-disposing God,
+How do I thank thee, that this carnal cur
+Preys on the issue of his mother's body,
+And makes her pew-fellow with others' moan!
+
+DUCHESS OF YORK:
+O Harry's wife, triumph not in my woes!
+God witness with me, I have wept for thine.
+
+QUEEN MARGARET:
+Bear with me; I am hungry for revenge,
+And now I cloy me with beholding it.
+Thy Edward he is dead, that stabb'd my Edward:
+Thy other Edward dead, to quit my Edward;
+Young York he is but boot, because both they
+Match not the high perfection of my loss:
+Thy Clarence he is dead that kill'd my Edward;
+And the beholders of this tragic play,
+The adulterate Hastings, Rivers, Vaughan, Grey,
+Untimely smother'd in their dusky graves.
+Richard yet lives, hell's black intelligencer,
+Only reserved their factor, to buy souls
+And send them thither: but at hand, at hand,
+Ensues his piteous and unpitied end:
+Earth gapes, hell burns, fiends roar, saints pray.
+To have him suddenly convey'd away.
+Cancel his bond of life, dear God, I prey,
+That I may live to say, The dog is dead!
+
+QUEEN ELIZABETH:
+O, thou didst prophesy the time would come
+That I should wish for thee to help me curse
+That bottled spider, that foul bunch-back'd toad!
+
+QUEEN MARGARET:
+I call'd thee then vain flourish of my fortune;
+I call'd thee then poor shadow, painted queen;
+The presentation of but what I was;
+The flattering index of a direful pageant;
+One heaved a-high, to be hurl'd down below;
+A mother only mock'd with two sweet babes;
+A dream of what thou wert, a breath, a bubble,
+A sign of dignity, a garish flag,
+To be the aim of every dangerous shot,
+A queen in jest, only to fill the scene.
+Where is thy husband now? where be thy brothers?
+Where are thy children? wherein dost thou, joy?
+Who sues to thee and cries 'God save the queen'?
+Where be the bending peers that flatter'd thee?
+Where be the thronging troops that follow'd thee?
+Decline all this, and see what now thou art:
+For happy wife, a most distressed widow;
+For joyful mother, one that wails the name;
+For queen, a very caitiff crown'd with care;
+For one being sued to, one that humbly sues;
+For one that scorn'd at me, now scorn'd of me;
+For one being fear'd of all, now fearing one;
+For one commanding all, obey'd of none.
+Thus hath the course of justice wheel'd about,
+And left thee but a very prey to time;
+Having no more but thought of what thou wert,
+To torture thee the more, being what thou art.
+Thou didst usurp my place, and dost thou not
+Usurp the just proportion of my sorrow?
+Now thy proud neck bears half my burthen'd yoke;
+From which even here I slip my weary neck,
+And leave the burthen of it all on thee.
+Farewell, York's wife, and queen of sad mischance:
+These English woes will make me smile in France.
+
+QUEEN ELIZABETH:
+O thou well skill'd in curses, stay awhile,
+And teach me how to curse mine enemies!
+
+QUEEN MARGARET:
+Forbear to sleep the nights, and fast the days;
+Compare dead happiness with living woe;
+Think that thy babes were fairer than they were,
+And he that slew them fouler than he is:
+Bettering thy loss makes the bad causer worse:
+Revolving this will teach thee how to curse.
+
+QUEEN ELIZABETH:
+My words are dull; O, quicken them with thine!
+
+QUEEN MARGARET:
+Thy woes will make them sharp, and pierce like mine.
+
+DUCHESS OF YORK:
+Why should calamity be full of words?
+
+QUEEN ELIZABETH:
+Windy attorneys to their client woes,
+Airy succeeders of intestate joys,
+Poor breathing orators of miseries!
+Let them have scope: though what they do impart
+Help not all, yet do they ease the heart.
+
+DUCHESS OF YORK:
+If so, then be not tongue-tied: go with me.
+And in the breath of bitter words let's smother
+My damned son, which thy two sweet sons smother'd.
+I hear his drum: be copious in exclaims.
+
+KING RICHARD III:
+Who intercepts my expedition?
+
+DUCHESS OF YORK:
+O, she that might have intercepted thee,
+By strangling thee in her accursed womb
+From all the slaughters, wretch, that thou hast done!
+
+QUEEN ELIZABETH:
+Hidest thou that forehead with a golden crown,
+Where should be graven, if that right were right,
+The slaughter of the prince that owed that crown,
+And the dire death of my two sons and brothers?
+Tell me, thou villain slave, where are my children?
+
+DUCHESS OF YORK:
+Thou toad, thou toad, where is thy brother Clarence?
+And little Ned Plantagenet, his son?
+
+QUEEN ELIZABETH:
+Where is kind Hastings, Rivers, Vaughan, Grey?
+
+KING RICHARD III:
+A flourish, trumpets! strike alarum, drums!
+Let not the heavens hear these tell-tale women
+Rail on the Lord's enointed: strike, I say!
+Either be patient, and entreat me fair,
+Or with the clamorous report of war
+Thus will I drown your exclamations.
+
+DUCHESS OF YORK:
+Art thou my son?
+
+KING RICHARD III:
+Ay, I thank God, my father, and yourself.
+
+DUCHESS OF YORK:
+Then patiently hear my impatience.
+
+KING RICHARD III:
+Madam, I have a touch of your condition,
+Which cannot brook the accent of reproof.
+
+DUCHESS OF YORK:
+O, let me speak!
+
+KING RICHARD III:
+Do then: but I'll not hear.
+
+DUCHESS OF YORK:
+I will be mild and gentle in my speech.
+
+KING RICHARD III:
+And brief, good mother; for I am in haste.
+
+DUCHESS OF YORK:
+Art thou so hasty? I have stay'd for thee,
+God knows, in anguish, pain and agony.
+
+KING RICHARD III:
+And came I not at last to comfort you?
+
+DUCHESS OF YORK:
+No, by the holy rood, thou know'st it well,
+Thou camest on earth to make the earth my hell.
+A grievous burthen was thy birth to me;
+Tetchy and wayward was thy infancy;
+Thy school-days frightful, desperate, wild, and furious,
+Thy prime of manhood daring, bold, and venturous,
+Thy age confirm'd, proud, subdued, bloody,
+treacherous,
+More mild, but yet more harmful, kind in hatred:
+What comfortable hour canst thou name,
+That ever graced me in thy company?
+
+KING RICHARD III:
+Faith, none, but Humphrey Hour, that call'd
+your grace
+To breakfast once forth of my company.
+If I be so disgracious in your sight,
+Let me march on, and not offend your grace.
+Strike the drum.
+
+DUCHESS OF YORK:
+I prithee, hear me speak.
+
+KING RICHARD III:
+You speak too bitterly.
+
+DUCHESS OF YORK:
+Hear me a word;
+For I shall never speak to thee again.
+
+KING RICHARD III:
+So.
+
+DUCHESS OF YORK:
+Either thou wilt die, by God's just ordinance,
+Ere from this war thou turn a conqueror,
+Or I with grief and extreme age shall perish
+And never look upon thy face again.
+Therefore take with thee my most heavy curse;
+Which, in the day of battle, tire thee more
+Than all the complete armour that thou wear'st!
+My prayers on the adverse party fight;
+And there the little souls of Edward's children
+Whisper the spirits of thine enemies
+And promise them success and victory.
+Bloody thou art, bloody will be thy end;
+Shame serves thy life and doth thy death attend.
+
+QUEEN ELIZABETH:
+Though far more cause, yet much less spirit to curse
+Abides in me; I say amen to all.
+
+KING RICHARD III:
+Stay, madam; I must speak a word with you.
+
+QUEEN ELIZABETH:
+I have no more sons of the royal blood
+For thee to murder: for my daughters, Richard,
+They shall be praying nuns, not weeping queens;
+And therefore level not to hit their lives.
+
+KING RICHARD III:
+You have a daughter call'd Elizabeth,
+Virtuous and fair, royal and gracious.
+
+QUEEN ELIZABETH:
+And must she die for this? O, let her live,
+And I'll corrupt her manners, stain her beauty;
+Slander myself as false to Edward's bed;
+Throw over her the veil of infamy:
+So she may live unscarr'd of bleeding slaughter,
+I will confess she was not Edward's daughter.
+
+KING RICHARD III:
+Wrong not her birth, she is of royal blood.
+
+QUEEN ELIZABETH:
+To save her life, I'll say she is not so.
+
+KING RICHARD III:
+Her life is only safest in her birth.
+
+QUEEN ELIZABETH:
+And only in that safety died her brothers.
+
+KING RICHARD III:
+Lo, at their births good stars were opposite.
+
+QUEEN ELIZABETH:
+No, to their lives bad friends were contrary.
+
+KING RICHARD III:
+All unavoided is the doom of destiny.
+
+QUEEN ELIZABETH:
+True, when avoided grace makes destiny:
+My babes were destined to a fairer death,
+If grace had bless'd thee with a fairer life.
+
+KING RICHARD III:
+You speak as if that I had slain my cousins.
+
+QUEEN ELIZABETH:
+Cousins, indeed; and by their uncle cozen'd
+Of comfort, kingdom, kindred, freedom, life.
+Whose hand soever lanced their tender hearts,
+Thy head, all indirectly, gave direction:
+No doubt the murderous knife was dull and blunt
+Till it was whetted on thy stone-hard heart,
+To revel in the entrails of my lambs.
+But that still use of grief makes wild grief tame,
+My tongue should to thy ears not name my boys
+Till that my nails were anchor'd in thine eyes;
+And I, in such a desperate bay of death,
+Like a poor bark, of sails and tackling reft,
+Rush all to pieces on thy rocky bosom.
+
+KING RICHARD III:
+Madam, so thrive I in my enterprise
+And dangerous success of bloody wars,
+As I intend more good to you and yours,
+Than ever you or yours were by me wrong'd!
+
+QUEEN ELIZABETH:
+What good is cover'd with the face of heaven,
+To be discover'd, that can do me good?
+
+KING RICHARD III:
+The advancement of your children, gentle lady.
+
+QUEEN ELIZABETH:
+Up to some scaffold, there to lose their heads?
+
+KING RICHARD III:
+No, to the dignity and height of honour
+The high imperial type of this earth's glory.
+
+QUEEN ELIZABETH:
+Flatter my sorrows with report of it;
+Tell me what state, what dignity, what honour,
+Canst thou demise to any child of mine?
+
+KING RICHARD III:
+Even all I have; yea, and myself and all,
+Will I withal endow a child of thine;
+So in the Lethe of thy angry soul
+Thou drown the sad remembrance of those wrongs
+Which thou supposest I have done to thee.
+
+QUEEN ELIZABETH:
+Be brief, lest that be process of thy kindness
+Last longer telling than thy kindness' date.
+
+KING RICHARD III:
+Then know, that from my soul I love thy daughter.
+
+QUEEN ELIZABETH:
+My daughter's mother thinks it with her soul.
+
+KING RICHARD III:
+What do you think?
+
+QUEEN ELIZABETH:
+That thou dost love my daughter from thy soul:
+So from thy soul's love didst thou love her brothers;
+And from my heart's love I do thank thee for it.
+
+KING RICHARD III:
+Be not so hasty to confound my meaning:
+I mean, that with my soul I love thy daughter,
+And mean to make her queen of England.
+
+QUEEN ELIZABETH:
+Say then, who dost thou mean shall be her king?
+
+KING RICHARD III:
+Even he that makes her queen who should be else?
+
+QUEEN ELIZABETH:
+What, thou?
+
+KING RICHARD III:
+I, even I: what think you of it, madam?
+
+QUEEN ELIZABETH:
+How canst thou woo her?
+
+KING RICHARD III:
+That would I learn of you,
+As one that are best acquainted with her humour.
+
+QUEEN ELIZABETH:
+And wilt thou learn of me?
+
+KING RICHARD III:
+Madam, with all my heart.
+
+QUEEN ELIZABETH:
+Send to her, by the man that slew her brothers,
+A pair of bleeding-hearts; thereon engrave
+Edward and York; then haply she will weep:
+Therefore present to her--as sometime Margaret
+Did to thy father, steep'd in Rutland's blood,--
+A handkerchief; which, say to her, did drain
+The purple sap from her sweet brother's body
+And bid her dry her weeping eyes therewith.
+If this inducement force her not to love,
+Send her a story of thy noble acts;
+Tell her thou madest away her uncle Clarence,
+Her uncle Rivers; yea, and, for her sake,
+Madest quick conveyance with her good aunt Anne.
+
+KING RICHARD III:
+Come, come, you mock me; this is not the way
+To win our daughter.
+
+QUEEN ELIZABETH:
+There is no other way
+Unless thou couldst put on some other shape,
+And not be Richard that hath done all this.
+
+KING RICHARD III:
+Say that I did all this for love of her.
+
+QUEEN ELIZABETH:
+Nay, then indeed she cannot choose but hate thee,
+Having bought love with such a bloody spoil.
+
+KING RICHARD III:
+Look, what is done cannot be now amended:
+Men shall deal unadvisedly sometimes,
+Which after hours give leisure to repent.
+If I did take the kingdom from your sons,
+To make amends, Ill give it to your daughter.
+If I have kill'd the issue of your womb,
+To quicken your increase, I will beget
+Mine issue of your blood upon your daughter
+A grandam's name is little less in love
+Than is the doting title of a mother;
+They are as children but one step below,
+Even of your mettle, of your very blood;
+Of an one pain, save for a night of groans
+Endured of her, for whom you bid like sorrow.
+Your children were vexation to your youth,
+But mine shall be a comfort to your age.
+The loss you have is but a son being king,
+And by that loss your daughter is made queen.
+I cannot make you what amends I would,
+Therefore accept such kindness as I can.
+Dorset your son, that with a fearful soul
+Leads discontented steps in foreign soil,
+This fair alliance quickly shall call home
+To high promotions and great dignity:
+The king, that calls your beauteous daughter wife.
+Familiarly shall call thy Dorset brother;
+Again shall you be mother to a king,
+And all the ruins of distressful times
+Repair'd with double riches of content.
+What! we have many goodly days to see:
+The liquid drops of tears that you have shed
+Shall come again, transform'd to orient pearl,
+Advantaging their loan with interest
+Of ten times double gain of happiness.
+Go, then my mother, to thy daughter go
+Make bold her bashful years with your experience;
+Prepare her ears to hear a wooer's tale
+Put in her tender heart the aspiring flame
+Of golden sovereignty; acquaint the princess
+With the sweet silent hours of marriage joys
+And when this arm of mine hath chastised
+The petty rebel, dull-brain'd Buckingham,
+Bound with triumphant garlands will I come
+And lead thy daughter to a conqueror's bed;
+To whom I will retail my conquest won,
+And she shall be sole victress, Caesar's Caesar.
+
+QUEEN ELIZABETH:
+What were I best to say? her father's brother
+Would be her lord? or shall I say, her uncle?
+Or, he that slew her brothers and her uncles?
+Under what title shall I woo for thee,
+That God, the law, my honour and her love,
+Can make seem pleasing to her tender years?
+
+KING RICHARD III:
+Infer fair England's peace by this alliance.
+
+QUEEN ELIZABETH:
+Which she shall purchase with still lasting war.
+
+KING RICHARD III:
+Say that the king, which may command, entreats.
+
+QUEEN ELIZABETH:
+That at her hands which the king's King forbids.
+
+KING RICHARD III:
+Say, she shall be a high and mighty queen.
+
+QUEEN ELIZABETH:
+To wail the tide, as her mother doth.
+
+KING RICHARD III:
+Say, I will love her everlastingly.
+
+QUEEN ELIZABETH:
+But how long shall that title 'ever' last?
+
+KING RICHARD III:
+Sweetly in force unto her fair life's end.
+
+QUEEN ELIZABETH:
+But how long fairly shall her sweet lie last?
+
+KING RICHARD III:
+So long as heaven and nature lengthens it.
+
+QUEEN ELIZABETH:
+So long as hell and Richard likes of it.
+
+KING RICHARD III:
+Say, I, her sovereign, am her subject love.
+
+QUEEN ELIZABETH:
+But she, your subject, loathes such sovereignty.
+
+KING RICHARD III:
+Be eloquent in my behalf to her.
+
+QUEEN ELIZABETH:
+An honest tale speeds best being plainly told.
+
+KING RICHARD III:
+Then in plain terms tell her my loving tale.
+
+QUEEN ELIZABETH:
+Plain and not honest is too harsh a style.
+
+KING RICHARD III:
+Your reasons are too shallow and too quick.
+
+QUEEN ELIZABETH:
+O no, my reasons are too deep and dead;
+Too deep and dead, poor infants, in their grave.
+
+KING RICHARD III:
+Harp not on that string, madam; that is past.
+
+QUEEN ELIZABETH:
+Harp on it still shall I till heart-strings break.
+
+KING RICHARD III:
+Now, by my George, my garter, and my crown,--
+
+QUEEN ELIZABETH:
+Profaned, dishonour'd, and the third usurp'd.
+
+KING RICHARD III:
+I swear--
+
+QUEEN ELIZABETH:
+By nothing; for this is no oath:
+The George, profaned, hath lost his holy honour;
+The garter, blemish'd, pawn'd his knightly virtue;
+The crown, usurp'd, disgraced his kingly glory.
+if something thou wilt swear to be believed,
+Swear then by something that thou hast not wrong'd.
+
+KING RICHARD III:
+Now, by the world--
+
+QUEEN ELIZABETH:
+'Tis full of thy foul wrongs.
+
+KING RICHARD III:
+My father's death--
+
+QUEEN ELIZABETH:
+Thy life hath that dishonour'd.
+
+KING RICHARD III:
+Then, by myself--
+
+QUEEN ELIZABETH:
+Thyself thyself misusest.
+
+KING RICHARD III:
+Why then, by God--
+
+QUEEN ELIZABETH:
+God's wrong is most of all.
+If thou hadst fear'd to break an oath by Him,
+The unity the king thy brother made
+Had not been broken, nor my brother slain:
+If thou hadst fear'd to break an oath by Him,
+The imperial metal, circling now thy brow,
+Had graced the tender temples of my child,
+And both the princes had been breathing here,
+Which now, two tender playfellows to dust,
+Thy broken faith hath made a prey for worms.
+What canst thou swear by now?
+
+KING RICHARD III:
+The time to come.
+
+QUEEN ELIZABETH:
+That thou hast wronged in the time o'erpast;
+For I myself have many tears to wash
+Hereafter time, for time past wrong'd by thee.
+The children live, whose parents thou hast
+slaughter'd,
+Ungovern'd youth, to wail it in their age;
+The parents live, whose children thou hast butcher'd,
+Old wither'd plants, to wail it with their age.
+Swear not by time to come; for that thou hast
+Misused ere used, by time misused o'erpast.
+
+KING RICHARD III:
+As I intend to prosper and repent,
+So thrive I in my dangerous attempt
+Of hostile arms! myself myself confound!
+Heaven and fortune bar me happy hours!
+Day, yield me not thy light; nor, night, thy rest!
+Be opposite all planets of good luck
+To my proceedings, if, with pure heart's love,
+Immaculate devotion, holy thoughts,
+I tender not thy beauteous princely daughter!
+In her consists my happiness and thine;
+Without her, follows to this land and me,
+To thee, herself, and many a Christian soul,
+Death, desolation, ruin and decay:
+It cannot be avoided but by this;
+It will not be avoided but by this.
+Therefore, good mother,--I must can you so--
+Be the attorney of my love to her:
+Plead what I will be, not what I have been;
+Not my deserts, but what I will deserve:
+Urge the necessity and state of times,
+And be not peevish-fond in great designs.
+
+QUEEN ELIZABETH:
+Shall I be tempted of the devil thus?
+
+KING RICHARD III:
+Ay, if the devil tempt thee to do good.
+
+QUEEN ELIZABETH:
+Shall I forget myself to be myself?
+
+KING RICHARD III:
+Ay, if yourself's remembrance wrong yourself.
+
+QUEEN ELIZABETH:
+But thou didst kill my children.
+
+KING RICHARD III:
+But in your daughter's womb I bury them:
+Where in that nest of spicery they shall breed
+Selves of themselves, to your recomforture.
+
+QUEEN ELIZABETH:
+Shall I go win my daughter to thy will?
+
+KING RICHARD III:
+And be a happy mother by the deed.
+
+QUEEN ELIZABETH:
+I go. Write to me very shortly.
+And you shall understand from me her mind.
+
+KING RICHARD III:
+Bear her my true love's kiss; and so, farewell.
+Relenting fool, and shallow, changing woman!
+How now! what news?
+
+RATCLIFF:
+My gracious sovereign, on the western coast
+Rideth a puissant navy; to the shore
+Throng many doubtful hollow-hearted friends,
+Unarm'd, and unresolved to beat them back:
+'Tis thought that Richmond is their admiral;
+And there they hull, expecting but the aid
+Of Buckingham to welcome them ashore.
+
+KING RICHARD III:
+Some light-foot friend post to the Duke of Norfolk:
+Ratcliff, thyself, or Catesby; where is he?
+
+CATESBY:
+Here, my lord.
+
+KING RICHARD III:
+Fly to the duke:
+Post thou to Salisbury
+When thou comest thither--
+Dull, unmindful villain,
+Why stand'st thou still, and go'st not to the duke?
+
+CATESBY:
+First, mighty sovereign, let me know your mind,
+What from your grace I shall deliver to him.
+
+KING RICHARD III:
+O, true, good Catesby: bid him levy straight
+The greatest strength and power he can make,
+And meet me presently at Salisbury.
+
+CATESBY:
+I go.
+
+RATCLIFF:
+What is't your highness' pleasure I shall do at
+Salisbury?
+
+KING RICHARD III:
+Why, what wouldst thou do there before I go?
+
+RATCLIFF:
+Your highness told me I should post before.
+
+KING RICHARD III:
+My mind is changed, sir, my mind is changed.
+How now, what news with you?
+
+STANLEY:
+None good, my lord, to please you with the hearing;
+Nor none so bad, but it may well be told.
+
+KING RICHARD III:
+Hoyday, a riddle! neither good nor bad!
+Why dost thou run so many mile about,
+When thou mayst tell thy tale a nearer way?
+Once more, what news?
+
+STANLEY:
+Richmond is on the seas.
+
+KING RICHARD III:
+There let him sink, and be the seas on him!
+White-liver'd runagate, what doth he there?
+
+STANLEY:
+I know not, mighty sovereign, but by guess.
+
+KING RICHARD III:
+Well, sir, as you guess, as you guess?
+
+STANLEY:
+Stirr'd up by Dorset, Buckingham, and Ely,
+He makes for England, there to claim the crown.
+
+KING RICHARD III:
+Is the chair empty? is the sword unsway'd?
+Is the king dead? the empire unpossess'd?
+What heir of York is there alive but we?
+And who is England's king but great York's heir?
+Then, tell me, what doth he upon the sea?
+
+STANLEY:
+Unless for that, my liege, I cannot guess.
+
+KING RICHARD III:
+Unless for that he comes to be your liege,
+You cannot guess wherefore the Welshman comes.
+Thou wilt revolt, and fly to him, I fear.
+
+STANLEY:
+No, mighty liege; therefore mistrust me not.
+
+KING RICHARD III:
+Where is thy power, then, to beat him back?
+Where are thy tenants and thy followers?
+Are they not now upon the western shore.
+Safe-conducting the rebels from their ships!
+
+STANLEY:
+No, my good lord, my friends are in the north.
+
+KING RICHARD III:
+Cold friends to Richard: what do they in the north,
+When they should serve their sovereign in the west?
+
+STANLEY:
+They have not been commanded, mighty sovereign:
+Please it your majesty to give me leave,
+I'll muster up my friends, and meet your grace
+Where and what time your majesty shall please.
+
+KING RICHARD III:
+Ay, ay. thou wouldst be gone to join with Richmond:
+I will not trust you, sir.
+
+STANLEY:
+Most mighty sovereign,
+You have no cause to hold my friendship doubtful:
+I never was nor never will be false.
+
+KING RICHARD III:
+Well,
+Go muster men; but, hear you, leave behind
+Your son, George Stanley: look your faith be firm.
+Or else his head's assurance is but frail.
+
+STANLEY:
+So deal with him as I prove true to you.
+
+Messenger:
+My gracious sovereign, now in Devonshire,
+As I by friends am well advertised,
+Sir Edward Courtney, and the haughty prelate
+Bishop of Exeter, his brother there,
+With many more confederates, are in arms.
+
+Second Messenger:
+My liege, in Kent the Guildfords are in arms;
+And every hour more competitors
+Flock to their aid, and still their power increaseth.
+
+Third Messenger:
+My lord, the army of the Duke of Buckingham--
+
+KING RICHARD III:
+Out on you, owls! nothing but songs of death?
+Take that, until thou bring me better news.
+
+Third Messenger:
+The news I have to tell your majesty
+Is, that by sudden floods and fall of waters,
+Buckingham's army is dispersed and scatter'd;
+And he himself wander'd away alone,
+No man knows whither.
+
+KING RICHARD III:
+I cry thee mercy:
+There is my purse to cure that blow of thine.
+Hath any well-advised friend proclaim'd
+Reward to him that brings the traitor in?
+
+Third Messenger:
+Such proclamation hath been made, my liege.
+
+Fourth Messenger:
+Sir Thomas Lovel and Lord Marquis Dorset,
+'Tis said, my liege, in Yorkshire are in arms.
+Yet this good comfort bring I to your grace,
+The Breton navy is dispersed by tempest:
+Richmond, in Yorkshire, sent out a boat
+Unto the shore, to ask those on the banks
+If they were his assistants, yea or no;
+Who answer'd him, they came from Buckingham.
+Upon his party: he, mistrusting them,
+Hoisted sail and made away for Brittany.
+
+KING RICHARD III:
+March on, march on, since we are up in arms;
+If not to fight with foreign enemies,
+Yet to beat down these rebels here at home.
+
+CATESBY:
+My liege, the Duke of Buckingham is taken;
+That is the best news: that the Earl of Richmond
+Is with a mighty power landed at Milford,
+Is colder tidings, yet they must be told.
+
+KING RICHARD III:
+Away towards Salisbury! while we reason here,
+A royal battle might be won and lost
+Some one take order Buckingham be brought
+To Salisbury; the rest march on with me.
+
+DERBY:
+Sir Christopher, tell Richmond this from me:
+That in the sty of this most bloody boar
+My son George Stanley is frank'd up in hold:
+If I revolt, off goes young George's head;
+The fear of that withholds my present aid.
+But, tell me, where is princely Richmond now?
+
+CHRISTOPHER:
+At Pembroke, or at Harford-west, in Wales.
+
+DERBY:
+What men of name resort to him?
+
+CHRISTOPHER:
+Sir Walter Herbert, a renowned soldier;
+Sir Gilbert Talbot, Sir William Stanley;
+Oxford, redoubted Pembroke, Sir James Blunt,
+And Rice ap Thomas with a valiant crew;
+And many more of noble fame and worth:
+And towards London they do bend their course,
+If by the way they be not fought withal.
+
+DERBY:
+Return unto thy lord; commend me to him:
+Tell him the queen hath heartily consented
+He shall espouse Elizabeth her daughter.
+These letters will resolve him of my mind. Farewell.
+
+BUCKINGHAM:
+Will not King Richard let me speak with him?
+
+Sheriff:
+No, my good lord; therefore be patient.
+
+BUCKINGHAM:
+Hastings, and Edward's children, Rivers, Grey,
+Holy King Henry, and thy fair son Edward,
+Vaughan, and all that have miscarried
+By underhand corrupted foul injustice,
+If that your moody discontented souls
+Do through the clouds behold this present hour,
+Even for revenge mock my destruction!
+This is All-Souls' day, fellows, is it not?
+
+Sheriff:
+It is, my lord.
+
+BUCKINGHAM:
+Why, then All-Souls' day is my body's doomsday.
+This is the day that, in King Edward's time,
+I wish't might fall on me, when I was found
+False to his children or his wife's allies
+This is the day wherein I wish'd to fall
+By the false faith of him I trusted most;
+This, this All-Souls' day to my fearful soul
+Is the determined respite of my wrongs:
+That high All-Seer that I dallied with
+Hath turn'd my feigned prayer on my head
+And given in earnest what I begg'd in jest.
+Thus doth he force the swords of wicked men
+To turn their own points on their masters' bosoms:
+Now Margaret's curse is fallen upon my head;
+'When he,' quoth she, 'shall split thy heart with sorrow,
+Remember Margaret was a prophetess.'
+Come, sirs, convey me to the block of shame;
+Wrong hath but wrong, and blame the due of blame.
+
+RICHMOND:
+Fellows in arms, and my most loving friends,
+Bruised underneath the yoke of tyranny,
+Thus far into the bowels of the land
+Have we march'd on without impediment;
+And here receive we from our father Stanley
+Lines of fair comfort and encouragement.
+The wretched, bloody, and usurping boar,
+That spoil'd your summer fields and fruitful vines,
+Swills your warm blood like wash, and makes his trough
+In your embowell'd bosoms, this foul swine
+Lies now even in the centre of this isle,
+Near to the town of Leicester, as we learn
+From Tamworth thither is but one day's march.
+In God's name, cheerly on, courageous friends,
+To reap the harvest of perpetual peace
+By this one bloody trial of sharp war.
+
+OXFORD:
+Every man's conscience is a thousand swords,
+To fight against that bloody homicide.
+
+HERBERT:
+I doubt not but his friends will fly to us.
+
+BLUNT:
+He hath no friends but who are friends for fear.
+Which in his greatest need will shrink from him.
+
+RICHMOND:
+All for our vantage. Then, in God's name, march:
+True hope is swift, and flies with swallow's wings:
+Kings it makes gods, and meaner creatures kings.
+
+KING RICHARD III:
+Here pitch our tents, even here in Bosworth field.
+My Lord of Surrey, why look you so sad?
+
+SURREY:
+My heart is ten times lighter than my looks.
+
+KING RICHARD III:
+My Lord of Norfolk,--
+
+NORFOLK:
+Here, most gracious liege.
+
+KING RICHARD III:
+Norfolk, we must have knocks; ha! must we not?
+
+NORFOLK:
+We must both give and take, my gracious lord.
+
+KING RICHARD III:
+Up with my tent there! here will I lie tonight;
+But where to-morrow?  Well, all's one for that.
+Who hath descried the number of the foe?
+
+NORFOLK:
+Six or seven thousand is their utmost power.
+
+KING RICHARD III:
+Why, our battalion trebles that account:
+Besides, the king's name is a tower of strength,
+Which they upon the adverse party want.
+Up with my tent there! Valiant gentlemen,
+Let us survey the vantage of the field
+Call for some men of sound direction
+Let's want no discipline, make no delay,
+For, lords, to-morrow is a busy day.
+
+RICHMOND:
+The weary sun hath made a golden set,
+And by the bright track of his fiery car,
+Gives signal, of a goodly day to-morrow.
+Sir William Brandon, you shall bear my standard.
+Give me some ink and paper in my tent
+I'll draw the form and model of our battle,
+Limit each leader to his several charge,
+And part in just proportion our small strength.
+My Lord of Oxford, you, Sir William Brandon,
+And you, Sir Walter Herbert, stay with me.
+The Earl of Pembroke keeps his regiment:
+Good Captain Blunt, bear my good night to him
+And by the second hour in the morning
+Desire the earl to see me in my tent:
+Yet one thing more, good Blunt, before thou go'st,
+Where is Lord Stanley quarter'd, dost thou know?
+
+BLUNT:
+Unless I have mista'en his colours much,
+Which well I am assured I have not done,
+His regiment lies half a mile at least
+South from the mighty power of the king.
+
+RICHMOND:
+If without peril it be possible,
+Good Captain Blunt, bear my good-night to him,
+And give him from me this most needful scroll.
+
+BLUNT:
+Upon my life, my lord, I'll under-take it;
+And so, God give you quiet rest to-night!
+
+RICHMOND:
+Good night, good Captain Blunt. Come gentlemen,
+Let us consult upon to-morrow's business
+In to our tent; the air is raw and cold.
+
+KING RICHARD III:
+What is't o'clock?
+
+CATESBY:
+It's supper-time, my lord;
+It's nine o'clock.
+
+KING RICHARD III:
+I will not sup to-night.
+Give me some ink and paper.
+What, is my beaver easier than it was?
+And all my armour laid into my tent?
+
+CATESBY:
+If is, my liege; and all things are in readiness.
+
+KING RICHARD III:
+Good Norfolk, hie thee to thy charge;
+Use careful watch, choose trusty sentinels.
+
+NORFOLK:
+I go, my lord.
+
+KING RICHARD III:
+Stir with the lark to-morrow, gentle Norfolk.
+
+NORFOLK:
+I warrant you, my lord.
+
+KING RICHARD III:
+Catesby!
+
+CATESBY:
+My lord?
+
+KING RICHARD III:
+Send out a pursuivant at arms
+To Stanley's regiment; bid him bring his power
+Before sunrising, lest his son George fall
+Into the blind cave of eternal night.
+Fill me a bowl of wine. Give me a watch.
+Saddle white Surrey for the field to-morrow.
+Look that my staves be sound, and not too heavy.
+Ratcliff!
+
+RATCLIFF:
+My lord?
+
+KING RICHARD III:
+Saw'st thou the melancholy Lord Northumberland?
+
+RATCLIFF:
+Thomas the Earl of Surrey, and himself,
+Much about cock-shut time, from troop to troop
+Went through the army, cheering up the soldiers.
+
+KING RICHARD III:
+So, I am satisfied. Give me a bowl of wine:
+I have not that alacrity of spirit,
+Nor cheer of mind, that I was wont to have.
+Set it down. Is ink and paper ready?
+
+RATCLIFF:
+It is, my lord.
+
+KING RICHARD III:
+Bid my guard watch; leave me.
+Ratcliff, about the mid of night come to my tent
+And help to arm me. Leave me, I say.
+
+DERBY:
+Fortune and victory sit on thy helm!
+
+RICHMOND:
+All comfort that the dark night can afford
+Be to thy person, noble father-in-law!
+Tell me, how fares our loving mother?
+
+DERBY:
+I, by attorney, bless thee from thy mother
+Who prays continually for Richmond's good:
+So much for that. The silent hours steal on,
+And flaky darkness breaks within the east.
+In brief,--for so the season bids us be,--
+Prepare thy battle early in the morning,
+And put thy fortune to the arbitrement
+Of bloody strokes and mortal-staring war.
+I, as I may--that which I would I cannot,--
+With best advantage will deceive the time,
+And aid thee in this doubtful shock of arms:
+But on thy side I may not be too forward
+Lest, being seen, thy brother, tender George,
+Be executed in his father's sight.
+Farewell: the leisure and the fearful time
+Cuts off the ceremonious vows of love
+And ample interchange of sweet discourse,
+Which so long sunder'd friends should dwell upon:
+God give us leisure for these rites of love!
+Once more, adieu: be valiant, and speed well!
+
+RICHMOND:
+Good lords, conduct him to his regiment:
+I'll strive, with troubled thoughts, to take a nap,
+Lest leaden slumber peise me down to-morrow,
+When I should mount with wings of victory:
+Once more, good night, kind lords and gentlemen.
+O Thou, whose captain I account myself,
+Look on my forces with a gracious eye;
+Put in their hands thy bruising irons of wrath,
+That they may crush down with a heavy fall
+The usurping helmets of our adversaries!
+Make us thy ministers of chastisement,
+That we may praise thee in the victory!
+To thee I do commend my watchful soul,
+Ere I let fall the windows of mine eyes:
+Sleeping and waking, O, defend me still!
+
+Ghost of Prince Edward:
+
+Ghost of King Henry VI:
+
+Ghost of CLARENCE:
+
+Ghost of RIVERS:
+
+Ghost of GREY:
+
+Ghost of VAUGHAN:
+
+All:
+
+Ghost of HASTINGS:
+
+Ghosts of young Princes:
+
+Ghost of LADY ANNE:
+
+Ghost of BUCKINGHAM:
+
+KING RICHARD III:
+Give me another horse: bind up my wounds.
+Have mercy, Jesu!--Soft! I did but dream.
+O coward conscience, how dost thou afflict me!
+The lights burn blue. It is now dead midnight.
+Cold fearful drops stand on my trembling flesh.
+What do I fear?  myself?  there's none else by:
+Richard loves Richard; that is, I am I.
+Is there a murderer here?  No. Yes, I am:
+Then fly. What, from myself?   Great reason why:
+Lest I revenge. What, myself upon myself?
+Alack. I love myself. Wherefore?  for any good
+That I myself have done unto myself?
+O, no! alas, I rather hate myself
+For hateful deeds committed by myself!
+I am a villain: yet I lie. I am not.
+Fool, of thyself speak well: fool, do not flatter.
+My conscience hath a thousand several tongues,
+And every tongue brings in a several tale,
+And every tale condemns me for a villain.
+Perjury, perjury, in the high'st degree
+Murder, stem murder, in the direst degree;
+All several sins, all used in each degree,
+Throng to the bar, crying all, Guilty! guilty!
+I shall despair. There is no creature loves me;
+And if I die, no soul shall pity me:
+Nay, wherefore should they, since that I myself
+Find in myself no pity to myself?
+Methought the souls of all that I had murder'd
+Came to my tent; and every one did threat
+To-morrow's vengeance on the head of Richard.
+
+RATCLIFF:
+My lord!
+
+KING RICHARD III:
+'Zounds! who is there?
+
+RATCLIFF:
+Ratcliff, my lord; 'tis I. The early village-cock
+Hath twice done salutation to the morn;
+Your friends are up, and buckle on their armour.
+
+KING RICHARD III:
+O Ratcliff, I have dream'd a fearful dream!
+What thinkest thou, will our friends prove all true?
+
+RATCLIFF:
+No doubt, my lord.
+
+KING RICHARD III:
+O Ratcliff, I fear, I fear,--
+
+RATCLIFF:
+Nay, good my lord, be not afraid of shadows.
+
+KING RICHARD III:
+By the apostle Paul, shadows to-night
+Have struck more terror to the soul of Richard
+Than can the substance of ten thousand soldiers
+Armed in proof, and led by shallow Richmond.
+It is not yet near day. Come, go with me;
+Under our tents I'll play the eaves-dropper,
+To see if any mean to shrink from me.
+
+LORDS:
+Good morrow, Richmond!
+
+RICHMOND:
+Cry mercy, lords and watchful gentlemen,
+That you have ta'en a tardy sluggard here.
+
+LORDS:
+How have you slept, my lord?
+
+RICHMOND:
+The sweetest sleep, and fairest-boding dreams
+That ever enter'd in a drowsy head,
+Have I since your departure had, my lords.
+Methought their souls, whose bodies Richard murder'd,
+Came to my tent, and cried on victory:
+I promise you, my soul is very jocund
+In the remembrance of so fair a dream.
+How far into the morning is it, lords?
+
+LORDS:
+Upon the stroke of four.
+
+RICHMOND:
+Why, then 'tis time to arm and give direction.
+More than I have said, loving countrymen,
+The leisure and enforcement of the time
+Forbids to dwell upon: yet remember this,
+God and our good cause fight upon our side;
+The prayers of holy saints and wronged souls,
+Like high-rear'd bulwarks, stand before our faces;
+Richard except, those whom we fight against
+Had rather have us win than him they follow:
+For what is he they follow?  truly, gentlemen,
+A bloody tyrant and a homicide;
+One raised in blood, and one in blood establish'd;
+One that made means to come by what he hath,
+And slaughter'd those that were the means to help him;
+Abase foul stone, made precious by the foil
+Of England's chair, where he is falsely set;
+One that hath ever been God's enemy:
+Then, if you fight against God's enemy,
+God will in justice ward you as his soldiers;
+If you do sweat to put a tyrant down,
+You sleep in peace, the tyrant being slain;
+If you do fight against your country's foes,
+Your country's fat shall pay your pains the hire;
+If you do fight in safeguard of your wives,
+Your wives shall welcome home the conquerors;
+If you do free your children from the sword,
+Your children's children quit it in your age.
+Then, in the name of God and all these rights,
+Advance your standards, draw your willing swords.
+For me, the ransom of my bold attempt
+Shall be this cold corpse on the earth's cold face;
+But if I thrive, the gain of my attempt
+The least of you shall share his part thereof.
+Sound drums and trumpets boldly and cheerfully;
+God and Saint George! Richmond and victory!
+
+KING RICHARD III:
+What said Northumberland as touching Richmond?
+
+RATCLIFF:
+That he was never trained up in arms.
+
+KING RICHARD III:
+He said the truth: and what said Surrey then?
+
+RATCLIFF:
+He smiled and said 'The better for our purpose.'
+
+KING RICHARD III:
+He was in the right; and so indeed it is.
+Ten the clock there. Give me a calendar.
+Who saw the sun to-day?
+
+RATCLIFF:
+Not I, my lord.
+
+KING RICHARD III:
+Then he disdains to shine; for by the book
+He should have braved the east an hour ago
+A black day will it be to somebody. Ratcliff!
+
+RATCLIFF:
+My lord?
+
+KING RICHARD III:
+The sun will not be seen to-day;
+The sky doth frown and lour upon our army.
+I would these dewy tears were from the ground.
+Not shine to-day! Why, what is that to me
+More than to Richmond?  for the selfsame heaven
+That frowns on me looks sadly upon him.
+
+NORFOLK:
+Arm, arm, my lord; the foe vaunts in the field.
+
+KING RICHARD III:
+Come, bustle, bustle; caparison my horse.
+Call up Lord Stanley, bid him bring his power:
+I will lead forth my soldiers to the plain,
+And thus my battle shall be ordered:
+My foreward shall be drawn out all in length,
+Consisting equally of horse and foot;
+Our archers shall be placed in the midst
+John Duke of Norfolk, Thomas Earl of Surrey,
+Shall have the leading of this foot and horse.
+They thus directed, we will follow
+In the main battle, whose puissance on either side
+Shall be well winged with our chiefest horse.
+This, and Saint George to boot! What think'st thou, Norfolk?
+
+NORFOLK:
+A good direction, warlike sovereign.
+This found I on my tent this morning.
+
+KING RICHARD III:
+
+Messenger:
+My lord, he doth deny to come.
+
+KING RICHARD III:
+Off with his son George's head!
+
+NORFOLK:
+My lord, the enemy is past the marsh
+After the battle let George Stanley die.
+
+KING RICHARD III:
+A thousand hearts are great within my bosom:
+Advance our standards, set upon our foes
+Our ancient word of courage, fair Saint George,
+Inspire us with the spleen of fiery dragons!
+Upon them! victory sits on our helms.
+
+CATESBY:
+Rescue, my Lord of Norfolk, rescue, rescue!
+The king enacts more wonders than a man,
+Daring an opposite to every danger:
+His horse is slain, and all on foot he fights,
+Seeking for Richmond in the throat of death.
+Rescue, fair lord, or else the day is lost!
+
+KING RICHARD III:
+A horse! a horse! my kingdom for a horse!
+
+CATESBY:
+Withdraw, my lord; I'll help you to a horse.
+
+KING RICHARD III:
+Slave, I have set my life upon a cast,
+And I will stand the hazard of the die:
+I think there be six Richmonds in the field;
+Five have I slain to-day instead of him.
+A horse! a horse! my kingdom for a horse!
+
+RICHMOND:
+God and your arms be praised, victorious friends,
+The day is ours, the bloody dog is dead.
+
+DERBY:
+Courageous Richmond, well hast thou acquit thee.
+Lo, here, this long-usurped royalty
+From the dead temples of this bloody wretch
+Have I pluck'd off, to grace thy brows withal:
+Wear it, enjoy it, and make much of it.
+
+RICHMOND:
+Great God of heaven, say Amen to all!
+But, tell me, is young George Stanley living?
+
+DERBY:
+He is, my lord, and safe in Leicester town;
+Whither, if it please you, we may now withdraw us.
+
+RICHMOND:
+What men of name are slain on either side?
+
+DERBY:
+John Duke of Norfolk, Walter Lord Ferrers,
+Sir Robert Brakenbury, and Sir William Brandon.
+
+RICHMOND:
+Inter their bodies as becomes their births:
+Proclaim a pardon to the soldiers fled
+That in submission will return to us:
+And then, as we have ta'en the sacrament,
+We will unite the white rose and the red:
+Smile heaven upon this fair conjunction,
+That long have frown'd upon their enmity!
+What traitor hears me, and says not amen?
+England hath long been mad, and scarr'd herself;
+The brother blindly shed the brother's blood,
+The father rashly slaughter'd his own son,
+The son, compell'd, been butcher to the sire:
+All this divided York and Lancaster,
+Divided in their dire division,
+O, now, let Richmond and Elizabeth,
+The true succeeders of each royal house,
+By God's fair ordinance conjoin together!
+And let their heirs, God, if thy will be so.
+Enrich the time to come with smooth-faced peace,
+With smiling plenty and fair prosperous days!
+Abate the edge of traitors, gracious Lord,
+That would reduce these bloody days again,
+And make poor England weep in streams of blood!
+Let them not live to taste this land's increase
+That would with treason wound this fair land's peace!
+Now civil wounds are stopp'd, peace lives again:
+That she may long live here, God say amen!
+
+KING RICHARD II:
+Old John of Gaunt, time-honour'd Lancaster,
+Hast thou, according to thy oath and band,
+Brought hither Henry Hereford thy bold son,
+Here to make good the boisterous late appeal,
+Which then our leisure would not let us hear,
+Against the Duke of Norfolk, Thomas Mowbray?
+
+JOHN OF GAUNT:
+I have, my liege.
+
+KING RICHARD II:
+Tell me, moreover, hast thou sounded him,
+If he appeal the duke on ancient malice;
+Or worthily, as a good subject should,
+On some known ground of treachery in him?
+
+JOHN OF GAUNT:
+As near as I could sift him on that argument,
+On some apparent danger seen in him
+Aim'd at your highness, no inveterate malice.
+
+KING RICHARD II:
+Then call them to our presence; face to face,
+And frowning brow to brow, ourselves will hear
+The accuser and the accused freely speak:
+High-stomach'd are they both, and full of ire,
+In rage deaf as the sea, hasty as fire.
+
+HENRY BOLINGBROKE:
+Many years of happy days befal
+My gracious sovereign, my most loving liege!
+
+THOMAS MOWBRAY:
+Each day still better other's happiness;
+Until the heavens, envying earth's good hap,
+Add an immortal title to your crown!
+
+KING RICHARD II:
+We thank you both: yet one but flatters us,
+As well appeareth by the cause you come;
+Namely to appeal each other of high treason.
+Cousin of Hereford, what dost thou object
+Against the Duke of Norfolk, Thomas Mowbray?
+
+HENRY BOLINGBROKE:
+First, heaven be the record to my speech!
+In the devotion of a subject's love,
+Tendering the precious safety of my prince,
+And free from other misbegotten hate,
+Come I appellant to this princely presence.
+Now, Thomas Mowbray, do I turn to thee,
+And mark my greeting well; for what I speak
+My body shall make good upon this earth,
+Or my divine soul answer it in heaven.
+Thou art a traitor and a miscreant,
+Too good to be so and too bad to live,
+Since the more fair and crystal is the sky,
+The uglier seem the clouds that in it fly.
+Once more, the more to aggravate the note,
+With a foul traitor's name stuff I thy throat;
+And wish, so please my sovereign, ere I move,
+What my tongue speaks my right drawn sword may prove.
+
+THOMAS MOWBRAY:
+Let not my cold words here accuse my zeal:
+'Tis not the trial of a woman's war,
+The bitter clamour of two eager tongues,
+Can arbitrate this cause betwixt us twain;
+The blood is hot that must be cool'd for this:
+Yet can I not of such tame patience boast
+As to be hush'd and nought at all to say:
+First, the fair reverence of your highness curbs me
+From giving reins and spurs to my free speech;
+Which else would post until it had return'd
+These terms of treason doubled down his throat.
+Setting aside his high blood's royalty,
+And let him be no kinsman to my liege,
+I do defy him, and I spit at him;
+Call him a slanderous coward and a villain:
+Which to maintain I would allow him odds,
+And meet him, were I tied to run afoot
+Even to the frozen ridges of the Alps,
+Or any other ground inhabitable,
+Where ever Englishman durst set his foot.
+Mean time let this defend my loyalty,
+By all my hopes, most falsely doth he lie.
+
+HENRY BOLINGBROKE:
+Pale trembling coward, there I throw my gage,
+Disclaiming here the kindred of the king,
+And lay aside my high blood's royalty,
+Which fear, not reverence, makes thee to except.
+If guilty dread have left thee so much strength
+As to take up mine honour's pawn, then stoop:
+By that and all the rites of knighthood else,
+Will I make good against thee, arm to arm,
+What I have spoke, or thou canst worse devise.
+
+THOMAS MOWBRAY:
+I take it up; and by that sword I swear
+Which gently laid my knighthood on my shoulder,
+I'll answer thee in any fair degree,
+Or chivalrous design of knightly trial:
+And when I mount, alive may I not light,
+If I be traitor or unjustly fight!
+
+KING RICHARD II:
+What doth our cousin lay to Mowbray's charge?
+It must be great that can inherit us
+So much as of a thought of ill in him.
+
+HENRY BOLINGBROKE:
+Look, what I speak, my life shall prove it true;
+That Mowbray hath received eight thousand nobles
+In name of lendings for your highness' soldiers,
+The which he hath detain'd for lewd employments,
+Like a false traitor and injurious villain.
+Besides I say and will in battle prove,
+Or here or elsewhere to the furthest verge
+That ever was survey'd by English eye,
+That all the treasons for these eighteen years
+Complotted and contrived in this land
+Fetch from false Mowbray their first head and spring.
+Further I say and further will maintain
+Upon his bad life to make all this good,
+That he did plot the Duke of Gloucester's death,
+Suggest his soon-believing adversaries,
+And consequently, like a traitor coward,
+Sluiced out his innocent soul through streams of blood:
+Which blood, like sacrificing Abel's, cries,
+Even from the tongueless caverns of the earth,
+To me for justice and rough chastisement;
+And, by the glorious worth of my descent,
+This arm shall do it, or this life be spent.
+
+KING RICHARD II:
+How high a pitch his resolution soars!
+Thomas of Norfolk, what say'st thou to this?
+
+THOMAS MOWBRAY:
+O, let my sovereign turn away his face
+And bid his ears a little while be deaf,
+Till I have told this slander of his blood,
+How God and good men hate so foul a liar.
+
+KING RICHARD II:
+Mowbray, impartial are our eyes and ears:
+Were he my brother, nay, my kingdom's heir,
+As he is but my father's brother's son,
+Now, by my sceptre's awe, I make a vow,
+Such neighbour nearness to our sacred blood
+Should nothing privilege him, nor partialize
+The unstooping firmness of my upright soul:
+He is our subject, Mowbray; so art thou:
+Free speech and fearless I to thee allow.
+
+THOMAS MOWBRAY:
+Then, Bolingbroke, as low as to thy heart,
+Through the false passage of thy throat, thou liest.
+Three parts of that receipt I had for Calais
+Disbursed I duly to his highness' soldiers;
+The other part reserved I by consent,
+For that my sovereign liege was in my debt
+Upon remainder of a dear account,
+Since last I went to France to fetch his queen:
+Now swallow down that lie. For Gloucester's death,
+I slew him not; but to my own disgrace
+Neglected my sworn duty in that case.
+For you, my noble Lord of Lancaster,
+The honourable father to my foe
+Once did I lay an ambush for your life,
+A trespass that doth vex my grieved soul
+But ere I last received the sacrament
+I did confess it, and exactly begg'd
+Your grace's pardon, and I hope I had it.
+This is my fault: as for the rest appeall'd,
+It issues from the rancour of a villain,
+A recreant and most degenerate traitor
+Which in myself I boldly will defend;
+And interchangeably hurl down my gage
+Upon this overweening traitor's foot,
+To prove myself a loyal gentleman
+Even in the best blood chamber'd in his bosom.
+In haste whereof, most heartily I pray
+Your highness to assign our trial day.
+
+KING RICHARD II:
+Wrath-kindled gentlemen, be ruled by me;
+Let's purge this choler without letting blood:
+This we prescribe, though no physician;
+Deep malice makes too deep incision;
+Forget, forgive; conclude and be agreed;
+Our doctors say this is no month to bleed.
+Good uncle, let this end where it begun;
+We'll calm the Duke of Norfolk, you your son.
+
+JOHN OF GAUNT:
+To be a make-peace shall become my age:
+Throw down, my son, the Duke of Norfolk's gage.
+
+KING RICHARD II:
+And, Norfolk, throw down his.
+
+JOHN OF GAUNT:
+When, Harry, when?
+Obedience bids I should not bid again.
+
+KING RICHARD II:
+Norfolk, throw down, we bid; there is no boot.
+
+THOMAS MOWBRAY:
+Myself I throw, dread sovereign, at thy foot.
+My life thou shalt command, but not my shame:
+The one my duty owes; but my fair name,
+Despite of death that lives upon my grave,
+To dark dishonour's use thou shalt not have.
+I am disgraced, impeach'd and baffled here,
+Pierced to the soul with slander's venom'd spear,
+The which no balm can cure but his heart-blood
+Which breathed this poison.
+
+KING RICHARD II:
+Rage must be withstood:
+Give me his gage: lions make leopards tame.
+
+THOMAS MOWBRAY:
+Yea, but not change his spots: take but my shame.
+And I resign my gage. My dear dear lord,
+The purest treasure mortal times afford
+Is spotless reputation: that away,
+Men are but gilded loam or painted clay.
+A jewel in a ten-times-barr'd-up chest
+Is a bold spirit in a loyal breast.
+Mine honour is my life; both grow in one:
+Take honour from me, and my life is done:
+Then, dear my liege, mine honour let me try;
+In that I live and for that will I die.
+
+KING RICHARD II:
+Cousin, throw up your gage; do you begin.
+
+HENRY BOLINGBROKE:
+O, God defend my soul from such deep sin!
+Shall I seem crest-fall'n in my father's sight?
+Or with pale beggar-fear impeach my height
+Before this out-dared dastard? Ere my tongue
+Shall wound my honour with such feeble wrong,
+Or sound so base a parle, my teeth shall tear
+The slavish motive of recanting fear,
+And spit it bleeding in his high disgrace,
+Where shame doth harbour, even in Mowbray's face.
+
+KING RICHARD II:
+We were not born to sue, but to command;
+Which since we cannot do to make you friends,
+Be ready, as your lives shall answer it,
+At Coventry, upon Saint Lambert's day:
+There shall your swords and lances arbitrate
+The swelling difference of your settled hate:
+Since we can not atone you, we shall see
+Justice design the victor's chivalry.
+Lord marshal, command our officers at arms
+Be ready to direct these home alarms.
+
+JOHN OF GAUNT:
+Alas, the part I had in Woodstock's blood
+Doth more solicit me than your exclaims,
+To stir against the butchers of his life!
+But since correction lieth in those hands
+Which made the fault that we cannot correct,
+Put we our quarrel to the will of heaven;
+Who, when they see the hours ripe on earth,
+Will rain hot vengeance on offenders' heads.
+
+DUCHESS:
+Finds brotherhood in thee no sharper spur?
+Hath love in thy old blood no living fire?
+Edward's seven sons, whereof thyself art one,
+Were as seven vials of his sacred blood,
+Or seven fair branches springing from one root:
+Some of those seven are dried by nature's course,
+Some of those branches by the Destinies cut;
+But Thomas, my dear lord, my life, my Gloucester,
+One vial full of Edward's sacred blood,
+One flourishing branch of his most royal root,
+Is crack'd, and all the precious liquor spilt,
+Is hack'd down, and his summer leaves all faded,
+By envy's hand and murder's bloody axe.
+Ah, Gaunt, his blood was thine! that bed, that womb,
+That metal, that self-mould, that fashion'd thee
+Made him a man; and though thou livest and breathest,
+Yet art thou slain in him: thou dost consent
+In some large measure to thy father's death,
+In that thou seest thy wretched brother die,
+Who was the model of thy father's life.
+Call it not patience, Gaunt; it is despair:
+In suffering thus thy brother to be slaughter'd,
+Thou showest the naked pathway to thy life,
+Teaching stern murder how to butcher thee:
+That which in mean men we intitle patience
+Is pale cold cowardice in noble breasts.
+What shall I say? to safeguard thine own life,
+The best way is to venge my Gloucester's death.
+
+JOHN OF GAUNT:
+God's is the quarrel; for God's substitute,
+His deputy anointed in His sight,
+Hath caused his death: the which if wrongfully,
+Let heaven revenge; for I may never lift
+An angry arm against His minister.
+
+DUCHESS:
+Where then, alas, may I complain myself?
+
+JOHN OF GAUNT:
+To God, the widow's champion and defence.
+
+DUCHESS:
+Why, then, I will. Farewell, old Gaunt.
+Thou goest to Coventry, there to behold
+Our cousin Hereford and fell Mowbray fight:
+O, sit my husband's wrongs on Hereford's spear,
+That it may enter butcher Mowbray's breast!
+Or, if misfortune miss the first career,
+Be Mowbray's sins so heavy in his bosom,
+They may break his foaming courser's back,
+And throw the rider headlong in the lists,
+A caitiff recreant to my cousin Hereford!
+Farewell, old Gaunt: thy sometimes brother's wife
+With her companion grief must end her life.
+
+JOHN OF GAUNT:
+Sister, farewell; I must to Coventry:
+As much good stay with thee as go with me!
+
+DUCHESS:
+Yet one word more: grief boundeth where it falls,
+Not with the empty hollowness, but weight:
+I take my leave before I have begun,
+For sorrow ends not when it seemeth done.
+Commend me to thy brother, Edmund York.
+Lo, this is all:--nay, yet depart not so;
+Though this be all, do not so quickly go;
+I shall remember more. Bid him--ah, what?--
+With all good speed at Plashy visit me.
+Alack, and what shall good old York there see
+But empty lodgings and unfurnish'd walls,
+Unpeopled offices, untrodden stones?
+And what hear there for welcome but my groans?
+Therefore commend me; let him not come there,
+To seek out sorrow that dwells every where.
+Desolate, desolate, will I hence and die:
+The last leave of thee takes my weeping eye.
+
+Lord Marshal:
+My Lord Aumerle, is Harry Hereford arm'd?
+
+DUKE OF AUMERLE:
+Yea, at all points; and longs to enter in.
+
+Lord Marshal:
+The Duke of Norfolk, sprightfully and bold,
+Stays but the summons of the appellant's trumpet.
+
+DUKE OF AUMERLE:
+Why, then, the champions are prepared, and stay
+For nothing but his majesty's approach.
+
+KING RICHARD II:
+Marshal, demand of yonder champion
+The cause of his arrival here in arms:
+Ask him his name and orderly proceed
+To swear him in the justice of his cause.
+
+Lord Marshal:
+In God's name and the king's, say who thou art
+And why thou comest thus knightly clad in arms,
+Against what man thou comest, and what thy quarrel:
+Speak truly, on thy knighthood and thy oath;
+As so defend thee heaven and thy valour!
+
+THOMAS MOWBRAY:
+My name is Thomas Mowbray, Duke of Norfolk;
+Who hither come engaged by my oath--
+Which God defend a knight should violate!--
+Both to defend my loyalty and truth
+To God, my king and my succeeding issue,
+Against the Duke of Hereford that appeals me
+And, by the grace of God and this mine arm,
+To prove him, in defending of myself,
+A traitor to my God, my king, and me:
+And as I truly fight, defend me heaven!
+
+KING RICHARD II:
+Marshal, ask yonder knight in arms,
+Both who he is and why he cometh hither
+Thus plated in habiliments of war,
+And formally, according to our law,
+Depose him in the justice of his cause.
+
+Lord Marshal:
+What is thy name? and wherefore comest thou hither,
+Before King Richard in his royal lists?
+Against whom comest thou? and what's thy quarrel?
+Speak like a true knight, so defend thee heaven!
+
+HENRY BOLINGBROKE:
+Harry of Hereford, Lancaster and Derby
+Am I; who ready here do stand in arms,
+To prove, by God's grace and my body's valour,
+In lists, on Thomas Mowbray, Duke of Norfolk,
+That he is a traitor, foul and dangerous,
+To God of heaven, King Richard and to me;
+And as I truly fight, defend me heaven!
+
+Lord Marshal:
+On pain of death, no person be so bold
+Or daring-hardy as to touch the lists,
+Except the marshal and such officers
+Appointed to direct these fair designs.
+
+HENRY BOLINGBROKE:
+Lord marshal, let me kiss my sovereign's hand,
+And bow my knee before his majesty:
+For Mowbray and myself are like two men
+That vow a long and weary pilgrimage;
+Then let us take a ceremonious leave
+And loving farewell of our several friends.
+
+Lord Marshal:
+The appellant in all duty greets your highness,
+And craves to kiss your hand and take his leave.
+
+KING RICHARD II:
+We will descend and fold him in our arms.
+Cousin of Hereford, as thy cause is right,
+So be thy fortune in this royal fight!
+Farewell, my blood; which if to-day thou shed,
+Lament we may, but not revenge thee dead.
+
+HENRY BOLINGBROKE:
+O let no noble eye profane a tear
+For me, if I be gored with Mowbray's spear:
+As confident as is the falcon's flight
+Against a bird, do I with Mowbray fight.
+My loving lord, I take my leave of you;
+Of you, my noble cousin, Lord Aumerle;
+Not sick, although I have to do with death,
+But lusty, young, and cheerly drawing breath.
+Lo, as at English feasts, so I regreet
+The daintiest last, to make the end most sweet:
+O thou, the earthly author of my blood,
+Whose youthful spirit, in me regenerate,
+Doth with a twofold vigour lift me up
+To reach at victory above my head,
+Add proof unto mine armour with thy prayers;
+And with thy blessings steel my lance's point,
+That it may enter Mowbray's waxen coat,
+And furbish new the name of John a Gaunt,
+Even in the lusty havior of his son.
+
+JOHN OF GAUNT:
+God in thy good cause make thee prosperous!
+Be swift like lightning in the execution;
+And let thy blows, doubly redoubled,
+Fall like amazing thunder on the casque
+Of thy adverse pernicious enemy:
+Rouse up thy youthful blood, be valiant and live.
+
+HENRY BOLINGBROKE:
+Mine innocency and Saint George to thrive!
+
+THOMAS MOWBRAY:
+However God or fortune cast my lot,
+There lives or dies, true to King Richard's throne,
+A loyal, just and upright gentleman:
+Never did captive with a freer heart
+Cast off his chains of bondage and embrace
+His golden uncontroll'd enfranchisement,
+More than my dancing soul doth celebrate
+This feast of battle with mine adversary.
+Most mighty liege, and my companion peers,
+Take from my mouth the wish of happy years:
+As gentle and as jocund as to jest
+Go I to fight: truth hath a quiet breast.
+
+KING RICHARD II:
+Farewell, my lord: securely I espy
+Virtue with valour couched in thine eye.
+Order the trial, marshal, and begin.
+
+Lord Marshal:
+Harry of Hereford, Lancaster and Derby,
+Receive thy lance; and God defend the right!
+
+HENRY BOLINGBROKE:
+Strong as a tower in hope, I cry amen.
+
+Lord Marshal:
+Go bear this lance to Thomas, Duke of Norfolk.
+
+First Herald:
+Harry of Hereford, Lancaster and Derby,
+Stands here for God, his sovereign and himself,
+On pain to be found false and recreant,
+To prove the Duke of Norfolk, Thomas Mowbray,
+A traitor to his God, his king and him;
+And dares him to set forward to the fight.
+
+Second Herald:
+Here standeth Thomas Mowbray, Duke of Norfolk,
+On pain to be found false and recreant,
+Both to defend himself and to approve
+Henry of Hereford, Lancaster, and Derby,
+To God, his sovereign and to him disloyal;
+Courageously and with a free desire
+Attending but the signal to begin.
+
+Lord Marshal:
+Sound, trumpets; and set forward, combatants.
+Stay, the king hath thrown his warder down.
+
+KING RICHARD II:
+Let them lay by their helmets and their spears,
+And both return back to their chairs again:
+Withdraw with us: and let the trumpets sound
+While we return these dukes what we decree.
+Draw near,
+And list what with our council we have done.
+For that our kingdom's earth should not be soil'd
+With that dear blood which it hath fostered;
+And for our eyes do hate the dire aspect
+Of civil wounds plough'd up with neighbours' sword;
+And for we think the eagle-winged pride
+Of sky-aspiring and ambitious thoughts,
+With rival-hating envy, set on you
+To wake our peace, which in our country's cradle
+Draws the sweet infant breath of gentle sleep;
+Which so roused up with boisterous untuned drums,
+With harsh resounding trumpets' dreadful bray,
+And grating shock of wrathful iron arms,
+Might from our quiet confines fright fair peace
+And make us wade even in our kindred's blood,
+Therefore, we banish you our territories:
+You, cousin Hereford, upon pain of life,
+Till twice five summers have enrich'd our fields
+Shall not regreet our fair dominions,
+But tread the stranger paths of banishment.
+
+HENRY BOLINGBROKE:
+Your will be done: this must my comfort be,
+Sun that warms you here shall shine on me;
+And those his golden beams to you here lent
+Shall point on me and gild my banishment.
+
+KING RICHARD II:
+Norfolk, for thee remains a heavier doom,
+Which I with some unwillingness pronounce:
+The sly slow hours shall not determinate
+The dateless limit of thy dear exile;
+The hopeless word of 'never to return'
+Breathe I against thee, upon pain of life.
+
+THOMAS MOWBRAY:
+A heavy sentence, my most sovereign liege,
+And all unlook'd for from your highness' mouth:
+A dearer merit, not so deep a maim
+As to be cast forth in the common air,
+Have I deserved at your highness' hands.
+The language I have learn'd these forty years,
+My native English, now I must forego:
+And now my tongue's use is to me no more
+Than an unstringed viol or a harp,
+Or like a cunning instrument cased up,
+Or, being open, put into his hands
+That knows no touch to tune the harmony:
+Within my mouth you have engaol'd my tongue,
+Doubly portcullis'd with my teeth and lips;
+And dull unfeeling barren ignorance
+Is made my gaoler to attend on me.
+I am too old to fawn upon a nurse,
+Too far in years to be a pupil now:
+What is thy sentence then but speechless death,
+Which robs my tongue from breathing native breath?
+
+KING RICHARD II:
+It boots thee not to be compassionate:
+After our sentence plaining comes too late.
+
+THOMAS MOWBRAY:
+Then thus I turn me from my country's light,
+To dwell in solemn shades of endless night.
+
+KING RICHARD II:
+Return again, and take an oath with thee.
+Lay on our royal sword your banish'd hands;
+Swear by the duty that you owe to God--
+Our part therein we banish with yourselves--
+To keep the oath that we administer:
+You never shall, so help you truth and God!
+Embrace each other's love in banishment;
+Nor never look upon each other's face;
+Nor never write, regreet, nor reconcile
+This louring tempest of your home-bred hate;
+Nor never by advised purpose meet
+To plot, contrive, or complot any ill
+'Gainst us, our state, our subjects, or our land.
+
+HENRY BOLINGBROKE:
+I swear.
+
+THOMAS MOWBRAY:
+And I, to keep all this.
+
+HENRY BOLINGBROKE:
+Norfolk, so far as to mine enemy:--
+By this time, had the king permitted us,
+One of our souls had wander'd in the air.
+Banish'd this frail sepulchre of our flesh,
+As now our flesh is banish'd from this land:
+Confess thy treasons ere thou fly the realm;
+Since thou hast far to go, bear not along
+The clogging burthen of a guilty soul.
+
+THOMAS MOWBRAY:
+No, Bolingbroke: if ever I were traitor,
+My name be blotted from the book of life,
+And I from heaven banish'd as from hence!
+But what thou art, God, thou, and I do know;
+And all too soon, I fear, the king shall rue.
+Farewell, my liege. Now no way can I stray;
+Save back to England, all the world's my way.
+
+KING RICHARD II:
+Uncle, even in the glasses of thine eyes
+I see thy grieved heart: thy sad aspect
+Hath from the number of his banish'd years
+Pluck'd four away.
+Six frozen winter spent,
+Return with welcome home from banishment.
+
+HENRY BOLINGBROKE:
+How long a time lies in one little word!
+Four lagging winters and four wanton springs
+End in a word: such is the breath of kings.
+
+JOHN OF GAUNT:
+I thank my liege, that in regard of me
+He shortens four years of my son's exile:
+But little vantage shall I reap thereby;
+For, ere the six years that he hath to spend
+Can change their moons and bring their times about
+My oil-dried lamp and time-bewasted light
+Shall be extinct with age and endless night;
+My inch of taper will be burnt and done,
+And blindfold death not let me see my son.
+
+KING RICHARD II:
+Why uncle, thou hast many years to live.
+
+JOHN OF GAUNT:
+But not a minute, king, that thou canst give:
+Shorten my days thou canst with sullen sorrow,
+And pluck nights from me, but not lend a morrow;
+Thou canst help time to furrow me with age,
+But stop no wrinkle in his pilgrimage;
+Thy word is current with him for my death,
+But dead, thy kingdom cannot buy my breath.
+
+KING RICHARD II:
+Thy son is banish'd upon good advice,
+Whereto thy tongue a party-verdict gave:
+Why at our justice seem'st thou then to lour?
+
+JOHN OF GAUNT:
+Things sweet to taste prove in digestion sour.
+You urged me as a judge; but I had rather
+You would have bid me argue like a father.
+O, had it been a stranger, not my child,
+To smooth his fault I should have been more mild:
+A partial slander sought I to avoid,
+And in the sentence my own life destroy'd.
+Alas, I look'd when some of you should say,
+I was too strict to make mine own away;
+But you gave leave to my unwilling tongue
+Against my will to do myself this wrong.
+
+KING RICHARD II:
+Cousin, farewell; and, uncle, bid him so:
+Six years we banish him, and he shall go.
+
+DUKE OF AUMERLE:
+Cousin, farewell: what presence must not know,
+From where you do remain let paper show.
+
+Lord Marshal:
+My lord, no leave take I; for I will ride,
+As far as land will let me, by your side.
+
+JOHN OF GAUNT:
+O, to what purpose dost thou hoard thy words,
+That thou return'st no greeting to thy friends?
+
+HENRY BOLINGBROKE:
+I have too few to take my leave of you,
+When the tongue's office should be prodigal
+To breathe the abundant dolour of the heart.
+
+JOHN OF GAUNT:
+Thy grief is but thy absence for a time.
+
+HENRY BOLINGBROKE:
+Joy absent, grief is present for that time.
+
+JOHN OF GAUNT:
+What is six winters? they are quickly gone.
+
+HENRY BOLINGBROKE:
+To men in joy; but grief makes one hour ten.
+
+JOHN OF GAUNT:
+Call it a travel that thou takest for pleasure.
+
+HENRY BOLINGBROKE:
+My heart will sigh when I miscall it so,
+Which finds it an inforced pilgrimage.
+
+JOHN OF GAUNT:
+The sullen passage of thy weary steps
+Esteem as foil wherein thou art to set
+The precious jewel of thy home return.
+
+HENRY BOLINGBROKE:
+Nay, rather, every tedious stride I make
+Will but remember me what a deal of world
+I wander from the jewels that I love.
+Must I not serve a long apprenticehood
+To foreign passages, and in the end,
+Having my freedom, boast of nothing else
+But that I was a journeyman to grief?
+
+JOHN OF GAUNT:
+All places that the eye of heaven visits
+Are to a wise man ports and happy havens.
+Teach thy necessity to reason thus;
+There is no virtue like necessity.
+Think not the king did banish thee,
+But thou the king. Woe doth the heavier sit,
+Where it perceives it is but faintly borne.
+Go, say I sent thee forth to purchase honour
+And not the king exiled thee; or suppose
+Devouring pestilence hangs in our air
+And thou art flying to a fresher clime:
+Look, what thy soul holds dear, imagine it
+To lie that way thou go'st, not whence thou comest:
+Suppose the singing birds musicians,
+The grass whereon thou tread'st the presence strew'd,
+The flowers fair ladies, and thy steps no more
+Than a delightful measure or a dance;
+For gnarling sorrow hath less power to bite
+The man that mocks at it and sets it light.
+
+HENRY BOLINGBROKE:
+O, who can hold a fire in his hand
+By thinking on the frosty Caucasus?
+Or cloy the hungry edge of appetite
+By bare imagination of a feast?
+Or wallow naked in December snow
+By thinking on fantastic summer's heat?
+O, no! the apprehension of the good
+Gives but the greater feeling to the worse:
+Fell sorrow's tooth doth never rankle more
+Than when he bites, but lanceth not the sore.
+
+JOHN OF GAUNT:
+Come, come, my son, I'll bring thee on thy way:
+Had I thy youth and cause, I would not stay.
+
+HENRY BOLINGBROKE:
+Then, England's ground, farewell; sweet soil, adieu;
+My mother, and my nurse, that bears me yet!
+Where'er I wander, boast of this I can,
+Though banish'd, yet a trueborn Englishman.
+
+KING RICHARD II:
+We did observe. Cousin Aumerle,
+How far brought you high Hereford on his way?
+
+DUKE OF AUMERLE:
+I brought high Hereford, if you call him so,
+But to the next highway, and there I left him.
+
+KING RICHARD II:
+And say, what store of parting tears were shed?
+
+DUKE OF AUMERLE:
+Faith, none for me; except the north-east wind,
+Which then blew bitterly against our faces,
+Awaked the sleeping rheum, and so by chance
+Did grace our hollow parting with a tear.
+
+KING RICHARD II:
+What said our cousin when you parted with him?
+
+DUKE OF AUMERLE:
+'Farewell:'
+And, for my heart disdained that my tongue
+Should so profane the word, that taught me craft
+To counterfeit oppression of such grief
+That words seem'd buried in my sorrow's grave.
+Marry, would the word 'farewell' have lengthen'd hours
+And added years to his short banishment,
+He should have had a volume of farewells;
+But since it would not, he had none of me.
+
+KING RICHARD II:
+He is our cousin, cousin; but 'tis doubt,
+When time shall call him home from banishment,
+Whether our kinsman come to see his friends.
+Ourself and Bushy, Bagot here and Green
+Observed his courtship to the common people;
+How he did seem to dive into their hearts
+With humble and familiar courtesy,
+What reverence he did throw away on slaves,
+Wooing poor craftsmen with the craft of smiles
+And patient underbearing of his fortune,
+As 'twere to banish their affects with him.
+Off goes his bonnet to an oyster-wench;
+A brace of draymen bid God speed him well
+And had the tribute of his supple knee,
+With 'Thanks, my countrymen, my loving friends;'
+As were our England in reversion his,
+And he our subjects' next degree in hope.
+
+GREEN:
+Well, he is gone; and with him go these thoughts.
+Now for the rebels which stand out in Ireland,
+Expedient manage must be made, my liege,
+Ere further leisure yield them further means
+For their advantage and your highness' loss.
+
+KING RICHARD II:
+We will ourself in person to this war:
+And, for our coffers, with too great a court
+And liberal largess, are grown somewhat light,
+We are inforced to farm our royal realm;
+The revenue whereof shall furnish us
+For our affairs in hand: if that come short,
+Our substitutes at home shall have blank charters;
+Whereto, when they shall know what men are rich,
+They shall subscribe them for large sums of gold
+And send them after to supply our wants;
+For we will make for Ireland presently.
+Bushy, what news?
+
+BUSHY:
+Old John of Gaunt is grievous sick, my lord,
+Suddenly taken; and hath sent post haste
+To entreat your majesty to visit him.
+
+KING RICHARD II:
+Where lies he?
+
+BUSHY:
+At Ely House.
+
+KING RICHARD II:
+Now put it, God, in the physician's mind
+To help him to his grave immediately!
+The lining of his coffers shall make coats
+To deck our soldiers for these Irish wars.
+Come, gentlemen, let's all go visit him:
+Pray God we may make haste, and come too late!
+
+All:
+Amen.
+
+JOHN OF GAUNT:
+Will the king come, that I may breathe my last
+In wholesome counsel to his unstaid youth?
+
+DUKE OF YORK:
+Vex not yourself, nor strive not with your breath;
+For all in vain comes counsel to his ear.
+
+JOHN OF GAUNT:
+O, but they say the tongues of dying men
+Enforce attention like deep harmony:
+Where words are scarce, they are seldom spent in vain,
+For they breathe truth that breathe their words in pain.
+He that no more must say is listen'd more
+Than they whom youth and ease have taught to glose;
+More are men's ends mark'd than their lives before:
+The setting sun, and music at the close,
+As the last taste of sweets, is sweetest last,
+Writ in remembrance more than things long past:
+Though Richard my life's counsel would not hear,
+My death's sad tale may yet undeaf his ear.
+
+DUKE OF YORK:
+No; it is stopp'd with other flattering sounds,
+As praises, of whose taste the wise are fond,
+Lascivious metres, to whose venom sound
+The open ear of youth doth always listen;
+Report of fashions in proud Italy,
+Whose manners still our tardy apish nation
+Limps after in base imitation.
+Where doth the world thrust forth a vanity--
+So it be new, there's no respect how vile--
+That is not quickly buzzed into his ears?
+Then all too late comes counsel to be heard,
+Where will doth mutiny with wit's regard.
+Direct not him whose way himself will choose:
+'Tis breath thou lack'st, and that breath wilt thou lose.
+
+JOHN OF GAUNT:
+Methinks I am a prophet new inspired
+And thus expiring do foretell of him:
+His rash fierce blaze of riot cannot last,
+For violent fires soon burn out themselves;
+Small showers last long, but sudden storms are short;
+He tires betimes that spurs too fast betimes;
+With eager feeding food doth choke the feeder:
+Light vanity, insatiate cormorant,
+Consuming means, soon preys upon itself.
+This royal throne of kings, this scepter'd isle,
+This earth of majesty, this seat of Mars,
+This other Eden, demi-paradise,
+This fortress built by Nature for herself
+Against infection and the hand of war,
+This happy breed of men, this little world,
+This precious stone set in the silver sea,
+Which serves it in the office of a wall,
+Or as a moat defensive to a house,
+Against the envy of less happier lands,
+This blessed plot, this earth, this realm, this England,
+This nurse, this teeming womb of royal kings,
+Fear'd by their breed and famous by their birth,
+Renowned for their deeds as far from home,
+For Christian service and true chivalry,
+As is the sepulchre in stubborn Jewry,
+Of the world's ransom, blessed Mary's Son,
+This land of such dear souls, this dear dear land,
+Dear for her reputation through the world,
+Is now leased out, I die pronouncing it,
+Like to a tenement or pelting farm:
+England, bound in with the triumphant sea
+Whose rocky shore beats back the envious siege
+Of watery Neptune, is now bound in with shame,
+With inky blots and rotten parchment bonds:
+That England, that was wont to conquer others,
+Hath made a shameful conquest of itself.
+Ah, would the scandal vanish with my life,
+How happy then were my ensuing death!
+
+DUKE OF YORK:
+The king is come: deal mildly with his youth;
+For young hot colts being raged do rage the more.
+
+QUEEN:
+How fares our noble uncle, Lancaster?
+
+KING RICHARD II:
+What comfort, man? how is't with aged Gaunt?
+
+JOHN OF GAUNT:
+O how that name befits my composition!
+Old Gaunt indeed, and gaunt in being old:
+Within me grief hath kept a tedious fast;
+And who abstains from meat that is not gaunt?
+For sleeping England long time have I watch'd;
+Watching breeds leanness, leanness is all gaunt:
+The pleasure that some fathers feed upon,
+Is my strict fast; I mean, my children's looks;
+And therein fasting, hast thou made me gaunt:
+Gaunt am I for the grave, gaunt as a grave,
+Whose hollow womb inherits nought but bones.
+
+KING RICHARD II:
+Can sick men play so nicely with their names?
+
+JOHN OF GAUNT:
+No, misery makes sport to mock itself:
+Since thou dost seek to kill my name in me,
+I mock my name, great king, to flatter thee.
+
+KING RICHARD II:
+Should dying men flatter with those that live?
+
+JOHN OF GAUNT:
+No, no, men living flatter those that die.
+
+KING RICHARD II:
+Thou, now a-dying, say'st thou flatterest me.
+
+JOHN OF GAUNT:
+O, no! thou diest, though I the sicker be.
+
+KING RICHARD II:
+I am in health, I breathe, and see thee ill.
+
+JOHN OF GAUNT:
+Now He that made me knows I see thee ill;
+Ill in myself to see, and in thee seeing ill.
+Thy death-bed is no lesser than thy land
+Wherein thou liest in reputation sick;
+And thou, too careless patient as thou art,
+Commit'st thy anointed body to the cure
+Of those physicians that first wounded thee:
+A thousand flatterers sit within thy crown,
+Whose compass is no bigger than thy head;
+And yet, incaged in so small a verge,
+The waste is no whit lesser than thy land.
+O, had thy grandsire with a prophet's eye
+Seen how his son's son should destroy his sons,
+From forth thy reach he would have laid thy shame,
+Deposing thee before thou wert possess'd,
+Which art possess'd now to depose thyself.
+Why, cousin, wert thou regent of the world,
+It were a shame to let this land by lease;
+But for thy world enjoying but this land,
+Is it not more than shame to shame it so?
+Landlord of England art thou now, not king:
+Thy state of law is bondslave to the law; And thou--
+
+KING RICHARD II:
+A lunatic lean-witted fool,
+Presuming on an ague's privilege,
+Darest with thy frozen admonition
+Make pale our cheek, chasing the royal blood
+With fury from his native residence.
+Now, by my seat's right royal majesty,
+Wert thou not brother to great Edward's son,
+This tongue that runs so roundly in thy head
+Should run thy head from thy unreverent shoulders.
+
+JOHN OF GAUNT:
+O, spare me not, my brother Edward's son,
+For that I was his father Edward's son;
+That blood already, like the pelican,
+Hast thou tapp'd out and drunkenly caroused:
+My brother Gloucester, plain well-meaning soul,
+Whom fair befal in heaven 'mongst happy souls!
+May be a precedent and witness good
+That thou respect'st not spilling Edward's blood:
+Join with the present sickness that I have;
+And thy unkindness be like crooked age,
+To crop at once a too long wither'd flower.
+Live in thy shame, but die not shame with thee!
+These words hereafter thy tormentors be!
+Convey me to my bed, then to my grave:
+Love they to live that love and honour have.
+
+KING RICHARD II:
+And let them die that age and sullens have;
+For both hast thou, and both become the grave.
+
+DUKE OF YORK:
+I do beseech your majesty, impute his words
+To wayward sickliness and age in him:
+He loves you, on my life, and holds you dear
+As Harry Duke of Hereford, were he here.
+
+KING RICHARD II:
+Right, you say true: as Hereford's love, so his;
+As theirs, so mine; and all be as it is.
+
+NORTHUMBERLAND:
+My liege, old Gaunt commends him to your majesty.
+
+KING RICHARD II:
+What says he?
+
+NORTHUMBERLAND:
+Nay, nothing; all is said
+His tongue is now a stringless instrument;
+Words, life and all, old Lancaster hath spent.
+
+DUKE OF YORK:
+Be York the next that must be bankrupt so!
+Though death be poor, it ends a mortal woe.
+
+KING RICHARD II:
+The ripest fruit first falls, and so doth he;
+His time is spent, our pilgrimage must be.
+So much for that. Now for our Irish wars:
+We must supplant those rough rug-headed kerns,
+Which live like venom where no venom else
+But only they have privilege to live.
+And for these great affairs do ask some charge,
+Towards our assistance we do seize to us
+The plate, corn, revenues and moveables,
+Whereof our uncle Gaunt did stand possess'd.
+
+DUKE OF YORK:
+How long shall I be patient? ah, how long
+Shall tender duty make me suffer wrong?
+Not Gloucester's death, nor Hereford's banishment
+Not Gaunt's rebukes, nor England's private wrongs,
+Nor the prevention of poor Bolingbroke
+About his marriage, nor my own disgrace,
+Have ever made me sour my patient cheek,
+Or bend one wrinkle on my sovereign's face.
+I am the last of noble Edward's sons,
+Of whom thy father, Prince of Wales, was first:
+In war was never lion raged more fierce,
+In peace was never gentle lamb more mild,
+Than was that young and princely gentleman.
+His face thou hast, for even so look'd he,
+Accomplish'd with the number of thy hours;
+But when he frown'd, it was against the French
+And not against his friends; his noble hand
+Did will what he did spend and spent not that
+Which his triumphant father's hand had won;
+His hands were guilty of no kindred blood,
+But bloody with the enemies of his kin.
+O Richard! York is too far gone with grief,
+Or else he never would compare between.
+
+KING RICHARD II:
+Why, uncle, what's the matter?
+
+DUKE OF YORK:
+O my liege,
+Pardon me, if you please; if not, I, pleased
+Not to be pardon'd, am content withal.
+Seek you to seize and gripe into your hands
+The royalties and rights of banish'd Hereford?
+Is not Gaunt dead, and doth not Hereford live?
+Was not Gaunt just, and is not Harry true?
+Did not the one deserve to have an heir?
+Is not his heir a well-deserving son?
+Take Hereford's rights away, and take from Time
+His charters and his customary rights;
+Let not to-morrow then ensue to-day;
+Be not thyself; for how art thou a king
+But by fair sequence and succession?
+Now, afore God--God forbid I say true!--
+If you do wrongfully seize Hereford's rights,
+Call in the letters patent that he hath
+By his attorneys-general to sue
+His livery, and deny his offer'd homage,
+You pluck a thousand dangers on your head,
+You lose a thousand well-disposed hearts
+And prick my tender patience, to those thoughts
+Which honour and allegiance cannot think.
+
+KING RICHARD II:
+Think what you will, we seize into our hands
+His plate, his goods, his money and his lands.
+
+DUKE OF YORK:
+I'll not be by the while: my liege, farewell:
+What will ensue hereof, there's none can tell;
+But by bad courses may be understood
+That their events can never fall out good.
+
+KING RICHARD II:
+Go, Bushy, to the Earl of Wiltshire straight:
+Bid him repair to us to Ely House
+To see this business. To-morrow next
+We will for Ireland; and 'tis time, I trow:
+And we create, in absence of ourself,
+Our uncle York lord governor of England;
+For he is just and always loved us well.
+Come on, our queen: to-morrow must we part;
+Be merry, for our time of stay is short
+
+NORTHUMBERLAND:
+Well, lords, the Duke of Lancaster is dead.
+
+LORD ROSS:
+And living too; for now his son is duke.
+
+LORD WILLOUGHBY:
+Barely in title, not in revenue.
+
+NORTHUMBERLAND:
+Richly in both, if justice had her right.
+
+LORD ROSS:
+My heart is great; but it must break with silence,
+Ere't be disburden'd with a liberal tongue.
+
+NORTHUMBERLAND:
+Nay, speak thy mind; and let him ne'er speak more
+That speaks thy words again to do thee harm!
+
+LORD WILLOUGHBY:
+Tends that thou wouldst speak to the Duke of Hereford?
+If it be so, out with it boldly, man;
+Quick is mine ear to hear of good towards him.
+
+LORD ROSS:
+No good at all that I can do for him;
+Unless you call it good to pity him,
+Bereft and gelded of his patrimony.
+
+NORTHUMBERLAND:
+Now, afore God, 'tis shame such wrongs are borne
+In him, a royal prince, and many moe
+Of noble blood in this declining land.
+The king is not himself, but basely led
+By flatterers; and what they will inform,
+Merely in hate, 'gainst any of us all,
+That will the king severely prosecute
+'Gainst us, our lives, our children, and our heirs.
+
+LORD ROSS:
+The commons hath he pill'd with grievous taxes,
+And quite lost their hearts: the nobles hath he fined
+For ancient quarrels, and quite lost their hearts.
+
+LORD WILLOUGHBY:
+And daily new exactions are devised,
+As blanks, benevolences, and I wot not what:
+But what, o' God's name, doth become of this?
+
+NORTHUMBERLAND:
+Wars have not wasted it, for warr'd he hath not,
+But basely yielded upon compromise
+That which his noble ancestors achieved with blows:
+More hath he spent in peace than they in wars.
+
+LORD ROSS:
+The Earl of Wiltshire hath the realm in farm.
+
+LORD WILLOUGHBY:
+The king's grown bankrupt, like a broken man.
+
+NORTHUMBERLAND:
+Reproach and dissolution hangeth over him.
+
+LORD ROSS:
+He hath not money for these Irish wars,
+His burthenous taxations notwithstanding,
+But by the robbing of the banish'd duke.
+
+NORTHUMBERLAND:
+His noble kinsman: most degenerate king!
+But, lords, we hear this fearful tempest sing,
+Yet see no shelter to avoid the storm;
+We see the wind sit sore upon our sails,
+And yet we strike not, but securely perish.
+
+LORD ROSS:
+We see the very wreck that we must suffer;
+And unavoided is the danger now,
+For suffering so the causes of our wreck.
+
+NORTHUMBERLAND:
+Not so; even through the hollow eyes of death
+I spy life peering; but I dare not say
+How near the tidings of our comfort is.
+
+LORD WILLOUGHBY:
+Nay, let us share thy thoughts, as thou dost ours.
+
+LORD ROSS:
+Be confident to speak, Northumberland:
+We three are but thyself; and, speaking so,
+Thy words are but as thoughts; therefore, be bold.
+
+NORTHUMBERLAND:
+Then thus: I have from Port le Blanc, a bay
+In Brittany, received intelligence
+That Harry Duke of Hereford, Rainold Lord Cobham,
+That late broke from the Duke of Exeter,
+His brother, Archbishop late of Canterbury,
+Sir Thomas Erpingham, Sir John Ramston,
+Sir John Norbery, Sir Robert Waterton and Francis Quoint,
+All these well furnish'd by the Duke of Bretagne
+With eight tall ships, three thousand men of war,
+Are making hither with all due expedience
+And shortly mean to touch our northern shore:
+Perhaps they had ere this, but that they stay
+The first departing of the king for Ireland.
+If then we shall shake off our slavish yoke,
+Imp out our drooping country's broken wing,
+Redeem from broking pawn the blemish'd crown,
+Wipe off the dust that hides our sceptre's gilt
+And make high majesty look like itself,
+Away with me in post to Ravenspurgh;
+But if you faint, as fearing to do so,
+Stay and be secret, and myself will go.
+
+LORD ROSS:
+To horse, to horse! urge doubts to them that fear.
+
+LORD WILLOUGHBY:
+Hold out my horse, and I will first be there.
+
+BUSHY:
+Madam, your majesty is too much sad:
+You promised, when you parted with the king,
+To lay aside life-harming heaviness
+And entertain a cheerful disposition.
+
+QUEEN:
+To please the king I did; to please myself
+I cannot do it; yet I know no cause
+Why I should welcome such a guest as grief,
+Save bidding farewell to so sweet a guest
+As my sweet Richard: yet again, methinks,
+Some unborn sorrow, ripe in fortune's womb,
+Is coming towards me, and my inward soul
+With nothing trembles: at some thing it grieves,
+More than with parting from my lord the king.
+
+BUSHY:
+Each substance of a grief hath twenty shadows,
+Which shows like grief itself, but is not so;
+For sorrow's eye, glazed with blinding tears,
+Divides one thing entire to many objects;
+Like perspectives, which rightly gazed upon
+Show nothing but confusion, eyed awry
+Distinguish form: so your sweet majesty,
+Looking awry upon your lord's departure,
+Find shapes of grief, more than himself, to wail;
+Which, look'd on as it is, is nought but shadows
+Of what it is not. Then, thrice-gracious queen,
+More than your lord's departure weep not: more's not seen;
+Or if it be, 'tis with false sorrow's eye,
+Which for things true weeps things imaginary.
+
+QUEEN:
+It may be so; but yet my inward soul
+Persuades me it is otherwise: howe'er it be,
+I cannot but be sad; so heavy sad
+As, though on thinking on no thought I think,
+Makes me with heavy nothing faint and shrink.
+
+BUSHY:
+'Tis nothing but conceit, my gracious lady.
+
+QUEEN:
+'Tis nothing less: conceit is still derived
+From some forefather grief; mine is not so,
+For nothing had begot my something grief;
+Or something hath the nothing that I grieve:
+'Tis in reversion that I do possess;
+But what it is, that is not yet known; what
+I cannot name; 'tis nameless woe, I wot.
+
+GREEN:
+God save your majesty! and well met, gentlemen:
+I hope the king is not yet shipp'd for Ireland.
+
+QUEEN:
+Why hopest thou so? 'tis better hope he is;
+For his designs crave haste, his haste good hope:
+Then wherefore dost thou hope he is not shipp'd?
+
+GREEN:
+That he, our hope, might have retired his power,
+And driven into despair an enemy's hope,
+Who strongly hath set footing in this land:
+The banish'd Bolingbroke repeals himself,
+And with uplifted arms is safe arrived
+At Ravenspurgh.
+
+QUEEN:
+Now God in heaven forbid!
+
+GREEN:
+Ah, madam, 'tis too true: and that is worse,
+The Lord Northumberland, his son young Henry Percy,
+The Lords of Ross, Beaumond, and Willoughby,
+With all their powerful friends, are fled to him.
+
+BUSHY:
+Why have you not proclaim'd Northumberland
+And all the rest revolted faction traitors?
+
+GREEN:
+We have: whereupon the Earl of Worcester
+Hath broke his staff, resign'd his stewardship,
+And all the household servants fled with him
+To Bolingbroke.
+
+QUEEN:
+So, Green, thou art the midwife to my woe,
+And Bolingbroke my sorrow's dismal heir:
+Now hath my soul brought forth her prodigy,
+And I, a gasping new-deliver'd mother,
+Have woe to woe, sorrow to sorrow join'd.
+
+BUSHY:
+Despair not, madam.
+
+QUEEN:
+Who shall hinder me?
+I will despair, and be at enmity
+With cozening hope: he is a flatterer,
+A parasite, a keeper back of death,
+Who gently would dissolve the bands of life,
+Which false hope lingers in extremity.
+
+GREEN:
+Here comes the Duke of York.
+
+QUEEN:
+With signs of war about his aged neck:
+O, full of careful business are his looks!
+Uncle, for God's sake, speak comfortable words.
+
+DUKE OF YORK:
+Should I do so, I should belie my thoughts:
+Comfort's in heaven; and we are on the earth,
+Where nothing lives but crosses, cares and grief.
+Your husband, he is gone to save far off,
+Whilst others come to make him lose at home:
+Here am I left to underprop his land,
+Who, weak with age, cannot support myself:
+Now comes the sick hour that his surfeit made;
+Now shall he try his friends that flatter'd him.
+
+Servant:
+My lord, your son was gone before I came.
+
+DUKE OF YORK:
+He was? Why, so! go all which way it will!
+The nobles they are fled, the commons they are cold,
+And will, I fear, revolt on Hereford's side.
+Sirrah, get thee to Plashy, to my sister Gloucester;
+Bid her send me presently a thousand pound:
+Hold, take my ring.
+
+Servant:
+My lord, I had forgot to tell your lordship,
+To-day, as I came by, I called there;
+But I shall grieve you to report the rest.
+
+DUKE OF YORK:
+What is't, knave?
+
+Servant:
+An hour before I came, the duchess died.
+
+DUKE OF YORK:
+God for his mercy! what a tide of woes
+Comes rushing on this woeful land at once!
+I know not what to do: I would to God,
+So my untruth had not provoked him to it,
+The king had cut off my head with my brother's.
+What, are there no posts dispatch'd for Ireland?
+How shall we do for money for these wars?
+Come, sister,--cousin, I would say--pray, pardon me.
+Go, fellow, get thee home, provide some carts
+And bring away the armour that is there.
+Gentlemen, will you go muster men?
+If I know how or which way to order these affairs
+Thus thrust disorderly into my hands,
+Never believe me. Both are my kinsmen:
+The one is my sovereign, whom both my oath
+And duty bids defend; the other again
+Is my kinsman, whom the king hath wrong'd,
+Whom conscience and my kindred bids to right.
+Well, somewhat we must do. Come, cousin, I'll
+Dispose of you.
+Gentlemen, go, muster up your men,
+And meet me presently at Berkeley.
+I should to Plashy too;
+But time will not permit: all is uneven,
+And every thing is left at six and seven.
+
+BUSHY:
+The wind sits fair for news to go to Ireland,
+But none returns. For us to levy power
+Proportionable to the enemy
+Is all unpossible.
+
+GREEN:
+Besides, our nearness to the king in love
+Is near the hate of those love not the king.
+
+BAGOT:
+And that's the wavering commons: for their love
+Lies in their purses, and whoso empties them
+By so much fills their hearts with deadly hate.
+
+BUSHY:
+Wherein the king stands generally condemn'd.
+
+BAGOT:
+If judgement lie in them, then so do we,
+Because we ever have been near the king.
+
+GREEN:
+Well, I will for refuge straight to Bristol castle:
+The Earl of Wiltshire is already there.
+
+BUSHY:
+Thither will I with you; for little office
+The hateful commons will perform for us,
+Except like curs to tear us all to pieces.
+Will you go along with us?
+
+BAGOT:
+No; I will to Ireland to his majesty.
+Farewell: if heart's presages be not vain,
+We three here art that ne'er shall meet again.
+
+BUSHY:
+That's as York thrives to beat back Bolingbroke.
+
+GREEN:
+Alas, poor duke! the task he undertakes
+Is numbering sands and drinking oceans dry:
+Where one on his side fights, thousands will fly.
+Farewell at once, for once, for all, and ever.
+
+BUSHY:
+Well, we may meet again.
+
+BAGOT:
+I fear me, never.
+
+HENRY BOLINGBROKE:
+How far is it, my lord, to Berkeley now?
+
+NORTHUMBERLAND:
+Believe me, noble lord,
+I am a stranger here in Gloucestershire:
+These high wild hills and rough uneven ways
+Draws out our miles, and makes them wearisome,
+And yet your fair discourse hath been as sugar,
+Making the hard way sweet and delectable.
+But I bethink me what a weary way
+From Ravenspurgh to Cotswold will be found
+In Ross and Willoughby, wanting your company,
+Which, I protest, hath very much beguiled
+The tediousness and process of my travel:
+But theirs is sweetened with the hope to have
+The present benefit which I possess;
+And hope to joy is little less in joy
+Than hope enjoy'd: by this the weary lords
+Shall make their way seem short, as mine hath done
+By sight of what I have, your noble company.
+
+HENRY BOLINGBROKE:
+Of much less value is my company
+Than your good words. But who comes here?
+
+NORTHUMBERLAND:
+It is my son, young Harry Percy,
+Sent from my brother Worcester, whencesoever.
+Harry, how fares your uncle?
+
+HENRY PERCY:
+I had thought, my lord, to have learn'd his health of you.
+
+NORTHUMBERLAND:
+Why, is he not with the queen?
+
+HENRY PERCY:
+No, my good Lord; he hath forsook the court,
+Broken his staff of office and dispersed
+The household of the king.
+
+NORTHUMBERLAND:
+What was his reason?
+He was not so resolved when last we spake together.
+
+HENRY PERCY:
+Because your lordship was proclaimed traitor.
+But he, my lord, is gone to Ravenspurgh,
+To offer service to the Duke of Hereford,
+And sent me over by Berkeley, to discover
+What power the Duke of York had levied there;
+Then with directions to repair to Ravenspurgh.
+
+NORTHUMBERLAND:
+Have you forgot the Duke of Hereford, boy?
+
+HENRY PERCY:
+No, my good lord, for that is not forgot
+Which ne'er I did remember: to my knowledge,
+I never in my life did look on him.
+
+NORTHUMBERLAND:
+Then learn to know him now; this is the duke.
+
+HENRY PERCY:
+My gracious lord, I tender you my service,
+Such as it is, being tender, raw and young:
+Which elder days shall ripen and confirm
+To more approved service and desert.
+
+HENRY BOLINGBROKE:
+I thank thee, gentle Percy; and be sure
+I count myself in nothing else so happy
+As in a soul remembering my good friends;
+And, as my fortune ripens with thy love,
+It shall be still thy true love's recompense:
+My heart this covenant makes, my hand thus seals it.
+
+NORTHUMBERLAND:
+How far is it to Berkeley? and what stir
+Keeps good old York there with his men of war?
+
+HENRY PERCY:
+There stands the castle, by yon tuft of trees,
+Mann'd with three hundred men, as I have heard;
+And in it are the Lords of York, Berkeley, and Seymour;
+None else of name and noble estimate.
+
+NORTHUMBERLAND:
+Here come the Lords of Ross and Willoughby,
+Bloody with spurring, fiery-red with haste.
+
+HENRY BOLINGBROKE:
+Welcome, my lords. I wot your love pursues
+A banish'd traitor: all my treasury
+Is yet but unfelt thanks, which more enrich'd
+Shall be your love and labour's recompense.
+
+LORD ROSS:
+Your presence makes us rich, most noble lord.
+
+LORD WILLOUGHBY:
+And far surmounts our labour to attain it.
+
+HENRY BOLINGBROKE:
+Evermore thanks, the exchequer of the poor;
+Which, till my infant fortune comes to years,
+Stands for my bounty. But who comes here?
+
+NORTHUMBERLAND:
+It is my Lord of Berkeley, as I guess.
+
+LORD BERKELEY:
+My Lord of Hereford, my message is to you.
+
+HENRY BOLINGBROKE:
+My lord, my answer is--to Lancaster;
+And I am come to seek that name in England;
+And I must find that title in your tongue,
+Before I make reply to aught you say.
+
+LORD BERKELEY:
+Mistake me not, my lord; 'tis not my meaning
+To raze one title of your honour out:
+To you, my lord, I come, what lord you will,
+From the most gracious regent of this land,
+The Duke of York, to know what pricks you on
+To take advantage of the absent time
+And fright our native peace with self-born arms.
+
+HENRY BOLINGBROKE:
+I shall not need transport my words by you;
+Here comes his grace in person. My noble uncle!
+
+DUKE OF YORK:
+Show me thy humble heart, and not thy knee,
+Whose duty is deceiveable and false.
+
+HENRY BOLINGBROKE:
+My gracious uncle--
+
+DUKE OF YORK:
+Tut, tut!
+Grace me no grace, nor uncle me no uncle:
+I am no traitor's uncle; and that word 'grace.'
+In an ungracious mouth is but profane.
+Why have those banish'd and forbidden legs
+Dared once to touch a dust of England's ground?
+But then more 'why?' why have they dared to march
+So many miles upon her peaceful bosom,
+Frighting her pale-faced villages with war
+And ostentation of despised arms?
+Comest thou because the anointed king is hence?
+Why, foolish boy, the king is left behind,
+And in my loyal bosom lies his power.
+Were I but now the lord of such hot youth
+As when brave Gaunt, thy father, and myself
+Rescued the Black Prince, that young Mars of men,
+From forth the ranks of many thousand French,
+O, then how quickly should this arm of mine.
+Now prisoner to the palsy, chastise thee
+And minister correction to thy fault!
+
+HENRY BOLINGBROKE:
+My gracious uncle, let me know my fault:
+On what condition stands it and wherein?
+
+DUKE OF YORK:
+Even in condition of the worst degree,
+In gross rebellion and detested treason:
+Thou art a banish'd man, and here art come
+Before the expiration of thy time,
+In braving arms against thy sovereign.
+
+HENRY BOLINGBROKE:
+As I was banish'd, I was banish'd Hereford;
+But as I come, I come for Lancaster.
+And, noble uncle, I beseech your grace
+Look on my wrongs with an indifferent eye:
+You are my father, for methinks in you
+I see old Gaunt alive; O, then, my father,
+Will you permit that I shall stand condemn'd
+A wandering vagabond; my rights and royalties
+Pluck'd from my arms perforce and given away
+To upstart unthrifts? Wherefore was I born?
+If that my cousin king be King of England,
+It must be granted I am Duke of Lancaster.
+You have a son, Aumerle, my noble cousin;
+Had you first died, and he been thus trod down,
+He should have found his uncle Gaunt a father,
+To rouse his wrongs and chase them to the bay.
+I am denied to sue my livery here,
+And yet my letters-patents give me leave:
+My father's goods are all distrain'd and sold,
+And these and all are all amiss employ'd.
+What would you have me do? I am a subject,
+And I challenge law: attorneys are denied me;
+And therefore, personally I lay my claim
+To my inheritance of free descent.
+
+NORTHUMBERLAND:
+The noble duke hath been too much abused.
+
+LORD ROSS:
+It stands your grace upon to do him right.
+
+LORD WILLOUGHBY:
+Base men by his endowments are made great.
+
+DUKE OF YORK:
+My lords of England, let me tell you this:
+I have had feeling of my cousin's wrongs
+And laboured all I could to do him right;
+But in this kind to come, in braving arms,
+Be his own carver and cut out his way,
+To find out right with wrong, it may not be;
+And you that do abet him in this kind
+Cherish rebellion and are rebels all.
+
+NORTHUMBERLAND:
+The noble duke hath sworn his coming is
+But for his own; and for the right of that
+We all have strongly sworn to give him aid;
+And let him ne'er see joy that breaks that oath!
+
+DUKE OF YORK:
+Well, well, I see the issue of these arms:
+I cannot mend it, I must needs confess,
+Because my power is weak and all ill left:
+But if I could, by Him that gave me life,
+I would attach you all and make you stoop
+Unto the sovereign mercy of the king;
+But since I cannot, be it known to you
+I do remain as neuter. So, fare you well;
+Unless you please to enter in the castle
+And there repose you for this night.
+
+HENRY BOLINGBROKE:
+An offer, uncle, that we will accept:
+But we must win your grace to go with us
+To Bristol castle, which they say is held
+By Bushy, Bagot and their complices,
+The caterpillars of the commonwealth,
+Which I have sworn to weed and pluck away.
+
+DUKE OF YORK:
+It may be I will go with you: but yet I'll pause;
+For I am loath to break our country's laws.
+Nor friends nor foes, to me welcome you are:
+Things past redress are now with me past care.
+
+Captain:
+My lord of Salisbury, we have stay'd ten days,
+And hardly kept our countrymen together,
+And yet we hear no tidings from the king;
+Therefore we will disperse ourselves: farewell.
+
+EARL OF SALISBURY:
+Stay yet another day, thou trusty Welshman:
+The king reposeth all his confidence in thee.
+
+Captain:
+'Tis thought the king is dead; we will not stay.
+The bay-trees in our country are all wither'd
+And meteors fright the fixed stars of heaven;
+The pale-faced moon looks bloody on the earth
+And lean-look'd prophets whisper fearful change;
+Rich men look sad and ruffians dance and leap,
+The one in fear to lose what they enjoy,
+The other to enjoy by rage and war:
+These signs forerun the death or fall of kings.
+Farewell: our countrymen are gone and fled,
+As well assured Richard their king is dead.
+
+EARL OF SALISBURY:
+Ah, Richard, with the eyes of heavy mind
+I see thy glory like a shooting star
+Fall to the base earth from the firmament.
+Thy sun sets weeping in the lowly west,
+Witnessing storms to come, woe and unrest:
+Thy friends are fled to wait upon thy foes,
+And crossly to thy good all fortune goes.
+
+HENRY BOLINGBROKE:
+Bring forth these men.
+Bushy and Green, I will not vex your souls--
+Since presently your souls must part your bodies--
+With too much urging your pernicious lives,
+For 'twere no charity; yet, to wash your blood
+From off my hands, here in the view of men
+I will unfold some causes of your deaths.
+You have misled a prince, a royal king,
+A happy gentleman in blood and lineaments,
+By you unhappied and disfigured clean:
+You have in manner with your sinful hours
+Made a divorce betwixt his queen and him,
+Broke the possession of a royal bed
+And stain'd the beauty of a fair queen's cheeks
+With tears drawn from her eyes by your foul wrongs.
+Myself, a prince by fortune of my birth,
+Near to the king in blood, and near in love
+Till you did make him misinterpret me,
+Have stoop'd my neck under your injuries,
+And sigh'd my English breath in foreign clouds,
+Eating the bitter bread of banishment;
+Whilst you have fed upon my signories,
+Dispark'd my parks and fell'd my forest woods,
+From my own windows torn my household coat,
+Razed out my imprese, leaving me no sign,
+Save men's opinions and my living blood,
+To show the world I am a gentleman.
+This and much more, much more than twice all this,
+Condemns you to the death. See them deliver'd over
+To execution and the hand of death.
+
+BUSHY:
+More welcome is the stroke of death to me
+Than Bolingbroke to England. Lords, farewell.
+
+GREEN:
+My comfort is that heaven will take our souls
+And plague injustice with the pains of hell.
+
+HENRY BOLINGBROKE:
+My Lord Northumberland, see them dispatch'd.
+Uncle, you say the queen is at your house;
+For God's sake, fairly let her be entreated:
+Tell her I send to her my kind commends;
+Take special care my greetings be deliver'd.
+
+DUKE OF YORK:
+A gentleman of mine I have dispatch'd
+With letters of your love to her at large.
+
+HENRY BOLINGBROKE:
+Thank, gentle uncle. Come, lords, away.
+To fight with Glendower and his complices:
+Awhile to work, and after holiday.
+
+KING RICHARD II:
+Barkloughly castle call they this at hand?
+
+DUKE OF AUMERLE:
+Yea, my lord. How brooks your grace the air,
+After your late tossing on the breaking seas?
+
+KING RICHARD II:
+Needs must I like it well: I weep for joy
+To stand upon my kingdom once again.
+Dear earth, I do salute thee with my hand,
+Though rebels wound thee with their horses' hoofs:
+As a long-parted mother with her child
+Plays fondly with her tears and smiles in meeting,
+So, weeping, smiling, greet I thee, my earth,
+And do thee favours with my royal hands.
+Feed not thy sovereign's foe, my gentle earth,
+Nor with thy sweets comfort his ravenous sense;
+But let thy spiders, that suck up thy venom,
+And heavy-gaited toads lie in their way,
+Doing annoyance to the treacherous feet
+Which with usurping steps do trample thee:
+Yield stinging nettles to mine enemies;
+And when they from thy bosom pluck a flower,
+Guard it, I pray thee, with a lurking adder
+Whose double tongue may with a mortal touch
+Throw death upon thy sovereign's enemies.
+Mock not my senseless conjuration, lords:
+This earth shall have a feeling and these stones
+Prove armed soldiers, ere her native king
+Shall falter under foul rebellion's arms.
+
+BISHOP OF CARLISLE:
+Fear not, my lord: that Power that made you king
+Hath power to keep you king in spite of all.
+The means that heaven yields must be embraced,
+And not neglected; else, if heaven would,
+And we will not, heaven's offer we refuse,
+The proffer'd means of succor and redress.
+
+DUKE OF AUMERLE:
+He means, my lord, that we are too remiss;
+Whilst Bolingbroke, through our security,
+Grows strong and great in substance and in power.
+
+KING RICHARD II:
+Discomfortable cousin! know'st thou not
+That when the searching eye of heaven is hid,
+Behind the globe, that lights the lower world,
+Then thieves and robbers range abroad unseen
+In murders and in outrage, boldly here;
+But when from under this terrestrial ball
+He fires the proud tops of the eastern pines
+And darts his light through every guilty hole,
+Then murders, treasons and detested sins,
+The cloak of night being pluck'd from off their backs,
+Stand bare and naked, trembling at themselves?
+So when this thief, this traitor, Bolingbroke,
+Who all this while hath revell'd in the night
+Whilst we were wandering with the antipodes,
+Shall see us rising in our throne, the east,
+His treasons will sit blushing in his face,
+Not able to endure the sight of day,
+But self-affrighted tremble at his sin.
+Not all the water in the rough rude sea
+Can wash the balm off from an anointed king;
+The breath of worldly men cannot depose
+The deputy elected by the Lord:
+For every man that Bolingbroke hath press'd
+To lift shrewd steel against our golden crown,
+God for his Richard hath in heavenly pay
+A glorious angel: then, if angels fight,
+Weak men must fall, for heaven still guards the right.
+Welcome, my lord how far off lies your power?
+
+EARL OF SALISBURY:
+Nor near nor farther off, my gracious lord,
+Than this weak arm: discomfort guides my tongue
+And bids me speak of nothing but despair.
+One day too late, I fear me, noble lord,
+Hath clouded all thy happy days on earth:
+O, call back yesterday, bid time return,
+And thou shalt have twelve thousand fighting men!
+To-day, to-day, unhappy day, too late,
+O'erthrows thy joys, friends, fortune and thy state:
+For all the Welshmen, hearing thou wert dead.
+Are gone to Bolingbroke, dispersed and fled.
+
+DUKE OF AUMERLE:
+Comfort, my liege; why looks your grace so pale?
+
+KING RICHARD II:
+But now the blood of twenty thousand men
+Did triumph in my face, and they are fled;
+And, till so much blood thither come again,
+Have I not reason to look pale and dead?
+All souls that will be safe fly from my side,
+For time hath set a blot upon my pride.
+
+DUKE OF AUMERLE:
+Comfort, my liege; remember who you are.
+
+KING RICHARD II:
+I had forgot myself; am I not king?
+Awake, thou coward majesty! thou sleepest.
+Is not the king's name twenty thousand names?
+Arm, arm, my name! a puny subject strikes
+At thy great glory. Look not to the ground,
+Ye favourites of a king: are we not high?
+High be our thoughts: I know my uncle York
+Hath power enough to serve our turn. But who comes here?
+
+SIR STEPHEN SCROOP:
+More health and happiness betide my liege
+Than can my care-tuned tongue deliver him!
+
+KING RICHARD II:
+Mine ear is open and my heart prepared;
+The worst is worldly loss thou canst unfold.
+Say, is my kingdom lost? why, 'twas my care
+And what loss is it to be rid of care?
+Strives Bolingbroke to be as great as we?
+Greater he shall not be; if he serve God,
+We'll serve Him too and be his fellow so:
+Revolt our subjects? that we cannot mend;
+They break their faith to God as well as us:
+Cry woe, destruction, ruin and decay:
+The worst is death, and death will have his day.
+
+SIR STEPHEN SCROOP:
+Glad am I that your highness is so arm'd
+To bear the tidings of calamity.
+Like an unseasonable stormy day,
+Which makes the silver rivers drown their shores,
+As if the world were all dissolved to tears,
+So high above his limits swells the rage
+Of Bolingbroke, covering your fearful land
+With hard bright steel and hearts harder than steel.
+White-beards have arm'd their thin and hairless scalps
+Against thy majesty; boys, with women's voices,
+Strive to speak big and clap their female joints
+In stiff unwieldy arms against thy crown:
+The very beadsmen learn to bend their bows
+Of double-fatal yew against thy state;
+Yea, distaff-women manage rusty bills
+Against thy seat: both young and old rebel,
+And all goes worse than I have power to tell.
+
+KING RICHARD II:
+Too well, too well thou tell'st a tale so ill.
+Where is the Earl of Wiltshire? where is Bagot?
+What is become of Bushy? where is Green?
+That they have let the dangerous enemy
+Measure our confines with such peaceful steps?
+If we prevail, their heads shall pay for it:
+I warrant they have made peace with Bolingbroke.
+
+SIR STEPHEN SCROOP:
+Peace have they made with him indeed, my lord.
+
+KING RICHARD II:
+O villains, vipers, damn'd without redemption!
+Dogs, easily won to fawn on any man!
+Snakes, in my heart-blood warm'd, that sting my heart!
+Three Judases, each one thrice worse than Judas!
+Would they make peace? terrible hell make war
+Upon their spotted souls for this offence!
+
+SIR STEPHEN SCROOP:
+Sweet love, I see, changing his property,
+Turns to the sourest and most deadly hate:
+Again uncurse their souls; their peace is made
+With heads, and not with hands; those whom you curse
+Have felt the worst of death's destroying wound
+And lie full low, graved in the hollow ground.
+
+DUKE OF AUMERLE:
+Is Bushy, Green, and the Earl of Wiltshire dead?
+
+SIR STEPHEN SCROOP:
+Ay, all of them at Bristol lost their heads.
+
+DUKE OF AUMERLE:
+Where is the duke my father with his power?
+
+KING RICHARD II:
+No matter where; of comfort no man speak:
+Let's talk of graves, of worms, and epitaphs;
+Make dust our paper and with rainy eyes
+Write sorrow on the bosom of the earth,
+Let's choose executors and talk of wills:
+And yet not so, for what can we bequeath
+Save our deposed bodies to the ground?
+Our lands, our lives and all are Bolingbroke's,
+And nothing can we call our own but death
+And that small model of the barren earth
+Which serves as paste and cover to our bones.
+For God's sake, let us sit upon the ground
+And tell sad stories of the death of kings;
+How some have been deposed; some slain in war,
+Some haunted by the ghosts they have deposed;
+Some poison'd by their wives: some sleeping kill'd;
+All murder'd: for within the hollow crown
+That rounds the mortal temples of a king
+Keeps Death his court and there the antic sits,
+Scoffing his state and grinning at his pomp,
+Allowing him a breath, a little scene,
+To monarchize, be fear'd and kill with looks,
+Infusing him with self and vain conceit,
+As if this flesh which walls about our life,
+Were brass impregnable, and humour'd thus
+Comes at the last and with a little pin
+Bores through his castle wall, and farewell king!
+Cover your heads and mock not flesh and blood
+With solemn reverence: throw away respect,
+Tradition, form and ceremonious duty,
+For you have but mistook me all this while:
+I live with bread like you, feel want,
+Taste grief, need friends: subjected thus,
+How can you say to me, I am a king?
+
+BISHOP OF CARLISLE:
+My lord, wise men ne'er sit and wail their woes,
+But presently prevent the ways to wail.
+To fear the foe, since fear oppresseth strength,
+Gives in your weakness strength unto your foe,
+And so your follies fight against yourself.
+Fear and be slain; no worse can come to fight:
+And fight and die is death destroying death;
+Where fearing dying pays death servile breath.
+
+DUKE OF AUMERLE:
+My father hath a power; inquire of him
+And learn to make a body of a limb.
+
+KING RICHARD II:
+Thou chidest me well: proud Bolingbroke, I come
+To change blows with thee for our day of doom.
+This ague fit of fear is over-blown;
+An easy task it is to win our own.
+Say, Scroop, where lies our uncle with his power?
+Speak sweetly, man, although thy looks be sour.
+
+SIR STEPHEN SCROOP:
+Men judge by the complexion of the sky
+The state and inclination of the day:
+So may you by my dull and heavy eye,
+My tongue hath but a heavier tale to say.
+I play the torturer, by small and small
+To lengthen out the worst that must be spoken:
+Your uncle York is join'd with Bolingbroke,
+And all your northern castles yielded up,
+And all your southern gentlemen in arms
+Upon his party.
+
+KING RICHARD II:
+Thou hast said enough.
+Beshrew thee, cousin, which didst lead me forth
+Of that sweet way I was in to despair!
+What say you now? what comfort have we now?
+By heaven, I'll hate him everlastingly
+That bids me be of comfort any more.
+Go to Flint castle: there I'll pine away;
+A king, woe's slave, shall kingly woe obey.
+That power I have, discharge; and let them go
+To ear the land that hath some hope to grow,
+For I have none: let no man speak again
+To alter this, for counsel is but vain.
+
+DUKE OF AUMERLE:
+My liege, one word.
+
+KING RICHARD II:
+He does me double wrong
+That wounds me with the flatteries of his tongue.
+Discharge my followers: let them hence away,
+From Richard's night to Bolingbroke's fair day.
+
+HENRY BOLINGBROKE:
+So that by this intelligence we learn
+The Welshmen are dispersed, and Salisbury
+Is gone to meet the king, who lately landed
+With some few private friends upon this coast.
+
+NORTHUMBERLAND:
+The news is very fair and good, my lord:
+Richard not far from hence hath hid his head.
+
+DUKE OF YORK:
+It would beseem the Lord Northumberland
+To say 'King Richard:' alack the heavy day
+When such a sacred king should hide his head.
+
+NORTHUMBERLAND:
+Your grace mistakes; only to be brief
+Left I his title out.
+
+DUKE OF YORK:
+The time hath been,
+Would you have been so brief with him, he would
+Have been so brief with you, to shorten you,
+For taking so the head, your whole head's length.
+
+HENRY BOLINGBROKE:
+Mistake not, uncle, further than you should.
+
+DUKE OF YORK:
+Take not, good cousin, further than you should.
+Lest you mistake the heavens are o'er our heads.
+
+HENRY BOLINGBROKE:
+I know it, uncle, and oppose not myself
+Against their will. But who comes here?
+Welcome, Harry: what, will not this castle yield?
+
+HENRY PERCY:
+The castle royally is mann'd, my lord,
+Against thy entrance.
+
+HENRY BOLINGBROKE:
+Royally!
+Why, it contains no king?
+
+HENRY PERCY:
+Yes, my good lord,
+It doth contain a king; King Richard lies
+Within the limits of yon lime and stone:
+And with him are the Lord Aumerle, Lord Salisbury,
+Sir Stephen Scroop, besides a clergyman
+Of holy reverence; who, I cannot learn.
+
+NORTHUMBERLAND:
+O, belike it is the Bishop of Carlisle.
+
+HENRY BOLINGBROKE:
+Noble lords,
+Go to the rude ribs of that ancient castle;
+Through brazen trumpet send the breath of parley
+Into his ruin'd ears, and thus deliver:
+Henry Bolingbroke
+On both his knees doth kiss King Richard's hand
+And sends allegiance and true faith of heart
+To his most royal person, hither come
+Even at his feet to lay my arms and power,
+Provided that my banishment repeal'd
+And lands restored again be freely granted:
+If not, I'll use the advantage of my power
+And lay the summer's dust with showers of blood
+Rain'd from the wounds of slaughter'd Englishmen:
+The which, how far off from the mind of Bolingbroke
+It is, such crimson tempest should bedrench
+The fresh green lap of fair King Richard's land,
+My stooping duty tenderly shall show.
+Go, signify as much, while here we march
+Upon the grassy carpet of this plain.
+Let's march without the noise of threatening drum,
+That from this castle's tatter'd battlements
+Our fair appointments may be well perused.
+Methinks King Richard and myself should meet
+With no less terror than the elements
+Of fire and water, when their thundering shock
+At meeting tears the cloudy cheeks of heaven.
+Be he the fire, I'll be the yielding water:
+The rage be his, whilst on the earth I rain
+My waters; on the earth, and not on him.
+March on, and mark King Richard how he looks.
+See, see, King Richard doth himself appear,
+As doth the blushing discontented sun
+From out the fiery portal of the east,
+When he perceives the envious clouds are bent
+To dim his glory and to stain the track
+Of his bright passage to the occident.
+
+DUKE OF YORK:
+Yet looks he like a king: behold, his eye,
+As bright as is the eagle's, lightens forth
+Controlling majesty: alack, alack, for woe,
+That any harm should stain so fair a show!
+
+KING RICHARD II:
+We are amazed; and thus long have we stood
+To watch the fearful bending of thy knee,
+Because we thought ourself thy lawful king:
+And if we be, how dare thy joints forget
+To pay their awful duty to our presence?
+If we be not, show us the hand of God
+That hath dismissed us from our stewardship;
+For well we know, no hand of blood and bone
+Can gripe the sacred handle of our sceptre,
+Unless he do profane, steal, or usurp.
+And though you think that all, as you have done,
+Have torn their souls by turning them from us,
+And we are barren and bereft of friends;
+Yet know, my master, God omnipotent,
+Is mustering in his clouds on our behalf
+Armies of pestilence; and they shall strike
+Your children yet unborn and unbegot,
+That lift your vassal hands against my head
+And threat the glory of my precious crown.
+Tell Bolingbroke--for yond methinks he stands--
+That every stride he makes upon my land
+Is dangerous treason: he is come to open
+The purple testament of bleeding war;
+But ere the crown he looks for live in peace,
+Ten thousand bloody crowns of mothers' sons
+Shall ill become the flower of England's face,
+Change the complexion of her maid-pale peace
+To scarlet indignation and bedew
+Her pastures' grass with faithful English blood.
+
+NORTHUMBERLAND:
+The king of heaven forbid our lord the king
+Should so with civil and uncivil arms
+Be rush'd upon! Thy thrice noble cousin
+Harry Bolingbroke doth humbly kiss thy hand;
+And by the honourable tomb he swears,
+That stands upon your royal grandsire's bones,
+And by the royalties of both your bloods,
+Currents that spring from one most gracious head,
+And by the buried hand of warlike Gaunt,
+And by the worth and honour of himself,
+Comprising all that may be sworn or said,
+His coming hither hath no further scope
+Than for his lineal royalties and to beg
+Enfranchisement immediate on his knees:
+Which on thy royal party granted once,
+His glittering arms he will commend to rust,
+His barbed steeds to stables, and his heart
+To faithful service of your majesty.
+This swears he, as he is a prince, is just;
+And, as I am a gentleman, I credit him.
+
+KING RICHARD II:
+Northumberland, say thus the king returns:
+His noble cousin is right welcome hither;
+And all the number of his fair demands
+Shall be accomplish'd without contradiction:
+With all the gracious utterance thou hast
+Speak to his gentle hearing kind commends.
+We do debase ourselves, cousin, do we not,
+To look so poorly and to speak so fair?
+Shall we call back Northumberland, and send
+Defiance to the traitor, and so die?
+
+DUKE OF AUMERLE:
+No, good my lord; let's fight with gentle words
+Till time lend friends and friends their helpful swords.
+
+KING RICHARD II:
+O God, O God! that e'er this tongue of mine,
+That laid the sentence of dread banishment
+On yon proud man, should take it off again
+With words of sooth! O that I were as great
+As is my grief, or lesser than my name!
+Or that I could forget what I have been,
+Or not remember what I must be now!
+Swell'st thou, proud heart? I'll give thee scope to beat,
+Since foes have scope to beat both thee and me.
+
+DUKE OF AUMERLE:
+Northumberland comes back from Bolingbroke.
+
+KING RICHARD II:
+What must the king do now? must he submit?
+The king shall do it: must he be deposed?
+The king shall be contented: must he lose
+The name of king? o' God's name, let it go:
+I'll give my jewels for a set of beads,
+My gorgeous palace for a hermitage,
+My gay apparel for an almsman's gown,
+My figured goblets for a dish of wood,
+My sceptre for a palmer's walking staff,
+My subjects for a pair of carved saints
+And my large kingdom for a little grave,
+A little little grave, an obscure grave;
+Or I'll be buried in the king's highway,
+Some way of common trade, where subjects' feet
+May hourly trample on their sovereign's head;
+For on my heart they tread now whilst I live;
+And buried once, why not upon my head?
+Aumerle, thou weep'st, my tender-hearted cousin!
+We'll make foul weather with despised tears;
+Our sighs and they shall lodge the summer corn,
+And make a dearth in this revolting land.
+Or shall we play the wantons with our woes,
+And make some pretty match with shedding tears?
+As thus, to drop them still upon one place,
+Till they have fretted us a pair of graves
+Within the earth; and, therein laid,--there lies
+Two kinsmen digg'd their graves with weeping eyes.
+Would not this ill do well? Well, well, I see
+I talk but idly, and you laugh at me.
+Most mighty prince, my Lord Northumberland,
+What says King Bolingbroke? will his majesty
+Give Richard leave to live till Richard die?
+You make a leg, and Bolingbroke says ay.
+
+NORTHUMBERLAND:
+My lord, in the base court he doth attend
+To speak with you; may it please you to come down.
+
+KING RICHARD II:
+Down, down I come; like glistering Phaethon,
+Wanting the manage of unruly jades.
+In the base court? Base court, where kings grow base,
+To come at traitors' calls and do them grace.
+In the base court? Come down? Down, court!
+down, king!
+For night-owls shriek where mounting larks
+should sing.
+
+HENRY BOLINGBROKE:
+What says his majesty?
+
+NORTHUMBERLAND:
+Sorrow and grief of heart
+Makes him speak fondly, like a frantic man
+Yet he is come.
+
+HENRY BOLINGBROKE:
+Stand all apart,
+And show fair duty to his majesty.
+My gracious lord,--
+
+KING RICHARD II:
+Fair cousin, you debase your princely knee
+To make the base earth proud with kissing it:
+Me rather had my heart might feel your love
+Than my unpleased eye see your courtesy.
+Up, cousin, up; your heart is up, I know,
+Thus high at least, although your knee be low.
+
+HENRY BOLINGBROKE:
+My gracious lord, I come but for mine own.
+
+KING RICHARD II:
+Your own is yours, and I am yours, and all.
+
+HENRY BOLINGBROKE:
+So far be mine, my most redoubted lord,
+As my true service shall deserve your love.
+
+KING RICHARD II:
+Well you deserve: they well deserve to have,
+That know the strong'st and surest way to get.
+Uncle, give me your hands: nay, dry your eyes;
+Tears show their love, but want their remedies.
+Cousin, I am too young to be your father,
+Though you are old enough to be my heir.
+What you will have, I'll give, and willing too;
+For do we must what force will have us do.
+Set on towards London, cousin, is it so?
+
+HENRY BOLINGBROKE:
+Yea, my good lord.
+
+KING RICHARD II:
+Then I must not say no.
+
+QUEEN:
+What sport shall we devise here in this garden,
+To drive away the heavy thought of care?
+
+Lady:
+Madam, we'll play at bowls.
+
+QUEEN:
+'Twill make me think the world is full of rubs,
+And that my fortune rubs against the bias.
+
+Lady:
+Madam, we'll dance.
+
+QUEEN:
+My legs can keep no measure in delight,
+When my poor heart no measure keeps in grief:
+Therefore, no dancing, girl; some other sport.
+
+Lady:
+Madam, we'll tell tales.
+
+QUEEN:
+Of sorrow or of joy?
+
+Lady:
+Of either, madam.
+
+QUEEN:
+Of neither, girl:
+For of joy, being altogether wanting,
+It doth remember me the more of sorrow;
+Or if of grief, being altogether had,
+It adds more sorrow to my want of joy:
+For what I have I need not to repeat;
+And what I want it boots not to complain.
+
+Lady:
+Madam, I'll sing.
+
+QUEEN:
+'Tis well that thou hast cause
+But thou shouldst please me better, wouldst thou weep.
+
+Lady:
+I could weep, madam, would it do you good.
+
+QUEEN:
+And I could sing, would weeping do me good,
+And never borrow any tear of thee.
+But stay, here come the gardeners:
+Let's step into the shadow of these trees.
+My wretchedness unto a row of pins,
+They'll talk of state; for every one doth so
+Against a change; woe is forerun with woe.
+
+Gardener:
+Go, bind thou up yon dangling apricocks,
+Which, like unruly children, make their sire
+Stoop with oppression of their prodigal weight:
+Give some supportance to the bending twigs.
+Go thou, and like an executioner,
+Cut off the heads of too fast growing sprays,
+That look too lofty in our commonwealth:
+All must be even in our government.
+You thus employ'd, I will go root away
+The noisome weeds, which without profit suck
+The soil's fertility from wholesome flowers.
+
+Servant:
+Why should we in the compass of a pale
+Keep law and form and due proportion,
+Showing, as in a model, our firm estate,
+When our sea-walled garden, the whole land,
+Is full of weeds, her fairest flowers choked up,
+Her fruit-trees all upturned, her hedges ruin'd,
+Her knots disorder'd and her wholesome herbs
+Swarming with caterpillars?
+
+Gardener:
+Hold thy peace:
+He that hath suffer'd this disorder'd spring
+Hath now himself met with the fall of leaf:
+The weeds which his broad-spreading leaves did shelter,
+That seem'd in eating him to hold him up,
+Are pluck'd up root and all by Bolingbroke,
+I mean the Earl of Wiltshire, Bushy, Green.
+
+Servant:
+What, are they dead?
+
+Gardener:
+They are; and Bolingbroke
+Hath seized the wasteful king. O, what pity is it
+That he had not so trimm'd and dress'd his land
+As we this garden! We at time of year
+Do wound the bark, the skin of our fruit-trees,
+Lest, being over-proud in sap and blood,
+With too much riches it confound itself:
+Had he done so to great and growing men,
+They might have lived to bear and he to taste
+Their fruits of duty: superfluous branches
+We lop away, that bearing boughs may live:
+Had he done so, himself had borne the crown,
+Which waste of idle hours hath quite thrown down.
+
+Servant:
+What, think you then the king shall be deposed?
+
+Gardener:
+Depress'd he is already, and deposed
+'Tis doubt he will be: letters came last night
+To a dear friend of the good Duke of York's,
+That tell black tidings.
+
+QUEEN:
+O, I am press'd to death through want of speaking!
+Thou, old Adam's likeness, set to dress this garden,
+How dares thy harsh rude tongue sound this unpleasing news?
+What Eve, what serpent, hath suggested thee
+To make a second fall of cursed man?
+Why dost thou say King Richard is deposed?
+Darest thou, thou little better thing than earth,
+Divine his downfall? Say, where, when, and how,
+Camest thou by this ill tidings? speak, thou wretch.
+
+Gardener:
+Pardon me, madam: little joy have I
+To breathe this news; yet what I say is true.
+King Richard, he is in the mighty hold
+Of Bolingbroke: their fortunes both are weigh'd:
+In your lord's scale is nothing but himself,
+And some few vanities that make him light;
+But in the balance of great Bolingbroke,
+Besides himself, are all the English peers,
+And with that odds he weighs King Richard down.
+Post you to London, and you will find it so;
+I speak no more than every one doth know.
+
+QUEEN:
+Nimble mischance, that art so light of foot,
+Doth not thy embassage belong to me,
+And am I last that knows it? O, thou think'st
+To serve me last, that I may longest keep
+Thy sorrow in my breast. Come, ladies, go,
+To meet at London London's king in woe.
+What, was I born to this, that my sad look
+Should grace the triumph of great Bolingbroke?
+Gardener, for telling me these news of woe,
+Pray God the plants thou graft'st may never grow.
+
+GARDENER:
+Poor queen! so that thy state might be no worse,
+I would my skill were subject to thy curse.
+Here did she fall a tear; here in this place
+I'll set a bank of rue, sour herb of grace:
+Rue, even for ruth, here shortly shall be seen,
+In the remembrance of a weeping queen.
+
+HENRY BOLINGBROKE:
+Call forth Bagot.
+Now, Bagot, freely speak thy mind;
+What thou dost know of noble Gloucester's death,
+Who wrought it with the king, and who perform'd
+The bloody office of his timeless end.
+
+BAGOT:
+Then set before my face the Lord Aumerle.
+
+HENRY BOLINGBROKE:
+Cousin, stand forth, and look upon that man.
+
+BAGOT:
+My Lord Aumerle, I know your daring tongue
+Scorns to unsay what once it hath deliver'd.
+In that dead time when Gloucester's death was plotted,
+I heard you say, 'Is not my arm of length,
+That reacheth from the restful English court
+As far as Calais, to mine uncle's head?'
+Amongst much other talk, that very time,
+I heard you say that you had rather refuse
+The offer of an hundred thousand crowns
+Than Bolingbroke's return to England;
+Adding withal how blest this land would be
+In this your cousin's death.
+
+DUKE OF AUMERLE:
+Princes and noble lords,
+What answer shall I make to this base man?
+Shall I so much dishonour my fair stars,
+On equal terms to give him chastisement?
+Either I must, or have mine honour soil'd
+With the attainder of his slanderous lips.
+There is my gage, the manual seal of death,
+That marks thee out for hell: I say, thou liest,
+And will maintain what thou hast said is false
+In thy heart-blood, though being all too base
+To stain the temper of my knightly sword.
+
+HENRY BOLINGBROKE:
+Bagot, forbear; thou shalt not take it up.
+
+DUKE OF AUMERLE:
+Excepting one, I would he were the best
+In all this presence that hath moved me so.
+
+LORD FITZWATER:
+If that thy valour stand on sympathy,
+There is my gage, Aumerle, in gage to thine:
+By that fair sun which shows me where thou stand'st,
+I heard thee say, and vauntingly thou spakest it
+That thou wert cause of noble Gloucester's death.
+If thou deny'st it twenty times, thou liest;
+And I will turn thy falsehood to thy heart,
+Where it was forged, with my rapier's point.
+
+DUKE OF AUMERLE:
+Thou darest not, coward, live to see that day.
+
+LORD FITZWATER:
+Now by my soul, I would it were this hour.
+
+DUKE OF AUMERLE:
+Fitzwater, thou art damn'd to hell for this.
+
+HENRY PERCY:
+Aumerle, thou liest; his honour is as true
+In this appeal as thou art all unjust;
+And that thou art so, there I throw my gage,
+To prove it on thee to the extremest point
+Of mortal breathing: seize it, if thou darest.
+
+DUKE OF AUMERLE:
+An if I do not, may my hands rot off
+And never brandish more revengeful steel
+Over the glittering helmet of my foe!
+
+Lord:
+I task the earth to the like, forsworn Aumerle;
+And spur thee on with full as many lies
+As may be holloa'd in thy treacherous ear
+From sun to sun: there is my honour's pawn;
+Engage it to the trial, if thou darest.
+
+DUKE OF AUMERLE:
+Who sets me else? by heaven, I'll throw at all:
+I have a thousand spirits in one breast,
+To answer twenty thousand such as you.
+
+DUKE OF SURREY:
+My Lord Fitzwater, I do remember well
+The very time Aumerle and you did talk.
+
+LORD FITZWATER:
+'Tis very true: you were in presence then;
+And you can witness with me this is true.
+
+DUKE OF SURREY:
+As false, by heaven, as heaven itself is true.
+
+LORD FITZWATER:
+Surrey, thou liest.
+
+DUKE OF SURREY:
+Dishonourable boy!
+That lie shall lie so heavy on my sword,
+That it shall render vengeance and revenge
+Till thou the lie-giver and that lie do lie
+In earth as quiet as thy father's skull:
+In proof whereof, there is my honour's pawn;
+Engage it to the trial, if thou darest.
+
+LORD FITZWATER:
+How fondly dost thou spur a forward horse!
+If I dare eat, or drink, or breathe, or live,
+I dare meet Surrey in a wilderness,
+And spit upon him, whilst I say he lies,
+And lies, and lies: there is my bond of faith,
+To tie thee to my strong correction.
+As I intend to thrive in this new world,
+Aumerle is guilty of my true appeal:
+Besides, I heard the banish'd Norfolk say
+That thou, Aumerle, didst send two of thy men
+To execute the noble duke at Calais.
+
+DUKE OF AUMERLE:
+Some honest Christian trust me with a gage
+That Norfolk lies: here do I throw down this,
+If he may be repeal'd, to try his honour.
+
+HENRY BOLINGBROKE:
+These differences shall all rest under gage
+Till Norfolk be repeal'd: repeal'd he shall be,
+And, though mine enemy, restored again
+To all his lands and signories: when he's return'd,
+Against Aumerle we will enforce his trial.
+
+BISHOP OF CARLISLE:
+That honourable day shall ne'er be seen.
+Many a time hath banish'd Norfolk fought
+For Jesu Christ in glorious Christian field,
+Streaming the ensign of the Christian cross
+Against black pagans, Turks, and Saracens:
+And toil'd with works of war, retired himself
+To Italy; and there at Venice gave
+His body to that pleasant country's earth,
+And his pure soul unto his captain Christ,
+Under whose colours he had fought so long.
+
+HENRY BOLINGBROKE:
+Why, bishop, is Norfolk dead?
+
+BISHOP OF CARLISLE:
+As surely as I live, my lord.
+
+HENRY BOLINGBROKE:
+Sweet peace conduct his sweet soul to the bosom
+Of good old Abraham! Lords appellants,
+Your differences shall all rest under gage
+Till we assign you to your days of trial.
+
+DUKE OF YORK:
+Great Duke of Lancaster, I come to thee
+From plume-pluck'd Richard; who with willing soul
+Adopts thee heir, and his high sceptre yields
+To the possession of thy royal hand:
+Ascend his throne, descending now from him;
+And long live Henry, fourth of that name!
+
+HENRY BOLINGBROKE:
+In God's name, I'll ascend the regal throne.
+
+BISHOP OF CARLISLE:
+Marry. God forbid!
+Worst in this royal presence may I speak,
+Yet best beseeming me to speak the truth.
+Would God that any in this noble presence
+Were enough noble to be upright judge
+Of noble Richard! then true noblesse would
+Learn him forbearance from so foul a wrong.
+What subject can give sentence on his king?
+And who sits here that is not Richard's subject?
+Thieves are not judged but they are by to hear,
+Although apparent guilt be seen in them;
+And shall the figure of God's majesty,
+His captain, steward, deputy-elect,
+Anointed, crowned, planted many years,
+Be judged by subject and inferior breath,
+And he himself not present? O, forfend it, God,
+That in a Christian climate souls refined
+Should show so heinous, black, obscene a deed!
+I speak to subjects, and a subject speaks,
+Stirr'd up by God, thus boldly for his king:
+My Lord of Hereford here, whom you call king,
+Is a foul traitor to proud Hereford's king:
+And if you crown him, let me prophesy:
+The blood of English shall manure the ground,
+And future ages groan for this foul act;
+Peace shall go sleep with Turks and infidels,
+And in this seat of peace tumultuous wars
+Shall kin with kin and kind with kind confound;
+Disorder, horror, fear and mutiny
+Shall here inhabit, and this land be call'd
+The field of Golgotha and dead men's skulls.
+O, if you raise this house against this house,
+It will the woefullest division prove
+That ever fell upon this cursed earth.
+Prevent it, resist it, let it not be so,
+Lest child, child's children, cry against you woe!
+
+NORTHUMBERLAND:
+Well have you argued, sir; and, for your pains,
+Of capital treason we arrest you here.
+My Lord of Westminster, be it your charge
+To keep him safely till his day of trial.
+May it please you, lords, to grant the commons' suit.
+
+HENRY BOLINGBROKE:
+Fetch hither Richard, that in common view
+He may surrender; so we shall proceed
+Without suspicion.
+
+DUKE OF YORK:
+I will be his conduct.
+
+HENRY BOLINGBROKE:
+Lords, you that here are under our arrest,
+Procure your sureties for your days of answer.
+Little are we beholding to your love,
+And little look'd for at your helping hands.
+
+KING RICHARD II:
+Alack, why am I sent for to a king,
+Before I have shook off the regal thoughts
+Wherewith I reign'd? I hardly yet have learn'd
+To insinuate, flatter, bow, and bend my limbs:
+Give sorrow leave awhile to tutor me
+To this submission. Yet I well remember
+The favours of these men: were they not mine?
+Did they not sometime cry, 'all hail!' to me?
+So Judas did to Christ: but he, in twelve,
+Found truth in all but one: I, in twelve thousand, none.
+God save the king! Will no man say amen?
+Am I both priest and clerk? well then, amen.
+God save the king! although I be not he;
+And yet, amen, if heaven do think him me.
+To do what service am I sent for hither?
+
+DUKE OF YORK:
+To do that office of thine own good will
+Which tired majesty did make thee offer,
+The resignation of thy state and crown
+To Henry Bolingbroke.
+
+KING RICHARD II:
+Give me the crown. Here, cousin, seize the crown;
+Here cousin:
+On this side my hand, and on that side yours.
+Now is this golden crown like a deep well
+That owes two buckets, filling one another,
+The emptier ever dancing in the air,
+The other down, unseen and full of water:
+That bucket down and full of tears am I,
+Drinking my griefs, whilst you mount up on high.
+
+HENRY BOLINGBROKE:
+I thought you had been willing to resign.
+
+KING RICHARD II:
+My crown I am; but still my griefs are mine:
+You may my glories and my state depose,
+But not my griefs; still am I king of those.
+
+HENRY BOLINGBROKE:
+Part of your cares you give me with your crown.
+
+KING RICHARD II:
+Your cares set up do not pluck my cares down.
+My care is loss of care, by old care done;
+Your care is gain of care, by new care won:
+The cares I give I have, though given away;
+They tend the crown, yet still with me they stay.
+
+HENRY BOLINGBROKE:
+Are you contented to resign the crown?
+
+KING RICHARD II:
+Ay, no; no, ay; for I must nothing be;
+Therefore no no, for I resign to thee.
+Now mark me, how I will undo myself;
+I give this heavy weight from off my head
+And this unwieldy sceptre from my hand,
+The pride of kingly sway from out my heart;
+With mine own tears I wash away my balm,
+With mine own hands I give away my crown,
+With mine own tongue deny my sacred state,
+With mine own breath release all duty's rites:
+All pomp and majesty I do forswear;
+My manors, rents, revenues I forego;
+My acts, decrees, and statutes I deny:
+God pardon all oaths that are broke to me!
+God keep all vows unbroke that swear to thee!
+Make me, that nothing have, with nothing grieved,
+And thou with all pleased, that hast all achieved!
+Long mayst thou live in Richard's seat to sit,
+And soon lie Richard in an earthly pit!
+God save King Harry, unking'd Richard says,
+And send him many years of sunshine days!
+What more remains?
+
+NORTHUMBERLAND:
+No more, but that you read
+These accusations and these grievous crimes
+Committed by your person and your followers
+Against the state and profit of this land;
+That, by confessing them, the souls of men
+May deem that you are worthily deposed.
+
+KING RICHARD II:
+Must I do so? and must I ravel out
+My weaved-up folly? Gentle Northumberland,
+If thy offences were upon record,
+Would it not shame thee in so fair a troop
+To read a lecture of them? If thou wouldst,
+There shouldst thou find one heinous article,
+Containing the deposing of a king
+And cracking the strong warrant of an oath,
+Mark'd with a blot, damn'd in the book of heaven:
+Nay, all of you that stand and look upon,
+Whilst that my wretchedness doth bait myself,
+Though some of you with Pilate wash your hands
+Showing an outward pity; yet you Pilates
+Have here deliver'd me to my sour cross,
+And water cannot wash away your sin.
+
+NORTHUMBERLAND:
+My lord, dispatch; read o'er these articles.
+
+KING RICHARD II:
+Mine eyes are full of tears, I cannot see:
+And yet salt water blinds them not so much
+But they can see a sort of traitors here.
+Nay, if I turn mine eyes upon myself,
+I find myself a traitor with the rest;
+For I have given here my soul's consent
+To undeck the pompous body of a king;
+Made glory base and sovereignty a slave,
+Proud majesty a subject, state a peasant.
+
+NORTHUMBERLAND:
+My lord,--
+
+KING RICHARD II:
+No lord of thine, thou haught insulting man,
+Nor no man's lord; I have no name, no title,
+No, not that name was given me at the font,
+But 'tis usurp'd: alack the heavy day,
+That I have worn so many winters out,
+And know not now what name to call myself!
+O that I were a mockery king of snow,
+Standing before the sun of Bolingbroke,
+To melt myself away in water-drops!
+Good king, great king, and yet not greatly good,
+An if my word be sterling yet in England,
+Let it command a mirror hither straight,
+That it may show me what a face I have,
+Since it is bankrupt of his majesty.
+
+HENRY BOLINGBROKE:
+Go some of you and fetch a looking-glass.
+
+NORTHUMBERLAND:
+Read o'er this paper while the glass doth come.
+
+KING RICHARD II:
+Fiend, thou torment'st me ere I come to hell!
+
+HENRY BOLINGBROKE:
+Urge it no more, my Lord Northumberland.
+
+NORTHUMBERLAND:
+The commons will not then be satisfied.
+
+KING RICHARD II:
+They shall be satisfied: I'll read enough,
+When I do see the very book indeed
+Where all my sins are writ, and that's myself.
+Give me the glass, and therein will I read.
+No deeper wrinkles yet? hath sorrow struck
+So many blows upon this face of mine,
+And made no deeper wounds? O flattering glass,
+Like to my followers in prosperity,
+Thou dost beguile me! Was this face the face
+That every day under his household roof
+Did keep ten thousand men? was this the face
+That, like the sun, did make beholders wink?
+Was this the face that faced so many follies,
+And was at last out-faced by Bolingbroke?
+A brittle glory shineth in this face:
+As brittle as the glory is the face;
+For there it is, crack'd in a hundred shivers.
+Mark, silent king, the moral of this sport,
+How soon my sorrow hath destroy'd my face.
+
+HENRY BOLINGBROKE:
+The shadow of your sorrow hath destroy'd
+The shadow or your face.
+
+KING RICHARD II:
+Say that again.
+The shadow of my sorrow! ha! let's see:
+'Tis very true, my grief lies all within;
+And these external manners of laments
+Are merely shadows to the unseen grief
+That swells with silence in the tortured soul;
+There lies the substance: and I thank thee, king,
+For thy great bounty, that not only givest
+Me cause to wail but teachest me the way
+How to lament the cause. I'll beg one boon,
+And then be gone and trouble you no more.
+Shall I obtain it?
+
+HENRY BOLINGBROKE:
+Name it, fair cousin.
+
+KING RICHARD II:
+'Fair cousin'? I am greater than a king:
+For when I was a king, my flatterers
+Were then but subjects; being now a subject,
+I have a king here to my flatterer.
+Being so great, I have no need to beg.
+
+HENRY BOLINGBROKE:
+Yet ask.
+
+KING RICHARD II:
+And shall I have?
+
+HENRY BOLINGBROKE:
+You shall.
+
+KING RICHARD II:
+Then give me leave to go.
+
+HENRY BOLINGBROKE:
+Whither?
+
+KING RICHARD II:
+Whither you will, so I were from your sights.
+
+HENRY BOLINGBROKE:
+Go, some of you convey him to the Tower.
+
+KING RICHARD II:
+O, good! convey? conveyers are you all,
+That rise thus nimbly by a true king's fall.
+
+HENRY BOLINGBROKE:
+On Wednesday next we solemnly set down
+Our coronation: lords, prepare yourselves.
+
+Abbot:
+A woeful pageant have we here beheld.
+
+BISHOP OF CARLISLE:
+The woe's to come; the children yet unborn.
+Shall feel this day as sharp to them as thorn.
+
+DUKE OF AUMERLE:
+You holy clergymen, is there no plot
+To rid the realm of this pernicious blot?
+
+Abbot:
+My lord,
+Before I freely speak my mind herein,
+You shall not only take the sacrament
+To bury mine intents, but also to effect
+Whatever I shall happen to devise.
+I see your brows are full of discontent,
+Your hearts of sorrow and your eyes of tears:
+Come home with me to supper; and I'll lay
+A plot shall show us all a merry day.
+
+QUEEN:
+This way the king will come; this is the way
+To Julius Caesar's ill-erected tower,
+To whose flint bosom my condemned lord
+Is doom'd a prisoner by proud Bolingbroke:
+Here let us rest, if this rebellious earth
+Have any resting for her true king's queen.
+But soft, but see, or rather do not see,
+My fair rose wither: yet look up, behold,
+That you in pity may dissolve to dew,
+And wash him fresh again with true-love tears.
+Ah, thou, the model where old Troy did stand,
+Thou map of honour, thou King Richard's tomb,
+And not King Richard; thou most beauteous inn,
+Why should hard-favour'd grief be lodged in thee,
+When triumph is become an alehouse guest?
+
+KING RICHARD II:
+Join not with grief, fair woman, do not so,
+To make my end too sudden: learn, good soul,
+To think our former state a happy dream;
+From which awaked, the truth of what we are
+Shows us but this: I am sworn brother, sweet,
+To grim Necessity, and he and I
+Will keep a league till death. Hie thee to France
+And cloister thee in some religious house:
+Our holy lives must win a new world's crown,
+Which our profane hours here have stricken down.
+
+QUEEN:
+What, is my Richard both in shape and mind
+Transform'd and weaken'd? hath Bolingbroke deposed
+Thine intellect? hath he been in thy heart?
+The lion dying thrusteth forth his paw,
+And wounds the earth, if nothing else, with rage
+To be o'erpower'd; and wilt thou, pupil-like,
+Take thy correction mildly, kiss the rod,
+And fawn on rage with base humility,
+Which art a lion and a king of beasts?
+
+KING RICHARD II:
+A king of beasts, indeed; if aught but beasts,
+I had been still a happy king of men.
+Good sometime queen, prepare thee hence for France:
+Think I am dead and that even here thou takest,
+As from my death-bed, thy last living leave.
+In winter's tedious nights sit by the fire
+With good old folks and let them tell thee tales
+Of woeful ages long ago betid;
+And ere thou bid good night, to quit their griefs,
+Tell thou the lamentable tale of me
+And send the hearers weeping to their beds:
+For why, the senseless brands will sympathize
+The heavy accent of thy moving tongue
+And in compassion weep the fire out;
+And some will mourn in ashes, some coal-black,
+For the deposing of a rightful king.
+
+NORTHUMBERLAND:
+My lord, the mind of Bolingbroke is changed:
+You must to Pomfret, not unto the Tower.
+And, madam, there is order ta'en for you;
+With all swift speed you must away to France.
+
+KING RICHARD II:
+Northumberland, thou ladder wherewithal
+The mounting Bolingbroke ascends my throne,
+The time shall not be many hours of age
+More than it is ere foul sin gathering head
+Shalt break into corruption: thou shalt think,
+Though he divide the realm and give thee half,
+It is too little, helping him to all;
+And he shall think that thou, which know'st the way
+To plant unrightful kings, wilt know again,
+Being ne'er so little urged, another way
+To pluck him headlong from the usurped throne.
+The love of wicked men converts to fear;
+That fear to hate, and hate turns one or both
+To worthy danger and deserved death.
+
+NORTHUMBERLAND:
+My guilt be on my head, and there an end.
+Take leave and part; for you must part forthwith.
+
+KING RICHARD II:
+Doubly divorced! Bad men, you violate
+A twofold marriage, 'twixt my crown and me,
+And then betwixt me and my married wife.
+Let me unkiss the oath 'twixt thee and me;
+And yet not so, for with a kiss 'twas made.
+Part us, Northumberland; I toward the north,
+Where shivering cold and sickness pines the clime;
+My wife to France: from whence, set forth in pomp,
+She came adorned hither like sweet May,
+Sent back like Hallowmas or short'st of day.
+
+QUEEN:
+And must we be divided? must we part?
+
+KING RICHARD II:
+Ay, hand from hand, my love, and heart from heart.
+
+QUEEN:
+Banish us both and send the king with me.
+
+NORTHUMBERLAND:
+That were some love but little policy.
+
+QUEEN:
+Then whither he goes, thither let me go.
+
+KING RICHARD II:
+So two, together weeping, make one woe.
+Weep thou for me in France, I for thee here;
+Better far off than near, be ne'er the near.
+Go, count thy way with sighs; I mine with groans.
+
+QUEEN:
+So longest way shall have the longest moans.
+
+KING RICHARD II:
+Twice for one step I'll groan, the way being short,
+And piece the way out with a heavy heart.
+Come, come, in wooing sorrow let's be brief,
+Since, wedding it, there is such length in grief;
+One kiss shall stop our mouths, and dumbly part;
+Thus give I mine, and thus take I thy heart.
+
+QUEEN:
+Give me mine own again; 'twere no good part
+To take on me to keep and kill thy heart.
+So, now I have mine own again, be gone,
+That I might strive to kill it with a groan.
+
+KING RICHARD II:
+We make woe wanton with this fond delay:
+Once more, adieu; the rest let sorrow say.
+
+DUCHESS OF YORK:
+My lord, you told me you would tell the rest,
+When weeping made you break the story off,
+of our two cousins coming into London.
+
+DUKE OF YORK:
+Where did I leave?
+
+DUCHESS OF YORK:
+At that sad stop, my lord,
+Where rude misgovern'd hands from windows' tops
+Threw dust and rubbish on King Richard's head.
+
+DUKE OF YORK:
+Then, as I said, the duke, great Bolingbroke,
+Mounted upon a hot and fiery steed
+Which his aspiring rider seem'd to know,
+With slow but stately pace kept on his course,
+Whilst all tongues cried 'God save thee,
+Bolingbroke!'
+You would have thought the very windows spake,
+So many greedy looks of young and old
+Through casements darted their desiring eyes
+Upon his visage, and that all the walls
+With painted imagery had said at once
+'Jesu preserve thee! welcome, Bolingbroke!'
+Whilst he, from the one side to the other turning,
+Bareheaded, lower than his proud steed's neck,
+Bespake them thus: 'I thank you, countrymen:'
+And thus still doing, thus he pass'd along.
+
+DUCHESS OF YORK:
+Alack, poor Richard! where rode he the whilst?
+
+DUKE OF YORK:
+As in a theatre, the eyes of men,
+After a well-graced actor leaves the stage,
+Are idly bent on him that enters next,
+Thinking his prattle to be tedious;
+Even so, or with much more contempt, men's eyes
+Did scowl on gentle Richard; no man cried 'God save him!'
+No joyful tongue gave him his welcome home:
+But dust was thrown upon his sacred head:
+Which with such gentle sorrow he shook off,
+His face still combating with tears and smiles,
+The badges of his grief and patience,
+That had not God, for some strong purpose, steel'd
+The hearts of men, they must perforce have melted
+And barbarism itself have pitied him.
+But heaven hath a hand in these events,
+To whose high will we bound our calm contents.
+To Bolingbroke are we sworn subjects now,
+Whose state and honour I for aye allow.
+
+DUCHESS OF YORK:
+Here comes my son Aumerle.
+
+DUKE OF YORK:
+Aumerle that was;
+But that is lost for being Richard's friend,
+And, madam, you must call him Rutland now:
+I am in parliament pledge for his truth
+And lasting fealty to the new-made king.
+
+DUCHESS OF YORK:
+Welcome, my son: who are the violets now
+That strew the green lap of the new come spring?
+
+DUKE OF AUMERLE:
+Madam, I know not, nor I greatly care not:
+God knows I had as lief be none as one.
+
+DUKE OF YORK:
+Well, bear you well in this new spring of time,
+Lest you be cropp'd before you come to prime.
+What news from Oxford? hold those justs and triumphs?
+
+DUKE OF AUMERLE:
+For aught I know, my lord, they do.
+
+DUKE OF YORK:
+You will be there, I know.
+
+DUKE OF AUMERLE:
+If God prevent not, I purpose so.
+
+DUKE OF YORK:
+What seal is that, that hangs without thy bosom?
+Yea, look'st thou pale? let me see the writing.
+
+DUKE OF AUMERLE:
+My lord, 'tis nothing.
+
+DUKE OF YORK:
+No matter, then, who see it;
+I will be satisfied; let me see the writing.
+
+DUKE OF AUMERLE:
+I do beseech your grace to pardon me:
+It is a matter of small consequence,
+Which for some reasons I would not have seen.
+
+DUKE OF YORK:
+Which for some reasons, sir, I mean to see.
+I fear, I fear,--
+
+DUCHESS OF YORK:
+What should you fear?
+'Tis nothing but some bond, that he is enter'd into
+For gay apparel 'gainst the triumph day.
+
+DUKE OF YORK:
+Bound to himself! what doth he with a bond
+That he is bound to? Wife, thou art a fool.
+Boy, let me see the writing.
+
+DUKE OF AUMERLE:
+I do beseech you, pardon me; I may not show it.
+
+DUKE OF YORK:
+I will be satisfied; let me see it, I say.
+Treason! foul treason! Villain! traitor! slave!
+
+DUCHESS OF YORK:
+What is the matter, my lord?
+
+DUKE OF YORK:
+Ho! who is within there?
+Saddle my horse.
+God for his mercy, what treachery is here!
+
+DUCHESS OF YORK:
+Why, what is it, my lord?
+
+DUKE OF YORK:
+Give me my boots, I say; saddle my horse.
+Now, by mine honour, by my life, by my troth,
+I will appeach the villain.
+
+DUCHESS OF YORK:
+What is the matter?
+
+DUKE OF YORK:
+Peace, foolish woman.
+
+DUCHESS OF YORK:
+I will not peace. What is the matter, Aumerle.
+
+DUKE OF AUMERLE:
+Good mother, be content; it is no more
+Than my poor life must answer.
+
+DUCHESS OF YORK:
+Thy life answer!
+
+DUKE OF YORK:
+Bring me my boots: I will unto the king.
+
+DUCHESS OF YORK:
+Strike him, Aumerle. Poor boy, thou art amazed.
+Hence, villain! never more come in my sight.
+
+DUKE OF YORK:
+Give me my boots, I say.
+
+DUCHESS OF YORK:
+Why, York, what wilt thou do?
+Wilt thou not hide the trespass of thine own?
+Have we more sons? or are we like to have?
+Is not my teeming date drunk up with time?
+And wilt thou pluck my fair son from mine age,
+And rob me of a happy mother's name?
+Is he not like thee? is he not thine own?
+
+DUKE OF YORK:
+Thou fond mad woman,
+Wilt thou conceal this dark conspiracy?
+A dozen of them here have ta'en the sacrament,
+And interchangeably set down their hands,
+To kill the king at Oxford.
+
+DUCHESS OF YORK:
+He shall be none;
+We'll keep him here: then what is that to him?
+
+DUKE OF YORK:
+Away, fond woman! were he twenty times my son,
+I would appeach him.
+
+DUCHESS OF YORK:
+Hadst thou groan'd for him
+As I have done, thou wouldst be more pitiful.
+But now I know thy mind; thou dost suspect
+That I have been disloyal to thy bed,
+And that he is a bastard, not thy son:
+Sweet York, sweet husband, be not of that mind:
+He is as like thee as a man may be,
+Not like to me, or any of my kin,
+And yet I love him.
+
+DUKE OF YORK:
+Make way, unruly woman!
+
+DUCHESS OF YORK:
+After, Aumerle! mount thee upon his horse;
+Spur post, and get before him to the king,
+And beg thy pardon ere he do accuse thee.
+I'll not be long behind; though I be old,
+I doubt not but to ride as fast as York:
+And never will I rise up from the ground
+Till Bolingbroke have pardon'd thee. Away, be gone!
+
+HENRY BOLINGBROKE:
+Can no man tell me of my unthrifty son?
+'Tis full three months since I did see him last;
+If any plague hang over us, 'tis he.
+I would to God, my lords, he might be found:
+Inquire at London, 'mongst the taverns there,
+For there, they say, he daily doth frequent,
+With unrestrained loose companions,
+Even such, they say, as stand in narrow lanes,
+And beat our watch, and rob our passengers;
+Which he, young wanton and effeminate boy,
+Takes on the point of honour to support
+So dissolute a crew.
+
+HENRY PERCY:
+My lord, some two days since I saw the prince,
+And told him of those triumphs held at Oxford.
+
+HENRY BOLINGBROKE:
+And what said the gallant?
+
+HENRY PERCY:
+His answer was, he would unto the stews,
+And from the common'st creature pluck a glove,
+And wear it as a favour; and with that
+He would unhorse the lustiest challenger.
+
+HENRY BOLINGBROKE:
+As dissolute as desperate; yet through both
+I see some sparks of better hope, which elder years
+May happily bring forth. But who comes here?
+
+DUKE OF AUMERLE:
+Where is the king?
+
+HENRY BOLINGBROKE:
+What means our cousin, that he stares and looks
+So wildly?
+
+DUKE OF AUMERLE:
+God save your grace! I do beseech your majesty,
+To have some conference with your grace alone.
+
+HENRY BOLINGBROKE:
+Withdraw yourselves, and leave us here alone.
+What is the matter with our cousin now?
+
+DUKE OF AUMERLE:
+For ever may my knees grow to the earth,
+My tongue cleave to my roof within my mouth
+Unless a pardon ere I rise or speak.
+
+HENRY BOLINGBROKE:
+Intended or committed was this fault?
+If on the first, how heinous e'er it be,
+To win thy after-love I pardon thee.
+
+DUKE OF AUMERLE:
+Then give me leave that I may turn the key,
+That no man enter till my tale be done.
+
+HENRY BOLINGBROKE:
+Have thy desire.
+
+DUKE OF YORK:
+
+HENRY BOLINGBROKE:
+Villain, I'll make thee safe.
+
+DUKE OF AUMERLE:
+Stay thy revengeful hand; thou hast no cause to fear.
+
+DUKE OF YORK:
+
+HENRY BOLINGBROKE:
+What is the matter, uncle? speak;
+Recover breath; tell us how near is danger,
+That we may arm us to encounter it.
+
+DUKE OF YORK:
+Peruse this writing here, and thou shalt know
+The treason that my haste forbids me show.
+
+DUKE OF AUMERLE:
+Remember, as thou read'st, thy promise pass'd:
+I do repent me; read not my name there
+My heart is not confederate with my hand.
+
+DUKE OF YORK:
+It was, villain, ere thy hand did set it down.
+I tore it from the traitor's bosom, king;
+Fear, and not love, begets his penitence:
+Forget to pity him, lest thy pity prove
+A serpent that will sting thee to the heart.
+
+HENRY BOLINGBROKE:
+O heinous, strong and bold conspiracy!
+O loyal father of a treacherous son!
+Thou sheer, immaculate and silver fountain,
+From when this stream through muddy passages
+Hath held his current and defiled himself!
+Thy overflow of good converts to bad,
+And thy abundant goodness shall excuse
+This deadly blot in thy digressing son.
+
+DUKE OF YORK:
+So shall my virtue be his vice's bawd;
+And he shall spend mine honour with his shame,
+As thriftless sons their scraping fathers' gold.
+Mine honour lives when his dishonour dies,
+Or my shamed life in his dishonour lies:
+Thou kill'st me in his life; giving him breath,
+The traitor lives, the true man's put to death.
+
+DUCHESS OF YORK:
+
+HENRY BOLINGBROKE:
+What shrill-voiced suppliant makes this eager cry?
+
+DUCHESS OF YORK:
+A woman, and thy aunt, great king; 'tis I.
+Speak with me, pity me, open the door.
+A beggar begs that never begg'd before.
+
+HENRY BOLINGBROKE:
+Our scene is alter'd from a serious thing,
+And now changed to 'The Beggar and the King.'
+My dangerous cousin, let your mother in:
+I know she is come to pray for your foul sin.
+
+DUKE OF YORK:
+If thou do pardon, whosoever pray,
+More sins for this forgiveness prosper may.
+This fester'd joint cut off, the rest rest sound;
+This let alone will all the rest confound.
+
+DUCHESS OF YORK:
+O king, believe not this hard-hearted man!
+Love loving not itself none other can.
+
+DUKE OF YORK:
+Thou frantic woman, what dost thou make here?
+Shall thy old dugs once more a traitor rear?
+
+DUCHESS OF YORK:
+Sweet York, be patient. Hear me, gentle liege.
+
+HENRY BOLINGBROKE:
+Rise up, good aunt.
+
+DUCHESS OF YORK:
+Not yet, I thee beseech:
+For ever will I walk upon my knees,
+And never see day that the happy sees,
+Till thou give joy; until thou bid me joy,
+By pardoning Rutland, my transgressing boy.
+
+DUKE OF AUMERLE:
+Unto my mother's prayers I bend my knee.
+
+DUKE OF YORK:
+Against them both my true joints bended be.
+Ill mayst thou thrive, if thou grant any grace!
+
+DUCHESS OF YORK:
+Pleads he in earnest? look upon his face;
+His eyes do drop no tears, his prayers are in jest;
+His words come from his mouth, ours from our breast:
+He prays but faintly and would be denied;
+We pray with heart and soul and all beside:
+His weary joints would gladly rise, I know;
+Our knees shall kneel till to the ground they grow:
+His prayers are full of false hypocrisy;
+Ours of true zeal and deep integrity.
+Our prayers do out-pray his; then let them have
+That mercy which true prayer ought to have.
+
+HENRY BOLINGBROKE:
+Good aunt, stand up.
+
+DUCHESS OF YORK:
+Nay, do not say, 'stand up;'
+Say, 'pardon' first, and afterwards 'stand up.'
+And if I were thy nurse, thy tongue to teach,
+'Pardon' should be the first word of thy speech.
+I never long'd to hear a word till now;
+Say 'pardon,' king; let pity teach thee how:
+The word is short, but not so short as sweet;
+No word like 'pardon' for kings' mouths so meet.
+
+DUKE OF YORK:
+Speak it in French, king; say, 'pardonne moi.'
+
+DUCHESS OF YORK:
+Dost thou teach pardon pardon to destroy?
+Ah, my sour husband, my hard-hearted lord,
+That set'st the word itself against the word!
+Speak 'pardon' as 'tis current in our land;
+The chopping French we do not understand.
+Thine eye begins to speak; set thy tongue there;
+Or in thy piteous heart plant thou thine ear;
+That hearing how our plaints and prayers do pierce,
+Pity may move thee 'pardon' to rehearse.
+
+HENRY BOLINGBROKE:
+Good aunt, stand up.
+
+DUCHESS OF YORK:
+I do not sue to stand;
+Pardon is all the suit I have in hand.
+
+HENRY BOLINGBROKE:
+I pardon him, as God shall pardon me.
+
+DUCHESS OF YORK:
+O happy vantage of a kneeling knee!
+Yet am I sick for fear: speak it again;
+Twice saying 'pardon' doth not pardon twain,
+But makes one pardon strong.
+
+HENRY BOLINGBROKE:
+With all my heart
+I pardon him.
+
+DUCHESS OF YORK:
+A god on earth thou art.
+
+HENRY BOLINGBROKE:
+But for our trusty brother-in-law and the abbot,
+With all the rest of that consorted crew,
+Destruction straight shall dog them at the heels.
+Good uncle, help to order several powers
+To Oxford, or where'er these traitors are:
+They shall not live within this world, I swear,
+But I will have them, if I once know where.
+Uncle, farewell: and, cousin too, adieu:
+Your mother well hath pray'd, and prove you true.
+
+DUCHESS OF YORK:
+Come, my old son: I pray God make thee new.
+
+EXTON:
+Didst thou not mark the king, what words he spake,
+'Have I no friend will rid me of this living fear?'
+Was it not so?
+
+Servant:
+These were his very words.
+
+EXTON:
+'Have I no friend?' quoth he: he spake it twice,
+And urged it twice together, did he not?
+
+Servant:
+He did.
+
+EXTON:
+And speaking it, he wistly look'd on me,
+And who should say, 'I would thou wert the man'
+That would divorce this terror from my heart;'
+Meaning the king at Pomfret. Come, let's go:
+I am the king's friend, and will rid his foe.
+
+KING RICHARD II:
+I have been studying how I may compare
+This prison where I live unto the world:
+And for because the world is populous
+And here is not a creature but myself,
+I cannot do it; yet I'll hammer it out.
+My brain I'll prove the female to my soul,
+My soul the father; and these two beget
+A generation of still-breeding thoughts,
+And these same thoughts people this little world,
+In humours like the people of this world,
+For no thought is contented. The better sort,
+As thoughts of things divine, are intermix'd
+With scruples and do set the word itself
+Against the word:
+As thus, 'Come, little ones,' and then again,
+'It is as hard to come as for a camel
+To thread the postern of a small needle's eye.'
+Thoughts tending to ambition, they do plot
+Unlikely wonders; how these vain weak nails
+May tear a passage through the flinty ribs
+Of this hard world, my ragged prison walls,
+And, for they cannot, die in their own pride.
+Thoughts tending to content flatter themselves
+That they are not the first of fortune's slaves,
+Nor shall not be the last; like silly beggars
+Who sitting in the stocks refuge their shame,
+That many have and others must sit there;
+And in this thought they find a kind of ease,
+Bearing their own misfortunes on the back
+Of such as have before endured the like.
+Thus play I in one person many people,
+And none contented: sometimes am I king;
+Then treasons make me wish myself a beggar,
+And so I am: then crushing penury
+Persuades me I was better when a king;
+Then am I king'd again: and by and by
+Think that I am unking'd by Bolingbroke,
+And straight am nothing: but whate'er I be,
+Nor I nor any man that but man is
+With nothing shall be pleased, till he be eased
+With being nothing. Music do I hear?
+Ha, ha! keep time: how sour sweet music is,
+When time is broke and no proportion kept!
+So is it in the music of men's lives.
+And here have I the daintiness of ear
+To cheque time broke in a disorder'd string;
+But for the concord of my state and time
+Had not an ear to hear my true time broke.
+I wasted time, and now doth time waste me;
+For now hath time made me his numbering clock:
+My thoughts are minutes; and with sighs they jar
+Their watches on unto mine eyes, the outward watch,
+Whereto my finger, like a dial's point,
+Is pointing still, in cleansing them from tears.
+Now sir, the sound that tells what hour it is
+Are clamorous groans, which strike upon my heart,
+Which is the bell: so sighs and tears and groans
+Show minutes, times, and hours: but my time
+Runs posting on in Bolingbroke's proud joy,
+While I stand fooling here, his Jack o' the clock.
+This music mads me; let it sound no more;
+For though it have holp madmen to their wits,
+In me it seems it will make wise men mad.
+Yet blessing on his heart that gives it me!
+For 'tis a sign of love; and love to Richard
+Is a strange brooch in this all-hating world.
+
+Groom:
+Hail, royal prince!
+
+KING RICHARD II:
+Thanks, noble peer;
+The cheapest of us is ten groats too dear.
+What art thou? and how comest thou hither,
+Where no man never comes but that sad dog
+That brings me food to make misfortune live?
+
+Groom:
+I was a poor groom of thy stable, king,
+When thou wert king; who, travelling towards York,
+With much ado at length have gotten leave
+To look upon my sometimes royal master's face.
+O, how it yearn'd my heart when I beheld
+In London streets, that coronation-day,
+When Bolingbroke rode on roan Barbary,
+That horse that thou so often hast bestrid,
+That horse that I so carefully have dress'd!
+
+KING RICHARD II:
+Rode he on Barbary? Tell me, gentle friend,
+How went he under him?
+
+Groom:
+So proudly as if he disdain'd the ground.
+
+KING RICHARD II:
+So proud that Bolingbroke was on his back!
+That jade hath eat bread from my royal hand;
+This hand hath made him proud with clapping him.
+Would he not stumble? would he not fall down,
+Since pride must have a fall, and break the neck
+Of that proud man that did usurp his back?
+Forgiveness, horse! why do I rail on thee,
+Since thou, created to be awed by man,
+Wast born to bear? I was not made a horse;
+And yet I bear a burthen like an ass,
+Spurr'd, gall'd and tired by jouncing Bolingbroke.
+
+Keeper:
+Fellow, give place; here is no longer stay.
+
+KING RICHARD II:
+If thou love me, 'tis time thou wert away.
+
+Groom:
+What my tongue dares not, that my heart shall say.
+
+Keeper:
+My lord, will't please you to fall to?
+
+KING RICHARD II:
+Taste of it first, as thou art wont to do.
+
+Keeper:
+My lord, I dare not: Sir Pierce of Exton, who
+lately came from the king, commands the contrary.
+
+KING RICHARD II:
+The devil take Henry of Lancaster and thee!
+Patience is stale, and I am weary of it.
+
+Keeper:
+Help, help, help!
+
+KING RICHARD II:
+How now! what means death in this rude assault?
+Villain, thy own hand yields thy death's instrument.
+Go thou, and fill another room in hell.
+That hand shall burn in never-quenching fire
+That staggers thus my person. Exton, thy fierce hand
+Hath with the king's blood stain'd the king's own land.
+Mount, mount, my soul! thy seat is up on high;
+Whilst my gross flesh sinks downward, here to die.
+
+EXTON:
+As full of valour as of royal blood:
+Both have I spill'd; O would the deed were good!
+For now the devil, that told me I did well,
+Says that this deed is chronicled in hell.
+This dead king to the living king I'll bear
+Take hence the rest, and give them burial here.
+
+HENRY BOLINGBROKE:
+Kind uncle York, the latest news we hear
+Is that the rebels have consumed with fire
+Our town of Cicester in Gloucestershire;
+But whether they be ta'en or slain we hear not.
+Welcome, my lord what is the news?
+
+NORTHUMBERLAND:
+First, to thy sacred state wish I all happiness.
+The next news is, I have to London sent
+The heads of Oxford, Salisbury, Blunt, and Kent:
+The manner of their taking may appear
+At large discoursed in this paper here.
+
+HENRY BOLINGBROKE:
+We thank thee, gentle Percy, for thy pains;
+And to thy worth will add right worthy gains.
+
+LORD FITZWATER:
+My lord, I have from Oxford sent to London
+The heads of Brocas and Sir Bennet Seely,
+Two of the dangerous consorted traitors
+That sought at Oxford thy dire overthrow.
+
+HENRY BOLINGBROKE:
+Thy pains, Fitzwater, shall not be forgot;
+Right noble is thy merit, well I wot.
+
+HENRY PERCY:
+The grand conspirator, Abbot of Westminster,
+With clog of conscience and sour melancholy
+Hath yielded up his body to the grave;
+But here is Carlisle living, to abide
+Thy kingly doom and sentence of his pride.
+
+HENRY BOLINGBROKE:
+Carlisle, this is your doom:
+Choose out some secret place, some reverend room,
+More than thou hast, and with it joy thy life;
+So as thou livest in peace, die free from strife:
+For though mine enemy thou hast ever been,
+High sparks of honour in thee have I seen.
+
+EXTON:
+Great king, within this coffin I present
+Thy buried fear: herein all breathless lies
+The mightiest of thy greatest enemies,
+Richard of Bordeaux, by me hither brought.
+
+HENRY BOLINGBROKE:
+Exton, I thank thee not; for thou hast wrought
+A deed of slander with thy fatal hand
+Upon my head and all this famous land.
+
+EXTON:
+From your own mouth, my lord, did I this deed.
+
+HENRY BOLINGBROKE:
+They love not poison that do poison need,
+Nor do I thee: though I did wish him dead,
+I hate the murderer, love him murdered.
+The guilt of conscience take thou for thy labour,
+But neither my good word nor princely favour:
+With Cain go wander through shades of night,
+And never show thy head by day nor light.
+Lords, I protest, my soul is full of woe,
+That blood should sprinkle me to make me grow:
+Come, mourn with me for that I do lament,
+And put on sullen black incontinent:
+I'll make a voyage to the Holy Land,
+To wash this blood off from my guilty hand:
+March sadly after; grace my mournings here;
+In weeping after this untimely bier.
+
+
+SAMPSON:
+Gregory, o' my word, we'll not carry coals.
+
+GREGORY:
+No, for then we should be colliers.
+
+SAMPSON:
+I mean, an we be in choler, we'll draw.
+
+GREGORY:
+Ay, while you live, draw your neck out o' the collar.
+
+SAMPSON:
+I strike quickly, being moved.
+
+GREGORY:
+But thou art not quickly moved to strike.
+
+SAMPSON:
+A dog of the house of Montague moves me.
+
+GREGORY:
+To move is to stir; and to be valiant is to stand:
+therefore, if thou art moved, thou runn'st away.
+
+SAMPSON:
+A dog of that house shall move me to stand: I will
+take the wall of any man or maid of Montague's.
+
+GREGORY:
+That shows thee a weak slave; for the weakest goes
+to the wall.
+
+SAMPSON:
+True; and therefore women, being the weaker vessels,
+are ever thrust to the wall: therefore I will push
+Montague's men from the wall, and thrust his maids
+to the wall.
+
+GREGORY:
+The quarrel is between our masters and us their men.
+
+SAMPSON:
+'Tis all one, I will show myself a tyrant: when I
+have fought with the men, I will be cruel with the
+maids, and cut off their heads.
+
+GREGORY:
+The heads of the maids?
+
+SAMPSON:
+Ay, the heads of the maids, or their maidenheads;
+take it in what sense thou wilt.
+
+GREGORY:
+They must take it in sense that feel it.
+
+SAMPSON:
+Me they shall feel while I am able to stand: and
+'tis known I am a pretty piece of flesh.
+
+GREGORY:
+'Tis well thou art not fish; if thou hadst, thou
+hadst been poor John. Draw thy tool! here comes
+two of the house of the Montagues.
+
+SAMPSON:
+My naked weapon is out: quarrel, I will back thee.
+
+GREGORY:
+How! turn thy back and run?
+
+SAMPSON:
+Fear me not.
+
+GREGORY:
+No, marry; I fear thee!
+
+SAMPSON:
+Let us take the law of our sides; let them begin.
+
+GREGORY:
+I will frown as I pass by, and let them take it as
+they list.
+
+SAMPSON:
+Nay, as they dare. I will bite my thumb at them;
+which is a disgrace to them, if they bear it.
+
+ABRAHAM:
+Do you bite your thumb at us, sir?
+
+SAMPSON:
+I do bite my thumb, sir.
+
+ABRAHAM:
+Do you bite your thumb at us, sir?
+
+SAMPSON:
+
+GREGORY:
+No.
+
+SAMPSON:
+No, sir, I do not bite my thumb at you, sir, but I
+bite my thumb, sir.
+
+GREGORY:
+Do you quarrel, sir?
+
+ABRAHAM:
+Quarrel sir! no, sir.
+
+SAMPSON:
+If you do, sir, I am for you: I serve as good a man as you.
+
+ABRAHAM:
+No better.
+
+SAMPSON:
+Well, sir.
+
+GREGORY:
+Say 'better:' here comes one of my master's kinsmen.
+
+SAMPSON:
+Yes, better, sir.
+
+ABRAHAM:
+You lie.
+
+SAMPSON:
+Draw, if you be men. Gregory, remember thy swashing blow.
+
+BENVOLIO:
+Part, fools!
+Put up your swords; you know not what you do.
+
+TYBALT:
+What, art thou drawn among these heartless hinds?
+Turn thee, Benvolio, look upon thy death.
+
+BENVOLIO:
+I do but keep the peace: put up thy sword,
+Or manage it to part these men with me.
+
+TYBALT:
+What, drawn, and talk of peace! I hate the word,
+As I hate hell, all Montagues, and thee:
+Have at thee, coward!
+
+First Citizen:
+Clubs, bills, and partisans! strike! beat them down!
+Down with the Capulets! down with the Montagues!
+
+CAPULET:
+What noise is this? Give me my long sword, ho!
+
+LADY CAPULET:
+A crutch, a crutch! why call you for a sword?
+
+CAPULET:
+My sword, I say! Old Montague is come,
+And flourishes his blade in spite of me.
+
+MONTAGUE:
+Thou villain Capulet,--Hold me not, let me go.
+
+LADY MONTAGUE:
+Thou shalt not stir a foot to seek a foe.
+
+PRINCE:
+Rebellious subjects, enemies to peace,
+Profaners of this neighbour-stained steel,--
+Will they not hear? What, ho! you men, you beasts,
+That quench the fire of your pernicious rage
+With purple fountains issuing from your veins,
+On pain of torture, from those bloody hands
+Throw your mistemper'd weapons to the ground,
+And hear the sentence of your moved prince.
+Three civil brawls, bred of an airy word,
+By thee, old Capulet, and Montague,
+Have thrice disturb'd the quiet of our streets,
+And made Verona's ancient citizens
+Cast by their grave beseeming ornaments,
+To wield old partisans, in hands as old,
+Canker'd with peace, to part your canker'd hate:
+If ever you disturb our streets again,
+Your lives shall pay the forfeit of the peace.
+For this time, all the rest depart away:
+You Capulet; shall go along with me:
+And, Montague, come you this afternoon,
+To know our further pleasure in this case,
+To old Free-town, our common judgment-place.
+Once more, on pain of death, all men depart.
+
+MONTAGUE:
+Who set this ancient quarrel new abroach?
+Speak, nephew, were you by when it began?
+
+BENVOLIO:
+Here were the servants of your adversary,
+And yours, close fighting ere I did approach:
+I drew to part them: in the instant came
+The fiery Tybalt, with his sword prepared,
+Which, as he breathed defiance to my ears,
+He swung about his head and cut the winds,
+Who nothing hurt withal hiss'd him in scorn:
+While we were interchanging thrusts and blows,
+Came more and more and fought on part and part,
+Till the prince came, who parted either part.
+
+LADY MONTAGUE:
+O, where is Romeo? saw you him to-day?
+Right glad I am he was not at this fray.
+
+BENVOLIO:
+Madam, an hour before the worshipp'd sun
+Peer'd forth the golden window of the east,
+A troubled mind drave me to walk abroad;
+Where, underneath the grove of sycamore
+That westward rooteth from the city's side,
+So early walking did I see your son:
+Towards him I made, but he was ware of me
+And stole into the covert of the wood:
+I, measuring his affections by my own,
+That most are busied when they're most alone,
+Pursued my humour not pursuing his,
+And gladly shunn'd who gladly fled from me.
+
+MONTAGUE:
+Many a morning hath he there been seen,
+With tears augmenting the fresh morning dew.
+Adding to clouds more clouds with his deep sighs;
+But all so soon as the all-cheering sun
+Should in the furthest east begin to draw
+The shady curtains from Aurora's bed,
+Away from the light steals home my heavy son,
+And private in his chamber pens himself,
+Shuts up his windows, locks far daylight out
+And makes himself an artificial night:
+Black and portentous must this humour prove,
+Unless good counsel may the cause remove.
+
+BENVOLIO:
+My noble uncle, do you know the cause?
+
+MONTAGUE:
+I neither know it nor can learn of him.
+
+BENVOLIO:
+Have you importuned him by any means?
+
+MONTAGUE:
+Both by myself and many other friends:
+But he, his own affections' counsellor,
+Is to himself--I will not say how true--
+But to himself so secret and so close,
+So far from sounding and discovery,
+As is the bud bit with an envious worm,
+Ere he can spread his sweet leaves to the air,
+Or dedicate his beauty to the sun.
+Could we but learn from whence his sorrows grow.
+We would as willingly give cure as know.
+
+BENVOLIO:
+See, where he comes: so please you, step aside;
+I'll know his grievance, or be much denied.
+
+MONTAGUE:
+I would thou wert so happy by thy stay,
+To hear true shrift. Come, madam, let's away.
+
+BENVOLIO:
+Good-morrow, cousin.
+
+ROMEO:
+Is the day so young?
+
+BENVOLIO:
+But new struck nine.
+
+ROMEO:
+Ay me! sad hours seem long.
+Was that my father that went hence so fast?
+
+BENVOLIO:
+It was. What sadness lengthens Romeo's hours?
+
+ROMEO:
+Not having that, which, having, makes them short.
+
+BENVOLIO:
+In love?
+
+ROMEO:
+Out--
+
+BENVOLIO:
+Of love?
+
+ROMEO:
+Out of her favour, where I am in love.
+
+BENVOLIO:
+Alas, that love, so gentle in his view,
+Should be so tyrannous and rough in proof!
+
+ROMEO:
+Alas, that love, whose view is muffled still,
+Should, without eyes, see pathways to his will!
+Where shall we dine? O me! What fray was here?
+Yet tell me not, for I have heard it all.
+Here's much to do with hate, but more with love.
+Why, then, O brawling love! O loving hate!
+O any thing, of nothing first create!
+O heavy lightness! serious vanity!
+Mis-shapen chaos of well-seeming forms!
+Feather of lead, bright smoke, cold fire,
+sick health!
+Still-waking sleep, that is not what it is!
+This love feel I, that feel no love in this.
+Dost thou not laugh?
+
+BENVOLIO:
+No, coz, I rather weep.
+
+ROMEO:
+Good heart, at what?
+
+BENVOLIO:
+At thy good heart's oppression.
+
+ROMEO:
+Why, such is love's transgression.
+Griefs of mine own lie heavy in my breast,
+Which thou wilt propagate, to have it prest
+With more of thine: this love that thou hast shown
+Doth add more grief to too much of mine own.
+Love is a smoke raised with the fume of sighs;
+Being purged, a fire sparkling in lovers' eyes;
+Being vex'd a sea nourish'd with lovers' tears:
+What is it else? a madness most discreet,
+A choking gall and a preserving sweet.
+Farewell, my coz.
+
+BENVOLIO:
+Soft! I will go along;
+An if you leave me so, you do me wrong.
+
+ROMEO:
+Tut, I have lost myself; I am not here;
+This is not Romeo, he's some other where.
+
+BENVOLIO:
+Tell me in sadness, who is that you love.
+
+ROMEO:
+What, shall I groan and tell thee?
+
+BENVOLIO:
+Groan! why, no.
+But sadly tell me who.
+
+ROMEO:
+Bid a sick man in sadness make his will:
+Ah, word ill urged to one that is so ill!
+In sadness, cousin, I do love a woman.
+
+BENVOLIO:
+I aim'd so near, when I supposed you loved.
+
+ROMEO:
+A right good mark-man! And she's fair I love.
+
+BENVOLIO:
+A right fair mark, fair coz, is soonest hit.
+
+ROMEO:
+Well, in that hit you miss: she'll not be hit
+With Cupid's arrow; she hath Dian's wit;
+And, in strong proof of chastity well arm'd,
+From love's weak childish bow she lives unharm'd.
+She will not stay the siege of loving terms,
+Nor bide the encounter of assailing eyes,
+Nor ope her lap to saint-seducing gold:
+O, she is rich in beauty, only poor,
+That when she dies with beauty dies her store.
+
+BENVOLIO:
+Then she hath sworn that she will still live chaste?
+
+ROMEO:
+She hath, and in that sparing makes huge waste,
+For beauty starved with her severity
+Cuts beauty off from all posterity.
+She is too fair, too wise, wisely too fair,
+To merit bliss by making me despair:
+She hath forsworn to love, and in that vow
+Do I live dead that live to tell it now.
+
+BENVOLIO:
+Be ruled by me, forget to think of her.
+
+ROMEO:
+O, teach me how I should forget to think.
+
+BENVOLIO:
+By giving liberty unto thine eyes;
+Examine other beauties.
+
+ROMEO:
+'Tis the way
+To call hers exquisite, in question more:
+These happy masks that kiss fair ladies' brows
+Being black put us in mind they hide the fair;
+He that is strucken blind cannot forget
+The precious treasure of his eyesight lost:
+Show me a mistress that is passing fair,
+What doth her beauty serve, but as a note
+Where I may read who pass'd that passing fair?
+Farewell: thou canst not teach me to forget.
+
+BENVOLIO:
+I'll pay that doctrine, or else die in debt.
+
+CAPULET:
+But Montague is bound as well as I,
+In penalty alike; and 'tis not hard, I think,
+For men so old as we to keep the peace.
+
+PARIS:
+Of honourable reckoning are you both;
+And pity 'tis you lived at odds so long.
+But now, my lord, what say you to my suit?
+
+CAPULET:
+But saying o'er what I have said before:
+My child is yet a stranger in the world;
+She hath not seen the change of fourteen years,
+Let two more summers wither in their pride,
+Ere we may think her ripe to be a bride.
+
+PARIS:
+Younger than she are happy mothers made.
+
+CAPULET:
+And too soon marr'd are those so early made.
+The earth hath swallow'd all my hopes but she,
+She is the hopeful lady of my earth:
+But woo her, gentle Paris, get her heart,
+My will to her consent is but a part;
+An she agree, within her scope of choice
+Lies my consent and fair according voice.
+This night I hold an old accustom'd feast,
+Whereto I have invited many a guest,
+Such as I love; and you, among the store,
+One more, most welcome, makes my number more.
+At my poor house look to behold this night
+Earth-treading stars that make dark heaven light:
+Such comfort as do lusty young men feel
+When well-apparell'd April on the heel
+Of limping winter treads, even such delight
+Among fresh female buds shall you this night
+Inherit at my house; hear all, all see,
+And like her most whose merit most shall be:
+Which on more view, of many mine being one
+May stand in number, though in reckoning none,
+Come, go with me.
+Go, sirrah, trudge about
+Through fair Verona; find those persons out
+Whose names are written there, and to them say,
+My house and welcome on their pleasure stay.
+
+Servant:
+Find them out whose names are written here! It is
+written, that the shoemaker should meddle with his
+yard, and the tailor with his last, the fisher with
+his pencil, and the painter with his nets; but I am
+sent to find those persons whose names are here
+writ, and can never find what names the writing
+person hath here writ. I must to the learned.--In good time.
+
+BENVOLIO:
+Tut, man, one fire burns out another's burning,
+One pain is lessen'd by another's anguish;
+Turn giddy, and be holp by backward turning;
+One desperate grief cures with another's languish:
+Take thou some new infection to thy eye,
+And the rank poison of the old will die.
+
+ROMEO:
+Your plaintain-leaf is excellent for that.
+
+BENVOLIO:
+For what, I pray thee?
+
+ROMEO:
+For your broken shin.
+
+BENVOLIO:
+Why, Romeo, art thou mad?
+
+ROMEO:
+Not mad, but bound more than a mad-man is;
+Shut up in prison, kept without my food,
+Whipp'd and tormented and--God-den, good fellow.
+
+Servant:
+God gi' god-den. I pray, sir, can you read?
+
+ROMEO:
+Ay, mine own fortune in my misery.
+
+Servant:
+Perhaps you have learned it without book: but, I
+pray, can you read any thing you see?
+
+ROMEO:
+Ay, if I know the letters and the language.
+
+Servant:
+Ye say honestly: rest you merry!
+
+ROMEO:
+Stay, fellow; I can read.
+'Signior Martino and his wife and daughters;
+County Anselme and his beauteous sisters; the lady
+widow of Vitravio; Signior Placentio and his lovely
+nieces; Mercutio and his brother Valentine; mine
+uncle Capulet, his wife and daughters; my fair niece
+Rosaline; Livia; Signior Valentio and his cousin
+Tybalt, Lucio and the lively Helena.' A fair
+assembly: whither should they come?
+
+Servant:
+Up.
+
+ROMEO:
+Whither?
+
+Servant:
+To supper; to our house.
+
+ROMEO:
+Whose house?
+
+Servant:
+My master's.
+
+ROMEO:
+Indeed, I should have ask'd you that before.
+
+Servant:
+Now I'll tell you without asking: my master is the
+great rich Capulet; and if you be not of the house
+of Montagues, I pray, come and crush a cup of wine.
+Rest you merry!
+
+BENVOLIO:
+At this same ancient feast of Capulet's
+Sups the fair Rosaline whom thou so lovest,
+With all the admired beauties of Verona:
+Go thither; and, with unattainted eye,
+Compare her face with some that I shall show,
+And I will make thee think thy swan a crow.
+
+ROMEO:
+When the devout religion of mine eye
+Maintains such falsehood, then turn tears to fires;
+And these, who often drown'd could never die,
+Transparent heretics, be burnt for liars!
+One fairer than my love! the all-seeing sun
+Ne'er saw her match since first the world begun.
+
+BENVOLIO:
+Tut, you saw her fair, none else being by,
+Herself poised with herself in either eye:
+But in that crystal scales let there be weigh'd
+Your lady's love against some other maid
+That I will show you shining at this feast,
+And she shall scant show well that now shows best.
+
+ROMEO:
+I'll go along, no such sight to be shown,
+But to rejoice in splendor of mine own.
+
+LADY CAPULET:
+Nurse, where's my daughter? call her forth to me.
+
+Nurse:
+Now, by my maidenhead, at twelve year old,
+I bade her come. What, lamb! what, ladybird!
+God forbid! Where's this girl? What, Juliet!
+
+JULIET:
+How now! who calls?
+
+Nurse:
+Your mother.
+
+JULIET:
+Madam, I am here.
+What is your will?
+
+LADY CAPULET:
+This is the matter:--Nurse, give leave awhile,
+We must talk in secret:--nurse, come back again;
+I have remember'd me, thou's hear our counsel.
+Thou know'st my daughter's of a pretty age.
+
+Nurse:
+Faith, I can tell her age unto an hour.
+
+LADY CAPULET:
+She's not fourteen.
+
+Nurse:
+I'll lay fourteen of my teeth,--
+And yet, to my teeth be it spoken, I have but four--
+She is not fourteen. How long is it now
+To Lammas-tide?
+
+LADY CAPULET:
+A fortnight and odd days.
+
+Nurse:
+Even or odd, of all days in the year,
+Come Lammas-eve at night shall she be fourteen.
+Susan and she--God rest all Christian souls!--
+Were of an age: well, Susan is with God;
+She was too good for me: but, as I said,
+On Lammas-eve at night shall she be fourteen;
+That shall she, marry; I remember it well.
+'Tis since the earthquake now eleven years;
+And she was wean'd,--I never shall forget it,--
+Of all the days of the year, upon that day:
+For I had then laid wormwood to my dug,
+Sitting in the sun under the dove-house wall;
+My lord and you were then at Mantua:--
+Nay, I do bear a brain:--but, as I said,
+When it did taste the wormwood on the nipple
+Of my dug and felt it bitter, pretty fool,
+To see it tetchy and fall out with the dug!
+Shake quoth the dove-house: 'twas no need, I trow,
+To bid me trudge:
+And since that time it is eleven years;
+For then she could stand alone; nay, by the rood,
+She could have run and waddled all about;
+For even the day before, she broke her brow:
+And then my husband--God be with his soul!
+A' was a merry man--took up the child:
+'Yea,' quoth he, 'dost thou fall upon thy face?
+Thou wilt fall backward when thou hast more wit;
+Wilt thou not, Jule?' and, by my holidame,
+The pretty wretch left crying and said 'Ay.'
+To see, now, how a jest shall come about!
+I warrant, an I should live a thousand years,
+I never should forget it: 'Wilt thou not, Jule?' quoth he;
+And, pretty fool, it stinted and said 'Ay.'
+
+LADY CAPULET:
+Enough of this; I pray thee, hold thy peace.
+
+Nurse:
+Yes, madam: yet I cannot choose but laugh,
+To think it should leave crying and say 'Ay.'
+And yet, I warrant, it had upon its brow
+A bump as big as a young cockerel's stone;
+A parlous knock; and it cried bitterly:
+'Yea,' quoth my husband,'fall'st upon thy face?
+Thou wilt fall backward when thou comest to age;
+Wilt thou not, Jule?' it stinted and said 'Ay.'
+
+JULIET:
+And stint thou too, I pray thee, nurse, say I.
+
+Nurse:
+Peace, I have done. God mark thee to his grace!
+Thou wast the prettiest babe that e'er I nursed:
+An I might live to see thee married once,
+I have my wish.
+
+LADY CAPULET:
+Marry, that 'marry' is the very theme
+I came to talk of. Tell me, daughter Juliet,
+How stands your disposition to be married?
+
+JULIET:
+It is an honour that I dream not of.
+
+Nurse:
+An honour! were not I thine only nurse,
+I would say thou hadst suck'd wisdom from thy teat.
+
+LADY CAPULET:
+Well, think of marriage now; younger than you,
+Here in Verona, ladies of esteem,
+Are made already mothers: by my count,
+I was your mother much upon these years
+That you are now a maid. Thus then in brief:
+The valiant Paris seeks you for his love.
+
+Nurse:
+A man, young lady! lady, such a man
+As all the world--why, he's a man of wax.
+
+LADY CAPULET:
+Verona's summer hath not such a flower.
+
+Nurse:
+Nay, he's a flower; in faith, a very flower.
+
+LADY CAPULET:
+What say you? can you love the gentleman?
+This night you shall behold him at our feast;
+Read o'er the volume of young Paris' face,
+And find delight writ there with beauty's pen;
+Examine every married lineament,
+And see how one another lends content
+And what obscured in this fair volume lies
+Find written in the margent of his eyes.
+This precious book of love, this unbound lover,
+To beautify him, only lacks a cover:
+The fish lives in the sea, and 'tis much pride
+For fair without the fair within to hide:
+That book in many's eyes doth share the glory,
+That in gold clasps locks in the golden story;
+So shall you share all that he doth possess,
+By having him, making yourself no less.
+
+Nurse:
+No less! nay, bigger; women grow by men.
+
+LADY CAPULET:
+Speak briefly, can you like of Paris' love?
+
+JULIET:
+I'll look to like, if looking liking move:
+But no more deep will I endart mine eye
+Than your consent gives strength to make it fly.
+
+Servant:
+Madam, the guests are come, supper served up, you
+called, my young lady asked for, the nurse cursed in
+the pantry, and every thing in extremity. I must
+hence to wait; I beseech you, follow straight.
+
+LADY CAPULET:
+We follow thee.
+Juliet, the county stays.
+
+Nurse:
+Go, girl, seek happy nights to happy days.
+
+ROMEO:
+What, shall this speech be spoke for our excuse?
+Or shall we on without a apology?
+
+BENVOLIO:
+The date is out of such prolixity:
+We'll have no Cupid hoodwink'd with a scarf,
+Bearing a Tartar's painted bow of lath,
+Scaring the ladies like a crow-keeper;
+Nor no without-book prologue, faintly spoke
+After the prompter, for our entrance:
+But let them measure us by what they will;
+We'll measure them a measure, and be gone.
+
+ROMEO:
+Give me a torch: I am not for this ambling;
+Being but heavy, I will bear the light.
+
+MERCUTIO:
+Nay, gentle Romeo, we must have you dance.
+
+ROMEO:
+Not I, believe me: you have dancing shoes
+With nimble soles: I have a soul of lead
+So stakes me to the ground I cannot move.
+
+MERCUTIO:
+You are a lover; borrow Cupid's wings,
+And soar with them above a common bound.
+
+ROMEO:
+I am too sore enpierced with his shaft
+To soar with his light feathers, and so bound,
+I cannot bound a pitch above dull woe:
+Under love's heavy burden do I sink.
+
+MERCUTIO:
+And, to sink in it, should you burden love;
+Too great oppression for a tender thing.
+
+ROMEO:
+Is love a tender thing? it is too rough,
+Too rude, too boisterous, and it pricks like thorn.
+
+MERCUTIO:
+If love be rough with you, be rough with love;
+Prick love for pricking, and you beat love down.
+Give me a case to put my visage in:
+A visor for a visor! what care I
+What curious eye doth quote deformities?
+Here are the beetle brows shall blush for me.
+
+BENVOLIO:
+Come, knock and enter; and no sooner in,
+But every man betake him to his legs.
+
+ROMEO:
+A torch for me: let wantons light of heart
+Tickle the senseless rushes with their heels,
+For I am proverb'd with a grandsire phrase;
+I'll be a candle-holder, and look on.
+The game was ne'er so fair, and I am done.
+
+MERCUTIO:
+Tut, dun's the mouse, the constable's own word:
+If thou art dun, we'll draw thee from the mire
+Of this sir-reverence love, wherein thou stick'st
+Up to the ears. Come, we burn daylight, ho!
+
+ROMEO:
+Nay, that's not so.
+
+MERCUTIO:
+I mean, sir, in delay
+We waste our lights in vain, like lamps by day.
+Take our good meaning, for our judgment sits
+Five times in that ere once in our five wits.
+
+ROMEO:
+And we mean well in going to this mask;
+But 'tis no wit to go.
+
+MERCUTIO:
+Why, may one ask?
+
+ROMEO:
+I dream'd a dream to-night.
+
+MERCUTIO:
+And so did I.
+
+ROMEO:
+Well, what was yours?
+
+MERCUTIO:
+That dreamers often lie.
+
+ROMEO:
+In bed asleep, while they do dream things true.
+
+MERCUTIO:
+O, then, I see Queen Mab hath been with you.
+She is the fairies' midwife, and she comes
+In shape no bigger than an agate-stone
+On the fore-finger of an alderman,
+Drawn with a team of little atomies
+Athwart men's noses as they lie asleep;
+Her wagon-spokes made of long spiders' legs,
+The cover of the wings of grasshoppers,
+The traces of the smallest spider's web,
+The collars of the moonshine's watery beams,
+Her whip of cricket's bone, the lash of film,
+Her wagoner a small grey-coated gnat,
+Not so big as a round little worm
+Prick'd from the lazy finger of a maid;
+Her chariot is an empty hazel-nut
+Made by the joiner squirrel or old grub,
+Time out o' mind the fairies' coachmakers.
+And in this state she gallops night by night
+Through lovers' brains, and then they dream of love;
+O'er courtiers' knees, that dream on court'sies straight,
+O'er lawyers' fingers, who straight dream on fees,
+O'er ladies ' lips, who straight on kisses dream,
+Which oft the angry Mab with blisters plagues,
+Because their breaths with sweetmeats tainted are:
+Sometime she gallops o'er a courtier's nose,
+And then dreams he of smelling out a suit;
+And sometime comes she with a tithe-pig's tail
+Tickling a parson's nose as a' lies asleep,
+Then dreams, he of another benefice:
+Sometime she driveth o'er a soldier's neck,
+And then dreams he of cutting foreign throats,
+Of breaches, ambuscadoes, Spanish blades,
+Of healths five-fathom deep; and then anon
+Drums in his ear, at which he starts and wakes,
+And being thus frighted swears a prayer or two
+And sleeps again. This is that very Mab
+That plats the manes of horses in the night,
+And bakes the elflocks in foul sluttish hairs,
+Which once untangled, much misfortune bodes:
+This is the hag, when maids lie on their backs,
+That presses them and learns them first to bear,
+Making them women of good carriage:
+This is she--
+
+ROMEO:
+Peace, peace, Mercutio, peace!
+Thou talk'st of nothing.
+
+MERCUTIO:
+True, I talk of dreams,
+Which are the children of an idle brain,
+Begot of nothing but vain fantasy,
+Which is as thin of substance as the air
+And more inconstant than the wind, who wooes
+Even now the frozen bosom of the north,
+And, being anger'd, puffs away from thence,
+Turning his face to the dew-dropping south.
+
+BENVOLIO:
+This wind, you talk of, blows us from ourselves;
+Supper is done, and we shall come too late.
+
+ROMEO:
+I fear, too early: for my mind misgives
+Some consequence yet hanging in the stars
+Shall bitterly begin his fearful date
+With this night's revels and expire the term
+Of a despised life closed in my breast
+By some vile forfeit of untimely death.
+But He, that hath the steerage of my course,
+Direct my sail! On, lusty gentlemen.
+
+BENVOLIO:
+Strike, drum.
+
+First Servant:
+Where's Potpan, that he helps not to take away? He
+shift a trencher? he scrape a trencher!
+
+Second Servant:
+When good manners shall lie all in one or two men's
+hands and they unwashed too, 'tis a foul thing.
+
+First Servant:
+Away with the joint-stools, remove the
+court-cupboard, look to the plate. Good thou, save
+me a piece of marchpane; and, as thou lovest me, let
+the porter let in Susan Grindstone and Nell.
+Antony, and Potpan!
+
+Second Servant:
+Ay, boy, ready.
+
+First Servant:
+You are looked for and called for, asked for and
+sought for, in the great chamber.
+
+Second Servant:
+We cannot be here and there too. Cheerly, boys; be
+brisk awhile, and the longer liver take all.
+
+CAPULET:
+Welcome, gentlemen! ladies that have their toes
+Unplagued with corns will have a bout with you.
+Ah ha, my mistresses! which of you all
+Will now deny to dance? she that makes dainty,
+She, I'll swear, hath corns; am I come near ye now?
+Welcome, gentlemen! I have seen the day
+That I have worn a visor and could tell
+A whispering tale in a fair lady's ear,
+Such as would please: 'tis gone, 'tis gone, 'tis gone:
+You are welcome, gentlemen! come, musicians, play.
+A hall, a hall! give room! and foot it, girls.
+More light, you knaves; and turn the tables up,
+And quench the fire, the room is grown too hot.
+Ah, sirrah, this unlook'd-for sport comes well.
+Nay, sit, nay, sit, good cousin Capulet;
+For you and I are past our dancing days:
+How long is't now since last yourself and I
+Were in a mask?
+
+Second Capulet:
+By'r lady, thirty years.
+
+CAPULET:
+What, man! 'tis not so much, 'tis not so much:
+'Tis since the nuptials of Lucentio,
+Come pentecost as quickly as it will,
+Some five and twenty years; and then we mask'd.
+
+Second Capulet:
+'Tis more, 'tis more, his son is elder, sir;
+His son is thirty.
+
+CAPULET:
+Will you tell me that?
+His son was but a ward two years ago.
+
+ROMEO:
+
+Servant:
+I know not, sir.
+
+ROMEO:
+O, she doth teach the torches to burn bright!
+It seems she hangs upon the cheek of night
+Like a rich jewel in an Ethiope's ear;
+Beauty too rich for use, for earth too dear!
+So shows a snowy dove trooping with crows,
+As yonder lady o'er her fellows shows.
+The measure done, I'll watch her place of stand,
+And, touching hers, make blessed my rude hand.
+Did my heart love till now? forswear it, sight!
+For I ne'er saw true beauty till this night.
+
+TYBALT:
+This, by his voice, should be a Montague.
+Fetch me my rapier, boy. What dares the slave
+Come hither, cover'd with an antic face,
+To fleer and scorn at our solemnity?
+Now, by the stock and honour of my kin,
+To strike him dead, I hold it not a sin.
+
+CAPULET:
+Why, how now, kinsman! wherefore storm you so?
+
+TYBALT:
+Uncle, this is a Montague, our foe,
+A villain that is hither come in spite,
+To scorn at our solemnity this night.
+
+CAPULET:
+Young Romeo is it?
+
+TYBALT:
+'Tis he, that villain Romeo.
+
+CAPULET:
+Content thee, gentle coz, let him alone;
+He bears him like a portly gentleman;
+And, to say truth, Verona brags of him
+To be a virtuous and well-govern'd youth:
+I would not for the wealth of all the town
+Here in my house do him disparagement:
+Therefore be patient, take no note of him:
+It is my will, the which if thou respect,
+Show a fair presence and put off these frowns,
+And ill-beseeming semblance for a feast.
+
+TYBALT:
+It fits, when such a villain is a guest:
+I'll not endure him.
+
+CAPULET:
+He shall be endured:
+What, goodman boy! I say, he shall: go to;
+Am I the master here, or you? go to.
+You'll not endure him! God shall mend my soul!
+You'll make a mutiny among my guests!
+You will set cock-a-hoop! you'll be the man!
+
+TYBALT:
+Why, uncle, 'tis a shame.
+
+CAPULET:
+Go to, go to;
+You are a saucy boy: is't so, indeed?
+This trick may chance to scathe you, I know what:
+You must contrary me! marry, 'tis time.
+Well said, my hearts! You are a princox; go:
+Be quiet, or--More light, more light! For shame!
+I'll make you quiet. What, cheerly, my hearts!
+
+TYBALT:
+Patience perforce with wilful choler meeting
+Makes my flesh tremble in their different greeting.
+I will withdraw: but this intrusion shall
+Now seeming sweet convert to bitter gall.
+
+ROMEO:
+
+JULIET:
+Good pilgrim, you do wrong your hand too much,
+Which mannerly devotion shows in this;
+For saints have hands that pilgrims' hands do touch,
+And palm to palm is holy palmers' kiss.
+
+ROMEO:
+Have not saints lips, and holy palmers too?
+
+JULIET:
+Ay, pilgrim, lips that they must use in prayer.
+
+ROMEO:
+O, then, dear saint, let lips do what hands do;
+They pray, grant thou, lest faith turn to despair.
+
+JULIET:
+Saints do not move, though grant for prayers' sake.
+
+ROMEO:
+Then move not, while my prayer's effect I take.
+Thus from my lips, by yours, my sin is purged.
+
+JULIET:
+Then have my lips the sin that they have took.
+
+ROMEO:
+Sin from thy lips? O trespass sweetly urged!
+Give me my sin again.
+
+JULIET:
+You kiss by the book.
+
+Nurse:
+Madam, your mother craves a word with you.
+
+ROMEO:
+What is her mother?
+
+Nurse:
+Marry, bachelor,
+Her mother is the lady of the house,
+And a good lady, and a wise and virtuous
+I nursed her daughter, that you talk'd withal;
+I tell you, he that can lay hold of her
+Shall have the chinks.
+
+ROMEO:
+Is she a Capulet?
+O dear account! my life is my foe's debt.
+
+BENVOLIO:
+Away, begone; the sport is at the best.
+
+ROMEO:
+Ay, so I fear; the more is my unrest.
+
+CAPULET:
+Nay, gentlemen, prepare not to be gone;
+We have a trifling foolish banquet towards.
+Is it e'en so? why, then, I thank you all
+I thank you, honest gentlemen; good night.
+More torches here! Come on then, let's to bed.
+Ah, sirrah, by my fay, it waxes late:
+I'll to my rest.
+
+JULIET:
+Come hither, nurse. What is yond gentleman?
+
+Nurse:
+The son and heir of old Tiberio.
+
+JULIET:
+What's he that now is going out of door?
+
+Nurse:
+Marry, that, I think, be young Petrucio.
+
+JULIET:
+What's he that follows there, that would not dance?
+
+Nurse:
+I know not.
+
+JULIET:
+Go ask his name: if he be married.
+My grave is like to be my wedding bed.
+
+Nurse:
+His name is Romeo, and a Montague;
+The only son of your great enemy.
+
+JULIET:
+My only love sprung from my only hate!
+Too early seen unknown, and known too late!
+Prodigious birth of love it is to me,
+That I must love a loathed enemy.
+
+Nurse:
+What's this? what's this?
+
+JULIET:
+A rhyme I learn'd even now
+Of one I danced withal.
+
+Nurse:
+Anon, anon!
+Come, let's away; the strangers all are gone.
+
+Chorus:
+Now old desire doth in his death-bed lie,
+And young affection gapes to be his heir;
+That fair for which love groan'd for and would die,
+With tender Juliet match'd, is now not fair.
+Now Romeo is beloved and loves again,
+Alike betwitched by the charm of looks,
+But to his foe supposed he must complain,
+And she steal love's sweet bait from fearful hooks:
+Being held a foe, he may not have access
+To breathe such vows as lovers use to swear;
+And she as much in love, her means much less
+To meet her new-beloved any where:
+But passion lends them power, time means, to meet
+Tempering extremities with extreme sweet.
+
+ROMEO:
+Can I go forward when my heart is here?
+Turn back, dull earth, and find thy centre out.
+
+BENVOLIO:
+Romeo! my cousin Romeo!
+
+MERCUTIO:
+He is wise;
+And, on my lie, hath stol'n him home to bed.
+
+BENVOLIO:
+He ran this way, and leap'd this orchard wall:
+Call, good Mercutio.
+
+MERCUTIO:
+Nay, I'll conjure too.
+Romeo! humours! madman! passion! lover!
+Appear thou in the likeness of a sigh:
+Speak but one rhyme, and I am satisfied;
+Cry but 'Ay me!' pronounce but 'love' and 'dove;'
+Speak to my gossip Venus one fair word,
+One nick-name for her purblind son and heir,
+Young Adam Cupid, he that shot so trim,
+When King Cophetua loved the beggar-maid!
+He heareth not, he stirreth not, he moveth not;
+The ape is dead, and I must conjure him.
+I conjure thee by Rosaline's bright eyes,
+By her high forehead and her scarlet lip,
+By her fine foot, straight leg and quivering thigh
+And the demesnes that there adjacent lie,
+That in thy likeness thou appear to us!
+
+BENVOLIO:
+And if he hear thee, thou wilt anger him.
+
+MERCUTIO:
+This cannot anger him: 'twould anger him
+To raise a spirit in his mistress' circle
+Of some strange nature, letting it there stand
+Till she had laid it and conjured it down;
+That were some spite: my invocation
+Is fair and honest, and in his mistress' name
+I conjure only but to raise up him.
+
+BENVOLIO:
+Come, he hath hid himself among these trees,
+To be consorted with the humorous night:
+Blind is his love and best befits the dark.
+
+MERCUTIO:
+If love be blind, love cannot hit the mark.
+Now will he sit under a medlar tree,
+And wish his mistress were that kind of fruit
+As maids call medlars, when they laugh alone.
+Romeo, that she were, O, that she were
+An open et caetera, thou a poperin pear!
+Romeo, good night: I'll to my truckle-bed;
+This field-bed is too cold for me to sleep:
+Come, shall we go?
+
+BENVOLIO:
+Go, then; for 'tis in vain
+To seek him here that means not to be found.
+
+ROMEO:
+He jests at scars that never felt a wound.
+But, soft! what light through yonder window breaks?
+It is the east, and Juliet is the sun.
+Arise, fair sun, and kill the envious moon,
+Who is already sick and pale with grief,
+That thou her maid art far more fair than she:
+Be not her maid, since she is envious;
+Her vestal livery is but sick and green
+And none but fools do wear it; cast it off.
+It is my lady, O, it is my love!
+O, that she knew she were!
+She speaks yet she says nothing: what of that?
+Her eye discourses; I will answer it.
+I am too bold, 'tis not to me she speaks:
+Two of the fairest stars in all the heaven,
+Having some business, do entreat her eyes
+To twinkle in their spheres till they return.
+What if her eyes were there, they in her head?
+The brightness of her cheek would shame those stars,
+As daylight doth a lamp; her eyes in heaven
+Would through the airy region stream so bright
+That birds would sing and think it were not night.
+See, how she leans her cheek upon her hand!
+O, that I were a glove upon that hand,
+That I might touch that cheek!
+
+JULIET:
+Ay me!
+
+ROMEO:
+She speaks:
+O, speak again, bright angel! for thou art
+As glorious to this night, being o'er my head
+As is a winged messenger of heaven
+Unto the white-upturned wondering eyes
+Of mortals that fall back to gaze on him
+When he bestrides the lazy-pacing clouds
+And sails upon the bosom of the air.
+
+JULIET:
+O Romeo, Romeo! wherefore art thou Romeo?
+Deny thy father and refuse thy name;
+Or, if thou wilt not, be but sworn my love,
+And I'll no longer be a Capulet.
+
+ROMEO:
+
+JULIET:
+'Tis but thy name that is my enemy;
+Thou art thyself, though not a Montague.
+What's Montague? it is nor hand, nor foot,
+Nor arm, nor face, nor any other part
+Belonging to a man. O, be some other name!
+What's in a name? that which we call a rose
+By any other name would smell as sweet;
+So Romeo would, were he not Romeo call'd,
+Retain that dear perfection which he owes
+Without that title. Romeo, doff thy name,
+And for that name which is no part of thee
+Take all myself.
+
+ROMEO:
+I take thee at thy word:
+Call me but love, and I'll be new baptized;
+Henceforth I never will be Romeo.
+
+JULIET:
+What man art thou that thus bescreen'd in night
+So stumblest on my counsel?
+
+ROMEO:
+By a name
+I know not how to tell thee who I am:
+My name, dear saint, is hateful to myself,
+Because it is an enemy to thee;
+Had I it written, I would tear the word.
+
+JULIET:
+My ears have not yet drunk a hundred words
+Of that tongue's utterance, yet I know the sound:
+Art thou not Romeo and a Montague?
+
+ROMEO:
+Neither, fair saint, if either thee dislike.
+
+JULIET:
+How camest thou hither, tell me, and wherefore?
+The orchard walls are high and hard to climb,
+And the place death, considering who thou art,
+If any of my kinsmen find thee here.
+
+ROMEO:
+With love's light wings did I o'er-perch these walls;
+For stony limits cannot hold love out,
+And what love can do that dares love attempt;
+Therefore thy kinsmen are no let to me.
+
+JULIET:
+If they do see thee, they will murder thee.
+
+ROMEO:
+Alack, there lies more peril in thine eye
+Than twenty of their swords: look thou but sweet,
+And I am proof against their enmity.
+
+JULIET:
+I would not for the world they saw thee here.
+
+ROMEO:
+I have night's cloak to hide me from their sight;
+And but thou love me, let them find me here:
+My life were better ended by their hate,
+Than death prorogued, wanting of thy love.
+
+JULIET:
+By whose direction found'st thou out this place?
+
+ROMEO:
+By love, who first did prompt me to inquire;
+He lent me counsel and I lent him eyes.
+I am no pilot; yet, wert thou as far
+As that vast shore wash'd with the farthest sea,
+I would adventure for such merchandise.
+
+JULIET:
+Thou know'st the mask of night is on my face,
+Else would a maiden blush bepaint my cheek
+For that which thou hast heard me speak to-night
+Fain would I dwell on form, fain, fain deny
+What I have spoke: but farewell compliment!
+Dost thou love me? I know thou wilt say 'Ay,'
+And I will take thy word: yet if thou swear'st,
+Thou mayst prove false; at lovers' perjuries
+Then say, Jove laughs. O gentle Romeo,
+If thou dost love, pronounce it faithfully:
+Or if thou think'st I am too quickly won,
+I'll frown and be perverse an say thee nay,
+So thou wilt woo; but else, not for the world.
+In truth, fair Montague, I am too fond,
+And therefore thou mayst think my 'havior light:
+But trust me, gentleman, I'll prove more true
+Than those that have more cunning to be strange.
+I should have been more strange, I must confess,
+But that thou overheard'st, ere I was ware,
+My true love's passion: therefore pardon me,
+And not impute this yielding to light love,
+Which the dark night hath so discovered.
+
+ROMEO:
+Lady, by yonder blessed moon I swear
+That tips with silver all these fruit-tree tops--
+
+JULIET:
+O, swear not by the moon, the inconstant moon,
+That monthly changes in her circled orb,
+Lest that thy love prove likewise variable.
+
+ROMEO:
+What shall I swear by?
+
+JULIET:
+Do not swear at all;
+Or, if thou wilt, swear by thy gracious self,
+Which is the god of my idolatry,
+And I'll believe thee.
+
+ROMEO:
+If my heart's dear love--
+
+JULIET:
+Well, do not swear: although I joy in thee,
+I have no joy of this contract to-night:
+It is too rash, too unadvised, too sudden;
+Too like the lightning, which doth cease to be
+Ere one can say 'It lightens.' Sweet, good night!
+This bud of love, by summer's ripening breath,
+May prove a beauteous flower when next we meet.
+Good night, good night! as sweet repose and rest
+Come to thy heart as that within my breast!
+
+ROMEO:
+O, wilt thou leave me so unsatisfied?
+
+JULIET:
+What satisfaction canst thou have to-night?
+
+ROMEO:
+The exchange of thy love's faithful vow for mine.
+
+JULIET:
+I gave thee mine before thou didst request it:
+And yet I would it were to give again.
+
+ROMEO:
+Wouldst thou withdraw it? for what purpose, love?
+
+JULIET:
+But to be frank, and give it thee again.
+And yet I wish but for the thing I have:
+My bounty is as boundless as the sea,
+My love as deep; the more I give to thee,
+The more I have, for both are infinite.
+I hear some noise within; dear love, adieu!
+Anon, good nurse! Sweet Montague, be true.
+Stay but a little, I will come again.
+
+ROMEO:
+O blessed, blessed night! I am afeard.
+Being in night, all this is but a dream,
+Too flattering-sweet to be substantial.
+
+JULIET:
+Three words, dear Romeo, and good night indeed.
+If that thy bent of love be honourable,
+Thy purpose marriage, send me word to-morrow,
+By one that I'll procure to come to thee,
+Where and what time thou wilt perform the rite;
+And all my fortunes at thy foot I'll lay
+And follow thee my lord throughout the world.
+
+Nurse:
+
+JULIET:
+I come, anon.--But if thou mean'st not well,
+I do beseech thee--
+
+Nurse:
+
+JULIET:
+By and by, I come:--
+To cease thy suit, and leave me to my grief:
+To-morrow will I send.
+
+ROMEO:
+So thrive my soul--
+
+JULIET:
+A thousand times good night!
+
+ROMEO:
+A thousand times the worse, to want thy light.
+Love goes toward love, as schoolboys from
+their books,
+But love from love, toward school with heavy looks.
+
+JULIET:
+Hist! Romeo, hist! O, for a falconer's voice,
+To lure this tassel-gentle back again!
+Bondage is hoarse, and may not speak aloud;
+Else would I tear the cave where Echo lies,
+And make her airy tongue more hoarse than mine,
+With repetition of my Romeo's name.
+
+ROMEO:
+It is my soul that calls upon my name:
+How silver-sweet sound lovers' tongues by night,
+Like softest music to attending ears!
+
+JULIET:
+Romeo!
+
+ROMEO:
+My dear?
+
+JULIET:
+At what o'clock to-morrow
+Shall I send to thee?
+
+ROMEO:
+At the hour of nine.
+
+JULIET:
+I will not fail: 'tis twenty years till then.
+I have forgot why I did call thee back.
+
+ROMEO:
+Let me stand here till thou remember it.
+
+JULIET:
+I shall forget, to have thee still stand there,
+Remembering how I love thy company.
+
+ROMEO:
+And I'll still stay, to have thee still forget,
+Forgetting any other home but this.
+
+JULIET:
+'Tis almost morning; I would have thee gone:
+And yet no further than a wanton's bird;
+Who lets it hop a little from her hand,
+Like a poor prisoner in his twisted gyves,
+And with a silk thread plucks it back again,
+So loving-jealous of his liberty.
+
+ROMEO:
+I would I were thy bird.
+
+JULIET:
+Sweet, so would I:
+Yet I should kill thee with much cherishing.
+Good night, good night! parting is such
+sweet sorrow,
+That I shall say good night till it be morrow.
+
+ROMEO:
+Sleep dwell upon thine eyes, peace in thy breast!
+Would I were sleep and peace, so sweet to rest!
+Hence will I to my ghostly father's cell,
+His help to crave, and my dear hap to tell.
+
+FRIAR LAURENCE:
+The grey-eyed morn smiles on the frowning night,
+Chequering the eastern clouds with streaks of light,
+And flecked darkness like a drunkard reels
+From forth day's path and Titan's fiery wheels:
+Now, ere the sun advance his burning eye,
+The day to cheer and night's dank dew to dry,
+I must up-fill this osier cage of ours
+With baleful weeds and precious-juiced flowers.
+The earth that's nature's mother is her tomb;
+What is her burying grave that is her womb,
+And from her womb children of divers kind
+We sucking on her natural bosom find,
+Many for many virtues excellent,
+None but for some and yet all different.
+O, mickle is the powerful grace that lies
+In herbs, plants, stones, and their true qualities:
+For nought so vile that on the earth doth live
+But to the earth some special good doth give,
+Nor aught so good but strain'd from that fair use
+Revolts from true birth, stumbling on abuse:
+Virtue itself turns vice, being misapplied;
+And vice sometimes by action dignified.
+Within the infant rind of this small flower
+Poison hath residence and medicine power:
+For this, being smelt, with that part cheers each part;
+Being tasted, slays all senses with the heart.
+Two such opposed kings encamp them still
+In man as well as herbs, grace and rude will;
+And where the worser is predominant,
+Full soon the canker death eats up that plant.
+
+ROMEO:
+Good morrow, father.
+
+FRIAR LAURENCE:
+Benedicite!
+What early tongue so sweet saluteth me?
+Young son, it argues a distemper'd head
+So soon to bid good morrow to thy bed:
+Care keeps his watch in every old man's eye,
+And where care lodges, sleep will never lie;
+But where unbruised youth with unstuff'd brain
+Doth couch his limbs, there golden sleep doth reign:
+Therefore thy earliness doth me assure
+Thou art up-roused by some distemperature;
+Or if not so, then here I hit it right,
+Our Romeo hath not been in bed to-night.
+
+ROMEO:
+That last is true; the sweeter rest was mine.
+
+FRIAR LAURENCE:
+God pardon sin! wast thou with Rosaline?
+
+ROMEO:
+With Rosaline, my ghostly father? no;
+I have forgot that name, and that name's woe.
+
+FRIAR LAURENCE:
+That's my good son: but where hast thou been, then?
+
+ROMEO:
+I'll tell thee, ere thou ask it me again.
+I have been feasting with mine enemy,
+Where on a sudden one hath wounded me,
+That's by me wounded: both our remedies
+Within thy help and holy physic lies:
+I bear no hatred, blessed man, for, lo,
+My intercession likewise steads my foe.
+
+FRIAR LAURENCE:
+Be plain, good son, and homely in thy drift;
+Riddling confession finds but riddling shrift.
+
+ROMEO:
+Then plainly know my heart's dear love is set
+On the fair daughter of rich Capulet:
+As mine on hers, so hers is set on mine;
+And all combined, save what thou must combine
+By holy marriage: when and where and how
+We met, we woo'd and made exchange of vow,
+I'll tell thee as we pass; but this I pray,
+That thou consent to marry us to-day.
+
+FRIAR LAURENCE:
+Holy Saint Francis, what a change is here!
+Is Rosaline, whom thou didst love so dear,
+So soon forsaken? young men's love then lies
+Not truly in their hearts, but in their eyes.
+Jesu Maria, what a deal of brine
+Hath wash'd thy sallow cheeks for Rosaline!
+How much salt water thrown away in waste,
+To season love, that of it doth not taste!
+The sun not yet thy sighs from heaven clears,
+Thy old groans ring yet in my ancient ears;
+Lo, here upon thy cheek the stain doth sit
+Of an old tear that is not wash'd off yet:
+If e'er thou wast thyself and these woes thine,
+Thou and these woes were all for Rosaline:
+And art thou changed? pronounce this sentence then,
+Women may fall, when there's no strength in men.
+
+ROMEO:
+Thou chid'st me oft for loving Rosaline.
+
+FRIAR LAURENCE:
+For doting, not for loving, pupil mine.
+
+ROMEO:
+And bad'st me bury love.
+
+FRIAR LAURENCE:
+Not in a grave,
+To lay one in, another out to have.
+
+ROMEO:
+I pray thee, chide not; she whom I love now
+Doth grace for grace and love for love allow;
+The other did not so.
+
+FRIAR LAURENCE:
+O, she knew well
+Thy love did read by rote and could not spell.
+But come, young waverer, come, go with me,
+In one respect I'll thy assistant be;
+For this alliance may so happy prove,
+To turn your households' rancour to pure love.
+
+ROMEO:
+O, let us hence; I stand on sudden haste.
+
+FRIAR LAURENCE:
+Wisely and slow; they stumble that run fast.
+
+MERCUTIO:
+Where the devil should this Romeo be?
+Came he not home to-night?
+
+BENVOLIO:
+Not to his father's; I spoke with his man.
+
+MERCUTIO:
+Ah, that same pale hard-hearted wench, that Rosaline.
+Torments him so, that he will sure run mad.
+
+BENVOLIO:
+Tybalt, the kinsman of old Capulet,
+Hath sent a letter to his father's house.
+
+MERCUTIO:
+A challenge, on my life.
+
+BENVOLIO:
+Romeo will answer it.
+
+MERCUTIO:
+Any man that can write may answer a letter.
+
+BENVOLIO:
+Nay, he will answer the letter's master, how he
+dares, being dared.
+
+MERCUTIO:
+Alas poor Romeo! he is already dead; stabbed with a
+white wench's black eye; shot through the ear with a
+love-song; the very pin of his heart cleft with the
+blind bow-boy's butt-shaft: and is he a man to
+encounter Tybalt?
+
+BENVOLIO:
+Why, what is Tybalt?
+
+MERCUTIO:
+More than prince of cats, I can tell you. O, he is
+the courageous captain of compliments. He fights as
+you sing prick-song, keeps time, distance, and
+proportion; rests me his minim rest, one, two, and
+the third in your bosom: the very butcher of a silk
+button, a duellist, a duellist; a gentleman of the
+very first house, of the first and second cause:
+ah, the immortal passado! the punto reverso! the
+hai!
+
+BENVOLIO:
+The what?
+
+MERCUTIO:
+The pox of such antic, lisping, affecting
+fantasticoes; these new tuners of accents! 'By Jesu,
+a very good blade! a very tall man! a very good
+whore!' Why, is not this a lamentable thing,
+grandsire, that we should be thus afflicted with
+these strange flies, these fashion-mongers, these
+perdona-mi's, who stand so much on the new form,
+that they cannot at ease on the old bench? O, their
+bones, their bones!
+
+BENVOLIO:
+Here comes Romeo, here comes Romeo.
+
+MERCUTIO:
+Without his roe, like a dried herring: flesh, flesh,
+how art thou fishified! Now is he for the numbers
+that Petrarch flowed in: Laura to his lady was but a
+kitchen-wench; marry, she had a better love to
+be-rhyme her; Dido a dowdy; Cleopatra a gipsy;
+Helen and Hero hildings and harlots; Thisbe a grey
+eye or so, but not to the purpose. Signior
+Romeo, bon jour! there's a French salutation
+to your French slop. You gave us the counterfeit
+fairly last night.
+
+ROMEO:
+Good morrow to you both. What counterfeit did I give you?
+
+MERCUTIO:
+The ship, sir, the slip; can you not conceive?
+
+ROMEO:
+Pardon, good Mercutio, my business was great; and in
+such a case as mine a man may strain courtesy.
+
+MERCUTIO:
+That's as much as to say, such a case as yours
+constrains a man to bow in the hams.
+
+ROMEO:
+Meaning, to court'sy.
+
+MERCUTIO:
+Thou hast most kindly hit it.
+
+ROMEO:
+A most courteous exposition.
+
+MERCUTIO:
+Nay, I am the very pink of courtesy.
+
+ROMEO:
+Pink for flower.
+
+MERCUTIO:
+Right.
+
+ROMEO:
+Why, then is my pump well flowered.
+
+MERCUTIO:
+Well said: follow me this jest now till thou hast
+worn out thy pump, that when the single sole of it
+is worn, the jest may remain after the wearing sole singular.
+
+ROMEO:
+O single-soled jest, solely singular for the
+singleness.
+
+MERCUTIO:
+Come between us, good Benvolio; my wits faint.
+
+ROMEO:
+Switch and spurs, switch and spurs; or I'll cry a match.
+
+MERCUTIO:
+Nay, if thy wits run the wild-goose chase, I have
+done, for thou hast more of the wild-goose in one of
+thy wits than, I am sure, I have in my whole five:
+was I with you there for the goose?
+
+ROMEO:
+Thou wast never with me for any thing when thou wast
+not there for the goose.
+
+MERCUTIO:
+I will bite thee by the ear for that jest.
+
+ROMEO:
+Nay, good goose, bite not.
+
+MERCUTIO:
+Thy wit is a very bitter sweeting; it is a most
+sharp sauce.
+
+ROMEO:
+And is it not well served in to a sweet goose?
+
+MERCUTIO:
+O here's a wit of cheveril, that stretches from an
+inch narrow to an ell broad!
+
+ROMEO:
+I stretch it out for that word 'broad;' which added
+to the goose, proves thee far and wide a broad goose.
+
+MERCUTIO:
+Why, is not this better now than groaning for love?
+now art thou sociable, now art thou Romeo; now art
+thou what thou art, by art as well as by nature:
+for this drivelling love is like a great natural,
+that runs lolling up and down to hide his bauble in a hole.
+
+BENVOLIO:
+Stop there, stop there.
+
+MERCUTIO:
+Thou desirest me to stop in my tale against the hair.
+
+BENVOLIO:
+Thou wouldst else have made thy tale large.
+
+MERCUTIO:
+O, thou art deceived; I would have made it short:
+for I was come to the whole depth of my tale; and
+meant, indeed, to occupy the argument no longer.
+
+ROMEO:
+Here's goodly gear!
+
+MERCUTIO:
+A sail, a sail!
+
+BENVOLIO:
+Two, two; a shirt and a smock.
+
+Nurse:
+Peter!
+
+PETER:
+Anon!
+
+Nurse:
+My fan, Peter.
+
+MERCUTIO:
+Good Peter, to hide her face; for her fan's the
+fairer face.
+
+Nurse:
+God ye good morrow, gentlemen.
+
+MERCUTIO:
+God ye good den, fair gentlewoman.
+
+Nurse:
+Is it good den?
+
+MERCUTIO:
+'Tis no less, I tell you, for the bawdy hand of the
+dial is now upon the prick of noon.
+
+Nurse:
+Out upon you! what a man are you!
+
+ROMEO:
+One, gentlewoman, that God hath made for himself to
+mar.
+
+Nurse:
+By my troth, it is well said; 'for himself to mar,'
+quoth a'? Gentlemen, can any of you tell me where I
+may find the young Romeo?
+
+ROMEO:
+I can tell you; but young Romeo will be older when
+you have found him than he was when you sought him:
+I am the youngest of that name, for fault of a worse.
+
+Nurse:
+You say well.
+
+MERCUTIO:
+Yea, is the worst well? very well took, i' faith;
+wisely, wisely.
+
+Nurse:
+if you be he, sir, I desire some confidence with
+you.
+
+BENVOLIO:
+She will indite him to some supper.
+
+MERCUTIO:
+A bawd, a bawd, a bawd! so ho!
+
+ROMEO:
+What hast thou found?
+
+MERCUTIO:
+No hare, sir; unless a hare, sir, in a lenten pie,
+that is something stale and hoar ere it be spent.
+An old hare hoar,
+And an old hare hoar,
+Is very good meat in lent
+But a hare that is hoar
+Is too much for a score,
+When it hoars ere it be spent.
+Romeo, will you come to your father's? we'll
+to dinner, thither.
+
+ROMEO:
+I will follow you.
+
+MERCUTIO:
+Farewell, ancient lady; farewell,
+'lady, lady, lady.'
+
+Nurse:
+Marry, farewell! I pray you, sir, what saucy
+merchant was this, that was so full of his ropery?
+
+ROMEO:
+A gentleman, nurse, that loves to hear himself talk,
+and will speak more in a minute than he will stand
+to in a month.
+
+Nurse:
+An a' speak any thing against me, I'll take him
+down, an a' were lustier than he is, and twenty such
+Jacks; and if I cannot, I'll find those that shall.
+Scurvy knave! I am none of his flirt-gills; I am
+none of his skains-mates. And thou must stand by
+too, and suffer every knave to use me at his pleasure?
+
+PETER:
+I saw no man use you a pleasure; if I had, my weapon
+should quickly have been out, I warrant you: I dare
+draw as soon as another man, if I see occasion in a
+good quarrel, and the law on my side.
+
+Nurse:
+Now, afore God, I am so vexed, that every part about
+me quivers. Scurvy knave! Pray you, sir, a word:
+and as I told you, my young lady bade me inquire you
+out; what she bade me say, I will keep to myself:
+but first let me tell ye, if ye should lead her into
+a fool's paradise, as they say, it were a very gross
+kind of behavior, as they say: for the gentlewoman
+is young; and, therefore, if you should deal double
+with her, truly it were an ill thing to be offered
+to any gentlewoman, and very weak dealing.
+
+ROMEO:
+Nurse, commend me to thy lady and mistress. I
+protest unto thee--
+
+Nurse:
+Good heart, and, i' faith, I will tell her as much:
+Lord, Lord, she will be a joyful woman.
+
+ROMEO:
+What wilt thou tell her, nurse? thou dost not mark me.
+
+Nurse:
+I will tell her, sir, that you do protest; which, as
+I take it, is a gentlemanlike offer.
+
+ROMEO:
+Bid her devise
+Some means to come to shrift this afternoon;
+And there she shall at Friar Laurence' cell
+Be shrived and married. Here is for thy pains.
+
+Nurse:
+No truly sir; not a penny.
+
+ROMEO:
+Go to; I say you shall.
+
+Nurse:
+This afternoon, sir? well, she shall be there.
+
+ROMEO:
+And stay, good nurse, behind the abbey wall:
+Within this hour my man shall be with thee
+And bring thee cords made like a tackled stair;
+Which to the high top-gallant of my joy
+Must be my convoy in the secret night.
+Farewell; be trusty, and I'll quit thy pains:
+Farewell; commend me to thy mistress.
+
+Nurse:
+Now God in heaven bless thee! Hark you, sir.
+
+ROMEO:
+What say'st thou, my dear nurse?
+
+Nurse:
+Is your man secret? Did you ne'er hear say,
+Two may keep counsel, putting one away?
+
+ROMEO:
+I warrant thee, my man's as true as steel.
+
+NURSE:
+Well, sir; my mistress is the sweetest lady--Lord,
+Lord! when 'twas a little prating thing:--O, there
+is a nobleman in town, one Paris, that would fain
+lay knife aboard; but she, good soul, had as lief
+see a toad, a very toad, as see him. I anger her
+sometimes and tell her that Paris is the properer
+man; but, I'll warrant you, when I say so, she looks
+as pale as any clout in the versal world. Doth not
+rosemary and Romeo begin both with a letter?
+
+ROMEO:
+Ay, nurse; what of that? both with an R.
+
+Nurse:
+Ah. mocker! that's the dog's name; R is for
+the--No; I know it begins with some other
+letter:--and she hath the prettiest sententious of
+it, of you and rosemary, that it would do you good
+to hear it.
+
+ROMEO:
+Commend me to thy lady.
+
+Nurse:
+Ay, a thousand times.
+Peter!
+
+PETER:
+Anon!
+
+Nurse:
+Peter, take my fan, and go before and apace.
+
+JULIET:
+The clock struck nine when I did send the nurse;
+In half an hour she promised to return.
+Perchance she cannot meet him: that's not so.
+O, she is lame! love's heralds should be thoughts,
+Which ten times faster glide than the sun's beams,
+Driving back shadows over louring hills:
+Therefore do nimble-pinion'd doves draw love,
+And therefore hath the wind-swift Cupid wings.
+Now is the sun upon the highmost hill
+Of this day's journey, and from nine till twelve
+Is three long hours, yet she is not come.
+Had she affections and warm youthful blood,
+She would be as swift in motion as a ball;
+My words would bandy her to my sweet love,
+And his to me:
+But old folks, many feign as they were dead;
+Unwieldy, slow, heavy and pale as lead.
+O God, she comes!
+O honey nurse, what news?
+Hast thou met with him? Send thy man away.
+
+Nurse:
+Peter, stay at the gate.
+
+JULIET:
+Now, good sweet nurse,--O Lord, why look'st thou sad?
+Though news be sad, yet tell them merrily;
+If good, thou shamest the music of sweet news
+By playing it to me with so sour a face.
+
+Nurse:
+I am a-weary, give me leave awhile:
+Fie, how my bones ache! what a jaunt have I had!
+
+JULIET:
+I would thou hadst my bones, and I thy news:
+Nay, come, I pray thee, speak; good, good nurse, speak.
+
+Nurse:
+Jesu, what haste? can you not stay awhile?
+Do you not see that I am out of breath?
+
+JULIET:
+How art thou out of breath, when thou hast breath
+To say to me that thou art out of breath?
+The excuse that thou dost make in this delay
+Is longer than the tale thou dost excuse.
+Is thy news good, or bad? answer to that;
+Say either, and I'll stay the circumstance:
+Let me be satisfied, is't good or bad?
+
+Nurse:
+Well, you have made a simple choice; you know not
+how to choose a man: Romeo! no, not he; though his
+face be better than any man's, yet his leg excels
+all men's; and for a hand, and a foot, and a body,
+though they be not to be talked on, yet they are
+past compare: he is not the flower of courtesy,
+but, I'll warrant him, as gentle as a lamb. Go thy
+ways, wench; serve God. What, have you dined at home?
+
+JULIET:
+No, no: but all this did I know before.
+What says he of our marriage? what of that?
+
+Nurse:
+Lord, how my head aches! what a head have I!
+It beats as it would fall in twenty pieces.
+My back o' t' other side,--O, my back, my back!
+Beshrew your heart for sending me about,
+To catch my death with jaunting up and down!
+
+JULIET:
+I' faith, I am sorry that thou art not well.
+Sweet, sweet, sweet nurse, tell me, what says my love?
+
+Nurse:
+Your love says, like an honest gentleman, and a
+courteous, and a kind, and a handsome, and, I
+warrant, a virtuous,--Where is your mother?
+
+JULIET:
+Where is my mother! why, she is within;
+Where should she be? How oddly thou repliest!
+'Your love says, like an honest gentleman,
+Where is your mother?'
+
+Nurse:
+O God's lady dear!
+Are you so hot? marry, come up, I trow;
+Is this the poultice for my aching bones?
+Henceforward do your messages yourself.
+
+JULIET:
+Here's such a coil! come, what says Romeo?
+
+Nurse:
+Have you got leave to go to shrift to-day?
+
+JULIET:
+I have.
+
+Nurse:
+Then hie you hence to Friar Laurence' cell;
+There stays a husband to make you a wife:
+Now comes the wanton blood up in your cheeks,
+They'll be in scarlet straight at any news.
+Hie you to church; I must another way,
+To fetch a ladder, by the which your love
+Must climb a bird's nest soon when it is dark:
+I am the drudge and toil in your delight,
+But you shall bear the burden soon at night.
+Go; I'll to dinner: hie you to the cell.
+
+JULIET:
+Hie to high fortune! Honest nurse, farewell.
+
+FRIAR LAURENCE:
+So smile the heavens upon this holy act,
+That after hours with sorrow chide us not!
+
+ROMEO:
+Amen, amen! but come what sorrow can,
+It cannot countervail the exchange of joy
+That one short minute gives me in her sight:
+Do thou but close our hands with holy words,
+Then love-devouring death do what he dare;
+It is enough I may but call her mine.
+
+FRIAR LAURENCE:
+These violent delights have violent ends
+And in their triumph die, like fire and powder,
+Which as they kiss consume: the sweetest honey
+Is loathsome in his own deliciousness
+And in the taste confounds the appetite:
+Therefore love moderately; long love doth so;
+Too swift arrives as tardy as too slow.
+Here comes the lady: O, so light a foot
+Will ne'er wear out the everlasting flint:
+A lover may bestride the gossamer
+That idles in the wanton summer air,
+And yet not fall; so light is vanity.
+
+JULIET:
+Good even to my ghostly confessor.
+
+FRIAR LAURENCE:
+Romeo shall thank thee, daughter, for us both.
+
+JULIET:
+As much to him, else is his thanks too much.
+
+ROMEO:
+Ah, Juliet, if the measure of thy joy
+Be heap'd like mine and that thy skill be more
+To blazon it, then sweeten with thy breath
+This neighbour air, and let rich music's tongue
+Unfold the imagined happiness that both
+Receive in either by this dear encounter.
+
+JULIET:
+Conceit, more rich in matter than in words,
+Brags of his substance, not of ornament:
+They are but beggars that can count their worth;
+But my true love is grown to such excess
+I cannot sum up sum of half my wealth.
+
+FRIAR LAURENCE:
+Come, come with me, and we will make short work;
+For, by your leaves, you shall not stay alone
+Till holy church incorporate two in one.
+
+BENVOLIO:
+I pray thee, good Mercutio, let's retire:
+The day is hot, the Capulets abroad,
+And, if we meet, we shall not scape a brawl;
+For now, these hot days, is the mad blood stirring.
+
+MERCUTIO:
+Thou art like one of those fellows that when he
+enters the confines of a tavern claps me his sword
+upon the table and says 'God send me no need of
+thee!' and by the operation of the second cup draws
+it on the drawer, when indeed there is no need.
+
+BENVOLIO:
+Am I like such a fellow?
+
+MERCUTIO:
+Come, come, thou art as hot a Jack in thy mood as
+any in Italy, and as soon moved to be moody, and as
+soon moody to be moved.
+
+BENVOLIO:
+And what to?
+
+MERCUTIO:
+Nay, an there were two such, we should have none
+shortly, for one would kill the other. Thou! why,
+thou wilt quarrel with a man that hath a hair more,
+or a hair less, in his beard, than thou hast: thou
+wilt quarrel with a man for cracking nuts, having no
+other reason but because thou hast hazel eyes: what
+eye but such an eye would spy out such a quarrel?
+Thy head is as fun of quarrels as an egg is full of
+meat, and yet thy head hath been beaten as addle as
+an egg for quarrelling: thou hast quarrelled with a
+man for coughing in the street, because he hath
+wakened thy dog that hath lain asleep in the sun:
+didst thou not fall out with a tailor for wearing
+his new doublet before Easter? with another, for
+tying his new shoes with old riband? and yet thou
+wilt tutor me from quarrelling!
+
+BENVOLIO:
+An I were so apt to quarrel as thou art, any man
+should buy the fee-simple of my life for an hour and a quarter.
+
+MERCUTIO:
+The fee-simple! O simple!
+
+BENVOLIO:
+By my head, here come the Capulets.
+
+MERCUTIO:
+By my heel, I care not.
+
+TYBALT:
+Follow me close, for I will speak to them.
+Gentlemen, good den: a word with one of you.
+
+MERCUTIO:
+And but one word with one of us? couple it with
+something; make it a word and a blow.
+
+TYBALT:
+You shall find me apt enough to that, sir, an you
+will give me occasion.
+
+MERCUTIO:
+Could you not take some occasion without giving?
+
+TYBALT:
+Mercutio, thou consort'st with Romeo,--
+
+MERCUTIO:
+Consort! what, dost thou make us minstrels? an
+thou make minstrels of us, look to hear nothing but
+discords: here's my fiddlestick; here's that shall
+make you dance. 'Zounds, consort!
+
+BENVOLIO:
+We talk here in the public haunt of men:
+Either withdraw unto some private place,
+And reason coldly of your grievances,
+Or else depart; here all eyes gaze on us.
+
+MERCUTIO:
+Men's eyes were made to look, and let them gaze;
+I will not budge for no man's pleasure, I.
+
+TYBALT:
+Well, peace be with you, sir: here comes my man.
+
+MERCUTIO:
+But I'll be hanged, sir, if he wear your livery:
+Marry, go before to field, he'll be your follower;
+Your worship in that sense may call him 'man.'
+
+TYBALT:
+Romeo, the hate I bear thee can afford
+No better term than this,--thou art a villain.
+
+ROMEO:
+Tybalt, the reason that I have to love thee
+Doth much excuse the appertaining rage
+To such a greeting: villain am I none;
+Therefore farewell; I see thou know'st me not.
+
+TYBALT:
+Boy, this shall not excuse the injuries
+That thou hast done me; therefore turn and draw.
+
+ROMEO:
+I do protest, I never injured thee,
+But love thee better than thou canst devise,
+Till thou shalt know the reason of my love:
+And so, good Capulet,--which name I tender
+As dearly as my own,--be satisfied.
+
+MERCUTIO:
+O calm, dishonourable, vile submission!
+Alla stoccata carries it away.
+Tybalt, you rat-catcher, will you walk?
+
+TYBALT:
+What wouldst thou have with me?
+
+MERCUTIO:
+Good king of cats, nothing but one of your nine
+lives; that I mean to make bold withal, and as you
+shall use me hereafter, drybeat the rest of the
+eight. Will you pluck your sword out of his pitcher
+by the ears? make haste, lest mine be about your
+ears ere it be out.
+
+TYBALT:
+I am for you.
+
+ROMEO:
+Gentle Mercutio, put thy rapier up.
+
+MERCUTIO:
+Come, sir, your passado.
+
+ROMEO:
+Draw, Benvolio; beat down their weapons.
+Gentlemen, for shame, forbear this outrage!
+Tybalt, Mercutio, the prince expressly hath
+Forbidden bandying in Verona streets:
+Hold, Tybalt! good Mercutio!
+
+MERCUTIO:
+I am hurt.
+A plague o' both your houses! I am sped.
+Is he gone, and hath nothing?
+
+BENVOLIO:
+What, art thou hurt?
+
+MERCUTIO:
+Ay, ay, a scratch, a scratch; marry, 'tis enough.
+Where is my page? Go, villain, fetch a surgeon.
+
+ROMEO:
+Courage, man; the hurt cannot be much.
+
+MERCUTIO:
+No, 'tis not so deep as a well, nor so wide as a
+church-door; but 'tis enough,'twill serve: ask for
+me to-morrow, and you shall find me a grave man. I
+am peppered, I warrant, for this world. A plague o'
+both your houses! 'Zounds, a dog, a rat, a mouse, a
+cat, to scratch a man to death! a braggart, a
+rogue, a villain, that fights by the book of
+arithmetic! Why the devil came you between us? I
+was hurt under your arm.
+
+ROMEO:
+I thought all for the best.
+
+MERCUTIO:
+Help me into some house, Benvolio,
+Or I shall faint. A plague o' both your houses!
+They have made worms' meat of me: I have it,
+And soundly too: your houses!
+
+ROMEO:
+This gentleman, the prince's near ally,
+My very friend, hath got his mortal hurt
+In my behalf; my reputation stain'd
+With Tybalt's slander,--Tybalt, that an hour
+Hath been my kinsman! O sweet Juliet,
+Thy beauty hath made me effeminate
+And in my temper soften'd valour's steel!
+
+BENVOLIO:
+O Romeo, Romeo, brave Mercutio's dead!
+That gallant spirit hath aspired the clouds,
+Which too untimely here did scorn the earth.
+
+ROMEO:
+This day's black fate on more days doth depend;
+This but begins the woe, others must end.
+
+BENVOLIO:
+Here comes the furious Tybalt back again.
+
+ROMEO:
+Alive, in triumph! and Mercutio slain!
+Away to heaven, respective lenity,
+And fire-eyed fury be my conduct now!
+Now, Tybalt, take the villain back again,
+That late thou gavest me; for Mercutio's soul
+Is but a little way above our heads,
+Staying for thine to keep him company:
+Either thou, or I, or both, must go with him.
+
+TYBALT:
+Thou, wretched boy, that didst consort him here,
+Shalt with him hence.
+
+ROMEO:
+This shall determine that.
+
+BENVOLIO:
+Romeo, away, be gone!
+The citizens are up, and Tybalt slain.
+Stand not amazed: the prince will doom thee death,
+If thou art taken: hence, be gone, away!
+
+ROMEO:
+O, I am fortune's fool!
+
+BENVOLIO:
+Why dost thou stay?
+
+First Citizen:
+Which way ran he that kill'd Mercutio?
+Tybalt, that murderer, which way ran he?
+
+BENVOLIO:
+There lies that Tybalt.
+
+First Citizen:
+Up, sir, go with me;
+I charge thee in the princes name, obey.
+
+PRINCE:
+Where are the vile beginners of this fray?
+
+BENVOLIO:
+O noble prince, I can discover all
+The unlucky manage of this fatal brawl:
+There lies the man, slain by young Romeo,
+That slew thy kinsman, brave Mercutio.
+
+LADY CAPULET:
+Tybalt, my cousin! O my brother's child!
+O prince! O cousin! husband! O, the blood is spilt
+O my dear kinsman! Prince, as thou art true,
+For blood of ours, shed blood of Montague.
+O cousin, cousin!
+
+PRINCE:
+Benvolio, who began this bloody fray?
+
+BENVOLIO:
+Tybalt, here slain, whom Romeo's hand did slay;
+Romeo that spoke him fair, bade him bethink
+How nice the quarrel was, and urged withal
+Your high displeasure: all this uttered
+With gentle breath, calm look, knees humbly bow'd,
+Could not take truce with the unruly spleen
+Of Tybalt deaf to peace, but that he tilts
+With piercing steel at bold Mercutio's breast,
+Who all as hot, turns deadly point to point,
+And, with a martial scorn, with one hand beats
+Cold death aside, and with the other sends
+It back to Tybalt, whose dexterity,
+Retorts it: Romeo he cries aloud,
+'Hold, friends! friends, part!' and, swifter than
+his tongue,
+His agile arm beats down their fatal points,
+And 'twixt them rushes; underneath whose arm
+An envious thrust from Tybalt hit the life
+Of stout Mercutio, and then Tybalt fled;
+But by and by comes back to Romeo,
+Who had but newly entertain'd revenge,
+And to 't they go like lightning, for, ere I
+Could draw to part them, was stout Tybalt slain.
+And, as he fell, did Romeo turn and fly.
+This is the truth, or let Benvolio die.
+
+LADY CAPULET:
+He is a kinsman to the Montague;
+Affection makes him false; he speaks not true:
+Some twenty of them fought in this black strife,
+And all those twenty could but kill one life.
+I beg for justice, which thou, prince, must give;
+Romeo slew Tybalt, Romeo must not live.
+
+PRINCE:
+Romeo slew him, he slew Mercutio;
+Who now the price of his dear blood doth owe?
+
+MONTAGUE:
+Not Romeo, prince, he was Mercutio's friend;
+His fault concludes but what the law should end,
+The life of Tybalt.
+
+PRINCE:
+And for that offence
+Immediately we do exile him hence:
+I have an interest in your hate's proceeding,
+My blood for your rude brawls doth lie a-bleeding;
+But I'll amerce you with so strong a fine
+That you shall all repent the loss of mine:
+I will be deaf to pleading and excuses;
+Nor tears nor prayers shall purchase out abuses:
+Therefore use none: let Romeo hence in haste,
+Else, when he's found, that hour is his last.
+Bear hence this body and attend our will:
+Mercy but murders, pardoning those that kill.
+
+JULIET:
+Gallop apace, you fiery-footed steeds,
+Towards Phoebus' lodging: such a wagoner
+As Phaethon would whip you to the west,
+And bring in cloudy night immediately.
+Spread thy close curtain, love-performing night,
+That runaway's eyes may wink and Romeo
+Leap to these arms, untalk'd of and unseen.
+Lovers can see to do their amorous rites
+By their own beauties; or, if love be blind,
+It best agrees with night. Come, civil night,
+Thou sober-suited matron, all in black,
+And learn me how to lose a winning match,
+Play'd for a pair of stainless maidenhoods:
+Hood my unmann'd blood, bating in my cheeks,
+With thy black mantle; till strange love, grown bold,
+Think true love acted simple modesty.
+Come, night; come, Romeo; come, thou day in night;
+For thou wilt lie upon the wings of night
+Whiter than new snow on a raven's back.
+Come, gentle night, come, loving, black-brow'd night,
+Give me my Romeo; and, when he shall die,
+Take him and cut him out in little stars,
+And he will make the face of heaven so fine
+That all the world will be in love with night
+And pay no worship to the garish sun.
+O, I have bought the mansion of a love,
+But not possess'd it, and, though I am sold,
+Not yet enjoy'd: so tedious is this day
+As is the night before some festival
+To an impatient child that hath new robes
+And may not wear them. O, here comes my nurse,
+And she brings news; and every tongue that speaks
+But Romeo's name speaks heavenly eloquence.
+Now, nurse, what news? What hast thou there? the cords
+That Romeo bid thee fetch?
+
+Nurse:
+Ay, ay, the cords.
+
+JULIET:
+Ay me! what news? why dost thou wring thy hands?
+
+Nurse:
+Ah, well-a-day! he's dead, he's dead, he's dead!
+We are undone, lady, we are undone!
+Alack the day! he's gone, he's kill'd, he's dead!
+
+JULIET:
+Can heaven be so envious?
+
+Nurse:
+Romeo can,
+Though heaven cannot: O Romeo, Romeo!
+Who ever would have thought it? Romeo!
+
+JULIET:
+What devil art thou, that dost torment me thus?
+This torture should be roar'd in dismal hell.
+Hath Romeo slain himself? say thou but 'I,'
+And that bare vowel 'I' shall poison more
+Than the death-darting eye of cockatrice:
+I am not I, if there be such an I;
+Or those eyes shut, that make thee answer 'I.'
+If he be slain, say 'I'; or if not, no:
+Brief sounds determine of my weal or woe.
+
+Nurse:
+I saw the wound, I saw it with mine eyes,--
+God save the mark!--here on his manly breast:
+A piteous corse, a bloody piteous corse;
+Pale, pale as ashes, all bedaub'd in blood,
+All in gore-blood; I swounded at the sight.
+
+JULIET:
+O, break, my heart! poor bankrupt, break at once!
+To prison, eyes, ne'er look on liberty!
+Vile earth, to earth resign; end motion here;
+And thou and Romeo press one heavy bier!
+
+Nurse:
+O Tybalt, Tybalt, the best friend I had!
+O courteous Tybalt! honest gentleman!
+That ever I should live to see thee dead!
+
+JULIET:
+What storm is this that blows so contrary?
+Is Romeo slaughter'd, and is Tybalt dead?
+My dear-loved cousin, and my dearer lord?
+Then, dreadful trumpet, sound the general doom!
+For who is living, if those two are gone?
+
+Nurse:
+Tybalt is gone, and Romeo banished;
+Romeo that kill'd him, he is banished.
+
+JULIET:
+O God! did Romeo's hand shed Tybalt's blood?
+
+Nurse:
+It did, it did; alas the day, it did!
+
+JULIET:
+O serpent heart, hid with a flowering face!
+Did ever dragon keep so fair a cave?
+Beautiful tyrant! fiend angelical!
+Dove-feather'd raven! wolvish-ravening lamb!
+Despised substance of divinest show!
+Just opposite to what thou justly seem'st,
+A damned saint, an honourable villain!
+O nature, what hadst thou to do in hell,
+When thou didst bower the spirit of a fiend
+In moral paradise of such sweet flesh?
+Was ever book containing such vile matter
+So fairly bound? O that deceit should dwell
+In such a gorgeous palace!
+
+Nurse:
+There's no trust,
+No faith, no honesty in men; all perjured,
+All forsworn, all naught, all dissemblers.
+Ah, where's my man? give me some aqua vitae:
+These griefs, these woes, these sorrows make me old.
+Shame come to Romeo!
+
+JULIET:
+Blister'd be thy tongue
+For such a wish! he was not born to shame:
+Upon his brow shame is ashamed to sit;
+For 'tis a throne where honour may be crown'd
+Sole monarch of the universal earth.
+O, what a beast was I to chide at him!
+
+Nurse:
+Will you speak well of him that kill'd your cousin?
+
+JULIET:
+Shall I speak ill of him that is my husband?
+Ah, poor my lord, what tongue shall smooth thy name,
+When I, thy three-hours wife, have mangled it?
+But, wherefore, villain, didst thou kill my cousin?
+That villain cousin would have kill'd my husband:
+Back, foolish tears, back to your native spring;
+Your tributary drops belong to woe,
+Which you, mistaking, offer up to joy.
+My husband lives, that Tybalt would have slain;
+And Tybalt's dead, that would have slain my husband:
+All this is comfort; wherefore weep I then?
+Some word there was, worser than Tybalt's death,
+That murder'd me: I would forget it fain;
+But, O, it presses to my memory,
+Like damned guilty deeds to sinners' minds:
+'Tybalt is dead, and Romeo--banished;'
+That 'banished,' that one word 'banished,'
+Hath slain ten thousand Tybalts. Tybalt's death
+Was woe enough, if it had ended there:
+Or, if sour woe delights in fellowship
+And needly will be rank'd with other griefs,
+Why follow'd not, when she said 'Tybalt's dead,'
+Thy father, or thy mother, nay, or both,
+Which modern lamentations might have moved?
+But with a rear-ward following Tybalt's death,
+'Romeo is banished,' to speak that word,
+Is father, mother, Tybalt, Romeo, Juliet,
+All slain, all dead. 'Romeo is banished!'
+There is no end, no limit, measure, bound,
+In that word's death; no words can that woe sound.
+Where is my father, and my mother, nurse?
+
+Nurse:
+Weeping and wailing over Tybalt's corse:
+Will you go to them? I will bring you thither.
+
+JULIET:
+Wash they his wounds with tears: mine shall be spent,
+When theirs are dry, for Romeo's banishment.
+Take up those cords: poor ropes, you are beguiled,
+Both you and I; for Romeo is exiled:
+He made you for a highway to my bed;
+But I, a maid, die maiden-widowed.
+Come, cords, come, nurse; I'll to my wedding-bed;
+And death, not Romeo, take my maidenhead!
+
+Nurse:
+Hie to your chamber: I'll find Romeo
+To comfort you: I wot well where he is.
+Hark ye, your Romeo will be here at night:
+I'll to him; he is hid at Laurence' cell.
+
+JULIET:
+O, find him! give this ring to my true knight,
+And bid him come to take his last farewell.
+
+FRIAR LAURENCE:
+Romeo, come forth; come forth, thou fearful man:
+Affliction is enamour'd of thy parts,
+And thou art wedded to calamity.
+
+ROMEO:
+Father, what news? what is the prince's doom?
+What sorrow craves acquaintance at my hand,
+That I yet know not?
+
+FRIAR LAURENCE:
+Too familiar
+Is my dear son with such sour company:
+I bring thee tidings of the prince's doom.
+
+ROMEO:
+What less than dooms-day is the prince's doom?
+
+FRIAR LAURENCE:
+A gentler judgment vanish'd from his lips,
+Not body's death, but body's banishment.
+
+ROMEO:
+Ha, banishment! be merciful, say 'death;'
+For exile hath more terror in his look,
+Much more than death: do not say 'banishment.'
+
+FRIAR LAURENCE:
+Hence from Verona art thou banished:
+Be patient, for the world is broad and wide.
+
+ROMEO:
+There is no world without Verona walls,
+But purgatory, torture, hell itself.
+Hence-banished is banish'd from the world,
+And world's exile is death: then banished,
+Is death mis-term'd: calling death banishment,
+Thou cutt'st my head off with a golden axe,
+And smilest upon the stroke that murders me.
+
+FRIAR LAURENCE:
+O deadly sin! O rude unthankfulness!
+Thy fault our law calls death; but the kind prince,
+Taking thy part, hath rush'd aside the law,
+And turn'd that black word death to banishment:
+This is dear mercy, and thou seest it not.
+
+ROMEO:
+'Tis torture, and not mercy: heaven is here,
+Where Juliet lives; and every cat and dog
+And little mouse, every unworthy thing,
+Live here in heaven and may look on her;
+But Romeo may not: more validity,
+More honourable state, more courtship lives
+In carrion-flies than Romeo: they my seize
+On the white wonder of dear Juliet's hand
+And steal immortal blessing from her lips,
+Who even in pure and vestal modesty,
+Still blush, as thinking their own kisses sin;
+But Romeo may not; he is banished:
+Flies may do this, but I from this must fly:
+They are free men, but I am banished.
+And say'st thou yet that exile is not death?
+Hadst thou no poison mix'd, no sharp-ground knife,
+No sudden mean of death, though ne'er so mean,
+But 'banished' to kill me?--'banished'?
+O friar, the damned use that word in hell;
+Howlings attend it: how hast thou the heart,
+Being a divine, a ghostly confessor,
+A sin-absolver, and my friend profess'd,
+To mangle me with that word 'banished'?
+
+FRIAR LAURENCE:
+Thou fond mad man, hear me but speak a word.
+
+ROMEO:
+O, thou wilt speak again of banishment.
+
+FRIAR LAURENCE:
+I'll give thee armour to keep off that word:
+Adversity's sweet milk, philosophy,
+To comfort thee, though thou art banished.
+
+ROMEO:
+Yet 'banished'? Hang up philosophy!
+Unless philosophy can make a Juliet,
+Displant a town, reverse a prince's doom,
+It helps not, it prevails not: talk no more.
+
+FRIAR LAURENCE:
+O, then I see that madmen have no ears.
+
+ROMEO:
+How should they, when that wise men have no eyes?
+
+FRIAR LAURENCE:
+Let me dispute with thee of thy estate.
+
+ROMEO:
+Thou canst not speak of that thou dost not feel:
+Wert thou as young as I, Juliet thy love,
+An hour but married, Tybalt murdered,
+Doting like me and like me banished,
+Then mightst thou speak, then mightst thou tear thy hair,
+And fall upon the ground, as I do now,
+Taking the measure of an unmade grave.
+
+FRIAR LAURENCE:
+Arise; one knocks; good Romeo, hide thyself.
+
+ROMEO:
+Not I; unless the breath of heartsick groans,
+Mist-like, infold me from the search of eyes.
+
+FRIAR LAURENCE:
+Hark, how they knock! Who's there? Romeo, arise;
+Thou wilt be taken. Stay awhile! Stand up;
+Run to my study. By and by! God's will,
+What simpleness is this! I come, I come!
+Who knocks so hard? whence come you? what's your will?
+
+Nurse:
+
+FRIAR LAURENCE:
+Welcome, then.
+
+Nurse:
+O holy friar, O, tell me, holy friar,
+Where is my lady's lord, where's Romeo?
+
+FRIAR LAURENCE:
+There on the ground, with his own tears made drunk.
+
+Nurse:
+O, he is even in my mistress' case,
+Just in her case! O woful sympathy!
+Piteous predicament! Even so lies she,
+Blubbering and weeping, weeping and blubbering.
+Stand up, stand up; stand, and you be a man:
+For Juliet's sake, for her sake, rise and stand;
+Why should you fall into so deep an O?
+
+ROMEO:
+Nurse!
+
+Nurse:
+Ah sir! ah sir! Well, death's the end of all.
+
+ROMEO:
+Spakest thou of Juliet? how is it with her?
+Doth she not think me an old murderer,
+Now I have stain'd the childhood of our joy
+With blood removed but little from her own?
+Where is she? and how doth she? and what says
+My conceal'd lady to our cancell'd love?
+
+Nurse:
+O, she says nothing, sir, but weeps and weeps;
+And now falls on her bed; and then starts up,
+And Tybalt calls; and then on Romeo cries,
+And then down falls again.
+
+ROMEO:
+As if that name,
+Shot from the deadly level of a gun,
+Did murder her; as that name's cursed hand
+Murder'd her kinsman. O, tell me, friar, tell me,
+In what vile part of this anatomy
+Doth my name lodge? tell me, that I may sack
+The hateful mansion.
+
+FRIAR LAURENCE:
+Hold thy desperate hand:
+Art thou a man? thy form cries out thou art:
+Thy tears are womanish; thy wild acts denote
+The unreasonable fury of a beast:
+Unseemly woman in a seeming man!
+Or ill-beseeming beast in seeming both!
+Thou hast amazed me: by my holy order,
+I thought thy disposition better temper'd.
+Hast thou slain Tybalt? wilt thou slay thyself?
+And stay thy lady too that lives in thee,
+By doing damned hate upon thyself?
+Why rail'st thou on thy birth, the heaven, and earth?
+Since birth, and heaven, and earth, all three do meet
+In thee at once; which thou at once wouldst lose.
+Fie, fie, thou shamest thy shape, thy love, thy wit;
+Which, like a usurer, abound'st in all,
+And usest none in that true use indeed
+Which should bedeck thy shape, thy love, thy wit:
+Thy noble shape is but a form of wax,
+Digressing from the valour of a man;
+Thy dear love sworn but hollow perjury,
+Killing that love which thou hast vow'd to cherish;
+Thy wit, that ornament to shape and love,
+Misshapen in the conduct of them both,
+Like powder in a skitless soldier's flask,
+Is set afire by thine own ignorance,
+And thou dismember'd with thine own defence.
+What, rouse thee, man! thy Juliet is alive,
+For whose dear sake thou wast but lately dead;
+There art thou happy: Tybalt would kill thee,
+But thou slew'st Tybalt; there are thou happy too:
+The law that threaten'd death becomes thy friend
+And turns it to exile; there art thou happy:
+A pack of blessings lights up upon thy back;
+Happiness courts thee in her best array;
+But, like a misbehaved and sullen wench,
+Thou pout'st upon thy fortune and thy love:
+Take heed, take heed, for such die miserable.
+Go, get thee to thy love, as was decreed,
+Ascend her chamber, hence and comfort her:
+But look thou stay not till the watch be set,
+For then thou canst not pass to Mantua;
+Where thou shalt live, till we can find a time
+To blaze your marriage, reconcile your friends,
+Beg pardon of the prince, and call thee back
+With twenty hundred thousand times more joy
+Than thou went'st forth in lamentation.
+Go before, nurse: commend me to thy lady;
+And bid her hasten all the house to bed,
+Which heavy sorrow makes them apt unto:
+Romeo is coming.
+
+Nurse:
+O Lord, I could have stay'd here all the night
+To hear good counsel: O, what learning is!
+My lord, I'll tell my lady you will come.
+
+ROMEO:
+Do so, and bid my sweet prepare to chide.
+
+Nurse:
+Here, sir, a ring she bid me give you, sir:
+Hie you, make haste, for it grows very late.
+
+ROMEO:
+How well my comfort is revived by this!
+
+FRIAR LAURENCE:
+Go hence; good night; and here stands all your state:
+Either be gone before the watch be set,
+Or by the break of day disguised from hence:
+Sojourn in Mantua; I'll find out your man,
+And he shall signify from time to time
+Every good hap to you that chances here:
+Give me thy hand; 'tis late: farewell; good night.
+
+ROMEO:
+But that a joy past joy calls out on me,
+It were a grief, so brief to part with thee: Farewell.
+
+CAPULET:
+Things have fall'n out, sir, so unluckily,
+That we have had no time to move our daughter:
+Look you, she loved her kinsman Tybalt dearly,
+And so did I:--Well, we were born to die.
+'Tis very late, she'll not come down to-night:
+I promise you, but for your company,
+I would have been a-bed an hour ago.
+
+PARIS:
+These times of woe afford no time to woo.
+Madam, good night: commend me to your daughter.
+
+LADY CAPULET:
+I will, and know her mind early to-morrow;
+To-night she is mew'd up to her heaviness.
+
+CAPULET:
+Sir Paris, I will make a desperate tender
+Of my child's love: I think she will be ruled
+In all respects by me; nay, more, I doubt it not.
+Wife, go you to her ere you go to bed;
+Acquaint her here of my son Paris' love;
+And bid her, mark you me, on Wednesday next--
+But, soft! what day is this?
+
+PARIS:
+Monday, my lord,
+
+CAPULET:
+Monday! ha, ha! Well, Wednesday is too soon,
+O' Thursday let it be: o' Thursday, tell her,
+She shall be married to this noble earl.
+Will you be ready? do you like this haste?
+We'll keep no great ado,--a friend or two;
+For, hark you, Tybalt being slain so late,
+It may be thought we held him carelessly,
+Being our kinsman, if we revel much:
+Therefore we'll have some half a dozen friends,
+And there an end. But what say you to Thursday?
+
+PARIS:
+My lord, I would that Thursday were to-morrow.
+
+CAPULET:
+Well get you gone: o' Thursday be it, then.
+Go you to Juliet ere you go to bed,
+Prepare her, wife, against this wedding-day.
+Farewell, my lord. Light to my chamber, ho!
+Afore me! it is so very very late,
+That we may call it early by and by.
+Good night.
+
+JULIET:
+Wilt thou be gone? it is not yet near day:
+It was the nightingale, and not the lark,
+That pierced the fearful hollow of thine ear;
+Nightly she sings on yon pomegranate-tree:
+Believe me, love, it was the nightingale.
+
+ROMEO:
+It was the lark, the herald of the morn,
+No nightingale: look, love, what envious streaks
+Do lace the severing clouds in yonder east:
+Night's candles are burnt out, and jocund day
+Stands tiptoe on the misty mountain tops.
+I must be gone and live, or stay and die.
+
+JULIET:
+Yon light is not day-light, I know it, I:
+It is some meteor that the sun exhales,
+To be to thee this night a torch-bearer,
+And light thee on thy way to Mantua:
+Therefore stay yet; thou need'st not to be gone.
+
+ROMEO:
+Let me be ta'en, let me be put to death;
+I am content, so thou wilt have it so.
+I'll say yon grey is not the morning's eye,
+'Tis but the pale reflex of Cynthia's brow;
+Nor that is not the lark, whose notes do beat
+The vaulty heaven so high above our heads:
+I have more care to stay than will to go:
+Come, death, and welcome! Juliet wills it so.
+How is't, my soul? let's talk; it is not day.
+
+JULIET:
+It is, it is: hie hence, be gone, away!
+It is the lark that sings so out of tune,
+Straining harsh discords and unpleasing sharps.
+Some say the lark makes sweet division;
+This doth not so, for she divideth us:
+Some say the lark and loathed toad change eyes,
+O, now I would they had changed voices too!
+Since arm from arm that voice doth us affray,
+Hunting thee hence with hunt's-up to the day,
+O, now be gone; more light and light it grows.
+
+ROMEO:
+More light and light; more dark and dark our woes!
+
+Nurse:
+Madam!
+
+JULIET:
+Nurse?
+
+Nurse:
+Your lady mother is coming to your chamber:
+The day is broke; be wary, look about.
+
+JULIET:
+Then, window, let day in, and let life out.
+
+ROMEO:
+Farewell, farewell! one kiss, and I'll descend.
+
+JULIET:
+Art thou gone so? love, lord, ay, husband, friend!
+I must hear from thee every day in the hour,
+For in a minute there are many days:
+O, by this count I shall be much in years
+Ere I again behold my Romeo!
+
+ROMEO:
+Farewell!
+I will omit no opportunity
+That may convey my greetings, love, to thee.
+
+JULIET:
+O think'st thou we shall ever meet again?
+
+ROMEO:
+I doubt it not; and all these woes shall serve
+For sweet discourses in our time to come.
+
+JULIET:
+O God, I have an ill-divining soul!
+Methinks I see thee, now thou art below,
+As one dead in the bottom of a tomb:
+Either my eyesight fails, or thou look'st pale.
+
+ROMEO:
+And trust me, love, in my eye so do you:
+Dry sorrow drinks our blood. Adieu, adieu!
+
+JULIET:
+O fortune, fortune! all men call thee fickle:
+If thou art fickle, what dost thou with him.
+That is renown'd for faith? Be fickle, fortune;
+For then, I hope, thou wilt not keep him long,
+But send him back.
+
+LADY CAPULET:
+
+JULIET:
+Who is't that calls? is it my lady mother?
+Is she not down so late, or up so early?
+What unaccustom'd cause procures her hither?
+
+LADY CAPULET:
+Why, how now, Juliet!
+
+JULIET:
+Madam, I am not well.
+
+LADY CAPULET:
+Evermore weeping for your cousin's death?
+What, wilt thou wash him from his grave with tears?
+An if thou couldst, thou couldst not make him live;
+Therefore, have done: some grief shows much of love;
+But much of grief shows still some want of wit.
+
+JULIET:
+Yet let me weep for such a feeling loss.
+
+LADY CAPULET:
+So shall you feel the loss, but not the friend
+Which you weep for.
+
+JULIET:
+Feeling so the loss,
+Cannot choose but ever weep the friend.
+
+LADY CAPULET:
+Well, girl, thou weep'st not so much for his death,
+As that the villain lives which slaughter'd him.
+
+JULIET:
+What villain madam?
+
+LADY CAPULET:
+That same villain, Romeo.
+
+JULIET:
+
+LADY CAPULET:
+That is, because the traitor murderer lives.
+
+JULIET:
+Ay, madam, from the reach of these my hands:
+Would none but I might venge my cousin's death!
+
+LADY CAPULET:
+We will have vengeance for it, fear thou not:
+Then weep no more. I'll send to one in Mantua,
+Where that same banish'd runagate doth live,
+Shall give him such an unaccustom'd dram,
+That he shall soon keep Tybalt company:
+And then, I hope, thou wilt be satisfied.
+
+JULIET:
+Indeed, I never shall be satisfied
+With Romeo, till I behold him--dead--
+Is my poor heart for a kinsman vex'd.
+Madam, if you could find out but a man
+To bear a poison, I would temper it;
+That Romeo should, upon receipt thereof,
+Soon sleep in quiet. O, how my heart abhors
+To hear him named, and cannot come to him.
+To wreak the love I bore my cousin
+Upon his body that slaughter'd him!
+
+LADY CAPULET:
+Find thou the means, and I'll find such a man.
+But now I'll tell thee joyful tidings, girl.
+
+JULIET:
+And joy comes well in such a needy time:
+What are they, I beseech your ladyship?
+
+LADY CAPULET:
+Well, well, thou hast a careful father, child;
+One who, to put thee from thy heaviness,
+Hath sorted out a sudden day of joy,
+That thou expect'st not nor I look'd not for.
+
+JULIET:
+Madam, in happy time, what day is that?
+
+LADY CAPULET:
+Marry, my child, early next Thursday morn,
+The gallant, young and noble gentleman,
+The County Paris, at Saint Peter's Church,
+Shall happily make thee there a joyful bride.
+
+JULIET:
+Now, by Saint Peter's Church and Peter too,
+He shall not make me there a joyful bride.
+I wonder at this haste; that I must wed
+Ere he, that should be husband, comes to woo.
+I pray you, tell my lord and father, madam,
+I will not marry yet; and, when I do, I swear,
+It shall be Romeo, whom you know I hate,
+Rather than Paris. These are news indeed!
+
+LADY CAPULET:
+Here comes your father; tell him so yourself,
+And see how he will take it at your hands.
+
+CAPULET:
+When the sun sets, the air doth drizzle dew;
+But for the sunset of my brother's son
+It rains downright.
+How now! a conduit, girl? what, still in tears?
+Evermore showering? In one little body
+Thou counterfeit'st a bark, a sea, a wind;
+For still thy eyes, which I may call the sea,
+Do ebb and flow with tears; the bark thy body is,
+Sailing in this salt flood; the winds, thy sighs;
+Who, raging with thy tears, and they with them,
+Without a sudden calm, will overset
+Thy tempest-tossed body. How now, wife!
+Have you deliver'd to her our decree?
+
+LADY CAPULET:
+Ay, sir; but she will none, she gives you thanks.
+I would the fool were married to her grave!
+
+CAPULET:
+Soft! take me with you, take me with you, wife.
+How! will she none? doth she not give us thanks?
+Is she not proud? doth she not count her blest,
+Unworthy as she is, that we have wrought
+So worthy a gentleman to be her bridegroom?
+
+JULIET:
+Not proud, you have; but thankful, that you have:
+Proud can I never be of what I hate;
+But thankful even for hate, that is meant love.
+
+CAPULET:
+How now, how now, chop-logic! What is this?
+'Proud,' and 'I thank you,' and 'I thank you not;'
+And yet 'not proud,' mistress minion, you,
+Thank me no thankings, nor, proud me no prouds,
+But fettle your fine joints 'gainst Thursday next,
+To go with Paris to Saint Peter's Church,
+Or I will drag thee on a hurdle thither.
+Out, you green-sickness carrion! out, you baggage!
+You tallow-face!
+
+LADY CAPULET:
+Fie, fie! what, are you mad?
+
+JULIET:
+Good father, I beseech you on my knees,
+Hear me with patience but to speak a word.
+
+CAPULET:
+Hang thee, young baggage! disobedient wretch!
+I tell thee what: get thee to church o' Thursday,
+Or never after look me in the face:
+Speak not, reply not, do not answer me;
+My fingers itch. Wife, we scarce thought us blest
+That God had lent us but this only child;
+But now I see this one is one too much,
+And that we have a curse in having her:
+Out on her, hilding!
+
+Nurse:
+God in heaven bless her!
+You are to blame, my lord, to rate her so.
+
+CAPULET:
+And why, my lady wisdom? hold your tongue,
+Good prudence; smatter with your gossips, go.
+
+Nurse:
+I speak no treason.
+
+CAPULET:
+O, God ye god-den.
+
+Nurse:
+May not one speak?
+
+CAPULET:
+Peace, you mumbling fool!
+Utter your gravity o'er a gossip's bowl;
+For here we need it not.
+
+LADY CAPULET:
+You are too hot.
+
+CAPULET:
+God's bread! it makes me mad:
+Day, night, hour, tide, time, work, play,
+Alone, in company, still my care hath been
+To have her match'd: and having now provided
+A gentleman of noble parentage,
+Of fair demesnes, youthful, and nobly train'd,
+Stuff'd, as they say, with honourable parts,
+Proportion'd as one's thought would wish a man;
+And then to have a wretched puling fool,
+A whining mammet, in her fortune's tender,
+To answer 'I'll not wed; I cannot love,
+I am too young; I pray you, pardon me.'
+But, as you will not wed, I'll pardon you:
+Graze where you will you shall not house with me:
+Look to't, think on't, I do not use to jest.
+Thursday is near; lay hand on heart, advise:
+An you be mine, I'll give you to my friend;
+And you be not, hang, beg, starve, die in
+the streets,
+For, by my soul, I'll ne'er acknowledge thee,
+Nor what is mine shall never do thee good:
+Trust to't, bethink you; I'll not be forsworn.
+
+JULIET:
+Is there no pity sitting in the clouds,
+That sees into the bottom of my grief?
+O, sweet my mother, cast me not away!
+Delay this marriage for a month, a week;
+Or, if you do not, make the bridal bed
+In that dim monument where Tybalt lies.
+
+LADY CAPULET:
+Talk not to me, for I'll not speak a word:
+Do as thou wilt, for I have done with thee.
+
+JULIET:
+O God!--O nurse, how shall this be prevented?
+My husband is on earth, my faith in heaven;
+How shall that faith return again to earth,
+Unless that husband send it me from heaven
+By leaving earth? comfort me, counsel me.
+Alack, alack, that heaven should practise stratagems
+Upon so soft a subject as myself!
+What say'st thou? hast thou not a word of joy?
+Some comfort, nurse.
+
+Nurse:
+Faith, here it is.
+Romeo is banish'd; and all the world to nothing,
+That he dares ne'er come back to challenge you;
+Or, if he do, it needs must be by stealth.
+Then, since the case so stands as now it doth,
+I think it best you married with the county.
+O, he's a lovely gentleman!
+Romeo's a dishclout to him: an eagle, madam,
+Hath not so green, so quick, so fair an eye
+As Paris hath. Beshrew my very heart,
+I think you are happy in this second match,
+For it excels your first: or if it did not,
+Your first is dead; or 'twere as good he were,
+As living here and you no use of him.
+
+JULIET:
+Speakest thou from thy heart?
+
+Nurse:
+And from my soul too;
+Or else beshrew them both.
+
+JULIET:
+Amen!
+
+Nurse:
+What?
+
+JULIET:
+Well, thou hast comforted me marvellous much.
+Go in: and tell my lady I am gone,
+Having displeased my father, to Laurence' cell,
+To make confession and to be absolved.
+
+Nurse:
+Marry, I will; and this is wisely done.
+
+JULIET:
+Ancient damnation! O most wicked fiend!
+Is it more sin to wish me thus forsworn,
+Or to dispraise my lord with that same tongue
+Which she hath praised him with above compare
+So many thousand times? Go, counsellor;
+Thou and my bosom henceforth shall be twain.
+I'll to the friar, to know his remedy:
+If all else fail, myself have power to die.
+
+FRIAR LAURENCE:
+On Thursday, sir? the time is very short.
+
+PARIS:
+My father Capulet will have it so;
+And I am nothing slow to slack his haste.
+
+FRIAR LAURENCE:
+You say you do not know the lady's mind:
+Uneven is the course, I like it not.
+
+PARIS:
+Immoderately she weeps for Tybalt's death,
+And therefore have I little talk'd of love;
+For Venus smiles not in a house of tears.
+Now, sir, her father counts it dangerous
+That she doth give her sorrow so much sway,
+And in his wisdom hastes our marriage,
+To stop the inundation of her tears;
+Which, too much minded by herself alone,
+May be put from her by society:
+Now do you know the reason of this haste.
+
+FRIAR LAURENCE:
+
+PARIS:
+Happily met, my lady and my wife!
+
+JULIET:
+That may be, sir, when I may be a wife.
+
+PARIS:
+That may be must be, love, on Thursday next.
+
+JULIET:
+What must be shall be.
+
+FRIAR LAURENCE:
+That's a certain text.
+
+PARIS:
+Come you to make confession to this father?
+
+JULIET:
+To answer that, I should confess to you.
+
+PARIS:
+Do not deny to him that you love me.
+
+JULIET:
+I will confess to you that I love him.
+
+PARIS:
+So will ye, I am sure, that you love me.
+
+JULIET:
+If I do so, it will be of more price,
+Being spoke behind your back, than to your face.
+
+PARIS:
+Poor soul, thy face is much abused with tears.
+
+JULIET:
+The tears have got small victory by that;
+For it was bad enough before their spite.
+
+PARIS:
+Thou wrong'st it, more than tears, with that report.
+
+JULIET:
+That is no slander, sir, which is a truth;
+And what I spake, I spake it to my face.
+
+PARIS:
+Thy face is mine, and thou hast slander'd it.
+
+JULIET:
+It may be so, for it is not mine own.
+Are you at leisure, holy father, now;
+Or shall I come to you at evening mass?
+
+FRIAR LAURENCE:
+My leisure serves me, pensive daughter, now.
+My lord, we must entreat the time alone.
+
+PARIS:
+God shield I should disturb devotion!
+Juliet, on Thursday early will I rouse ye:
+Till then, adieu; and keep this holy kiss.
+
+JULIET:
+O shut the door! and when thou hast done so,
+Come weep with me; past hope, past cure, past help!
+
+FRIAR LAURENCE:
+Ah, Juliet, I already know thy grief;
+It strains me past the compass of my wits:
+I hear thou must, and nothing may prorogue it,
+On Thursday next be married to this county.
+
+JULIET:
+Tell me not, friar, that thou hear'st of this,
+Unless thou tell me how I may prevent it:
+If, in thy wisdom, thou canst give no help,
+Do thou but call my resolution wise,
+And with this knife I'll help it presently.
+God join'd my heart and Romeo's, thou our hands;
+And ere this hand, by thee to Romeo seal'd,
+Shall be the label to another deed,
+Or my true heart with treacherous revolt
+Turn to another, this shall slay them both:
+Therefore, out of thy long-experienced time,
+Give me some present counsel, or, behold,
+'Twixt my extremes and me this bloody knife
+Shall play the umpire, arbitrating that
+Which the commission of thy years and art
+Could to no issue of true honour bring.
+Be not so long to speak; I long to die,
+If what thou speak'st speak not of remedy.
+
+FRIAR LAURENCE:
+Hold, daughter: I do spy a kind of hope,
+Which craves as desperate an execution.
+As that is desperate which we would prevent.
+If, rather than to marry County Paris,
+Thou hast the strength of will to slay thyself,
+Then is it likely thou wilt undertake
+A thing like death to chide away this shame,
+That copest with death himself to scape from it:
+And, if thou darest, I'll give thee remedy.
+
+JULIET:
+O, bid me leap, rather than marry Paris,
+From off the battlements of yonder tower;
+Or walk in thievish ways; or bid me lurk
+Where serpents are; chain me with roaring bears;
+Or shut me nightly in a charnel-house,
+O'er-cover'd quite with dead men's rattling bones,
+With reeky shanks and yellow chapless skulls;
+Or bid me go into a new-made grave
+And hide me with a dead man in his shroud;
+Things that, to hear them told, have made me tremble;
+And I will do it without fear or doubt,
+To live an unstain'd wife to my sweet love.
+
+FRIAR LAURENCE:
+Hold, then; go home, be merry, give consent
+To marry Paris: Wednesday is to-morrow:
+To-morrow night look that thou lie alone;
+Let not thy nurse lie with thee in thy chamber:
+Take thou this vial, being then in bed,
+And this distilled liquor drink thou off;
+When presently through all thy veins shall run
+A cold and drowsy humour, for no pulse
+Shall keep his native progress, but surcease:
+No warmth, no breath, shall testify thou livest;
+The roses in thy lips and cheeks shall fade
+To paly ashes, thy eyes' windows fall,
+Like death, when he shuts up the day of life;
+Each part, deprived of supple government,
+Shall, stiff and stark and cold, appear like death:
+And in this borrow'd likeness of shrunk death
+Thou shalt continue two and forty hours,
+And then awake as from a pleasant sleep.
+Now, when the bridegroom in the morning comes
+To rouse thee from thy bed, there art thou dead:
+Then, as the manner of our country is,
+In thy best robes uncover'd on the bier
+Thou shalt be borne to that same ancient vault
+Where all the kindred of the Capulets lie.
+In the mean time, against thou shalt awake,
+Shall Romeo by my letters know our drift,
+And hither shall he come: and he and I
+Will watch thy waking, and that very night
+Shall Romeo bear thee hence to Mantua.
+And this shall free thee from this present shame;
+If no inconstant toy, nor womanish fear,
+Abate thy valour in the acting it.
+
+JULIET:
+Give me, give me! O, tell not me of fear!
+
+FRIAR LAURENCE:
+Hold; get you gone, be strong and prosperous
+In this resolve: I'll send a friar with speed
+To Mantua, with my letters to thy lord.
+
+JULIET:
+Love give me strength! and strength shall help afford.
+Farewell, dear father!
+
+CAPULET:
+So many guests invite as here are writ.
+Sirrah, go hire me twenty cunning cooks.
+
+Second Servant:
+You shall have none ill, sir; for I'll try if they
+can lick their fingers.
+
+CAPULET:
+How canst thou try them so?
+
+Second Servant:
+Marry, sir, 'tis an ill cook that cannot lick his
+own fingers: therefore he that cannot lick his
+fingers goes not with me.
+
+CAPULET:
+Go, be gone.
+We shall be much unfurnished for this time.
+What, is my daughter gone to Friar Laurence?
+
+Nurse:
+Ay, forsooth.
+
+CAPULET:
+Well, he may chance to do some good on her:
+A peevish self-will'd harlotry it is.
+
+Nurse:
+See where she comes from shrift with merry look.
+
+CAPULET:
+How now, my headstrong! where have you been gadding?
+
+JULIET:
+Where I have learn'd me to repent the sin
+Of disobedient opposition
+To you and your behests, and am enjoin'd
+By holy Laurence to fall prostrate here,
+And beg your pardon: pardon, I beseech you!
+Henceforward I am ever ruled by you.
+
+CAPULET:
+Send for the county; go tell him of this:
+I'll have this knot knit up to-morrow morning.
+
+JULIET:
+I met the youthful lord at Laurence' cell;
+And gave him what becomed love I might,
+Not step o'er the bounds of modesty.
+
+CAPULET:
+Why, I am glad on't; this is well: stand up:
+This is as't should be. Let me see the county;
+Ay, marry, go, I say, and fetch him hither.
+Now, afore God! this reverend holy friar,
+Our whole city is much bound to him.
+
+JULIET:
+Nurse, will you go with me into my closet,
+To help me sort such needful ornaments
+As you think fit to furnish me to-morrow?
+
+LADY CAPULET:
+No, not till Thursday; there is time enough.
+
+CAPULET:
+Go, nurse, go with her: we'll to church to-morrow.
+
+LADY  CAPULET:
+We shall be short in our provision:
+'Tis now near night.
+
+CAPULET:
+Tush, I will stir about,
+And all things shall be well, I warrant thee, wife:
+Go thou to Juliet, help to deck up her;
+I'll not to bed to-night; let me alone;
+I'll play the housewife for this once. What, ho!
+They are all forth. Well, I will walk myself
+To County Paris, to prepare him up
+Against to-morrow: my heart is wondrous light,
+Since this same wayward girl is so reclaim'd.
+
+JULIET:
+Ay, those attires are best: but, gentle nurse,
+I pray thee, leave me to myself to-night,
+For I have need of many orisons
+To move the heavens to smile upon my state,
+Which, well thou know'st, is cross, and full of sin.
+
+LADY CAPULET:
+What, are you busy, ho? need you my help?
+
+JULIET:
+No, madam; we have cull'd such necessaries
+As are behoveful for our state to-morrow:
+So please you, let me now be left alone,
+And let the nurse this night sit up with you;
+For, I am sure, you have your hands full all,
+In this so sudden business.
+
+LADY CAPULET:
+Good night:
+Get thee to bed, and rest; for thou hast need.
+
+JULIET:
+Farewell! God knows when we shall meet again.
+I have a faint cold fear thrills through my veins,
+That almost freezes up the heat of life:
+I'll call them back again to comfort me:
+Nurse! What should she do here?
+My dismal scene I needs must act alone.
+Come, vial.
+What if this mixture do not work at all?
+Shall I be married then to-morrow morning?
+No, no: this shall forbid it: lie thou there.
+What if it be a poison, which the friar
+Subtly hath minister'd to have me dead,
+Lest in this marriage he should be dishonour'd,
+Because he married me before to Romeo?
+I fear it is: and yet, methinks, it should not,
+For he hath still been tried a holy man.
+How if, when I am laid into the tomb,
+I wake before the time that Romeo
+Come to redeem me? there's a fearful point!
+Shall I not, then, be stifled in the vault,
+To whose foul mouth no healthsome air breathes in,
+And there die strangled ere my Romeo comes?
+Or, if I live, is it not very like,
+The horrible conceit of death and night,
+Together with the terror of the place,--
+As in a vault, an ancient receptacle,
+Where, for these many hundred years, the bones
+Of all my buried ancestors are packed:
+Where bloody Tybalt, yet but green in earth,
+Lies festering in his shroud; where, as they say,
+At some hours in the night spirits resort;--
+Alack, alack, is it not like that I,
+So early waking, what with loathsome smells,
+And shrieks like mandrakes' torn out of the earth,
+That living mortals, hearing them, run mad:--
+O, if I wake, shall I not be distraught,
+Environed with all these hideous fears?
+And madly play with my forefather's joints?
+And pluck the mangled Tybalt from his shroud?
+And, in this rage, with some great kinsman's bone,
+As with a club, dash out my desperate brains?
+O, look! methinks I see my cousin's ghost
+Seeking out Romeo, that did spit his body
+Upon a rapier's point: stay, Tybalt, stay!
+Romeo, I come! this do I drink to thee.
+
+LADY CAPULET:
+Hold, take these keys, and fetch more spices, nurse.
+
+Nurse:
+They call for dates and quinces in the pastry.
+
+CAPULET:
+Come, stir, stir, stir! the second cock hath crow'd,
+The curfew-bell hath rung, 'tis three o'clock:
+Look to the baked meats, good Angelica:
+Spare not for the cost.
+
+Nurse:
+Go, you cot-quean, go,
+Get you to bed; faith, You'll be sick to-morrow
+For this night's watching.
+
+CAPULET:
+No, not a whit: what! I have watch'd ere now
+All night for lesser cause, and ne'er been sick.
+
+LADY CAPULET:
+Ay, you have been a mouse-hunt in your time;
+But I will watch you from such watching now.
+
+CAPULET:
+A jealous hood, a jealous hood!
+Now, fellow,
+What's there?
+
+First Servant:
+Things for the cook, sir; but I know not what.
+
+CAPULET:
+Make haste, make haste.
+Sirrah, fetch drier logs:
+Call Peter, he will show thee where they are.
+
+Second Servant:
+I have a head, sir, that will find out logs,
+And never trouble Peter for the matter.
+
+CAPULET:
+Mass, and well said; a merry whoreson, ha!
+Thou shalt be logger-head. Good faith, 'tis day:
+The county will be here with music straight,
+For so he said he would: I hear him near.
+Nurse! Wife! What, ho! What, nurse, I say!
+Go waken Juliet, go and trim her up;
+I'll go and chat with Paris: hie, make haste,
+Make haste; the bridegroom he is come already:
+Make haste, I say.
+
+Nurse:
+Mistress! what, mistress! Juliet! fast, I warrant her, she:
+Why, lamb! why, lady! fie, you slug-a-bed!
+Why, love, I say! madam! sweet-heart! why, bride!
+What, not a word? you take your pennyworths now;
+Sleep for a week; for the next night, I warrant,
+The County Paris hath set up his rest,
+That you shall rest but little. God forgive me,
+Marry, and amen, how sound is she asleep!
+I must needs wake her. Madam, madam, madam!
+Ay, let the county take you in your bed;
+He'll fright you up, i' faith. Will it not be?
+What, dress'd! and in your clothes! and down again!
+I must needs wake you; Lady! lady! lady!
+Alas, alas! Help, help! my lady's dead!
+O, well-a-day, that ever I was born!
+Some aqua vitae, ho! My lord! my lady!
+
+LADY CAPULET:
+What noise is here?
+
+Nurse:
+O lamentable day!
+
+LADY CAPULET:
+What is the matter?
+
+Nurse:
+Look, look! O heavy day!
+
+LADY CAPULET:
+O me, O me! My child, my only life,
+Revive, look up, or I will die with thee!
+Help, help! Call help.
+
+CAPULET:
+For shame, bring Juliet forth; her lord is come.
+
+Nurse:
+She's dead, deceased, she's dead; alack the day!
+
+LADY CAPULET:
+Alack the day, she's dead, she's dead, she's dead!
+
+CAPULET:
+Ha! let me see her: out, alas! she's cold:
+Her blood is settled, and her joints are stiff;
+Life and these lips have long been separated:
+Death lies on her like an untimely frost
+Upon the sweetest flower of all the field.
+
+Nurse:
+O lamentable day!
+
+LADY CAPULET:
+O woful time!
+
+CAPULET:
+Death, that hath ta'en her hence to make me wail,
+Ties up my tongue, and will not let me speak.
+
+FRIAR LAURENCE:
+Come, is the bride ready to go to church?
+
+CAPULET:
+Ready to go, but never to return.
+O son! the night before thy wedding-day
+Hath Death lain with thy wife. There she lies,
+Flower as she was, deflowered by him.
+Death is my son-in-law, Death is my heir;
+My daughter he hath wedded: I will die,
+And leave him all; life, living, all is Death's.
+
+PARIS:
+Have I thought long to see this morning's face,
+And doth it give me such a sight as this?
+
+LADY CAPULET:
+Accursed, unhappy, wretched, hateful day!
+Most miserable hour that e'er time saw
+In lasting labour of his pilgrimage!
+But one, poor one, one poor and loving child,
+But one thing to rejoice and solace in,
+And cruel death hath catch'd it from my sight!
+
+Nurse:
+O woe! O woful, woful, woful day!
+Most lamentable day, most woful day,
+That ever, ever, I did yet behold!
+O day! O day! O day! O hateful day!
+Never was seen so black a day as this:
+O woful day, O woful day!
+
+PARIS:
+Beguiled, divorced, wronged, spited, slain!
+Most detestable death, by thee beguil'd,
+By cruel cruel thee quite overthrown!
+O love! O life! not life, but love in death!
+
+CAPULET:
+Despised, distressed, hated, martyr'd, kill'd!
+Uncomfortable time, why camest thou now
+To murder, murder our solemnity?
+O child! O child! my soul, and not my child!
+Dead art thou! Alack! my child is dead;
+And with my child my joys are buried.
+
+FRIAR LAURENCE:
+Peace, ho, for shame! confusion's cure lives not
+In these confusions. Heaven and yourself
+Had part in this fair maid; now heaven hath all,
+And all the better is it for the maid:
+Your part in her you could not keep from death,
+But heaven keeps his part in eternal life.
+The most you sought was her promotion;
+For 'twas your heaven she should be advanced:
+And weep ye now, seeing she is advanced
+Above the clouds, as high as heaven itself?
+O, in this love, you love your child so ill,
+That you run mad, seeing that she is well:
+She's not well married that lives married long;
+But she's best married that dies married young.
+Dry up your tears, and stick your rosemary
+On this fair corse; and, as the custom is,
+In all her best array bear her to church:
+For though fond nature bids us an lament,
+Yet nature's tears are reason's merriment.
+
+CAPULET:
+All things that we ordained festival,
+Turn from their office to black funeral;
+Our instruments to melancholy bells,
+Our wedding cheer to a sad burial feast,
+Our solemn hymns to sullen dirges change,
+Our bridal flowers serve for a buried corse,
+And all things change them to the contrary.
+
+FRIAR LAURENCE:
+Sir, go you in; and, madam, go with him;
+And go, Sir Paris; every one prepare
+To follow this fair corse unto her grave:
+The heavens do lour upon you for some ill;
+Move them no more by crossing their high will.
+
+First Musician:
+Faith, we may put up our pipes, and be gone.
+
+Nurse:
+Honest goodfellows, ah, put up, put up;
+For, well you know, this is a pitiful case.
+
+First Musician:
+Ay, by my troth, the case may be amended.
+
+PETER:
+Musicians, O, musicians, 'Heart's ease, Heart's
+ease:' O, an you will have me live, play 'Heart's ease.'
+
+First Musician:
+Why 'Heart's ease?'
+
+PETER:
+O, musicians, because my heart itself plays 'My
+heart is full of woe:' O, play me some merry dump,
+to comfort me.
+
+First Musician:
+Not a dump we; 'tis no time to play now.
+
+PETER:
+You will not, then?
+
+First Musician:
+No.
+
+PETER:
+I will then give it you soundly.
+
+First Musician:
+What will you give us?
+
+PETER:
+No money, on my faith, but the gleek;
+I will give you the minstrel.
+
+First Musician:
+Then I will give you the serving-creature.
+
+PETER:
+Then will I lay the serving-creature's dagger on
+your pate. I will carry no crotchets: I'll re you,
+I'll fa you; do you note me?
+
+First Musician:
+An you re us and fa us, you note us.
+
+Second Musician:
+Pray you, put up your dagger, and put out your wit.
+
+PETER:
+Then have at you with my wit! I will dry-beat you
+with an iron wit, and put up my iron dagger. Answer
+me like men:
+'When griping grief the heart doth wound,
+And doleful dumps the mind oppress,
+Then music with her silver sound'--
+why 'silver sound'? why 'music with her silver
+sound'? What say you, Simon Catling?
+
+Musician:
+Marry, sir, because silver hath a sweet sound.
+
+PETER:
+Pretty! What say you, Hugh Rebeck?
+
+Second Musician:
+I say 'silver sound,' because musicians sound for silver.
+
+PETER:
+Pretty too! What say you, James Soundpost?
+
+Third Musician:
+Faith, I know not what to say.
+
+PETER:
+O, I cry you mercy; you are the singer: I will say
+for you. It is 'music with her silver sound,'
+because musicians have no gold for sounding:
+'Then music with her silver sound
+With speedy help doth lend redress.'
+
+First Musician:
+What a pestilent knave is this same!
+
+Second Musician:
+Hang him, Jack! Come, we'll in here; tarry for the
+mourners, and stay dinner.
+
+ROMEO:
+If I may trust the flattering truth of sleep,
+My dreams presage some joyful news at hand:
+My bosom's lord sits lightly in his throne;
+And all this day an unaccustom'd spirit
+Lifts me above the ground with cheerful thoughts.
+I dreamt my lady came and found me dead--
+Strange dream, that gives a dead man leave
+to think!--
+And breathed such life with kisses in my lips,
+That I revived, and was an emperor.
+Ah me! how sweet is love itself possess'd,
+When but love's shadows are so rich in joy!
+News from Verona!--How now, Balthasar!
+Dost thou not bring me letters from the friar?
+How doth my lady? Is my father well?
+How fares my Juliet? that I ask again;
+For nothing can be ill, if she be well.
+
+BALTHASAR:
+Then she is well, and nothing can be ill:
+Her body sleeps in Capel's monument,
+And her immortal part with angels lives.
+I saw her laid low in her kindred's vault,
+And presently took post to tell it you:
+O, pardon me for bringing these ill news,
+Since you did leave it for my office, sir.
+
+ROMEO:
+Is it even so? then I defy you, stars!
+Thou know'st my lodging: get me ink and paper,
+And hire post-horses; I will hence to-night.
+
+BALTHASAR:
+I do beseech you, sir, have patience:
+Your looks are pale and wild, and do import
+Some misadventure.
+
+ROMEO:
+Tush, thou art deceived:
+Leave me, and do the thing I bid thee do.
+Hast thou no letters to me from the friar?
+
+BALTHASAR:
+No, my good lord.
+
+ROMEO:
+No matter: get thee gone,
+And hire those horses; I'll be with thee straight.
+Well, Juliet, I will lie with thee to-night.
+Let's see for means: O mischief, thou art swift
+To enter in the thoughts of desperate men!
+I do remember an apothecary,--
+And hereabouts he dwells,--which late I noted
+In tatter'd weeds, with overwhelming brows,
+Culling of simples; meagre were his looks,
+Sharp misery had worn him to the bones:
+And in his needy shop a tortoise hung,
+An alligator stuff'd, and other skins
+Of ill-shaped fishes; and about his shelves
+A beggarly account of empty boxes,
+Green earthen pots, bladders and musty seeds,
+Remnants of packthread and old cakes of roses,
+Were thinly scatter'd, to make up a show.
+Noting this penury, to myself I said
+'An if a man did need a poison now,
+Whose sale is present death in Mantua,
+Here lives a caitiff wretch would sell it him.'
+O, this same thought did but forerun my need;
+And this same needy man must sell it me.
+As I remember, this should be the house.
+Being holiday, the beggar's shop is shut.
+What, ho! apothecary!
+
+Apothecary:
+Who calls so loud?
+
+ROMEO:
+Come hither, man. I see that thou art poor:
+Hold, there is forty ducats: let me have
+A dram of poison, such soon-speeding gear
+As will disperse itself through all the veins
+That the life-weary taker may fall dead
+And that the trunk may be discharged of breath
+As violently as hasty powder fired
+Doth hurry from the fatal cannon's womb.
+
+Apothecary:
+Such mortal drugs I have; but Mantua's law
+Is death to any he that utters them.
+
+ROMEO:
+Art thou so bare and full of wretchedness,
+And fear'st to die? famine is in thy cheeks,
+Need and oppression starveth in thine eyes,
+Contempt and beggary hangs upon thy back;
+The world is not thy friend nor the world's law;
+The world affords no law to make thee rich;
+Then be not poor, but break it, and take this.
+
+Apothecary:
+My poverty, but not my will, consents.
+
+ROMEO:
+I pay thy poverty, and not thy will.
+
+Apothecary:
+Put this in any liquid thing you will,
+And drink it off; and, if you had the strength
+Of twenty men, it would dispatch you straight.
+
+ROMEO:
+There is thy gold, worse poison to men's souls,
+Doing more murders in this loathsome world,
+Than these poor compounds that thou mayst not sell.
+I sell thee poison; thou hast sold me none.
+Farewell: buy food, and get thyself in flesh.
+Come, cordial and not poison, go with me
+To Juliet's grave; for there must I use thee.
+
+FRIAR JOHN:
+Holy Franciscan friar! brother, ho!
+
+FRIAR LAURENCE:
+This same should be the voice of Friar John.
+Welcome from Mantua: what says Romeo?
+Or, if his mind be writ, give me his letter.
+
+FRIAR JOHN:
+Going to find a bare-foot brother out
+One of our order, to associate me,
+Here in this city visiting the sick,
+And finding him, the searchers of the town,
+Suspecting that we both were in a house
+Where the infectious pestilence did reign,
+Seal'd up the doors, and would not let us forth;
+So that my speed to Mantua there was stay'd.
+
+FRIAR LAURENCE:
+Who bare my letter, then, to Romeo?
+
+FRIAR JOHN:
+I could not send it,--here it is again,--
+Nor get a messenger to bring it thee,
+So fearful were they of infection.
+
+FRIAR LAURENCE:
+Unhappy fortune! by my brotherhood,
+The letter was not nice but full of charge
+Of dear import, and the neglecting it
+May do much danger. Friar John, go hence;
+Get me an iron crow, and bring it straight
+Unto my cell.
+
+FRIAR JOHN:
+Brother, I'll go and bring it thee.
+
+FRIAR LAURENCE:
+Now must I to the monument alone;
+Within three hours will fair Juliet wake:
+She will beshrew me much that Romeo
+Hath had no notice of these accidents;
+But I will write again to Mantua,
+And keep her at my cell till Romeo come;
+Poor living corse, closed in a dead man's tomb!
+
+PARIS:
+Give me thy torch, boy: hence, and stand aloof:
+Yet put it out, for I would not be seen.
+Under yond yew-trees lay thee all along,
+Holding thine ear close to the hollow ground;
+So shall no foot upon the churchyard tread,
+Being loose, unfirm, with digging up of graves,
+But thou shalt hear it: whistle then to me,
+As signal that thou hear'st something approach.
+Give me those flowers. Do as I bid thee, go.
+
+PAGE:
+
+PARIS:
+Sweet flower, with flowers thy bridal bed I strew,--
+O woe! thy canopy is dust and stones;--
+Which with sweet water nightly I will dew,
+Or, wanting that, with tears distill'd by moans:
+The obsequies that I for thee will keep
+Nightly shall be to strew thy grave and weep.
+The boy gives warning something doth approach.
+What cursed foot wanders this way to-night,
+To cross my obsequies and true love's rite?
+What with a torch! muffle me, night, awhile.
+
+ROMEO:
+Give me that mattock and the wrenching iron.
+Hold, take this letter; early in the morning
+See thou deliver it to my lord and father.
+Give me the light: upon thy life, I charge thee,
+Whate'er thou hear'st or seest, stand all aloof,
+And do not interrupt me in my course.
+Why I descend into this bed of death,
+Is partly to behold my lady's face;
+But chiefly to take thence from her dead finger
+A precious ring, a ring that I must use
+In dear employment: therefore hence, be gone:
+But if thou, jealous, dost return to pry
+In what I further shall intend to do,
+By heaven, I will tear thee joint by joint
+And strew this hungry churchyard with thy limbs:
+The time and my intents are savage-wild,
+More fierce and more inexorable far
+Than empty tigers or the roaring sea.
+
+BALTHASAR:
+I will be gone, sir, and not trouble you.
+
+ROMEO:
+So shalt thou show me friendship. Take thou that:
+Live, and be prosperous: and farewell, good fellow.
+
+BALTHASAR:
+
+ROMEO:
+Thou detestable maw, thou womb of death,
+Gorged with the dearest morsel of the earth,
+Thus I enforce thy rotten jaws to open,
+And, in despite, I'll cram thee with more food!
+
+PARIS:
+This is that banish'd haughty Montague,
+That murder'd my love's cousin, with which grief,
+It is supposed, the fair creature died;
+And here is come to do some villanous shame
+To the dead bodies: I will apprehend him.
+Stop thy unhallow'd toil, vile Montague!
+Can vengeance be pursued further than death?
+Condemned villain, I do apprehend thee:
+Obey, and go with me; for thou must die.
+
+ROMEO:
+I must indeed; and therefore came I hither.
+Good gentle youth, tempt not a desperate man;
+Fly hence, and leave me: think upon these gone;
+Let them affright thee. I beseech thee, youth,
+Put not another sin upon my head,
+By urging me to fury: O, be gone!
+By heaven, I love thee better than myself;
+For I come hither arm'd against myself:
+Stay not, be gone; live, and hereafter say,
+A madman's mercy bade thee run away.
+
+PARIS:
+I do defy thy conjurations,
+And apprehend thee for a felon here.
+
+ROMEO:
+Wilt thou provoke me? then have at thee, boy!
+
+PAGE:
+O Lord, they fight! I will go call the watch.
+
+PARIS:
+O, I am slain!
+If thou be merciful,
+Open the tomb, lay me with Juliet.
+
+ROMEO:
+In faith, I will. Let me peruse this face.
+Mercutio's kinsman, noble County Paris!
+What said my man, when my betossed soul
+Did not attend him as we rode? I think
+He told me Paris should have married Juliet:
+Said he not so? or did I dream it so?
+Or am I mad, hearing him talk of Juliet,
+To think it was so? O, give me thy hand,
+One writ with me in sour misfortune's book!
+I'll bury thee in a triumphant grave;
+A grave? O no! a lantern, slaughter'd youth,
+For here lies Juliet, and her beauty makes
+This vault a feasting presence full of light.
+Death, lie thou there, by a dead man interr'd.
+How oft when men are at the point of death
+Have they been merry! which their keepers call
+A lightning before death: O, how may I
+Call this a lightning? O my love! my wife!
+Death, that hath suck'd the honey of thy breath,
+Hath had no power yet upon thy beauty:
+Thou art not conquer'd; beauty's ensign yet
+Is crimson in thy lips and in thy cheeks,
+And death's pale flag is not advanced there.
+Tybalt, liest thou there in thy bloody sheet?
+O, what more favour can I do to thee,
+Than with that hand that cut thy youth in twain
+To sunder his that was thine enemy?
+Forgive me, cousin! Ah, dear Juliet,
+Why art thou yet so fair? shall I believe
+That unsubstantial death is amorous,
+And that the lean abhorred monster keeps
+Thee here in dark to be his paramour?
+For fear of that, I still will stay with thee;
+And never from this palace of dim night
+Depart again: here, here will I remain
+With worms that are thy chamber-maids; O, here
+Will I set up my everlasting rest,
+And shake the yoke of inauspicious stars
+From this world-wearied flesh. Eyes, look your last!
+Arms, take your last embrace! and, lips, O you
+The doors of breath, seal with a righteous kiss
+A dateless bargain to engrossing death!
+Come, bitter conduct, come, unsavoury guide!
+Thou desperate pilot, now at once run on
+The dashing rocks thy sea-sick weary bark!
+Here's to my love!
+O true apothecary!
+Thy drugs are quick. Thus with a kiss I die.
+
+FRIAR LAURENCE:
+Saint Francis be my speed! how oft to-night
+Have my old feet stumbled at graves! Who's there?
+
+BALTHASAR:
+Here's one, a friend, and one that knows you well.
+
+FRIAR LAURENCE:
+Bliss be upon you! Tell me, good my friend,
+What torch is yond, that vainly lends his light
+To grubs and eyeless skulls? as I discern,
+It burneth in the Capel's monument.
+
+BALTHASAR:
+It doth so, holy sir; and there's my master,
+One that you love.
+
+FRIAR LAURENCE:
+Who is it?
+
+BALTHASAR:
+Romeo.
+
+FRIAR LAURENCE:
+How long hath he been there?
+
+BALTHASAR:
+Full half an hour.
+
+FRIAR LAURENCE:
+Go with me to the vault.
+
+BALTHASAR:
+I dare not, sir
+My master knows not but I am gone hence;
+And fearfully did menace me with death,
+If I did stay to look on his intents.
+
+FRIAR LAURENCE:
+Stay, then; I'll go alone. Fear comes upon me:
+O, much I fear some ill unlucky thing.
+
+BALTHASAR:
+As I did sleep under this yew-tree here,
+I dreamt my master and another fought,
+And that my master slew him.
+
+FRIAR LAURENCE:
+Romeo!
+Alack, alack, what blood is this, which stains
+The stony entrance of this sepulchre?
+What mean these masterless and gory swords
+To lie discolour'd by this place of peace?
+Romeo! O, pale! Who else? what, Paris too?
+And steep'd in blood? Ah, what an unkind hour
+Is guilty of this lamentable chance!
+The lady stirs.
+
+JULIET:
+O comfortable friar! where is my lord?
+I do remember well where I should be,
+And there I am. Where is my Romeo?
+
+FRIAR LAURENCE:
+I hear some noise. Lady, come from that nest
+Of death, contagion, and unnatural sleep:
+A greater power than we can contradict
+Hath thwarted our intents. Come, come away.
+Thy husband in thy bosom there lies dead;
+And Paris too. Come, I'll dispose of thee
+Among a sisterhood of holy nuns:
+Stay not to question, for the watch is coming;
+Come, go, good Juliet,
+I dare no longer stay.
+
+JULIET:
+Go, get thee hence, for I will not away.
+What's here? a cup, closed in my true love's hand?
+Poison, I see, hath been his timeless end:
+O churl! drunk all, and left no friendly drop
+To help me after? I will kiss thy lips;
+Haply some poison yet doth hang on them,
+To make die with a restorative.
+Thy lips are warm.
+
+First Watchman:
+
+JULIET:
+Yea, noise? then I'll be brief. O happy dagger!
+This is thy sheath;
+there rust, and let me die.
+
+PAGE:
+This is the place; there, where the torch doth burn.
+
+First Watchman:
+The ground is bloody; search about the churchyard:
+Go, some of you, whoe'er you find attach.
+Pitiful sight! here lies the county slain,
+And Juliet bleeding, warm, and newly dead,
+Who here hath lain these two days buried.
+Go, tell the prince: run to the Capulets:
+Raise up the Montagues: some others search:
+We see the ground whereon these woes do lie;
+But the true ground of all these piteous woes
+We cannot without circumstance descry.
+
+Second Watchman:
+Here's Romeo's man; we found him in the churchyard.
+
+First Watchman:
+Hold him in safety, till the prince come hither.
+
+Third Watchman:
+Here is a friar, that trembles, sighs and weeps:
+We took this mattock and this spade from him,
+As he was coming from this churchyard side.
+
+First Watchman:
+A great suspicion: stay the friar too.
+
+PRINCE:
+What misadventure is so early up,
+That calls our person from our morning's rest?
+
+CAPULET:
+What should it be, that they so shriek abroad?
+
+LADY CAPULET:
+The people in the street cry Romeo,
+Some Juliet, and some Paris; and all run,
+With open outcry toward our monument.
+
+PRINCE:
+What fear is this which startles in our ears?
+
+First Watchman:
+Sovereign, here lies the County Paris slain;
+And Romeo dead; and Juliet, dead before,
+Warm and new kill'd.
+
+PRINCE:
+Search, seek, and know how this foul murder comes.
+
+First Watchman:
+Here is a friar, and slaughter'd Romeo's man;
+With instruments upon them, fit to open
+These dead men's tombs.
+
+CAPULET:
+O heavens! O wife, look how our daughter bleeds!
+This dagger hath mista'en--for, lo, his house
+Is empty on the back of Montague,--
+And it mis-sheathed in my daughter's bosom!
+
+LADY CAPULET:
+O me! this sight of death is as a bell,
+That warns my old age to a sepulchre.
+
+PRINCE:
+Come, Montague; for thou art early up,
+To see thy son and heir more early down.
+
+MONTAGUE:
+Alas, my liege, my wife is dead to-night;
+Grief of my son's exile hath stopp'd her breath:
+What further woe conspires against mine age?
+
+PRINCE:
+Look, and thou shalt see.
+
+MONTAGUE:
+O thou untaught! what manners is in this?
+To press before thy father to a grave?
+
+PRINCE:
+Seal up the mouth of outrage for a while,
+Till we can clear these ambiguities,
+And know their spring, their head, their
+true descent;
+And then will I be general of your woes,
+And lead you even to death: meantime forbear,
+And let mischance be slave to patience.
+Bring forth the parties of suspicion.
+
+FRIAR LAURENCE:
+I am the greatest, able to do least,
+Yet most suspected, as the time and place
+Doth make against me of this direful murder;
+And here I stand, both to impeach and purge
+Myself condemned and myself excused.
+
+PRINCE:
+Then say at once what thou dost know in this.
+
+FRIAR LAURENCE:
+I will be brief, for my short date of breath
+Is not so long as is a tedious tale.
+Romeo, there dead, was husband to that Juliet;
+And she, there dead, that Romeo's faithful wife:
+I married them; and their stol'n marriage-day
+Was Tybalt's dooms-day, whose untimely death
+Banish'd the new-made bridegroom from the city,
+For whom, and not for Tybalt, Juliet pined.
+You, to remove that siege of grief from her,
+Betroth'd and would have married her perforce
+To County Paris: then comes she to me,
+And, with wild looks, bid me devise some mean
+To rid her from this second marriage,
+Or in my cell there would she kill herself.
+Then gave I her, so tutor'd by my art,
+A sleeping potion; which so took effect
+As I intended, for it wrought on her
+The form of death: meantime I writ to Romeo,
+That he should hither come as this dire night,
+To help to take her from her borrow'd grave,
+Being the time the potion's force should cease.
+But he which bore my letter, Friar John,
+Was stay'd by accident, and yesternight
+Return'd my letter back. Then all alone
+At the prefixed hour of her waking,
+Came I to take her from her kindred's vault;
+Meaning to keep her closely at my cell,
+Till I conveniently could send to Romeo:
+But when I came, some minute ere the time
+Of her awaking, here untimely lay
+The noble Paris and true Romeo dead.
+She wakes; and I entreated her come forth,
+And bear this work of heaven with patience:
+But then a noise did scare me from the tomb;
+And she, too desperate, would not go with me,
+But, as it seems, did violence on herself.
+All this I know; and to the marriage
+Her nurse is privy: and, if aught in this
+Miscarried by my fault, let my old life
+Be sacrificed, some hour before his time,
+Unto the rigour of severest law.
+
+PRINCE:
+We still have known thee for a holy man.
+Where's Romeo's man? what can he say in this?
+
+BALTHASAR:
+I brought my master news of Juliet's death;
+And then in post he came from Mantua
+To this same place, to this same monument.
+This letter he early bid me give his father,
+And threatened me with death, going in the vault,
+I departed not and left him there.
+
+PRINCE:
+Give me the letter; I will look on it.
+Where is the county's page, that raised the watch?
+Sirrah, what made your master in this place?
+
+PAGE:
+He came with flowers to strew his lady's grave;
+And bid me stand aloof, and so I did:
+Anon comes one with light to ope the tomb;
+And by and by my master drew on him;
+And then I ran away to call the watch.
+
+PRINCE:
+This letter doth make good the friar's words,
+Their course of love, the tidings of her death:
+And here he writes that he did buy a poison
+Of a poor 'pothecary, and therewithal
+Came to this vault to die, and lie with Juliet.
+Where be these enemies? Capulet! Montague!
+See, what a scourge is laid upon your hate,
+That heaven finds means to kill your joys with love.
+And I for winking at your discords too
+Have lost a brace of kinsmen: all are punish'd.
+
+CAPULET:
+O brother Montague, give me thy hand:
+This is my daughter's jointure, for no more
+Can I demand.
+
+MONTAGUE:
+But I can give thee more:
+For I will raise her statue in pure gold;
+That while Verona by that name is known,
+There shall no figure at such rate be set
+As that of true and faithful Juliet.
+
+CAPULET:
+As rich shall Romeo's by his lady's lie;
+Poor sacrifices of our enmity!
+
+PRINCE:
+A glooming peace this morning with it brings;
+The sun, for sorrow, will not show his head:
+Go hence, to have more talk of these sad things;
+Some shall be pardon'd, and some punished:
+For never was a story of more woe
+Than this of Juliet and her Romeo.
+
+WARWICK:
+I wonder how the king escaped our hands.
+
+YORK:
+While we pursued the horsemen of the north,
+He slily stole away and left his men:
+Whereat the great Lord of Northumberland,
+Whose warlike ears could never brook retreat,
+Cheer'd up the drooping army; and himself,
+Lord Clifford and Lord Stafford, all abreast,
+Charged our main battle's front, and breaking in
+Were by the swords of common soldiers slain.
+
+EDWARD:
+Lord Stafford's father, Duke of Buckingham,
+Is either slain or wounded dangerously;
+I cleft his beaver with a downright blow:
+That this is true, father, behold his blood.
+
+MONTAGUE:
+And, brother, here's the Earl of Wiltshire's blood,
+Whom I encounter'd as the battles join'd.
+
+RICHARD:
+Speak thou for me and tell them what I did.
+
+YORK:
+Richard hath best deserved of all my sons.
+But is your grace dead, my Lord of Somerset?
+
+NORFOLK:
+Such hope have all the line of John of Gaunt!
+
+RICHARD:
+Thus do I hope to shake King Henry's head.
+
+WARWICK:
+And so do I. Victorious Prince of York,
+Before I see thee seated in that throne
+Which now the house of Lancaster usurps,
+I vow by heaven these eyes shall never close.
+This is the palace of the fearful king,
+And this the regal seat: possess it, York;
+For this is thine and not King Henry's heirs'
+
+YORK:
+Assist me, then, sweet Warwick, and I will;
+For hither we have broken in by force.
+
+NORFOLK:
+We'll all assist you; he that flies shall die.
+
+YORK:
+Thanks, gentle Norfolk: stay by me, my lords;
+And, soldiers, stay and lodge by me this night.
+
+WARWICK:
+And when the king comes, offer no violence,
+Unless he seek to thrust you out perforce.
+
+YORK:
+The queen this day here holds her parliament,
+But little thinks we shall be of her council:
+By words or blows here let us win our right.
+
+RICHARD:
+Arm'd as we are, let's stay within this house.
+
+WARWICK:
+The bloody parliament shall this be call'd,
+Unless Plantagenet, Duke of York, be king,
+And bashful Henry deposed, whose cowardice
+Hath made us by-words to our enemies.
+
+YORK:
+Then leave me not, my lords; be resolute;
+I mean to take possession of my right.
+
+WARWICK:
+Neither the king, nor he that loves him best,
+The proudest he that holds up Lancaster,
+Dares stir a wing, if Warwick shake his bells.
+I'll plant Plantagenet, root him up who dares:
+Resolve thee, Richard; claim the English crown.
+
+KING HENRY VI:
+My lords, look where the sturdy rebel sits,
+Even in the chair of state: belike he means,
+Back'd by the power of Warwick, that false peer,
+To aspire unto the crown and reign as king.
+Earl of Northumberland, he slew thy father.
+And thine, Lord Clifford; and you both have vow'd revenge
+On him, his sons, his favourites and his friends.
+
+NORTHUMBERLAND:
+If I be not, heavens be revenged on me!
+
+CLIFFORD:
+The hope thereof makes Clifford mourn in steel.
+
+WESTMORELAND:
+What, shall we suffer this? let's pluck him down:
+My heart for anger burns; I cannot brook it.
+
+KING HENRY VI:
+Be patient, gentle Earl of Westmoreland.
+
+CLIFFORD:
+Patience is for poltroons, such as he:
+He durst not sit there, had your father lived.
+My gracious lord, here in the parliament
+Let us assail the family of York.
+
+NORTHUMBERLAND:
+Well hast thou spoken, cousin: be it so.
+
+KING HENRY VI:
+Ah, know you not the city favours them,
+And they have troops of soldiers at their beck?
+
+EXETER:
+But when the duke is slain, they'll quickly fly.
+
+KING HENRY VI:
+Far be the thought of this from Henry's heart,
+To make a shambles of the parliament-house!
+Cousin of Exeter, frowns, words and threats
+Shall be the war that Henry means to use.
+Thou factious Duke of York, descend my throne,
+and kneel for grace and mercy at my feet;
+I am thy sovereign.
+
+YORK:
+I am thine.
+
+EXETER:
+For shame, come down: he made thee Duke of York.
+
+YORK:
+'Twas my inheritance, as the earldom was.
+
+EXETER:
+Thy father was a traitor to the crown.
+
+WARWICK:
+Exeter, thou art a traitor to the crown
+In following this usurping Henry.
+
+CLIFFORD:
+Whom should he follow but his natural king?
+
+WARWICK:
+True, Clifford; and that's Richard Duke of York.
+
+KING HENRY VI:
+And shall I stand, and thou sit in my throne?
+
+YORK:
+It must and shall be so: content thyself.
+
+WARWICK:
+Be Duke of Lancaster; let him be king.
+
+WESTMORELAND:
+He is both king and Duke of Lancaster;
+And that the Lord of Westmoreland shall maintain.
+
+WARWICK:
+And Warwick shall disprove it. You forget
+That we are those which chased you from the field
+And slew your fathers, and with colours spread
+March'd through the city to the palace gates.
+
+NORTHUMBERLAND:
+Yes, Warwick, I remember it to my grief;
+And, by his soul, thou and thy house shall rue it.
+
+WESTMORELAND:
+Plantagenet, of thee and these thy sons,
+Thy kinsman and thy friends, I'll have more lives
+Than drops of blood were in my father's veins.
+
+CLIFFORD:
+Urge it no more; lest that, instead of words,
+I send thee, Warwick, such a messenger
+As shall revenge his death before I stir.
+
+WARWICK:
+Poor Clifford! how I scorn his worthless threats!
+
+YORK:
+Will you we show our title to the crown?
+If not, our swords shall plead it in the field.
+
+KING HENRY VI:
+What title hast thou, traitor, to the crown?
+Thy father was, as thou art, Duke of York;
+Thy grandfather, Roger Mortimer, Earl of March:
+I am the son of Henry the Fifth,
+Who made the Dauphin and the French to stoop
+And seized upon their towns and provinces.
+
+WARWICK:
+Talk not of France, sith thou hast lost it all.
+
+KING HENRY VI:
+The lord protector lost it, and not I:
+When I was crown'd I was but nine months old.
+
+RICHARD:
+You are old enough now, and yet, methinks, you lose.
+Father, tear the crown from the usurper's head.
+
+EDWARD:
+Sweet father, do so; set it on your head.
+
+MONTAGUE:
+Good brother, as thou lovest and honourest arms,
+Let's fight it out and not stand cavilling thus.
+
+RICHARD:
+Sound drums and trumpets, and the king will fly.
+
+YORK:
+Sons, peace!
+
+KING HENRY VI:
+Peace, thou! and give King Henry leave to speak.
+
+WARWICK:
+Plantagenet shall speak first: hear him, lords;
+And be you silent and attentive too,
+For he that interrupts him shall not live.
+
+KING HENRY VI:
+Think'st thou that I will leave my kingly throne,
+Wherein my grandsire and my father sat?
+No: first shall war unpeople this my realm;
+Ay, and their colours, often borne in France,
+And now in England to our heart's great sorrow,
+Shall be my winding-sheet. Why faint you, lords?
+My title's good, and better far than his.
+
+WARWICK:
+Prove it, Henry, and thou shalt be king.
+
+KING HENRY VI:
+Henry the Fourth by conquest got the crown.
+
+YORK:
+'Twas by rebellion against his king.
+
+KING HENRY VI:
+
+YORK:
+What then?
+
+KING HENRY VI:
+An if he may, then am I lawful king;
+For Richard, in the view of many lords,
+Resign'd the crown to Henry the Fourth,
+Whose heir my father was, and I am his.
+
+YORK:
+He rose against him, being his sovereign,
+And made him to resign his crown perforce.
+
+WARWICK:
+Suppose, my lords, he did it unconstrain'd,
+Think you 'twere prejudicial to his crown?
+
+EXETER:
+No; for he could not so resign his crown
+But that the next heir should succeed and reign.
+
+KING HENRY VI:
+Art thou against us, Duke of Exeter?
+
+EXETER:
+His is the right, and therefore pardon me.
+
+YORK:
+Why whisper you, my lords, and answer not?
+
+EXETER:
+My conscience tells me he is lawful king.
+
+KING HENRY VI:
+
+NORTHUMBERLAND:
+Plantagenet, for all the claim thou lay'st,
+Think not that Henry shall be so deposed.
+
+WARWICK:
+Deposed he shall be, in despite of all.
+
+NORTHUMBERLAND:
+Thou art deceived: 'tis not thy southern power,
+Of Essex, Norfolk, Suffolk, nor of Kent,
+Which makes thee thus presumptuous and proud,
+Can set the duke up in despite of me.
+
+CLIFFORD:
+King Henry, be thy title right or wrong,
+Lord Clifford vows to fight in thy defence:
+May that ground gape and swallow me alive,
+Where I shall kneel to him that slew my father!
+
+KING HENRY VI:
+O Clifford, how thy words revive my heart!
+
+YORK:
+Henry of Lancaster, resign thy crown.
+What mutter you, or what conspire you, lords?
+
+WARWICK:
+Do right unto this princely Duke of York,
+Or I will fill the house with armed men,
+And over the chair of state, where now he sits,
+Write up his title with usurping blood.
+
+KING HENRY VI:
+My Lord of Warwick, hear me but one word:
+Let me for this my life-time reign as king.
+
+YORK:
+Confirm the crown to me and to mine heirs,
+And thou shalt reign in quiet while thou livest.
+
+KING HENRY VI:
+I am content: Richard Plantagenet,
+Enjoy the kingdom after my decease.
+
+CLIFFORD:
+What wrong is this unto the prince your son!
+
+WARWICK:
+What good is this to England and himself!
+
+WESTMORELAND:
+Base, fearful and despairing Henry!
+
+CLIFFORD:
+How hast thou injured both thyself and us!
+
+WESTMORELAND:
+I cannot stay to hear these articles.
+
+NORTHUMBERLAND:
+Nor I.
+
+CLIFFORD:
+Come, cousin, let us tell the queen these news.
+
+WESTMORELAND:
+Farewell, faint-hearted and degenerate king,
+In whose cold blood no spark of honour bides.
+
+NORTHUMBERLAND:
+Be thou a prey unto the house of York,
+And die in bands for this unmanly deed!
+
+CLIFFORD:
+In dreadful war mayst thou be overcome,
+Or live in peace abandon'd and despised!
+
+WARWICK:
+Turn this way, Henry, and regard them not.
+
+EXETER:
+They seek revenge and therefore will not yield.
+
+KING HENRY VI:
+Ah, Exeter!
+
+WARWICK:
+Why should you sigh, my lord?
+
+KING HENRY VI:
+Not for myself, Lord Warwick, but my son,
+Whom I unnaturally shall disinherit.
+But be it as it may: I here entail
+The crown to thee and to thine heirs for ever;
+Conditionally, that here thou take an oath
+To cease this civil war, and, whilst I live,
+To honour me as thy king and sovereign,
+And neither by treason nor hostility
+To seek to put me down and reign thyself.
+
+YORK:
+This oath I willingly take and will perform.
+
+WARWICK:
+Long live King Henry! Plantagenet embrace him.
+
+KING HENRY VI:
+And long live thou and these thy forward sons!
+
+YORK:
+Now York and Lancaster are reconciled.
+
+EXETER:
+Accursed be he that seeks to make them foes!
+
+YORK:
+Farewell, my gracious lord; I'll to my castle.
+
+WARWICK:
+And I'll keep London with my soldiers.
+
+NORFOLK:
+And I to Norfolk with my followers.
+
+MONTAGUE:
+And I unto the sea from whence I came.
+
+KING HENRY VI:
+And I, with grief and sorrow, to the court.
+
+EXETER:
+Here comes the queen, whose looks bewray her anger:
+I'll steal away.
+
+KING HENRY VI:
+Exeter, so will I.
+
+QUEEN MARGARET:
+Nay, go not from me; I will follow thee.
+
+KING HENRY VI:
+Be patient, gentle queen, and I will stay.
+
+QUEEN MARGARET:
+Who can be patient in such extremes?
+Ah, wretched man! would I had died a maid
+And never seen thee, never borne thee son,
+Seeing thou hast proved so unnatural a father
+Hath he deserved to lose his birthright thus?
+Hadst thou but loved him half so well as I,
+Or felt that pain which I did for him once,
+Or nourish'd him as I did with my blood,
+Thou wouldst have left thy dearest heart-blood there,
+Rather than have that savage duke thine heir
+And disinherited thine only son.
+
+PRINCE EDWARD:
+Father, you cannot disinherit me:
+If you be king, why should not I succeed?
+
+KING HENRY VI:
+Pardon me, Margaret; pardon me, sweet son:
+The Earl of Warwick and the duke enforced me.
+
+QUEEN MARGARET:
+Enforced thee! art thou king, and wilt be forced?
+I shame to hear thee speak. Ah, timorous wretch!
+Thou hast undone thyself, thy son and me;
+And given unto the house of York such head
+As thou shalt reign but by their sufferance.
+To entail him and his heirs unto the crown,
+What is it, but to make thy sepulchre
+And creep into it far before thy time?
+Warwick is chancellor and the lord of Calais;
+Stern Falconbridge commands the narrow seas;
+The duke is made protector of the realm;
+And yet shalt thou be safe? such safety finds
+The trembling lamb environed with wolves.
+Had I been there, which am a silly woman,
+The soldiers should have toss'd me on their pikes
+Before I would have granted to that act.
+But thou preferr'st thy life before thine honour:
+And seeing thou dost, I here divorce myself
+Both from thy table, Henry, and thy bed,
+Until that act of parliament be repeal'd
+Whereby my son is disinherited.
+The northern lords that have forsworn thy colours
+Will follow mine, if once they see them spread;
+And spread they shall be, to thy foul disgrace
+And utter ruin of the house of York.
+Thus do I leave thee. Come, son, let's away;
+Our army is ready; come, we'll after them.
+
+KING HENRY VI:
+Stay, gentle Margaret, and hear me speak.
+
+QUEEN MARGARET:
+Thou hast spoke too much already: get thee gone.
+
+KING HENRY VI:
+Gentle son Edward, thou wilt stay with me?
+
+QUEEN MARGARET:
+Ay, to be murder'd by his enemies.
+
+PRINCE EDWARD:
+When I return with victory from the field
+I'll see your grace: till then I'll follow her.
+
+QUEEN MARGARET:
+Come, son, away; we may not linger thus.
+
+KING HENRY VI:
+Poor queen! how love to me and to her son
+Hath made her break out into terms of rage!
+Revenged may she be on that hateful duke,
+Whose haughty spirit, winged with desire,
+Will cost my crown, and like an empty eagle
+Tire on the flesh of me and of my son!
+The loss of those three lords torments my heart:
+I'll write unto them and entreat them fair.
+Come, cousin you shall be the messenger.
+
+EXETER:
+And I, I hope, shall reconcile them all.
+3 KING HENRY VI
+
+RICHARD:
+Brother, though I be youngest, give me leave.
+
+EDWARD:
+No, I can better play the orator.
+
+MONTAGUE:
+But I have reasons strong and forcible.
+
+YORK:
+Why, how now, sons and brother! at a strife?
+What is your quarrel? how began it first?
+
+EDWARD:
+No quarrel, but a slight contention.
+
+YORK:
+About what?
+
+RICHARD:
+About that which concerns your grace and us;
+The crown of England, father, which is yours.
+
+YORK:
+Mine boy? not till King Henry be dead.
+
+RICHARD:
+Your right depends not on his life or death.
+
+EDWARD:
+Now you are heir, therefore enjoy it now:
+By giving the house of Lancaster leave to breathe,
+It will outrun you, father, in the end.
+
+YORK:
+I took an oath that he should quietly reign.
+
+EDWARD:
+But for a kingdom any oath may be broken:
+I would break a thousand oaths to reign one year.
+
+RICHARD:
+No; God forbid your grace should be forsworn.
+
+YORK:
+I shall be, if I claim by open war.
+
+RICHARD:
+I'll prove the contrary, if you'll hear me speak.
+
+YORK:
+Thou canst not, son; it is impossible.
+
+RICHARD:
+An oath is of no moment, being not took
+Before a true and lawful magistrate,
+That hath authority over him that swears:
+Henry had none, but did usurp the place;
+Then, seeing 'twas he that made you to depose,
+Your oath, my lord, is vain and frivolous.
+Therefore, to arms! And, father, do but think
+How sweet a thing it is to wear a crown;
+Within whose circuit is Elysium
+And all that poets feign of bliss and joy.
+Why do we finger thus? I cannot rest
+Until the white rose that I wear be dyed
+Even in the lukewarm blood of Henry's heart.
+
+YORK:
+Richard, enough; I will be king, or die.
+Brother, thou shalt to London presently,
+And whet on Warwick to this enterprise.
+Thou, Richard, shalt to the Duke of Norfolk,
+And tell him privily of our intent.
+You Edward, shall unto my Lord Cobham,
+With whom the Kentishmen will willingly rise:
+In them I trust; for they are soldiers,
+Witty, courteous, liberal, full of spirit.
+While you are thus employ'd, what resteth more,
+But that I seek occasion how to rise,
+And yet the king not privy to my drift,
+Nor any of the house of Lancaster?
+But, stay: what news? Why comest thou in such post?
+
+Messenger:
+The queen with all the northern earls and lords
+Intend here to besiege you in your castle:
+She is hard by with twenty thousand men;
+And therefore fortify your hold, my lord.
+
+YORK:
+Ay, with my sword. What! think'st thou that we fear them?
+Edward and Richard, you shall stay with me;
+My brother Montague shall post to London:
+Let noble Warwick, Cobham, and the rest,
+Whom we have left protectors of the king,
+With powerful policy strengthen themselves,
+And trust not simple Henry nor his oaths.
+
+MONTAGUE:
+Brother, I go; I'll win them, fear it not:
+And thus most humbly I do take my leave.
+Sir John and Sir Hugh Mortimer, mine uncles,
+You are come to Sandal in a happy hour;
+The army of the queen mean to besiege us.
+
+JOHN MORTIMER:
+She shall not need; we'll meet her in the field.
+
+YORK:
+What, with five thousand men?
+
+RICHARD:
+Ay, with five hundred, father, for a need:
+A woman's general; what should we fear?
+
+EDWARD:
+I hear their drums: let's set our men in order,
+And issue forth and bid them battle straight.
+
+YORK:
+Five men to twenty! though the odds be great,
+I doubt not, uncle, of our victory.
+Many a battle have I won in France,
+When as the enemy hath been ten to one:
+Why should I not now have the like success?
+3 KING HENRY VI
+
+RUTLAND:
+Ah, whither shall I fly to 'scape their hands?
+Ah, tutor, look where bloody Clifford comes!
+
+CLIFFORD:
+Chaplain, away! thy priesthood saves thy life.
+As for the brat of this accursed duke,
+Whose father slew my father, he shall die.
+
+Tutor:
+And I, my lord, will bear him company.
+
+CLIFFORD:
+Soldiers, away with him!
+
+Tutor:
+Ah, Clifford, murder not this innocent child,
+Lest thou be hated both of God and man!
+
+CLIFFORD:
+How now! is he dead already? or is it fear
+That makes him close his eyes? I'll open them.
+
+RUTLAND:
+So looks the pent-up lion o'er the wretch
+That trembles under his devouring paws;
+And so he walks, insulting o'er his prey,
+And so he comes, to rend his limbs asunder.
+Ah, gentle Clifford, kill me with thy sword,
+And not with such a cruel threatening look.
+Sweet Clifford, hear me speak before I die.
+I am too mean a subject for thy wrath:
+Be thou revenged on men, and let me live.
+
+CLIFFORD:
+In vain thou speak'st, poor boy; my father's blood
+Hath stopp'd the passage where thy words should enter.
+
+RUTLAND:
+Then let my father's blood open it again:
+He is a man, and, Clifford, cope with him.
+
+CLIFFORD:
+Had thy brethren here, their lives and thine
+Were not revenge sufficient for me;
+No, if I digg'd up thy forefathers' graves
+And hung their rotten coffins up in chains,
+It could not slake mine ire, nor ease my heart.
+The sight of any of the house of York
+Is as a fury to torment my soul;
+And till I root out their accursed line
+And leave not one alive, I live in hell.
+Therefore--
+
+RUTLAND:
+O, let me pray before I take my death!
+To thee I pray; sweet Clifford, pity me!
+
+CLIFFORD:
+Such pity as my rapier's point affords.
+
+RUTLAND:
+I never did thee harm: why wilt thou slay me?
+
+CLIFFORD:
+Thy father hath.
+
+RUTLAND:
+But 'twas ere I was born.
+Thou hast one son; for his sake pity me,
+Lest in revenge thereof, sith God is just,
+He be as miserably slain as I.
+Ah, let me live in prison all my days;
+And when I give occasion of offence,
+Then let me die, for now thou hast no cause.
+
+CLIFFORD:
+No cause!
+Thy father slew my father; therefore, die.
+
+RUTLAND:
+Di faciant laudis summa sit ista tuae!
+
+CLIFFORD:
+Plantagenet! I come, Plantagenet!
+And this thy son's blood cleaving to my blade
+Shall rust upon my weapon, till thy blood,
+Congeal'd with this, do make me wipe off both.
+3 KING HENRY VI
+
+YORK:
+The army of the queen hath got the field:
+My uncles both are slain in rescuing me;
+And all my followers to the eager foe
+Turn back and fly, like ships before the wind
+Or lambs pursued by hunger-starved wolves.
+My sons, God knows what hath bechanced them:
+But this I know, they have demean'd themselves
+Like men born to renown by life or death.
+Three times did Richard make a lane to me.
+And thrice cried 'Courage, father! fight it out!'
+And full as oft came Edward to my side,
+With purple falchion, painted to the hilt
+In blood of those that had encounter'd him:
+And when the hardiest warriors did retire,
+Richard cried 'Charge! and give no foot of ground!'
+And cried 'A crown, or else a glorious tomb!
+A sceptre, or an earthly sepulchre!'
+With this, we charged again: but, out, alas!
+We bodged again; as I have seen a swan
+With bootless labour swim against the tide
+And spend her strength with over-matching waves.
+Ah, hark! the fatal followers do pursue;
+And I am faint and cannot fly their fury:
+And were I strong, I would not shun their fury:
+The sands are number'd that make up my life;
+Here must I stay, and here my life must end.
+Come, bloody Clifford, rough Northumberland,
+I dare your quenchless fury to more rage:
+I am your butt, and I abide your shot.
+
+NORTHUMBERLAND:
+Yield to our mercy, proud Plantagenet.
+
+CLIFFORD:
+Ay, to such mercy as his ruthless arm,
+With downright payment, show'd unto my father.
+Now Phaethon hath tumbled from his car,
+And made an evening at the noontide prick.
+
+YORK:
+My ashes, as the phoenix, may bring forth
+A bird that will revenge upon you all:
+And in that hope I throw mine eyes to heaven,
+Scorning whate'er you can afflict me with.
+Why come you not? what! multitudes, and fear?
+
+CLIFFORD:
+So cowards fight when they can fly no further;
+So doves do peck the falcon's piercing talons;
+So desperate thieves, all hopeless of their lives,
+Breathe out invectives 'gainst the officers.
+
+YORK:
+O Clifford, but bethink thee once again,
+And in thy thought o'er-run my former time;
+And, if though canst for blushing, view this face,
+And bite thy tongue, that slanders him with cowardice
+Whose frown hath made thee faint and fly ere this!
+
+CLIFFORD:
+I will not bandy with thee word for word,
+But buckle with thee blows, twice two for one.
+
+QUEEN MARGARET:
+Hold, valiant Clifford! for a thousand causes
+I would prolong awhile the traitor's life.
+Wrath makes him deaf: speak thou, Northumberland.
+
+NORTHUMBERLAND:
+Hold, Clifford! do not honour him so much
+To prick thy finger, though to wound his heart:
+What valour were it, when a cur doth grin,
+For one to thrust his hand between his teeth,
+When he might spurn him with his foot away?
+It is war's prize to take all vantages;
+And ten to one is no impeach of valour.
+
+CLIFFORD:
+Ay, ay, so strives the woodcock with the gin.
+
+NORTHUMBERLAND:
+So doth the cony struggle in the net.
+
+YORK:
+So triumph thieves upon their conquer'd booty;
+So true men yield, with robbers so o'ermatch'd.
+
+NORTHUMBERLAND:
+What would your grace have done unto him now?
+
+QUEEN MARGARET:
+Brave warriors, Clifford and Northumberland,
+Come, make him stand upon this molehill here,
+That raught at mountains with outstretched arms,
+Yet parted but the shadow with his hand.
+What! was it you that would be England's king?
+Was't you that revell'd in our parliament,
+And made a preachment of your high descent?
+Where are your mess of sons to back you now?
+The wanton Edward, and the lusty George?
+And where's that valiant crook-back prodigy,
+Dicky your boy, that with his grumbling voice
+Was wont to cheer his dad in mutinies?
+Or, with the rest, where is your darling Rutland?
+Look, York: I stain'd this napkin with the blood
+That valiant Clifford, with his rapier's point,
+Made issue from the bosom of the boy;
+And if thine eyes can water for his death,
+I give thee this to dry thy cheeks withal.
+Alas poor York! but that I hate thee deadly,
+I should lament thy miserable state.
+I prithee, grieve, to make me merry, York.
+What, hath thy fiery heart so parch'd thine entrails
+That not a tear can fall for Rutland's death?
+Why art thou patient, man? thou shouldst be mad;
+And I, to make thee mad, do mock thee thus.
+Stamp, rave, and fret, that I may sing and dance.
+Thou wouldst be fee'd, I see, to make me sport:
+York cannot speak, unless he wear a crown.
+A crown for York! and, lords, bow low to him:
+Hold you his hands, whilst I do set it on.
+Ay, marry, sir, now looks he like a king!
+Ay, this is he that took King Henry's chair,
+And this is he was his adopted heir.
+But how is it that great Plantagenet
+Is crown'd so soon, and broke his solemn oath?
+As I bethink me, you should not be king
+Till our King Henry had shook hands with death.
+And will you pale your head in Henry's glory,
+And rob his temples of the diadem,
+Now in his life, against your holy oath?
+O, 'tis a fault too too unpardonable!
+Off with the crown, and with the crown his head;
+And, whilst we breathe, take time to do him dead.
+
+CLIFFORD:
+That is my office, for my father's sake.
+
+QUEEN MARGARET:
+Nay, stay; lets hear the orisons he makes.
+
+YORK:
+She-wolf of France, but worse than wolves of France,
+Whose tongue more poisons than the adder's tooth!
+How ill-beseeming is it in thy sex
+To triumph, like an Amazonian trull,
+Upon their woes whom fortune captivates!
+But that thy face is, vizard-like, unchanging,
+Made impudent with use of evil deeds,
+I would assay, proud queen, to make thee blush.
+To tell thee whence thou camest, of whom derived,
+Were shame enough to shame thee, wert thou not shameless.
+Thy father bears the type of King of Naples,
+Of both the Sicils and Jerusalem,
+Yet not so wealthy as an English yeoman.
+Hath that poor monarch taught thee to insult?
+It needs not, nor it boots thee not, proud queen,
+Unless the adage must be verified,
+That beggars mounted run their horse to death.
+'Tis beauty that doth oft make women proud;
+But, God he knows, thy share thereof is small:
+'Tis virtue that doth make them most admired;
+The contrary doth make thee wonder'd at:
+'Tis government that makes them seem divine;
+The want thereof makes thee abominable:
+Thou art as opposite to every good
+As the Antipodes are unto us,
+Or as the south to the septentrion.
+O tiger's heart wrapt in a woman's hide!
+How couldst thou drain the life-blood of the child,
+To bid the father wipe his eyes withal,
+And yet be seen to bear a woman's face?
+Women are soft, mild, pitiful and flexible;
+Thou stern, obdurate, flinty, rough, remorseless.
+Bids't thou me rage? why, now thou hast thy wish:
+Wouldst have me weep? why, now thou hast thy will:
+For raging wind blows up incessant showers,
+And when the rage allays, the rain begins.
+These tears are my sweet Rutland's obsequies:
+And every drop cries vengeance for his death,
+'Gainst thee, fell Clifford, and thee, false
+Frenchwoman.
+
+NORTHUMBERLAND:
+Beshrew me, but his passion moves me so
+That hardly can I cheque my eyes from tears.
+
+YORK:
+That face of his the hungry cannibals
+Would not have touch'd, would not have stain'd with blood:
+But you are more inhuman, more inexorable,
+O, ten times more, than tigers of Hyrcania.
+See, ruthless queen, a hapless father's tears:
+This cloth thou dip'dst in blood of my sweet boy,
+And I with tears do wash the blood away.
+Keep thou the napkin, and go boast of this:
+And if thou tell'st the heavy story right,
+Upon my soul, the hearers will shed tears;
+Yea even my foes will shed fast-falling tears,
+And say 'Alas, it was a piteous deed!'
+There, take the crown, and, with the crown, my curse;
+And in thy need such comfort come to thee
+As now I reap at thy too cruel hand!
+Hard-hearted Clifford, take me from the world:
+My soul to heaven, my blood upon your heads!
+
+NORTHUMBERLAND:
+Had he been slaughter-man to all my kin,
+I should not for my life but weep with him.
+To see how inly sorrow gripes his soul.
+
+QUEEN MARGARET:
+What, weeping-ripe, my Lord Northumberland?
+Think but upon the wrong he did us all,
+And that will quickly dry thy melting tears.
+
+CLIFFORD:
+Here's for my oath, here's for my father's death.
+
+QUEEN MARGARET:
+And here's to right our gentle-hearted king.
+
+YORK:
+Open Thy gate of mercy, gracious God!
+My soul flies through these wounds to seek out Thee.
+
+QUEEN MARGARET:
+Off with his head, and set it on York gates;
+So York may overlook the town of York.
+3 KING HENRY VI
+
+EDWARD:
+I wonder how our princely father 'scaped,
+Or whether he be 'scaped away or no
+From Clifford's and Northumberland's pursuit:
+Had he been ta'en, we should have heard the news;
+Had he been slain, we should have heard the news;
+Or had he 'scaped, methinks we should have heard
+The happy tidings of his good escape.
+How fares my brother? why is he so sad?
+
+RICHARD:
+I cannot joy, until I be resolved
+Where our right valiant father is become.
+I saw him in the battle range about;
+And watch'd him how he singled Clifford forth.
+Methought he bore him in the thickest troop
+As doth a lion in a herd of neat;
+Or as a bear, encompass'd round with dogs,
+Who having pinch'd a few and made them cry,
+The rest stand all aloof, and bark at him.
+So fared our father with his enemies;
+So fled his enemies my warlike father:
+Methinks, 'tis prize enough to be his son.
+See how the morning opes her golden gates,
+And takes her farewell of the glorious sun!
+How well resembles it the prime of youth,
+Trimm'd like a younker prancing to his love!
+
+EDWARD:
+Dazzle mine eyes, or do I see three suns?
+
+RICHARD:
+Three glorious suns, each one a perfect sun;
+Not separated with the racking clouds,
+But sever'd in a pale clear-shining sky.
+See, see! they join, embrace, and seem to kiss,
+As if they vow'd some league inviolable:
+Now are they but one lamp, one light, one sun.
+In this the heaven figures some event.
+
+EDWARD:
+'Tis wondrous strange, the like yet never heard of.
+I think it cites us, brother, to the field,
+That we, the sons of brave Plantagenet,
+Each one already blazing by our meeds,
+Should notwithstanding join our lights together
+And over-shine the earth as this the world.
+Whate'er it bodes, henceforward will I bear
+Upon my target three fair-shining suns.
+
+RICHARD:
+Nay, bear three daughters: by your leave I speak it,
+You love the breeder better than the male.
+But what art thou, whose heavy looks foretell
+Some dreadful story hanging on thy tongue?
+
+Messenger:
+Ah, one that was a woful looker-on
+When as the noble Duke of York was slain,
+Your princely father and my loving lord!
+
+EDWARD:
+O, speak no more, for I have heard too much.
+
+RICHARD:
+Say how he died, for I will hear it all.
+
+Messenger:
+Environed he was with many foes,
+And stood against them, as the hope of Troy
+Against the Greeks that would have enter'd Troy.
+But Hercules himself must yield to odds;
+And many strokes, though with a little axe,
+Hew down and fell the hardest-timber'd oak.
+By many hands your father was subdued;
+But only slaughter'd by the ireful arm
+Of unrelenting Clifford and the queen,
+Who crown'd the gracious duke in high despite,
+Laugh'd in his face; and when with grief he wept,
+The ruthless queen gave him to dry his cheeks
+A napkin steeped in the harmless blood
+Of sweet young Rutland, by rough Clifford slain:
+And after many scorns, many foul taunts,
+They took his head, and on the gates of York
+They set the same; and there it doth remain,
+The saddest spectacle that e'er I view'd.
+
+EDWARD:
+Sweet Duke of York, our prop to lean upon,
+Now thou art gone, we have no staff, no stay.
+O Clifford, boisterous Clifford! thou hast slain
+The flower of Europe for his chivalry;
+And treacherously hast thou vanquish'd him,
+For hand to hand he would have vanquish'd thee.
+Now my soul's palace is become a prison:
+Ah, would she break from hence, that this my body
+Might in the ground be closed up in rest!
+For never henceforth shall I joy again,
+Never, O never shall I see more joy!
+
+RICHARD:
+I cannot weep; for all my body's moisture
+Scarce serves to quench my furnace-burning heart:
+Nor can my tongue unload my heart's great burthen;
+For selfsame wind that I should speak withal
+Is kindling coals that fires all my breast,
+And burns me up with flames that tears would quench.
+To weep is to make less the depth of grief:
+Tears then for babes; blows and revenge for me
+Richard, I bear thy name; I'll venge thy death,
+Or die renowned by attempting it.
+
+EDWARD:
+His name that valiant duke hath left with thee;
+His dukedom and his chair with me is left.
+
+RICHARD:
+Nay, if thou be that princely eagle's bird,
+Show thy descent by gazing 'gainst the sun:
+For chair and dukedom, throne and kingdom say;
+Either that is thine, or else thou wert not his.
+
+WARWICK:
+How now, fair lords! What fare? what news abroad?
+
+RICHARD:
+Great Lord of Warwick, if we should recount
+Our baleful news, and at each word's deliverance
+Stab poniards in our flesh till all were told,
+The words would add more anguish than the wounds.
+O valiant lord, the Duke of York is slain!
+
+EDWARD:
+O Warwick, Warwick! that Plantagenet,
+Which held three dearly as his soul's redemption,
+Is by the stern Lord Clifford done to death.
+
+WARWICK:
+Ten days ago I drown'd these news in tears;
+And now, to add more measure to your woes,
+I come to tell you things sith then befall'n.
+After the bloody fray at Wakefield fought,
+Where your brave father breathed his latest gasp,
+Tidings, as swiftly as the posts could run,
+Were brought me of your loss and his depart.
+I, then in London keeper of the king,
+Muster'd my soldiers, gather'd flocks of friends,
+And very well appointed, as I thought,
+March'd toward Saint Alban's to intercept the queen,
+Bearing the king in my behalf along;
+For by my scouts I was advertised
+That she was coming with a full intent
+To dash our late decree in parliament
+Touching King Henry's oath and your succession.
+Short tale to make, we at Saint Alban's met
+Our battles join'd, and both sides fiercely fought:
+But whether 'twas the coldness of the king,
+Who look'd full gently on his warlike queen,
+That robb'd my soldiers of their heated spleen;
+Or whether 'twas report of her success;
+Or more than common fear of Clifford's rigour,
+Who thunders to his captives blood and death,
+I cannot judge: but to conclude with truth,
+Their weapons like to lightning came and went;
+Our soldiers', like the night-owl's lazy flight,
+Or like an idle thresher with a flail,
+Fell gently down, as if they struck their friends.
+I cheer'd them up with justice of our cause,
+With promise of high pay and great rewards:
+But all in vain; they had no heart to fight,
+And we in them no hope to win the day;
+So that we fled; the king unto the queen;
+Lord George your brother, Norfolk and myself,
+In haste, post-haste, are come to join with you:
+For in the marches here we heard you were,
+Making another head to fight again.
+
+EDWARD:
+Where is the Duke of Norfolk, gentle Warwick?
+And when came George from Burgundy to England?
+
+WARWICK:
+Some six miles off the duke is with the soldiers;
+And for your brother, he was lately sent
+From your kind aunt, Duchess of Burgundy,
+With aid of soldiers to this needful war.
+
+RICHARD:
+'Twas odds, belike, when valiant Warwick fled:
+Oft have I heard his praises in pursuit,
+But ne'er till now his scandal of retire.
+
+WARWICK:
+Nor now my scandal, Richard, dost thou hear;
+For thou shalt know this strong right hand of mine
+Can pluck the diadem from faint Henry's head,
+And wring the awful sceptre from his fist,
+Were he as famous and as bold in war
+As he is famed for mildness, peace, and prayer.
+
+RICHARD:
+I know it well, Lord Warwick; blame me not:
+'Tis love I bear thy glories makes me speak.
+But in this troublous time what's to be done?
+Shall we go throw away our coats of steel,
+And wrap our bodies in black mourning gowns,
+Numbering our Ave-Maries with our beads?
+Or shall we on the helmets of our foes
+Tell our devotion with revengeful arms?
+If for the last, say ay, and to it, lords.
+
+WARWICK:
+Why, therefore Warwick came to seek you out;
+And therefore comes my brother Montague.
+Attend me, lords. The proud insulting queen,
+With Clifford and the haught Northumberland,
+And of their feather many more proud birds,
+Have wrought the easy-melting king like wax.
+He swore consent to your succession,
+His oath enrolled in the parliament;
+And now to London all the crew are gone,
+To frustrate both his oath and what beside
+May make against the house of Lancaster.
+Their power, I think, is thirty thousand strong:
+Now, if the help of Norfolk and myself,
+With all the friends that thou, brave Earl of March,
+Amongst the loving Welshmen canst procure,
+Will but amount to five and twenty thousand,
+Why, Via! to London will we march amain,
+And once again bestride our foaming steeds,
+And once again cry 'Charge upon our foes!'
+But never once again turn back and fly.
+
+RICHARD:
+Ay, now methinks I hear great Warwick speak:
+Ne'er may he live to see a sunshine day,
+That cries 'Retire,' if Warwick bid him stay.
+
+EDWARD:
+Lord Warwick, on thy shoulder will I lean;
+And when thou fail'st--as God forbid the hour!--
+Must Edward fall, which peril heaven forfend!
+
+WARWICK:
+No longer Earl of March, but Duke of York:
+The next degree is England's royal throne;
+For King of England shalt thou be proclaim'd
+In every borough as we pass along;
+And he that throws not up his cap for joy
+Shall for the fault make forfeit of his head.
+King Edward, valiant Richard, Montague,
+Stay we no longer, dreaming of renown,
+But sound the trumpets, and about our task.
+
+RICHARD:
+Then, Clifford, were thy heart as hard as steel,
+As thou hast shown it flinty by thy deeds,
+I come to pierce it, or to give thee mine.
+
+EDWARD:
+Then strike up drums: God and Saint George for us!
+
+WARWICK:
+How now! what news?
+
+Messenger:
+The Duke of Norfolk sends you word by me,
+The queen is coming with a puissant host;
+And craves your company for speedy counsel.
+
+WARWICK:
+Why then it sorts, brave warriors, let's away.
+3 KING HENRY VI
+
+QUEEN MARGARET:
+Welcome, my lord, to this brave town of York.
+Yonder's the head of that arch-enemy
+That sought to be encompass'd with your crown:
+Doth not the object cheer your heart, my lord?
+
+KING HENRY VI:
+Ay, as the rocks cheer them that fear their wreck:
+To see this sight, it irks my very soul.
+Withhold revenge, dear God! 'tis not my fault,
+Nor wittingly have I infringed my vow.
+
+CLIFFORD:
+My gracious liege, this too much lenity
+And harmful pity must be laid aside.
+To whom do lions cast their gentle looks?
+Not to the beast that would usurp their den.
+Whose hand is that the forest bear doth lick?
+Not his that spoils her young before her face.
+Who 'scapes the lurking serpent's mortal sting?
+Not he that sets his foot upon her back.
+The smallest worm will turn being trodden on,
+And doves will peck in safeguard of their brood.
+Ambitious York doth level at thy crown,
+Thou smiling while he knit his angry brows:
+He, but a duke, would have his son a king,
+And raise his issue, like a loving sire;
+Thou, being a king, blest with a goodly son,
+Didst yield consent to disinherit him,
+Which argued thee a most unloving father.
+Unreasonable creatures feed their young;
+And though man's face be fearful to their eyes,
+Yet, in protection of their tender ones,
+Who hath not seen them, even with those wings
+Which sometime they have used with fearful flight,
+Make war with him that climb'd unto their nest,
+Offer their own lives in their young's defence?
+For shame, my liege, make them your precedent!
+Were it not pity that this goodly boy
+Should lose his birthright by his father's fault,
+And long hereafter say unto his child,
+'What my great-grandfather and his grandsire got
+My careless father fondly gave away'?
+Ah, what a shame were this! Look on the boy;
+And let his manly face, which promiseth
+Successful fortune, steel thy melting heart
+To hold thine own and leave thine own with him.
+
+KING HENRY VI:
+Full well hath Clifford play'd the orator,
+Inferring arguments of mighty force.
+But, Clifford, tell me, didst thou never hear
+That things ill-got had ever bad success?
+And happy always was it for that son
+Whose father for his hoarding went to hell?
+I'll leave my son my virtuous deeds behind;
+And would my father had left me no more!
+For all the rest is held at such a rate
+As brings a thousand-fold more care to keep
+Than in possession and jot of pleasure.
+Ah, cousin York! would thy best friends did know
+How it doth grieve me that thy head is here!
+
+QUEEN MARGARET:
+My lord, cheer up your spirits: our foes are nigh,
+And this soft courage makes your followers faint.
+You promised knighthood to our forward son:
+Unsheathe your sword, and dub him presently.
+Edward, kneel down.
+
+KING HENRY VI:
+Edward Plantagenet, arise a knight;
+And learn this lesson, draw thy sword in right.
+
+PRINCE:
+My gracious father, by your kingly leave,
+I'll draw it as apparent to the crown,
+And in that quarrel use it to the death.
+
+CLIFFORD:
+Why, that is spoken like a toward prince.
+
+Messenger:
+Royal commanders, be in readiness:
+For with a band of thirty thousand men
+Comes Warwick, backing of the Duke of York;
+And in the towns, as they do march along,
+Proclaims him king, and many fly to him:
+Darraign your battle, for they are at hand.
+
+CLIFFORD:
+I would your highness would depart the field:
+The queen hath best success when you are absent.
+
+QUEEN MARGARET:
+Ay, good my lord, and leave us to our fortune.
+
+KING HENRY VI:
+Why, that's my fortune too; therefore I'll stay.
+
+NORTHUMBERLAND:
+Be it with resolution then to fight.
+
+PRINCE EDWARD:
+My royal father, cheer these noble lords
+And hearten those that fight in your defence:
+Unsheathe your sword, good father; cry 'Saint George!'
+
+EDWARD:
+Now, perjured Henry! wilt thou kneel for grace,
+And set thy diadem upon my head;
+Or bide the mortal fortune of the field?
+
+QUEEN MARGARET:
+Go, rate thy minions, proud insulting boy!
+Becomes it thee to be thus bold in terms
+Before thy sovereign and thy lawful king?
+
+EDWARD:
+I am his king, and he should bow his knee;
+I was adopted heir by his consent:
+Since when, his oath is broke; for, as I hear,
+You, that are king, though he do wear the crown,
+Have caused him, by new act of parliament,
+To blot out me, and put his own son in.
+
+CLIFFORD:
+And reason too:
+Who should succeed the father but the son?
+
+RICHARD:
+Are you there, butcher? O, I cannot speak!
+
+CLIFFORD:
+Ay, crook-back, here I stand to answer thee,
+Or any he the proudest of thy sort.
+
+RICHARD:
+'Twas you that kill'd young Rutland, was it not?
+
+CLIFFORD:
+Ay, and old York, and yet not satisfied.
+
+RICHARD:
+For God's sake, lords, give signal to the fight.
+
+WARWICK:
+What say'st thou, Henry, wilt thou yield the crown?
+
+QUEEN MARGARET:
+Why, how now, long-tongued Warwick! dare you speak?
+When you and I met at Saint Alban's last,
+Your legs did better service than your hands.
+
+WARWICK:
+Then 'twas my turn to fly, and now 'tis thine.
+
+CLIFFORD:
+You said so much before, and yet you fled.
+
+WARWICK:
+'Twas not your valour, Clifford, drove me thence.
+
+NORTHUMBERLAND:
+No, nor your manhood that durst make you stay.
+
+RICHARD:
+Northumberland, I hold thee reverently.
+Break off the parley; for scarce I can refrain
+The execution of my big-swoln heart
+Upon that Clifford, that cruel child-killer.
+
+CLIFFORD:
+I slew thy father, call'st thou him a child?
+
+RICHARD:
+Ay, like a dastard and a treacherous coward,
+As thou didst kill our tender brother Rutland;
+But ere sunset I'll make thee curse the deed.
+
+KING HENRY VI:
+Have done with words, my lords, and hear me speak.
+
+QUEEN MARGARET:
+Defy them then, or else hold close thy lips.
+
+KING HENRY VI:
+I prithee, give no limits to my tongue:
+I am a king, and privileged to speak.
+
+CLIFFORD:
+My liege, the wound that bred this meeting here
+Cannot be cured by words; therefore be still.
+
+RICHARD:
+Then, executioner, unsheathe thy sword:
+By him that made us all, I am resolved
+that Clifford's manhood lies upon his tongue.
+
+EDWARD:
+Say, Henry, shall I have my right, or no?
+A thousand men have broke their fasts to-day,
+That ne'er shall dine unless thou yield the crown.
+
+WARWICK:
+If thou deny, their blood upon thy head;
+For York in justice puts his armour on.
+
+PRINCE EDWARD:
+If that be right which Warwick says is right,
+There is no wrong, but every thing is right.
+
+RICHARD:
+Whoever got thee, there thy mother stands;
+For, well I wot, thou hast thy mother's tongue.
+
+QUEEN MARGARET:
+But thou art neither like thy sire nor dam;
+But like a foul mis-shapen stigmatic,
+Mark'd by the destinies to be avoided,
+As venom toads, or lizards' dreadful stings.
+
+RICHARD:
+Iron of Naples hid with English gilt,
+Whose father bears the title of a king,--
+As if a channel should be call'd the sea,--
+Shamest thou not, knowing whence thou art extraught,
+To let thy tongue detect thy base-born heart?
+
+EDWARD:
+A wisp of straw were worth a thousand crowns,
+To make this shameless callet know herself.
+Helen of Greece was fairer far than thou,
+Although thy husband may be Menelaus;
+And ne'er was Agamemnon's brother wrong'd
+By that false woman, as this king by thee.
+His father revell'd in the heart of France,
+And tamed the king, and made the dauphin stoop;
+And had he match'd according to his state,
+He might have kept that glory to this day;
+But when he took a beggar to his bed,
+And graced thy poor sire with his bridal-day,
+Even then that sunshine brew'd a shower for him,
+That wash'd his father's fortunes forth of France,
+And heap'd sedition on his crown at home.
+For what hath broach'd this tumult but thy pride?
+Hadst thou been meek, our title still had slept;
+And we, in pity of the gentle king,
+Had slipp'd our claim until another age.
+
+GEORGE:
+But when we saw our sunshine made thy spring,
+And that thy summer bred us no increase,
+We set the axe to thy usurping root;
+And though the edge hath something hit ourselves,
+Yet, know thou, since we have begun to strike,
+We'll never leave till we have hewn thee down,
+Or bathed thy growing with our heated bloods.
+
+EDWARD:
+And, in this resolution, I defy thee;
+Not willing any longer conference,
+Since thou deniest the gentle king to speak.
+Sound trumpets! let our bloody colours wave!
+And either victory, or else a grave.
+
+QUEEN MARGARET:
+Stay, Edward.
+
+EDWARD:
+No, wrangling woman, we'll no longer stay:
+These words will cost ten thousand lives this day.
+3 KING HENRY VI
+
+WARWICK:
+Forspent with toil, as runners with a race,
+I lay me down a little while to breathe;
+For strokes received, and many blows repaid,
+Have robb'd my strong-knit sinews of their strength,
+And spite of spite needs must I rest awhile.
+
+EDWARD:
+Smile, gentle heaven! or strike, ungentle death!
+For this world frowns, and Edward's sun is clouded.
+
+WARWICK:
+How now, my lord! what hap? what hope of good?
+
+GEORGE:
+Our hap is loss, our hope but sad despair;
+Our ranks are broke, and ruin follows us:
+What counsel give you? whither shall we fly?
+
+EDWARD:
+Bootless is flight, they follow us with wings;
+And weak we are and cannot shun pursuit.
+
+RICHARD:
+Ah, Warwick, why hast thou withdrawn thyself?
+Thy brother's blood the thirsty earth hath drunk,
+Broach'd with the steely point of Clifford's lance;
+And in the very pangs of death he cried,
+Like to a dismal clangour heard from far,
+'Warwick, revenge! brother, revenge my death!'
+So, underneath the belly of their steeds,
+That stain'd their fetlocks in his smoking blood,
+The noble gentleman gave up the ghost.
+
+WARWICK:
+Then let the earth be drunken with our blood:
+I'll kill my horse, because I will not fly.
+Why stand we like soft-hearted women here,
+Wailing our losses, whiles the foe doth rage;
+And look upon, as if the tragedy
+Were play'd in jest by counterfeiting actors?
+Here on my knee I vow to God above,
+I'll never pause again, never stand still,
+Till either death hath closed these eyes of mine
+Or fortune given me measure of revenge.
+
+EDWARD:
+O Warwick, I do bend my knee with thine;
+And in this vow do chain my soul to thine!
+And, ere my knee rise from the earth's cold face,
+I throw my hands, mine eyes, my heart to thee,
+Thou setter up and plucker down of kings,
+Beseeching thee, if with they will it stands
+That to my foes this body must be prey,
+Yet that thy brazen gates of heaven may ope,
+And give sweet passage to my sinful soul!
+Now, lords, take leave until we meet again,
+Where'er it be, in heaven or in earth.
+
+RICHARD:
+Brother, give me thy hand; and, gentle Warwick,
+Let me embrace thee in my weary arms:
+I, that did never weep, now melt with woe
+That winter should cut off our spring-time so.
+
+WARWICK:
+Away, away! Once more, sweet lords farewell.
+
+GEORGE:
+Yet let us all together to our troops,
+And give them leave to fly that will not stay;
+And call them pillars that will stand to us;
+And, if we thrive, promise them such rewards
+As victors wear at the Olympian games:
+This may plant courage in their quailing breasts;
+For yet is hope of life and victory.
+Forslow no longer, make we hence amain.
+3 KING HENRY VI
+
+RICHARD:
+Now, Clifford, I have singled thee alone:
+Suppose this arm is for the Duke of York,
+And this for Rutland; both bound to revenge,
+Wert thou environ'd with a brazen wall.
+
+CLIFFORD:
+Now, Richard, I am with thee here alone:
+This is the hand that stabb'd thy father York;
+And this the hand that slew thy brother Rutland;
+And here's the heart that triumphs in their death
+And cheers these hands that slew thy sire and brother
+To execute the like upon thyself;
+And so, have at thee!
+
+RICHARD:
+Nay Warwick, single out some other chase;
+For I myself will hunt this wolf to death.
+3 KING HENRY VI
+
+KING HENRY VI:
+This battle fares like to the morning's war,
+When dying clouds contend with growing light,
+What time the shepherd, blowing of his nails,
+Can neither call it perfect day nor night.
+Now sways it this way, like a mighty sea
+Forced by the tide to combat with the wind;
+Now sways it that way, like the selfsame sea
+Forced to retire by fury of the wind:
+Sometime the flood prevails, and then the wind;
+Now one the better, then another best;
+Both tugging to be victors, breast to breast,
+Yet neither conqueror nor conquered:
+So is the equal of this fell war.
+Here on this molehill will I sit me down.
+To whom God will, there be the victory!
+For Margaret my queen, and Clifford too,
+Have chid me from the battle; swearing both
+They prosper best of all when I am thence.
+Would I were dead! if God's good will were so;
+For what is in this world but grief and woe?
+O God! methinks it were a happy life,
+To be no better than a homely swain;
+To sit upon a hill, as I do now,
+To carve out dials quaintly, point by point,
+Thereby to see the minutes how they run,
+How many make the hour full complete;
+How many hours bring about the day;
+How many days will finish up the year;
+How many years a mortal man may live.
+When this is known, then to divide the times:
+So many hours must I tend my flock;
+So many hours must I take my rest;
+So many hours must I contemplate;
+So many hours must I sport myself;
+So many days my ewes have been with young;
+So many weeks ere the poor fools will ean:
+So many years ere I shall shear the fleece:
+So minutes, hours, days, months, and years,
+Pass'd over to the end they were created,
+Would bring white hairs unto a quiet grave.
+Ah, what a life were this! how sweet! how lovely!
+Gives not the hawthorn-bush a sweeter shade
+To shepherds looking on their silly sheep,
+Than doth a rich embroider'd canopy
+To kings that fear their subjects' treachery?
+O, yes, it doth; a thousand-fold it doth.
+And to conclude, the shepherd's homely curds,
+His cold thin drink out of his leather bottle.
+His wonted sleep under a fresh tree's shade,
+All which secure and sweetly he enjoys,
+Is far beyond a prince's delicates,
+His viands sparkling in a golden cup,
+His body couched in a curious bed,
+When care, mistrust, and treason waits on him.
+
+Son:
+Ill blows the wind that profits nobody.
+This man, whom hand to hand I slew in fight,
+May be possessed with some store of crowns;
+And I, that haply take them from him now,
+May yet ere night yield both my life and them
+To some man else, as this dead man doth me.
+Who's this? O God! it is my father's face,
+Whom in this conflict I unwares have kill'd.
+O heavy times, begetting such events!
+From London by the king was I press'd forth;
+My father, being the Earl of Warwick's man,
+Came on the part of York, press'd by his master;
+And I, who at his hands received my life, him
+Have by my hands of life bereaved him.
+Pardon me, God, I knew not what I did!
+And pardon, father, for I knew not thee!
+My tears shall wipe away these bloody marks;
+And no more words till they have flow'd their fill.
+
+KING HENRY VI:
+O piteous spectacle! O bloody times!
+Whiles lions war and battle for their dens,
+Poor harmless lambs abide their enmity.
+Weep, wretched man, I'll aid thee tear for tear;
+And let our hearts and eyes, like civil war,
+Be blind with tears, and break o'ercharged with grief.
+
+Father:
+Thou that so stoutly hast resisted me,
+Give me thy gold, if thou hast any gold:
+For I have bought it with an hundred blows.
+But let me see: is this our foeman's face?
+Ah, no, no, no, it is mine only son!
+Ah, boy, if any life be left in thee,
+Throw up thine eye! see, see what showers arise,
+Blown with the windy tempest of my heart,
+Upon thy words, that kill mine eye and heart!
+O, pity, God, this miserable age!
+What stratagems, how fell, how butcherly,
+Erroneous, mutinous and unnatural,
+This deadly quarrel daily doth beget!
+O boy, thy father gave thee life too soon,
+And hath bereft thee of thy life too late!
+
+KING HENRY VI:
+Woe above woe! grief more than common grief!
+O that my death would stay these ruthful deeds!
+O pity, pity, gentle heaven, pity!
+The red rose and the white are on his face,
+The fatal colours of our striving houses:
+The one his purple blood right well resembles;
+The other his pale cheeks, methinks, presenteth:
+Wither one rose, and let the other flourish;
+If you contend, a thousand lives must wither.
+
+Son:
+How will my mother for a father's death
+Take on with me and ne'er be satisfied!
+
+Father:
+How will my wife for slaughter of my son
+Shed seas of tears and ne'er be satisfied!
+
+KING HENRY VI:
+How will the country for these woful chances
+Misthink the king and not be satisfied!
+
+Son:
+Was ever son so rued a father's death?
+
+Father:
+Was ever father so bemoan'd his son?
+
+KING HENRY VI:
+Was ever king so grieved for subjects' woe?
+Much is your sorrow; mine ten times so much.
+
+Son:
+I'll bear thee hence, where I may weep my fill.
+
+Father:
+These arms of mine shall be thy winding-sheet;
+My heart, sweet boy, shall be thy sepulchre,
+For from my heart thine image ne'er shall go;
+My sighing breast shall be thy funeral bell;
+And so obsequious will thy father be,
+Even for the loss of thee, having no more,
+As Priam was for all his valiant sons.
+I'll bear thee hence; and let them fight that will,
+For I have murdered where I should not kill.
+
+KING HENRY VI:
+Sad-hearted men, much overgone with care,
+Here sits a king more woful than you are.
+
+PRINCE EDWARD:
+Fly, father, fly! for all your friends are fled,
+And Warwick rages like a chafed bull:
+Away! for death doth hold us in pursuit.
+
+QUEEN MARGARET:
+Mount you, my lord; towards Berwick post amain:
+Edward and Richard, like a brace of greyhounds
+Having the fearful flying hare in sight,
+With fiery eyes sparkling for very wrath,
+And bloody steel grasp'd in their ireful hands,
+Are at our backs; and therefore hence amain.
+
+EXETER:
+Away! for vengeance comes along with them:
+Nay, stay not to expostulate, make speed;
+Or else come after: I'll away before.
+
+KING HENRY VI:
+Nay, take me with thee, good sweet Exeter:
+Not that I fear to stay, but love to go
+Whither the queen intends. Forward; away!
+3 KING HENRY VI
+
+CLIFFORD:
+Here burns my candle out; ay, here it dies,
+Which, whiles it lasted, gave King Henry light.
+O Lancaster, I fear thy overthrow
+More than my body's parting with my soul!
+My love and fear glued many friends to thee;
+And, now I fall, thy tough commixture melts.
+Impairing Henry, strengthening misproud York,
+The common people swarm like summer flies;
+And whither fly the gnats but to the sun?
+And who shines now but Henry's enemies?
+O Phoebus, hadst thou never given consent
+That Phaethon should cheque thy fiery steeds,
+Thy burning car never had scorch'd the earth!
+And, Henry, hadst thou sway'd as kings should do,
+Or as thy father and his father did,
+Giving no ground unto the house of York,
+They never then had sprung like summer flies;
+I and ten thousand in this luckless realm
+Had left no mourning widows for our death;
+And thou this day hadst kept thy chair in peace.
+For what doth cherish weeds but gentle air?
+And what makes robbers bold but too much lenity?
+Bootless are plaints, and cureless are my wounds;
+No way to fly, nor strength to hold out flight:
+The foe is merciless, and will not pity;
+For at their hands I have deserved no pity.
+The air hath got into my deadly wounds,
+And much effuse of blood doth make me faint.
+Come, York and Richard, Warwick and the rest;
+I stabb'd your fathers' bosoms, split my breast.
+
+EDWARD:
+Now breathe we, lords: good fortune bids us pause,
+And smooth the frowns of war with peaceful looks.
+Some troops pursue the bloody-minded queen,
+That led calm Henry, though he were a king,
+As doth a sail, fill'd with a fretting gust,
+Command an argosy to stem the waves.
+But think you, lords, that Clifford fled with them?
+
+WARWICK:
+No, 'tis impossible he should escape,
+For, though before his face I speak the words
+Your brother Richard mark'd him for the grave:
+And wheresoe'er he is, he's surely dead.
+
+EDWARD:
+Whose soul is that which takes her heavy leave?
+
+RICHARD:
+A deadly groan, like life and death's departing.
+
+EDWARD:
+See who it is: and, now the battle's ended,
+If friend or foe, let him be gently used.
+
+RICHARD:
+Revoke that doom of mercy, for 'tis Clifford;
+Who not contented that he lopp'd the branch
+In hewing Rutland when his leaves put forth,
+But set his murdering knife unto the root
+From whence that tender spray did sweetly spring,
+I mean our princely father, Duke of York.
+
+WARWICK:
+From off the gates of York fetch down the head,
+Your father's head, which Clifford placed there;
+Instead whereof let this supply the room:
+Measure for measure must be answered.
+
+EDWARD:
+Bring forth that fatal screech-owl to our house,
+That nothing sung but death to us and ours:
+Now death shall stop his dismal threatening sound,
+And his ill-boding tongue no more shall speak.
+
+WARWICK:
+I think his understanding is bereft.
+Speak, Clifford, dost thou know who speaks to thee?
+Dark cloudy death o'ershades his beams of life,
+And he nor sees nor hears us what we say.
+
+RICHARD:
+O, would he did! and so perhaps he doth:
+'Tis but his policy to counterfeit,
+Because he would avoid such bitter taunts
+Which in the time of death he gave our father.
+
+GEORGE:
+If so thou think'st, vex him with eager words.
+
+RICHARD:
+Clifford, ask mercy and obtain no grace.
+
+EDWARD:
+Clifford, repent in bootless penitence.
+
+WARWICK:
+Clifford, devise excuses for thy faults.
+
+GEORGE:
+While we devise fell tortures for thy faults.
+
+RICHARD:
+Thou didst love York, and I am son to York.
+
+EDWARD:
+Thou pitied'st Rutland; I will pity thee.
+
+GEORGE:
+Where's Captain Margaret, to fence you now?
+
+WARWICK:
+They mock thee, Clifford: swear as thou wast wont.
+
+RICHARD:
+What, not an oath? nay, then the world goes hard
+When Clifford cannot spare his friends an oath.
+I know by that he's dead; and, by my soul,
+If this right hand would buy two hour's life,
+That I in all despite might rail at him,
+This hand should chop it off, and with the
+issuing blood
+Stifle the villain whose unstanched thirst
+York and young Rutland could not satisfy.
+
+WARWICK:
+Ay, but he's dead: off with the traitor's head,
+And rear it in the place your father's stands.
+And now to London with triumphant march,
+There to be crowned England's royal king:
+From whence shall Warwick cut the sea to France,
+And ask the Lady Bona for thy queen:
+So shalt thou sinew both these lands together;
+And, having France thy friend, thou shalt not dread
+The scatter'd foe that hopes to rise again;
+For though they cannot greatly sting to hurt,
+Yet look to have them buzz to offend thine ears.
+First will I see the coronation;
+And then to Brittany I'll cross the sea,
+To effect this marriage, so it please my lord.
+
+EDWARD:
+Even as thou wilt, sweet Warwick, let it be;
+For in thy shoulder do I build my seat,
+And never will I undertake the thing
+Wherein thy counsel and consent is wanting.
+Richard, I will create thee Duke of Gloucester,
+And George, of Clarence: Warwick, as ourself,
+Shall do and undo as him pleaseth best.
+
+RICHARD:
+Let me be Duke of Clarence, George of Gloucester;
+For Gloucester's dukedom is too ominous.
+
+WARWICK:
+Tut, that's a foolish observation:
+Richard, be Duke of Gloucester. Now to London,
+To see these honours in possession.
+3 KING HENRY VI
+
+First Keeper:
+Under this thick-grown brake we'll shroud ourselves;
+For through this laund anon the deer will come;
+And in this covert will we make our stand,
+Culling the principal of all the deer.
+
+Second Keeper:
+I'll stay above the hill, so both may shoot.
+
+First Keeper:
+That cannot be; the noise of thy cross-bow
+Will scare the herd, and so my shoot is lost.
+Here stand we both, and aim we at the best:
+And, for the time shall not seem tedious,
+I'll tell thee what befell me on a day
+In this self-place where now we mean to stand.
+
+Second Keeper:
+Here comes a man; let's stay till he be past.
+
+KING HENRY VI:
+From Scotland am I stol'n, even of pure love,
+To greet mine own land with my wishful sight.
+No, Harry, Harry, 'tis no land of thine;
+Thy place is fill'd, thy sceptre wrung from thee,
+Thy balm wash'd off wherewith thou wast anointed:
+No bending knee will call thee Caesar now,
+No humble suitors press to speak for right,
+No, not a man comes for redress of thee;
+For how can I help them, and not myself?
+
+First Keeper:
+Ay, here's a deer whose skin's a keeper's fee:
+This is the quondam king; let's seize upon him.
+
+KING HENRY VI:
+Let me embrace thee, sour adversity,
+For wise men say it is the wisest course.
+
+Second Keeper:
+Why linger we? let us lay hands upon him.
+
+First Keeper:
+Forbear awhile; we'll hear a little more.
+
+KING HENRY VI:
+My queen and son are gone to France for aid;
+And, as I hear, the great commanding Warwick
+Is thither gone, to crave the French king's sister
+To wife for Edward: if this news be true,
+Poor queen and son, your labour is but lost;
+For Warwick is a subtle orator,
+And Lewis a prince soon won with moving words.
+By this account then Margaret may win him;
+For she's a woman to be pitied much:
+Her sighs will make a battery in his breast;
+Her tears will pierce into a marble heart;
+The tiger will be mild whiles she doth mourn;
+And Nero will be tainted with remorse,
+To hear and see her plaints, her brinish tears.
+Ay, but she's come to beg, Warwick to give;
+She, on his left side, craving aid for Henry,
+He, on his right, asking a wife for Edward.
+She weeps, and says her Henry is deposed;
+He smiles, and says his Edward is install'd;
+That she, poor wretch, for grief can speak no more;
+Whiles Warwick tells his title, smooths the wrong,
+Inferreth arguments of mighty strength,
+And in conclusion wins the king from her,
+With promise of his sister, and what else,
+To strengthen and support King Edward's place.
+O Margaret, thus 'twill be; and thou, poor soul,
+Art then forsaken, as thou went'st forlorn!
+
+Second Keeper:
+Say, what art thou that talk'st of kings and queens?
+
+KING HENRY VI:
+More than I seem, and less than I was born to:
+A man at least, for less I should not be;
+And men may talk of kings, and why not I?
+
+Second Keeper:
+Ay, but thou talk'st as if thou wert a king.
+
+KING HENRY VI:
+Why, so I am, in mind; and that's enough.
+
+Second Keeper:
+But, if thou be a king, where is thy crown?
+
+KING HENRY VI:
+My crown is in my heart, not on my head;
+Not decked with diamonds and Indian stones,
+Nor to be seen: my crown is called content:
+A crown it is that seldom kings enjoy.
+
+Second Keeper:
+Well, if you be a king crown'd with content,
+Your crown content and you must be contented
+To go along with us; for as we think,
+You are the king King Edward hath deposed;
+And we his subjects sworn in all allegiance
+Will apprehend you as his enemy.
+
+KING HENRY VI:
+But did you never swear, and break an oath?
+
+Second Keeper:
+No, never such an oath; nor will not now.
+
+KING HENRY VI:
+Where did you dwell when I was King of England?
+
+Second Keeper:
+Here in this country, where we now remain.
+
+KING HENRY VI:
+I was anointed king at nine months old;
+My father and my grandfather were kings,
+And you were sworn true subjects unto me:
+And tell me, then, have you not broke your oaths?
+
+First Keeper:
+No;
+For we were subjects but while you were king.
+
+KING HENRY VI:
+Why, am I dead? do I not breathe a man?
+Ah, simple men, you know not what you swear!
+Look, as I blow this feather from my face,
+And as the air blows it to me again,
+Obeying with my wind when I do blow,
+And yielding to another when it blows,
+Commanded always by the greater gust;
+Such is the lightness of you common men.
+But do not break your oaths; for of that sin
+My mild entreaty shall not make you guilty.
+Go where you will, the king shall be commanded;
+And be you kings, command, and I'll obey.
+
+First Keeper:
+We are true subjects to the king, King Edward.
+
+KING HENRY VI:
+So would you be again to Henry,
+If he were seated as King Edward is.
+
+First Keeper:
+We charge you, in God's name, and the king's,
+To go with us unto the officers.
+
+KING HENRY VI:
+In God's name, lead; your king's name be obey'd:
+And what God will, that let your king perform;
+And what he will, I humbly yield unto.
+3 KING HENRY VI
+
+KING EDWARD IV:
+Brother of Gloucester, at Saint Alban's field
+This lady's husband, Sir Richard Grey, was slain,
+His lands then seized on by the conqueror:
+Her suit is now to repossess those lands;
+Which we in justice cannot well deny,
+Because in quarrel of the house of York
+The worthy gentleman did lose his life.
+
+GLOUCESTER:
+Your highness shall do well to grant her suit;
+It were dishonour to deny it her.
+
+KING EDWARD IV:
+It were no less; but yet I'll make a pause.
+
+GLOUCESTER:
+
+CLARENCE:
+
+GLOUCESTER:
+
+KING EDWARD IV:
+Widow, we will consider of your suit;
+And come some other time to know our mind.
+
+LADY GREY:
+Right gracious lord, I cannot brook delay:
+May it please your highness to resolve me now;
+And what your pleasure is, shall satisfy me.
+
+GLOUCESTER:
+
+CLARENCE:
+
+GLOUCESTER:
+
+KING EDWARD IV:
+How many children hast thou, widow? tell me.
+
+CLARENCE:
+
+GLOUCESTER:
+
+LADY GREY:
+Three, my most gracious lord.
+
+GLOUCESTER:
+
+KING EDWARD IV:
+'Twere pity they should lose their father's lands.
+
+LADY GREY:
+Be pitiful, dread lord, and grant it then.
+
+KING EDWARD IV:
+Lords, give us leave: I'll try this widow's wit.
+
+GLOUCESTER:
+
+KING EDWARD IV:
+Now tell me, madam, do you love your children?
+
+LADY GREY:
+Ay, full as dearly as I love myself.
+
+KING EDWARD IV:
+And would you not do much to do them good?
+
+LADY GREY:
+To do them good, I would sustain some harm.
+
+KING EDWARD IV:
+Then get your husband's lands, to do them good.
+
+LADY GREY:
+Therefore I came unto your majesty.
+
+KING EDWARD IV:
+I'll tell you how these lands are to be got.
+
+LADY GREY:
+So shall you bind me to your highness' service.
+
+KING EDWARD IV:
+What service wilt thou do me, if I give them?
+
+LADY GREY:
+What you command, that rests in me to do.
+
+KING EDWARD IV:
+But you will take exceptions to my boon.
+
+LADY GREY:
+No, gracious lord, except I cannot do it.
+
+KING EDWARD IV:
+Ay, but thou canst do what I mean to ask.
+
+LADY GREY:
+Why, then I will do what your grace commands.
+
+GLOUCESTER:
+
+CLARENCE:
+
+LADY GREY:
+Why stops my lord, shall I not hear my task?
+
+KING EDWARD IV:
+An easy task; 'tis but to love a king.
+
+LADY GREY:
+That's soon perform'd, because I am a subject.
+
+KING EDWARD IV:
+Why, then, thy husband's lands I freely give thee.
+
+LADY GREY:
+I take my leave with many thousand thanks.
+
+GLOUCESTER:
+
+KING EDWARD IV:
+But stay thee, 'tis the fruits of love I mean.
+
+LADY GREY:
+The fruits of love I mean, my loving liege.
+
+KING EDWARD IV:
+Ay, but, I fear me, in another sense.
+What love, think'st thou, I sue so much to get?
+
+LADY GREY:
+My love till death, my humble thanks, my prayers;
+That love which virtue begs and virtue grants.
+
+KING EDWARD IV:
+No, by my troth, I did not mean such love.
+
+LADY GREY:
+Why, then you mean not as I thought you did.
+
+KING EDWARD IV:
+But now you partly may perceive my mind.
+
+LADY GREY:
+My mind will never grant what I perceive
+Your highness aims at, if I aim aright.
+
+KING EDWARD IV:
+To tell thee plain, I aim to lie with thee.
+
+LADY GREY:
+To tell you plain, I had rather lie in prison.
+
+KING EDWARD IV:
+Why, then thou shalt not have thy husband's lands.
+
+LADY GREY:
+Why, then mine honesty shall be my dower;
+For by that loss I will not purchase them.
+
+KING EDWARD IV:
+Therein thou wrong'st thy children mightily.
+
+LADY GREY:
+Herein your highness wrongs both them and me.
+But, mighty lord, this merry inclination
+Accords not with the sadness of my suit:
+Please you dismiss me either with 'ay' or 'no.'
+
+KING EDWARD IV:
+Ay, if thou wilt say 'ay' to my request;
+No if thou dost say 'no' to my demand.
+
+LADY GREY:
+Then, no, my lord. My suit is at an end.
+
+GLOUCESTER:
+
+CLARENCE:
+
+KING EDWARD IV:
+
+LADY GREY:
+'Tis better said than done, my gracious lord:
+I am a subject fit to jest withal,
+But far unfit to be a sovereign.
+
+KING EDWARD IV:
+Sweet widow, by my state I swear to thee
+I speak no more than what my soul intends;
+And that is, to enjoy thee for my love.
+
+LADY GREY:
+And that is more than I will yield unto:
+I know I am too mean to be your queen,
+And yet too good to be your concubine.
+
+KING EDWARD IV:
+You cavil, widow: I did mean, my queen.
+
+LADY GREY:
+'Twill grieve your grace my sons should call you father.
+
+KING EDWARD IV:
+No more than when my daughters call thee mother.
+Thou art a widow, and thou hast some children;
+And, by God's mother, I, being but a bachelor,
+Have other some: why, 'tis a happy thing
+To be the father unto many sons.
+Answer no more, for thou shalt be my queen.
+
+GLOUCESTER:
+
+CLARENCE:
+
+KING EDWARD IV:
+Brothers, you muse what chat we two have had.
+
+GLOUCESTER:
+The widow likes it not, for she looks very sad.
+
+KING EDWARD IV:
+You'll think it strange if I should marry her.
+
+CLARENCE:
+To whom, my lord?
+
+KING EDWARD IV:
+Why, Clarence, to myself.
+
+GLOUCESTER:
+That would be ten days' wonder at the least.
+
+CLARENCE:
+That's a day longer than a wonder lasts.
+
+GLOUCESTER:
+By so much is the wonder in extremes.
+
+KING EDWARD IV:
+Well, jest on, brothers: I can tell you both
+Her suit is granted for her husband's lands.
+
+Nobleman:
+My gracious lord, Henry your foe is taken,
+And brought your prisoner to your palace gate.
+
+KING EDWARD IV:
+See that he be convey'd unto the Tower:
+And go we, brothers, to the man that took him,
+To question of his apprehension.
+Widow, go you along. Lords, use her honourably.
+
+GLOUCESTER:
+Ay, Edward will use women honourably.
+Would he were wasted, marrow, bones and all,
+That from his loins no hopeful branch may spring,
+To cross me from the golden time I look for!
+And yet, between my soul's desire and me--
+The lustful Edward's title buried--
+Is Clarence, Henry, and his son young Edward,
+And all the unlook'd for issue of their bodies,
+To take their rooms, ere I can place myself:
+A cold premeditation for my purpose!
+Why, then, I do but dream on sovereignty;
+Like one that stands upon a promontory,
+And spies a far-off shore where he would tread,
+Wishing his foot were equal with his eye,
+And chides the sea that sunders him from thence,
+Saying, he'll lade it dry to have his way:
+So do I wish the crown, being so far off;
+And so I chide the means that keeps me from it;
+And so I say, I'll cut the causes off,
+Flattering me with impossibilities.
+My eye's too quick, my heart o'erweens too much,
+Unless my hand and strength could equal them.
+Well, say there is no kingdom then for Richard;
+What other pleasure can the world afford?
+I'll make my heaven in a lady's lap,
+And deck my body in gay ornaments,
+And witch sweet ladies with my words and looks.
+O miserable thought! and more unlikely
+Than to accomplish twenty golden crowns!
+Why, love forswore me in my mother's womb:
+And, for I should not deal in her soft laws,
+She did corrupt frail nature with some bribe,
+To shrink mine arm up like a wither'd shrub;
+To make an envious mountain on my back,
+Where sits deformity to mock my body;
+To shape my legs of an unequal size;
+To disproportion me in every part,
+Like to a chaos, or an unlick'd bear-whelp
+That carries no impression like the dam.
+And am I then a man to be beloved?
+O monstrous fault, to harbour such a thought!
+Then, since this earth affords no joy to me,
+But to command, to cheque, to o'erbear such
+As are of better person than myself,
+I'll make my heaven to dream upon the crown,
+And, whiles I live, to account this world but hell,
+Until my mis-shaped trunk that bears this head
+Be round impaled with a glorious crown.
+And yet I know not how to get the crown,
+For many lives stand between me and home:
+And I,--like one lost in a thorny wood,
+That rends the thorns and is rent with the thorns,
+Seeking a way and straying from the way;
+Not knowing how to find the open air,
+But toiling desperately to find it out,--
+Torment myself to catch the English crown:
+And from that torment I will free myself,
+Or hew my way out with a bloody axe.
+Why, I can smile, and murder whiles I smile,
+And cry 'Content' to that which grieves my heart,
+And wet my cheeks with artificial tears,
+And frame my face to all occasions.
+I'll drown more sailors than the mermaid shall;
+I'll slay more gazers than the basilisk;
+I'll play the orator as well as Nestor,
+Deceive more slily than Ulysses could,
+And, like a Sinon, take another Troy.
+I can add colours to the chameleon,
+Change shapes with Proteus for advantages,
+And set the murderous Machiavel to school.
+Can I do this, and cannot get a crown?
+Tut, were it farther off, I'll pluck it down.
+3 KING HENRY VI
+
+KING LEWIS XI:
+Fair Queen of England, worthy Margaret,
+Sit down with us: it ill befits thy state
+And birth, that thou shouldst stand while Lewis doth sit.
+
+QUEEN MARGARET:
+No, mighty King of France: now Margaret
+Must strike her sail and learn awhile to serve
+Where kings command. I was, I must confess,
+Great Albion's queen in former golden days:
+But now mischance hath trod my title down,
+And with dishonour laid me on the ground;
+Where I must take like seat unto my fortune,
+And to my humble seat conform myself.
+
+KING LEWIS XI:
+Why, say, fair queen, whence springs this deep despair?
+
+QUEEN MARGARET:
+From such a cause as fills mine eyes with tears
+And stops my tongue, while heart is drown'd in cares.
+
+KING LEWIS XI:
+Whate'er it be, be thou still like thyself,
+And sit thee by our side:
+Yield not thy neck
+To fortune's yoke, but let thy dauntless mind
+Still ride in triumph over all mischance.
+Be plain, Queen Margaret, and tell thy grief;
+It shall be eased, if France can yield relief.
+
+QUEEN MARGARET:
+Those gracious words revive my drooping thoughts
+And give my tongue-tied sorrows leave to speak.
+Now, therefore, be it known to noble Lewis,
+That Henry, sole possessor of my love,
+Is of a king become a banish'd man,
+And forced to live in Scotland a forlorn;
+While proud ambitious Edward Duke of York
+Usurps the regal title and the seat
+Of England's true-anointed lawful king.
+This is the cause that I, poor Margaret,
+With this my son, Prince Edward, Henry's heir,
+Am come to crave thy just and lawful aid;
+And if thou fail us, all our hope is done:
+Scotland hath will to help, but cannot help;
+Our people and our peers are both misled,
+Our treasures seized, our soldiers put to flight,
+And, as thou seest, ourselves in heavy plight.
+
+KING LEWIS XI:
+Renowned queen, with patience calm the storm,
+While we bethink a means to break it off.
+
+QUEEN MARGARET:
+The more we stay, the stronger grows our foe.
+
+KING LEWIS XI:
+The more I stay, the more I'll succor thee.
+
+QUEEN MARGARET:
+O, but impatience waiteth on true sorrow.
+And see where comes the breeder of my sorrow!
+
+KING LEWIS XI:
+What's he approacheth boldly to our presence?
+
+QUEEN MARGARET:
+Our Earl of Warwick, Edward's greatest friend.
+
+KING LEWIS XI:
+Welcome, brave Warwick! What brings thee to France?
+
+QUEEN MARGARET:
+Ay, now begins a second storm to rise;
+For this is he that moves both wind and tide.
+
+WARWICK:
+From worthy Edward, King of Albion,
+My lord and sovereign, and thy vowed friend,
+I come, in kindness and unfeigned love,
+First, to do greetings to thy royal person;
+And then to crave a league of amity;
+And lastly, to confirm that amity
+With a nuptial knot, if thou vouchsafe to grant
+That virtuous Lady Bona, thy fair sister,
+To England's king in lawful marriage.
+
+QUEEN MARGARET:
+
+WARWICK:
+
+QUEEN MARGARET:
+King Lewis and Lady Bona, hear me speak,
+Before you answer Warwick. His demand
+Springs not from Edward's well-meant honest love,
+But from deceit bred by necessity;
+For how can tyrants safely govern home,
+Unless abroad they purchase great alliance?
+To prove him tyrant this reason may suffice,
+That Henry liveth still: but were he dead,
+Yet here Prince Edward stands, King Henry's son.
+Look, therefore, Lewis, that by this league and marriage
+Thou draw not on thy danger and dishonour;
+For though usurpers sway the rule awhile,
+Yet heavens are just, and time suppresseth wrongs.
+
+WARWICK:
+Injurious Margaret!
+
+PRINCE EDWARD:
+And why not queen?
+
+WARWICK:
+Because thy father Henry did usurp;
+And thou no more are prince than she is queen.
+
+OXFORD:
+Then Warwick disannuls great John of Gaunt,
+Which did subdue the greatest part of Spain;
+And, after John of Gaunt, Henry the Fourth,
+Whose wisdom was a mirror to the wisest;
+And, after that wise prince, Henry the Fifth,
+Who by his prowess conquered all France:
+From these our Henry lineally descends.
+
+WARWICK:
+Oxford, how haps it, in this smooth discourse,
+You told not how Henry the Sixth hath lost
+All that which Henry Fifth had gotten?
+Methinks these peers of France should smile at that.
+But for the rest, you tell a pedigree
+Of threescore and two years; a silly time
+To make prescription for a kingdom's worth.
+
+OXFORD:
+Why, Warwick, canst thou speak against thy liege,
+Whom thou obeyed'st thirty and six years,
+And not bewray thy treason with a blush?
+
+WARWICK:
+Can Oxford, that did ever fence the right,
+Now buckler falsehood with a pedigree?
+For shame! leave Henry, and call Edward king.
+
+OXFORD:
+Call him my king by whose injurious doom
+My elder brother, the Lord Aubrey Vere,
+Was done to death? and more than so, my father,
+Even in the downfall of his mellow'd years,
+When nature brought him to the door of death?
+No, Warwick, no; while life upholds this arm,
+This arm upholds the house of Lancaster.
+
+WARWICK:
+And I the house of York.
+
+KING LEWIS XI:
+Queen Margaret, Prince Edward, and Oxford,
+Vouchsafe, at our request, to stand aside,
+While I use further conference with Warwick.
+
+QUEEN MARGARET:
+Heavens grant that Warwick's words bewitch him not!
+
+KING LEWIS XI:
+Now Warwick, tell me, even upon thy conscience,
+Is Edward your true king? for I were loath
+To link with him that were not lawful chosen.
+
+WARWICK:
+Thereon I pawn my credit and mine honour.
+
+KING LEWIS XI:
+But is he gracious in the people's eye?
+
+WARWICK:
+The more that Henry was unfortunate.
+
+KING LEWIS XI:
+Then further, all dissembling set aside,
+Tell me for truth the measure of his love
+Unto our sister Bona.
+
+WARWICK:
+Such it seems
+As may beseem a monarch like himself.
+Myself have often heard him say and swear
+That this his love was an eternal plant,
+Whereof the root was fix'd in virtue's ground,
+The leaves and fruit maintain'd with beauty's sun,
+Exempt from envy, but not from disdain,
+Unless the Lady Bona quit his pain.
+
+KING LEWIS XI:
+Now, sister, let us hear your firm resolve.
+
+BONA:
+Your grant, or your denial, shall be mine:
+Yet I confess that often ere this day,
+When I have heard your king's desert recounted,
+Mine ear hath tempted judgment to desire.
+
+KING LEWIS XI:
+Then, Warwick, thus: our sister shall be Edward's;
+And now forthwith shall articles be drawn
+Touching the jointure that your king must make,
+Which with her dowry shall be counterpoised.
+Draw near, Queen Margaret, and be a witness
+That Bona shall be wife to the English king.
+
+PRINCE EDWARD:
+To Edward, but not to the English king.
+
+QUEEN MARGARET:
+Deceitful Warwick! it was thy device
+By this alliance to make void my suit:
+Before thy coming Lewis was Henry's friend.
+
+KING LEWIS XI:
+And still is friend to him and Margaret:
+But if your title to the crown be weak,
+As may appear by Edward's good success,
+Then 'tis but reason that I be released
+From giving aid which late I promised.
+Yet shall you have all kindness at my hand
+That your estate requires and mine can yield.
+
+WARWICK:
+Henry now lives in Scotland at his ease,
+Where having nothing, nothing can he lose.
+And as for you yourself, our quondam queen,
+You have a father able to maintain you;
+And better 'twere you troubled him than France.
+
+QUEEN MARGARET:
+Peace, impudent and shameless Warwick, peace,
+Proud setter up and puller down of kings!
+I will not hence, till, with my talk and tears,
+Both full of truth, I make King Lewis behold
+Thy sly conveyance and thy lord's false love;
+For both of you are birds of selfsame feather.
+
+KING LEWIS XI:
+Warwick, this is some post to us or thee.
+
+Post:
+
+OXFORD:
+I like it well that our fair queen and mistress
+Smiles at her news, while Warwick frowns at his.
+
+PRINCE EDWARD:
+Nay, mark how Lewis stamps, as he were nettled:
+I hope all's for the best.
+
+KING LEWIS XI:
+Warwick, what are thy news? and yours, fair queen?
+
+QUEEN MARGARET:
+Mine, such as fill my heart with unhoped joys.
+
+WARWICK:
+Mine, full of sorrow and heart's discontent.
+
+KING LEWIS XI:
+What! has your king married the Lady Grey!
+And now, to soothe your forgery and his,
+Sends me a paper to persuade me patience?
+Is this the alliance that he seeks with France?
+Dare he presume to scorn us in this manner?
+
+QUEEN MARGARET:
+I told your majesty as much before:
+This proveth Edward's love and Warwick's honesty.
+
+WARWICK:
+King Lewis, I here protest, in sight of heaven,
+And by the hope I have of heavenly bliss,
+That I am clear from this misdeed of Edward's,
+No more my king, for he dishonours me,
+But most himself, if he could see his shame.
+Did I forget that by the house of York
+My father came untimely to his death?
+Did I let pass the abuse done to my niece?
+Did I impale him with the regal crown?
+Did I put Henry from his native right?
+And am I guerdon'd at the last with shame?
+Shame on himself! for my desert is honour:
+And to repair my honour lost for him,
+I here renounce him and return to Henry.
+My noble queen, let former grudges pass,
+And henceforth I am thy true servitor:
+I will revenge his wrong to Lady Bona,
+And replant Henry in his former state.
+
+QUEEN MARGARET:
+Warwick, these words have turn'd my hate to love;
+And I forgive and quite forget old faults,
+And joy that thou becomest King Henry's friend.
+
+WARWICK:
+So much his friend, ay, his unfeigned friend,
+That, if King Lewis vouchsafe to furnish us
+With some few bands of chosen soldiers,
+I'll undertake to land them on our coast
+And force the tyrant from his seat by war.
+'Tis not his new-made bride shall succor him:
+And as for Clarence, as my letters tell me,
+He's very likely now to fall from him,
+For matching more for wanton lust than honour,
+Or than for strength and safety of our country.
+
+BONA:
+Dear brother, how shall Bona be revenged
+But by thy help to this distressed queen?
+
+QUEEN MARGARET:
+Renowned prince, how shall poor Henry live,
+Unless thou rescue him from foul despair?
+
+BONA:
+My quarrel and this English queen's are one.
+
+WARWICK:
+And mine, fair lady Bona, joins with yours.
+
+KING LEWIS XI:
+And mine with hers, and thine, and Margaret's.
+Therefore at last I firmly am resolved
+You shall have aid.
+
+QUEEN MARGARET:
+Let me give humble thanks for all at once.
+
+KING LEWIS XI:
+Then, England's messenger, return in post,
+And tell false Edward, thy supposed king,
+That Lewis of France is sending over masquers
+To revel it with him and his new bride:
+Thou seest what's past, go fear thy king withal.
+
+BONA:
+Tell him, in hope he'll prove a widower shortly,
+I'll wear the willow garland for his sake.
+
+QUEEN MARGARET:
+Tell him, my mourning weeds are laid aside,
+And I am ready to put armour on.
+
+WARWICK:
+Tell him from me that he hath done me wrong,
+And therefore I'll uncrown him ere't be long.
+There's thy reward: be gone.
+
+KING LEWIS XI:
+But, Warwick,
+Thou and Oxford, with five thousand men,
+Shall cross the seas, and bid false Edward battle;
+And, as occasion serves, this noble queen
+And prince shall follow with a fresh supply.
+Yet, ere thou go, but answer me one doubt,
+What pledge have we of thy firm loyalty?
+
+WARWICK:
+This shall assure my constant loyalty,
+That if our queen and this young prince agree,
+I'll join mine eldest daughter and my joy
+To him forthwith in holy wedlock bands.
+
+QUEEN MARGARET:
+Yes, I agree, and thank you for your motion.
+Son Edward, she is fair and virtuous,
+Therefore delay not, give thy hand to Warwick;
+And, with thy hand, thy faith irrevocable,
+That only Warwick's daughter shall be thine.
+
+PRINCE EDWARD:
+Yes, I accept her, for she well deserves it;
+And here, to pledge my vow, I give my hand.
+
+KING LEWIS XI:
+Why stay we now? These soldiers shall be levied,
+And thou, Lord Bourbon, our high admiral,
+Shalt waft them over with our royal fleet.
+I long till Edward fall by war's mischance,
+For mocking marriage with a dame of France.
+
+WARWICK:
+I came from Edward as ambassador,
+But I return his sworn and mortal foe:
+Matter of marriage was the charge he gave me,
+But dreadful war shall answer his demand.
+Had he none else to make a stale but me?
+Then none but I shall turn his jest to sorrow.
+I was the chief that raised him to the crown,
+And I'll be chief to bring him down again:
+Not that I pity Henry's misery,
+But seek revenge on Edward's mockery.
+3 KING HENRY VI
+
+GLOUCESTER:
+Now tell me, brother Clarence, what think you
+Of this new marriage with the Lady Grey?
+Hath not our brother made a worthy choice?
+
+CLARENCE:
+Alas, you know, 'tis far from hence to France;
+How could he stay till Warwick made return?
+
+SOMERSET:
+My lords, forbear this talk; here comes the king.
+
+GLOUCESTER:
+And his well-chosen bride.
+
+CLARENCE:
+I mind to tell him plainly what I think.
+
+KING EDWARD IV:
+Now, brother of Clarence, how like you our choice,
+That you stand pensive, as half malcontent?
+
+CLARENCE:
+As well as Lewis of France, or the Earl of Warwick,
+Which are so weak of courage and in judgment
+That they'll take no offence at our abuse.
+
+KING EDWARD IV:
+Suppose they take offence without a cause,
+They are but Lewis and Warwick: I am Edward,
+Your king and Warwick's, and must have my will.
+
+GLOUCESTER:
+And shall have your will, because our king:
+Yet hasty marriage seldom proveth well.
+
+KING EDWARD IV:
+Yea, brother Richard, are you offended too?
+
+GLOUCESTER:
+Not I:
+No, God forbid that I should wish them sever'd
+Whom God hath join'd together; ay, and 'twere pity
+To sunder them that yoke so well together.
+
+KING EDWARD IV:
+Setting your scorns and your mislike aside,
+Tell me some reason why the Lady Grey
+Should not become my wife and England's queen.
+And you too, Somerset and Montague,
+Speak freely what you think.
+
+CLARENCE:
+Then this is mine opinion: that King Lewis
+Becomes your enemy, for mocking him
+About the marriage of the Lady Bona.
+
+GLOUCESTER:
+And Warwick, doing what you gave in charge,
+Is now dishonoured by this new marriage.
+
+KING EDWARD IV:
+What if both Lewis and Warwick be appeased
+By such invention as I can devise?
+
+MONTAGUE:
+Yet, to have join'd with France in such alliance
+Would more have strengthen'd this our commonwealth
+'Gainst foreign storms than any home-bred marriage.
+
+HASTINGS:
+Why, knows not Montague that of itself
+England is safe, if true within itself?
+
+MONTAGUE:
+But the safer when 'tis back'd with France.
+
+HASTINGS:
+'Tis better using France than trusting France:
+Let us be back'd with God and with the seas
+Which He hath given for fence impregnable,
+And with their helps only defend ourselves;
+In them and in ourselves our safety lies.
+
+CLARENCE:
+For this one speech Lord Hastings well deserves
+To have the heir of the Lord Hungerford.
+
+KING EDWARD IV:
+Ay, what of that? it was my will and grant;
+And for this once my will shall stand for law.
+
+GLOUCESTER:
+And yet methinks your grace hath not done well,
+To give the heir and daughter of Lord Scales
+Unto the brother of your loving bride;
+She better would have fitted me or Clarence:
+But in your bride you bury brotherhood.
+
+CLARENCE:
+Or else you would not have bestow'd the heir
+Of the Lord Bonville on your new wife's son,
+And leave your brothers to go speed elsewhere.
+
+KING EDWARD IV:
+Alas, poor Clarence! is it for a wife
+That thou art malcontent? I will provide thee.
+
+CLARENCE:
+In choosing for yourself, you show'd your judgment,
+Which being shallow, you give me leave
+To play the broker in mine own behalf;
+And to that end I shortly mind to leave you.
+
+KING EDWARD IV:
+Leave me, or tarry, Edward will be king,
+And not be tied unto his brother's will.
+
+QUEEN ELIZABETH:
+My lords, before it pleased his majesty
+To raise my state to title of a queen,
+Do me but right, and you must all confess
+That I was not ignoble of descent;
+And meaner than myself have had like fortune.
+But as this title honours me and mine,
+So your dislike, to whom I would be pleasing,
+Doth cloud my joys with danger and with sorrow.
+
+KING EDWARD IV:
+My love, forbear to fawn upon their frowns:
+What danger or what sorrow can befall thee,
+So long as Edward is thy constant friend,
+And their true sovereign, whom they must obey?
+Nay, whom they shall obey, and love thee too,
+Unless they seek for hatred at my hands;
+Which if they do, yet will I keep thee safe,
+And they shall feel the vengeance of my wrath.
+
+GLOUCESTER:
+
+KING EDWARD IV:
+Now, messenger, what letters or what news
+From France?
+
+Post:
+My sovereign liege, no letters; and few words,
+But such as I, without your special pardon,
+Dare not relate.
+
+KING EDWARD IV:
+Go to, we pardon thee: therefore, in brief,
+Tell me their words as near as thou canst guess them.
+What answer makes King Lewis unto our letters?
+
+Post:
+At my depart, these were his very words:
+'Go tell false Edward, thy supposed king,
+That Lewis of France is sending over masquers
+To revel it with him and his new bride.'
+
+KING EDWARD IV:
+Is Lewis so brave? belike he thinks me Henry.
+But what said Lady Bona to my marriage?
+
+Post:
+These were her words, utter'd with mad disdain:
+'Tell him, in hope he'll prove a widower shortly,
+I'll wear the willow garland for his sake.'
+
+KING EDWARD IV:
+I blame not her, she could say little less;
+She had the wrong. But what said Henry's queen?
+For I have heard that she was there in place.
+
+Post:
+'Tell him,' quoth she, 'my mourning weeds are done,
+And I am ready to put armour on.'
+
+KING EDWARD IV:
+Belike she minds to play the Amazon.
+But what said Warwick to these injuries?
+
+Post:
+He, more incensed against your majesty
+Than all the rest, discharged me with these words:
+'Tell him from me that he hath done me wrong,
+And therefore I'll uncrown him ere't be long.'
+
+KING EDWARD IV:
+Ha! durst the traitor breathe out so proud words?
+Well I will arm me, being thus forewarn'd:
+They shall have wars and pay for their presumption.
+But say, is Warwick friends with Margaret?
+
+Post:
+Ay, gracious sovereign; they are so link'd in
+friendship
+That young Prince Edward marries Warwick's daughter.
+
+CLARENCE:
+Belike the elder; Clarence will have the younger.
+Now, brother king, farewell, and sit you fast,
+For I will hence to Warwick's other daughter;
+That, though I want a kingdom, yet in marriage
+I may not prove inferior to yourself.
+You that love me and Warwick, follow me.
+
+GLOUCESTER:
+
+KING EDWARD IV:
+Clarence and Somerset both gone to Warwick!
+Yet am I arm'd against the worst can happen;
+And haste is needful in this desperate case.
+Pembroke and Stafford, you in our behalf
+Go levy men, and make prepare for war;
+They are already, or quickly will be landed:
+Myself in person will straight follow you.
+But, ere I go, Hastings and Montague,
+Resolve my doubt. You twain, of all the rest,
+Are near to Warwick by blood and by alliance:
+Tell me if you love Warwick more than me?
+If it be so, then both depart to him;
+I rather wish you foes than hollow friends:
+But if you mind to hold your true obedience,
+Give me assurance with some friendly vow,
+That I may never have you in suspect.
+
+MONTAGUE:
+So God help Montague as he proves true!
+
+HASTINGS:
+And Hastings as he favours Edward's cause!
+
+KING EDWARD IV:
+Now, brother Richard, will you stand by us?
+
+GLOUCESTER:
+Ay, in despite of all that shall withstand you.
+
+KING EDWARD IV:
+Why, so! then am I sure of victory.
+Now therefore let us hence; and lose no hour,
+Till we meet Warwick with his foreign power.
+3 KING HENRY VI
+
+WARWICK:
+Trust me, my lord, all hitherto goes well;
+The common people by numbers swarm to us.
+But see where Somerset and Clarence come!
+Speak suddenly, my lords, are we all friends?
+
+CLARENCE:
+Fear not that, my lord.
+
+WARWICK:
+Then, gentle Clarence, welcome unto Warwick;
+And welcome, Somerset: I hold it cowardice
+To rest mistrustful where a noble heart
+Hath pawn'd an open hand in sign of love;
+Else might I think that Clarence, Edward's brother,
+Were but a feigned friend to our proceedings:
+But welcome, sweet Clarence; my daughter shall be thine.
+And now what rests but, in night's coverture,
+Thy brother being carelessly encamp'd,
+His soldiers lurking in the towns about,
+And but attended by a simple guard,
+We may surprise and take him at our pleasure?
+Our scouts have found the adventure very easy:
+That as Ulysses and stout Diomede
+With sleight and manhood stole to Rhesus' tents,
+And brought from thence the Thracian fatal steeds,
+So we, well cover'd with the night's black mantle,
+At unawares may beat down Edward's guard
+And seize himself; I say not, slaughter him,
+For I intend but only to surprise him.
+You that will follow me to this attempt,
+Applaud the name of Henry with your leader.
+Why, then, let's on our way in silent sort:
+For Warwick and his friends, God and Saint George!
+3 KING HENRY VI
+
+First Watchman:
+Come on, my masters, each man take his stand:
+The king by this is set him down to sleep.
+
+Second Watchman:
+What, will he not to bed?
+
+First Watchman:
+Why, no; for he hath made a solemn vow
+Never to lie and take his natural rest
+Till Warwick or himself be quite suppress'd.
+
+Second Watchman:
+To-morrow then belike shall be the day,
+If Warwick be so near as men report.
+
+Third Watchman:
+But say, I pray, what nobleman is that
+That with the king here resteth in his tent?
+
+First Watchman:
+'Tis the Lord Hastings, the king's chiefest friend.
+
+Third Watchman:
+O, is it so? But why commands the king
+That his chief followers lodge in towns about him,
+While he himself keeps in the cold field?
+
+Second Watchman:
+'Tis the more honour, because more dangerous.
+
+Third Watchman:
+Ay, but give me worship and quietness;
+I like it better than a dangerous honour.
+If Warwick knew in what estate he stands,
+'Tis to be doubted he would waken him.
+
+First Watchman:
+Unless our halberds did shut up his passage.
+
+Second Watchman:
+Ay, wherefore else guard we his royal tent,
+But to defend his person from night-foes?
+
+WARWICK:
+This is his tent; and see where stand his guard.
+Courage, my masters! honour now or never!
+But follow me, and Edward shall be ours.
+
+First Watchman:
+Who goes there?
+
+Second Watchman:
+Stay, or thou diest!
+
+SOMERSET:
+What are they that fly there?
+
+WARWICK:
+Richard and Hastings: let them go; here is The duke.
+
+KING EDWARD IV:
+The duke! Why, Warwick, when we parted,
+Thou call'dst me king.
+
+WARWICK:
+Ay, but the case is alter'd:
+When you disgraced me in my embassade,
+Then I degraded you from being king,
+And come now to create you Duke of York.
+Alas! how should you govern any kingdom,
+That know not how to use ambassadors,
+Nor how to be contented with one wife,
+Nor how to use your brothers brotherly,
+Nor how to study for the people's welfare,
+Nor how to shroud yourself from enemies?
+
+KING EDWARD IV:
+Yea, brother of Clarence, are thou here too?
+Nay, then I see that Edward needs must down.
+Yet, Warwick, in despite of all mischance,
+Of thee thyself and all thy complices,
+Edward will always bear himself as king:
+Though fortune's malice overthrow my state,
+My mind exceeds the compass of her wheel.
+
+WARWICK:
+Then, for his mind, be Edward England's king:
+But Henry now shall wear the English crown,
+And be true king indeed, thou but the shadow.
+My Lord of Somerset, at my request,
+See that forthwith Duke Edward be convey'd
+Unto my brother, Archbishop of York.
+When I have fought with Pembroke and his fellows,
+I'll follow you, and tell what answer
+Lewis and the Lady Bona send to him.
+Now, for a while farewell, good Duke of York.
+
+KING EDWARD IV:
+What fates impose, that men must needs abide;
+It boots not to resist both wind and tide.
+
+OXFORD:
+What now remains, my lords, for us to do
+But march to London with our soldiers?
+
+WARWICK:
+Ay, that's the first thing that we have to do;
+To free King Henry from imprisonment
+And see him seated in the regal throne.
+3 KING HENRY VI
+
+RIVERS:
+Madam, what makes you in this sudden change?
+
+QUEEN ELIZABETH:
+Why brother Rivers, are you yet to learn
+What late misfortune is befall'n King Edward?
+
+RIVERS:
+What! loss of some pitch'd battle against Warwick?
+
+QUEEN ELIZABETH:
+No, but the loss of his own royal person.
+
+RIVERS:
+Then is my sovereign slain?
+
+QUEEN ELIZABETH:
+Ay, almost slain, for he is taken prisoner,
+Either betray'd by falsehood of his guard
+Or by his foe surprised at unawares:
+And, as I further have to understand,
+Is new committed to the Bishop of York,
+Fell Warwick's brother and by that our foe.
+
+RIVERS:
+These news I must confess are full of grief;
+Yet, gracious madam, bear it as you may:
+Warwick may lose, that now hath won the day.
+
+QUEEN ELIZABETH:
+Till then fair hope must hinder life's decay.
+And I the rather wean me from despair
+For love of Edward's offspring in my womb:
+This is it that makes me bridle passion
+And bear with mildness my misfortune's cross;
+Ay, ay, for this I draw in many a tear
+And stop the rising of blood-sucking sighs,
+Lest with my sighs or tears I blast or drown
+King Edward's fruit, true heir to the English crown.
+
+RIVERS:
+But, madam, where is Warwick then become?
+
+QUEEN ELIZABETH:
+I am inform'd that he comes towards London,
+To set the crown once more on Henry's head:
+Guess thou the rest; King Edward's friends must down,
+But, to prevent the tyrant's violence,--
+For trust not him that hath once broken faith,--
+I'll hence forthwith unto the sanctuary,
+To save at least the heir of Edward's right:
+There shall I rest secure from force and fraud.
+Come, therefore, let us fly while we may fly:
+If Warwick take us we are sure to die.
+3 KING HENRY VI
+
+GLOUCESTER:
+Now, my Lord Hastings and Sir William Stanley,
+Leave off to wonder why I drew you hither,
+Into this chiefest thicket of the park.
+Thus stands the case: you know our king, my brother,
+Is prisoner to the bishop here, at whose hands
+He hath good usage and great liberty,
+And, often but attended with weak guard,
+Comes hunting this way to disport himself.
+I have advertised him by secret means
+That if about this hour he make his way
+Under the colour of his usual game,
+He shall here find his friends with horse and men
+To set him free from his captivity.
+
+Huntsman:
+This way, my lord; for this way lies the game.
+
+KING EDWARD IV:
+Nay, this way, man: see where the huntsmen stand.
+Now, brother of Gloucester, Lord Hastings, and the rest,
+Stand you thus close, to steal the bishop's deer?
+
+GLOUCESTER:
+Brother, the time and case requireth haste:
+Your horse stands ready at the park-corner.
+
+KING EDWARD IV:
+But whither shall we then?
+
+HASTINGS:
+To Lynn, my lord,
+And ship from thence to Flanders.
+
+GLOUCESTER:
+Well guess'd, believe me; for that was my meaning.
+
+KING EDWARD IV:
+Stanley, I will requite thy forwardness.
+
+GLOUCESTER:
+But wherefore stay we? 'tis no time to talk.
+
+KING EDWARD IV:
+Huntsman, what say'st thou? wilt thou go along?
+
+Huntsman:
+Better do so than tarry and be hang'd.
+
+GLOUCESTER:
+Come then, away; let's ha' no more ado.
+
+KING EDWARD IV:
+Bishop, farewell: shield thee from Warwick's frown;
+And pray that I may repossess the crown.
+3 KING HENRY VI
+
+KING HENRY VI:
+Master lieutenant, now that God and friends
+Have shaken Edward from the regal seat,
+And turn'd my captive state to liberty,
+My fear to hope, my sorrows unto joys,
+At our enlargement what are thy due fees?
+
+Lieutenant:
+Subjects may challenge nothing of their sovereigns;
+But if an humble prayer may prevail,
+I then crave pardon of your majesty.
+
+KING HENRY VI:
+For what, lieutenant? for well using me?
+Nay, be thou sure I'll well requite thy kindness,
+For that it made my imprisonment a pleasure;
+Ay, such a pleasure as incaged birds
+Conceive when after many moody thoughts
+At last by notes of household harmony
+They quite forget their loss of liberty.
+But, Warwick, after God, thou set'st me free,
+And chiefly therefore I thank God and thee;
+He was the author, thou the instrument.
+Therefore, that I may conquer fortune's spite
+By living low, where fortune cannot hurt me,
+And that the people of this blessed land
+May not be punish'd with my thwarting stars,
+Warwick, although my head still wear the crown,
+I here resign my government to thee,
+For thou art fortunate in all thy deeds.
+
+WARWICK:
+Your grace hath still been famed for virtuous;
+And now may seem as wise as virtuous,
+By spying and avoiding fortune's malice,
+For few men rightly temper with the stars:
+Yet in this one thing let me blame your grace,
+For choosing me when Clarence is in place.
+
+CLARENCE:
+No, Warwick, thou art worthy of the sway,
+To whom the heavens in thy nativity
+Adjudged an olive branch and laurel crown,
+As likely to be blest in peace and war;
+And therefore I yield thee my free consent.
+
+WARWICK:
+And I choose Clarence only for protector.
+
+KING HENRY VI:
+Warwick and Clarence give me both your hands:
+Now join your hands, and with your hands your hearts,
+That no dissension hinder government:
+I make you both protectors of this land,
+While I myself will lead a private life
+And in devotion spend my latter days,
+To sin's rebuke and my Creator's praise.
+
+WARWICK:
+What answers Clarence to his sovereign's will?
+
+CLARENCE:
+That he consents, if Warwick yield consent;
+For on thy fortune I repose myself.
+
+WARWICK:
+Why, then, though loath, yet must I be content:
+We'll yoke together, like a double shadow
+To Henry's body, and supply his place;
+I mean, in bearing weight of government,
+While he enjoys the honour and his ease.
+And, Clarence, now then it is more than needful
+Forthwith that Edward be pronounced a traitor,
+And all his lands and goods be confiscate.
+
+CLARENCE:
+What else? and that succession be determined.
+
+WARWICK:
+Ay, therein Clarence shall not want his part.
+
+KING HENRY VI:
+But, with the first of all your chief affairs,
+Let me entreat, for I command no more,
+That Margaret your queen and my son Edward
+Be sent for, to return from France with speed;
+For, till I see them here, by doubtful fear
+My joy of liberty is half eclipsed.
+
+CLARENCE:
+It shall be done, my sovereign, with all speed.
+
+KING HENRY VI:
+My Lord of Somerset, what youth is that,
+Of whom you seem to have so tender care?
+
+SOMERSET:
+My liege, it is young Henry, earl of Richmond.
+
+KING HENRY VI:
+Come hither, England's hope.
+If secret powers
+Suggest but truth to my divining thoughts,
+This pretty lad will prove our country's bliss.
+His looks are full of peaceful majesty,
+His head by nature framed to wear a crown,
+His hand to wield a sceptre, and himself
+Likely in time to bless a regal throne.
+Make much of him, my lords, for this is he
+Must help you more than you are hurt by me.
+
+WARWICK:
+What news, my friend?
+
+Post:
+That Edward is escaped from your brother,
+And fled, as he hears since, to Burgundy.
+
+WARWICK:
+Unsavoury news! but how made he escape?
+
+Post:
+He was convey'd by Richard Duke of Gloucester
+And the Lord Hastings, who attended him
+In secret ambush on the forest side
+And from the bishop's huntsmen rescued him;
+For hunting was his daily exercise.
+
+WARWICK:
+My brother was too careless of his charge.
+But let us hence, my sovereign, to provide
+A salve for any sore that may betide.
+
+SOMERSET:
+My lord, I like not of this flight of Edward's;
+For doubtless Burgundy will yield him help,
+And we shall have more wars before 't be long.
+As Henry's late presaging prophecy
+Did glad my heart with hope of this young Richmond,
+So doth my heart misgive me, in these conflicts
+What may befall him, to his harm and ours:
+Therefore, Lord Oxford, to prevent the worst,
+Forthwith we'll send him hence to Brittany,
+Till storms be past of civil enmity.
+
+OXFORD:
+Ay, for if Edward repossess the crown,
+'Tis like that Richmond with the rest shall down.
+
+SOMERSET:
+It shall be so; he shall to Brittany.
+Come, therefore, let's about it speedily.
+3 KING HENRY VI
+
+KING EDWARD IV:
+Now, brother Richard, Lord Hastings, and the rest,
+Yet thus far fortune maketh us amends,
+And says that once more I shall interchange
+My waned state for Henry's regal crown.
+Well have we pass'd and now repass'd the seas
+And brought desired help from Burgundy:
+What then remains, we being thus arrived
+From Ravenspurgh haven before the gates of York,
+But that we enter, as into our dukedom?
+
+GLOUCESTER:
+The gates made fast! Brother, I like not this;
+For many men that stumble at the threshold
+Are well foretold that danger lurks within.
+
+KING EDWARD IV:
+Tush, man, abodements must not now affright us:
+By fair or foul means we must enter in,
+For hither will our friends repair to us.
+
+HASTINGS:
+My liege, I'll knock once more to summon them.
+
+Mayor:
+My lords, we were forewarned of your coming,
+And shut the gates for safety of ourselves;
+For now we owe allegiance unto Henry.
+
+KING EDWARD IV:
+But, master mayor, if Henry be your king,
+Yet Edward at the least is Duke of York.
+
+Mayor:
+True, my good lord; I know you for no less.
+
+KING EDWARD IV:
+Why, and I challenge nothing but my dukedom,
+As being well content with that alone.
+
+GLOUCESTER:
+
+HASTINGS:
+Why, master mayor, why stand you in a doubt?
+Open the gates; we are King Henry's friends.
+
+Mayor:
+Ay, say you so? the gates shall then be open'd.
+
+GLOUCESTER:
+A wise stout captain, and soon persuaded!
+
+HASTINGS:
+The good old man would fain that all were well,
+So 'twere not 'long of him; but being enter'd,
+I doubt not, I, but we shall soon persuade
+Both him and all his brothers unto reason.
+
+KING EDWARD IV:
+So, master mayor: these gates must not be shut
+But in the night or in the time of war.
+What! fear not, man, but yield me up the keys;
+For Edward will defend the town and thee,
+And all those friends that deign to follow me.
+
+GLOUCESTER:
+Brother, this is Sir John Montgomery,
+Our trusty friend, unless I be deceived.
+
+KING EDWARD IV:
+Welcome, Sir John! But why come you in arms?
+
+MONTAGUE:
+To help King Edward in his time of storm,
+As every loyal subject ought to do.
+
+KING EDWARD IV:
+Thanks, good Montgomery; but we now forget
+Our title to the crown and only claim
+Our dukedom till God please to send the rest.
+
+MONTAGUE:
+Then fare you well, for I will hence again:
+I came to serve a king and not a duke.
+Drummer, strike up, and let us march away.
+
+KING EDWARD IV:
+Nay, stay, Sir John, awhile, and we'll debate
+By what safe means the crown may be recover'd.
+
+MONTAGUE:
+What talk you of debating? in few words,
+If you'll not here proclaim yourself our king,
+I'll leave you to your fortune and be gone
+To keep them back that come to succor you:
+Why shall we fight, if you pretend no title?
+
+GLOUCESTER:
+Why, brother, wherefore stand you on nice points?
+
+KING EDWARD IV:
+When we grow stronger, then we'll make our claim:
+Till then, 'tis wisdom to conceal our meaning.
+
+HASTINGS:
+Away with scrupulous wit! now arms must rule.
+
+GLOUCESTER:
+And fearless minds climb soonest unto crowns.
+Brother, we will proclaim you out of hand:
+The bruit thereof will bring you many friends.
+
+KING EDWARD IV:
+Then be it as you will; for 'tis my right,
+And Henry but usurps the diadem.
+
+MONTAGUE:
+Ay, now my sovereign speaketh like himself;
+And now will I be Edward's champion.
+
+HASTINGS:
+Sound trumpet; Edward shall be here proclaim'd:
+Come, fellow-soldier, make thou proclamation.
+
+Soldier:
+Edward the Fourth, by the grace of God, king of
+England and France, and lord of Ireland, &c.
+
+MONTAGUE:
+And whosoe'er gainsays King Edward's right,
+By this I challenge him to single fight.
+
+All:
+Long live Edward the Fourth!
+
+KING EDWARD IV:
+Thanks, brave Montgomery; and thanks unto you all:
+If fortune serve me, I'll requite this kindness.
+Now, for this night, let's harbour here in York;
+And when the morning sun shall raise his car
+Above the border of this horizon,
+We'll forward towards Warwick and his mates;
+For well I wot that Henry is no soldier.
+Ah, froward Clarence! how evil it beseems thee
+To flatter Henry and forsake thy brother!
+Yet, as we may, we'll meet both thee and Warwick.
+Come on, brave soldiers: doubt not of the day,
+And, that once gotten, doubt not of large pay.
+3 KING HENRY VI
+
+WARWICK:
+What counsel, lords? Edward from Belgia,
+With hasty Germans and blunt Hollanders,
+Hath pass'd in safety through the narrow seas,
+And with his troops doth march amain to London;
+And many giddy people flock to him.
+
+KING HENRY VI:
+Let's levy men, and beat him back again.
+
+CLARENCE:
+A little fire is quickly trodden out;
+Which, being suffer'd, rivers cannot quench.
+
+WARWICK:
+In Warwickshire I have true-hearted friends,
+Not mutinous in peace, yet bold in war;
+Those will I muster up: and thou, son Clarence,
+Shalt stir up in Suffolk, Norfolk, and in Kent,
+The knights and gentlemen to come with thee:
+Thou, brother Montague, in Buckingham,
+Northampton and in Leicestershire, shalt find
+Men well inclined to hear what thou command'st:
+And thou, brave Oxford, wondrous well beloved,
+In Oxfordshire shalt muster up thy friends.
+My sovereign, with the loving citizens,
+Like to his island girt in with the ocean,
+Or modest Dian circled with her nymphs,
+Shall rest in London till we come to him.
+Fair lords, take leave and stand not to reply.
+Farewell, my sovereign.
+
+KING HENRY VI:
+Farewell, my Hector, and my Troy's true hope.
+
+CLARENCE:
+In sign of truth, I kiss your highness' hand.
+
+KING HENRY VI:
+Well-minded Clarence, be thou fortunate!
+
+MONTAGUE:
+Comfort, my lord; and so I take my leave.
+
+OXFORD:
+And thus I seal my truth, and bid adieu.
+
+KING HENRY VI:
+Sweet Oxford, and my loving Montague,
+And all at once, once more a happy farewell.
+
+WARWICK:
+Farewell, sweet lords: let's meet at Coventry.
+
+KING HENRY VI:
+Here at the palace I will rest awhile.
+Cousin of Exeter, what thinks your lordship?
+Methinks the power that Edward hath in field
+Should not be able to encounter mine.
+
+EXETER:
+The doubt is that he will seduce the rest.
+
+KING HENRY VI:
+That's not my fear; my meed hath got me fame:
+I have not stopp'd mine ears to their demands,
+Nor posted off their suits with slow delays;
+My pity hath been balm to heal their wounds,
+My mildness hath allay'd their swelling griefs,
+My mercy dried their water-flowing tears;
+I have not been desirous of their wealth,
+Nor much oppress'd them with great subsidies.
+Nor forward of revenge, though they much err'd:
+Then why should they love Edward more than me?
+No, Exeter, these graces challenge grace:
+And when the lion fawns upon the lamb,
+The lamb will never cease to follow him.
+
+EXETER:
+Hark, hark, my lord! what shouts are these?
+
+KING EDWARD IV:
+Seize on the shame-faced Henry, bear him hence;
+And once again proclaim us King of England.
+You are the fount that makes small brooks to flow:
+Now stops thy spring; my sea sha$l suck them dry,
+And swell so much the higher by their ebb.
+Hence with him to the Tower; let him not speak.
+And, lords, towards Coventry bend we our course
+Where peremptory Warwick now remains:
+The sun shines hot; and, if we use delay,
+Cold biting winter mars our hoped-for hay.
+
+GLOUCESTER:
+Away betimes, before his forces join,
+And take the great-grown traitor unawares:
+Brave warriors, march amain towards Coventry.
+3 KING HENRY VI
+
+WARWICK:
+Where is the post that came from valiant Oxford?
+How far hence is thy lord, mine honest fellow?
+
+First Messenger:
+By this at Dunsmore, marching hitherward.
+
+WARWICK:
+How far off is our brother Montague?
+Where is the post that came from Montague?
+
+Second Messenger:
+By this at Daintry, with a puissant troop.
+
+WARWICK:
+Say, Somerville, what says my loving son?
+And, by thy guess, how nigh is Clarence now?
+
+SOMERSET:
+At Southam I did leave him with his forces,
+And do expect him here some two hours hence.
+
+WARWICK:
+Then Clarence is at hand, I hear his drum.
+
+SOMERSET:
+It is not his, my lord; here Southam lies:
+The drum your honour hears marcheth from Warwick.
+
+WARWICK:
+Who should that be? belike, unlook'd-for friends.
+
+SOMERSET:
+They are at hand, and you shall quickly know.
+
+KING EDWARD IV:
+Go, trumpet, to the walls, and sound a parle.
+
+GLOUCESTER:
+See how the surly Warwick mans the wall!
+
+WARWICK:
+O unbid spite! is sportful Edward come?
+Where slept our scouts, or how are they seduced,
+That we could hear no news of his repair?
+
+KING EDWARD IV:
+Now, Warwick, wilt thou ope the city gates,
+Speak gentle words and humbly bend thy knee,
+Call Edward king and at his hands beg mercy?
+And he shall pardon thee these outrages.
+
+WARWICK:
+Nay, rather, wilt thou draw thy forces hence,
+Confess who set thee up and pluck'd thee own,
+Call Warwick patron and be penitent?
+And thou shalt still remain the Duke of York.
+
+GLOUCESTER:
+I thought, at least, he would have said the king;
+Or did he make the jest against his will?
+
+WARWICK:
+Is not a dukedom, sir, a goodly gift?
+
+GLOUCESTER:
+Ay, by my faith, for a poor earl to give:
+I'll do thee service for so good a gift.
+
+WARWICK:
+'Twas I that gave the kingdom to thy brother.
+
+KING EDWARD IV:
+Why then 'tis mine, if but by Warwick's gift.
+
+WARWICK:
+Thou art no Atlas for so great a weight:
+And weakling, Warwick takes his gift again;
+And Henry is my king, Warwick his subject.
+
+KING EDWARD IV:
+But Warwick's king is Edward's prisoner:
+And, gallant Warwick, do but answer this:
+What is the body when the head is off?
+
+GLOUCESTER:
+Alas, that Warwick had no more forecast,
+But, whiles he thought to steal the single ten,
+The king was slily finger'd from the deck!
+You left poor Henry at the Bishop's palace,
+And, ten to one, you'll meet him in the Tower.
+
+EDWARD:
+'Tis even so; yet you are Warwick still.
+
+GLOUCESTER:
+Come, Warwick, take the time; kneel down, kneel down:
+Nay, when? strike now, or else the iron cools.
+
+WARWICK:
+I had rather chop this hand off at a blow,
+And with the other fling it at thy face,
+Than bear so low a sail, to strike to thee.
+
+KING EDWARD IV:
+Sail how thou canst, have wind and tide thy friend,
+This hand, fast wound about thy coal-black hair
+Shall, whiles thy head is warm and new cut off,
+Write in the dust this sentence with thy blood,
+'Wind-changing Warwick now can change no more.'
+
+WARWICK:
+O cheerful colours! see where Oxford comes!
+
+OXFORD:
+Oxford, Oxford, for Lancaster!
+
+GLOUCESTER:
+The gates are open, let us enter too.
+
+KING EDWARD IV:
+So other foes may set upon our backs.
+Stand we in good array; for they no doubt
+Will issue out again and bid us battle:
+If not, the city being but of small defence,
+We'll quickly rouse the traitors in the same.
+
+WARWICK:
+O, welcome, Oxford! for we want thy help.
+
+MONTAGUE:
+Montague, Montague, for Lancaster!
+
+GLOUCESTER:
+Thou and thy brother both shall buy this treason
+Even with the dearest blood your bodies bear.
+
+KING EDWARD IV:
+The harder match'd, the greater victory:
+My mind presageth happy gain and conquest.
+
+SOMERSET:
+Somerset, Somerset, for Lancaster!
+
+GLOUCESTER:
+Two of thy name, both Dukes of Somerset,
+Have sold their lives unto the house of York;
+And thou shalt be the third if this sword hold.
+
+WARWICK:
+And lo, where George of Clarence sweeps along,
+Of force enough to bid his brother battle;
+With whom an upright zeal to right prevails
+More than the nature of a brother's love!
+Come, Clarence, come; thou wilt, if Warwick call.
+
+CLARENCE:
+Father of Warwick, know you what this means?
+Look here, I throw my infamy at thee
+I will not ruinate my father's house,
+Who gave his blood to lime the stones together,
+And set up Lancaster. Why, trow'st thou, Warwick,
+That Clarence is so harsh, so blunt, unnatural,
+To bend the fatal instruments of war
+Against his brother and his lawful king?
+Perhaps thou wilt object my holy oath:
+To keep that oath were more impiety
+Than Jephthah's, when he sacrificed his daughter.
+I am so sorry for my trespass made
+That, to deserve well at my brother's hands,
+I here proclaim myself thy mortal foe,
+With resolution, wheresoe'er I meet thee--
+As I will meet thee, if thou stir abroad--
+To plague thee for thy foul misleading me.
+And so, proud-hearted Warwick, I defy thee,
+And to my brother turn my blushing cheeks.
+Pardon me, Edward, I will make amends:
+And, Richard, do not frown upon my faults,
+For I will henceforth be no more unconstant.
+
+KING EDWARD IV:
+Now welcome more, and ten times more beloved,
+Than if thou never hadst deserved our hate.
+
+GLOUCESTER:
+Welcome, good Clarence; this is brotherlike.
+
+WARWICK:
+O passing traitor, perjured and unjust!
+
+KING EDWARD IV:
+What, Warwick, wilt thou leave the town and fight?
+Or shall we beat the stones about thine ears?
+
+WARWICK:
+Alas, I am not coop'd here for defence!
+I will away towards Barnet presently,
+And bid thee battle, Edward, if thou darest.
+
+KING EDWARD IV:
+Yes, Warwick, Edward dares, and leads the way.
+Lords, to the field; Saint George and victory!
+3 KING HENRY VI
+
+KING EDWARD IV:
+So, lie thou there: die thou, and die our fear;
+For Warwick was a bug that fear'd us all.
+Now, Montague, sit fast; I seek for thee,
+That Warwick's bones may keep thine company.
+
+WARWICK:
+Ah, who is nigh? come to me, friend or foe,
+And tell me who is victor, York or Warwick?
+Why ask I that? my mangled body shows,
+My blood, my want of strength, my sick heart shows.
+That I must yield my body to the earth
+And, by my fall, the conquest to my foe.
+Thus yields the cedar to the axe's edge,
+Whose arms gave shelter to the princely eagle,
+Under whose shade the ramping lion slept,
+Whose top-branch overpeer'd Jove's spreading tree
+And kept low shrubs from winter's powerful wind.
+These eyes, that now are dimm'd with death's black veil,
+Have been as piercing as the mid-day sun,
+To search the secret treasons of the world:
+The wrinkles in my brows, now filled with blood,
+Were liken'd oft to kingly sepulchres;
+For who lived king, but I could dig his grave?
+And who durst mine when Warwick bent his brow?
+Lo, now my glory smear'd in dust and blood!
+My parks, my walks, my manors that I had.
+Even now forsake me, and of all my lands
+Is nothing left me but my body's length.
+Why, what is pomp, rule, reign, but earth and dust?
+And, live we how we can, yet die we must.
+
+SOMERSET:
+Ah, Warwick, Warwick! wert thou as we are.
+We might recover all our loss again;
+The queen from France hath brought a puissant power:
+Even now we heard the news: ah, could'st thou fly!
+
+WARWICK:
+Why, then I would not fly. Ah, Montague,
+If thou be there, sweet brother, take my hand.
+And with thy lips keep in my soul awhile!
+Thou lovest me not; for, brother, if thou didst,
+Thy tears would wash this cold congealed blood
+That glues my lips and will not let me speak.
+Come quickly, Montague, or I am dead.
+
+SOMERSET:
+Ah, Warwick! Montague hath breathed his last;
+And to the latest gasp cried out for Warwick,
+And said 'Commend me to my valiant brother.'
+And more he would have said, and more he spoke,
+Which sounded like a clamour in a vault,
+That mought not be distinguished; but at last
+I well might hear, delivered with a groan,
+'O, farewell, Warwick!'
+
+WARWICK:
+Sweet rest his soul! Fly, lords, and save yourselves;
+For Warwick bids you all farewell to meet in heaven.
+
+OXFORD:
+Away, away, to meet the queen's great power!
+3 KING HENRY VI
+
+KING EDWARD IV:
+Thus far our fortune keeps an upward course,
+And we are graced with wreaths of victory.
+But, in the midst of this bright-shining day,
+I spy a black, suspicious, threatening cloud,
+That will encounter with our glorious sun,
+Ere he attain his easeful western bed:
+I mean, my lords, those powers that the queen
+Hath raised in Gallia have arrived our coast
+And, as we hear, march on to fight with us.
+
+CLARENCE:
+A little gale will soon disperse that cloud
+And blow it to the source from whence it came:
+The very beams will dry those vapours up,
+For every cloud engenders not a storm.
+
+GLOUCESTER:
+The queen is valued thirty thousand strong,
+And Somerset, with Oxford fled to her:
+If she have time to breathe be well assured
+Her faction will be full as strong as ours.
+
+KING EDWARD IV:
+We are advertised by our loving friends
+That they do hold their course toward Tewksbury:
+We, having now the best at Barnet field,
+Will thither straight, for willingness rids way;
+And, as we march, our strength will be augmented
+In every county as we go along.
+Strike up the drum; cry 'Courage!' and away.
+3 KING HENRY VI
+
+QUEEN MARGARET:
+Great lords, wise men ne'er sit and wail their loss,
+But cheerly seek how to redress their harms.
+What though the mast be now blown overboard,
+The cable broke, the holding-anchor lost,
+And half our sailors swallow'd in the flood?
+Yet lives our pilot still. Is't meet that he
+Should leave the helm and like a fearful lad
+With tearful eyes add water to the sea
+And give more strength to that which hath too much,
+Whiles, in his moan, the ship splits on the rock,
+Which industry and courage might have saved?
+Ah, what a shame! ah, what a fault were this!
+Say Warwick was our anchor; what of that?
+And Montague our topmost; what of him?
+Our slaughter'd friends the tackles; what of these?
+Why, is not Oxford here another anchor?
+And Somerset another goodly mast?
+The friends of France our shrouds and tacklings?
+And, though unskilful, why not Ned and I
+For once allow'd the skilful pilot's charge?
+We will not from the helm to sit and weep,
+But keep our course, though the rough wind say no,
+From shelves and rocks that threaten us with wreck.
+As good to chide the waves as speak them fair.
+And what is Edward but ruthless sea?
+What Clarence but a quicksand of deceit?
+And Richard but a ragged fatal rock?
+All these the enemies to our poor bark.
+Say you can swim; alas, 'tis but a while!
+Tread on the sand; why, there you quickly sink:
+Bestride the rock; the tide will wash you off,
+Or else you famish; that's a threefold death.
+This speak I, lords, to let you understand,
+If case some one of you would fly from us,
+That there's no hoped-for mercy with the brothers
+More than with ruthless waves, with sands and rocks.
+Why, courage then! what cannot be avoided
+'Twere childish weakness to lament or fear.
+
+PRINCE EDWARD:
+Methinks a woman of this valiant spirit
+Should, if a coward heard her speak these words,
+Infuse his breast with magnanimity
+And make him, naked, foil a man at arms.
+I speak not this as doubting any here
+For did I but suspect a fearful man
+He should have leave to go away betimes,
+Lest in our need he might infect another
+And make him of like spirit to himself.
+If any such be here--as God forbid!--
+Let him depart before we need his help.
+
+OXFORD:
+Women and children of so high a courage,
+And warriors faint! why, 'twere perpetual shame.
+O brave young prince! thy famous grandfather
+Doth live again in thee: long mayst thou live
+To bear his image and renew his glories!
+
+SOMERSET:
+And he that will not fight for such a hope.
+Go home to bed, and like the owl by day,
+If he arise, be mock'd and wonder'd at.
+
+QUEEN MARGARET:
+Thanks, gentle Somerset; sweet Oxford, thanks.
+
+PRINCE EDWARD:
+And take his thanks that yet hath nothing else.
+
+Messenger:
+Prepare you, lords, for Edward is at hand.
+Ready to fight; therefore be resolute.
+
+OXFORD:
+I thought no less: it is his policy
+To haste thus fast, to find us unprovided.
+
+SOMERSET:
+But he's deceived; we are in readiness.
+
+QUEEN MARGARET:
+This cheers my heart, to see your forwardness.
+
+OXFORD:
+Here pitch our battle; hence we will not budge.
+
+KING EDWARD IV:
+Brave followers, yonder stands the thorny wood,
+Which, by the heavens' assistance and your strength,
+Must by the roots be hewn up yet ere night.
+I need not add more fuel to your fire,
+For well I wot ye blaze to burn them out
+Give signal to the fight, and to it, lords!
+
+QUEEN MARGARET:
+Lords, knights, and gentlemen, what I should say
+My tears gainsay; for every word I speak,
+Ye see, I drink the water of mine eyes.
+Therefore, no more but this: Henry, your sovereign,
+Is prisoner to the foe; his state usurp'd,
+His realm a slaughter-house, his subjects slain,
+His statutes cancell'd and his treasure spent;
+And yonder is the wolf that makes this spoil.
+You fight in justice: then, in God's name, lords,
+Be valiant and give signal to the fight.
+3 KING HENRY VI
+
+KING EDWARD IV:
+Now here a period of tumultuous broils.
+Away with Oxford to Hames Castle straight:
+For Somerset, off with his guilty head.
+Go, bear them hence; I will not hear them speak.
+
+OXFORD:
+For my part, I'll not trouble thee with words.
+
+SOMERSET:
+Nor I, but stoop with patience to my fortune.
+
+QUEEN MARGARET:
+So part we sadly in this troublous world,
+To meet with joy in sweet Jerusalem.
+
+KING EDWARD IV:
+Is proclamation made, that who finds Edward
+Shall have a high reward, and he his life?
+
+GLOUCESTER:
+It is: and lo, where youthful Edward comes!
+
+KING EDWARD IV:
+Bring forth the gallant, let us hear him speak.
+What! can so young a thorn begin to prick?
+Edward, what satisfaction canst thou make
+For bearing arms, for stirring up my subjects,
+And all the trouble thou hast turn'd me to?
+
+PRINCE EDWARD:
+Speak like a subject, proud ambitious York!
+Suppose that I am now my father's mouth;
+Resign thy chair, and where I stand kneel thou,
+Whilst I propose the selfsame words to thee,
+Which traitor, thou wouldst have me answer to.
+
+QUEEN MARGARET:
+Ah, that thy father had been so resolved!
+
+GLOUCESTER:
+That you might still have worn the petticoat,
+And ne'er have stol'n the breech from Lancaster.
+
+PRINCE EDWARD:
+Let AEsop fable in a winter's night;
+His currish riddles sort not with this place.
+
+GLOUCESTER:
+By heaven, brat, I'll plague ye for that word.
+
+QUEEN MARGARET:
+Ay, thou wast born to be a plague to men.
+
+GLOUCESTER:
+For God's sake, take away this captive scold.
+
+PRINCE EDWARD:
+Nay, take away this scolding crookback rather.
+
+KING EDWARD IV:
+Peace, wilful boy, or I will charm your tongue.
+
+CLARENCE:
+Untutor'd lad, thou art too malapert.
+
+PRINCE EDWARD:
+I know my duty; you are all undutiful:
+Lascivious Edward, and thou perjured George,
+And thou mis-shapen Dick, I tell ye all
+I am your better, traitors as ye are:
+And thou usurp'st my father's right and mine.
+
+KING EDWARD IV:
+Take that, thou likeness of this railer here.
+
+GLOUCESTER:
+Sprawl'st thou? take that, to end thy agony.
+
+CLARENCE:
+And there's for twitting me with perjury.
+
+QUEEN MARGARET:
+O, kill me too!
+
+GLOUCESTER:
+Marry, and shall.
+
+KING EDWARD IV:
+Hold, Richard, hold; for we have done too much.
+
+GLOUCESTER:
+Why should she live, to fill the world with words?
+
+KING EDWARD IV:
+What, doth she swoon? use means for her recovery.
+
+GLOUCESTER:
+Clarence, excuse me to the king my brother;
+I'll hence to London on a serious matter:
+Ere ye come there, be sure to hear some news.
+
+CLARENCE:
+What? what?
+
+GLOUCESTER:
+The Tower, the Tower.
+
+QUEEN MARGARET:
+O Ned, sweet Ned! speak to thy mother, boy!
+Canst thou not speak? O traitors! murderers!
+They that stabb'd Caesar shed no blood at all,
+Did not offend, nor were not worthy blame,
+If this foul deed were by to equal it:
+He was a man; this, in respect, a child:
+And men ne'er spend their fury on a child.
+What's worse than murderer, that I may name it?
+No, no, my heart will burst, and if I speak:
+And I will speak, that so my heart may burst.
+Butchers and villains! bloody cannibals!
+How sweet a plant have you untimely cropp'd!
+You have no children, butchers! if you had,
+The thought of them would have stirr'd up remorse:
+But if you ever chance to have a child,
+Look in his youth to have him so cut off
+As, deathmen, you have rid this sweet young prince!
+
+KING EDWARD IV:
+Away with her; go, bear her hence perforce.
+
+QUEEN MARGARET:
+Nay, never bear me hence, dispatch me here,
+Here sheathe thy sword, I'll pardon thee my death:
+What, wilt thou not? then, Clarence, do it thou.
+
+CLARENCE:
+By heaven, I will not do thee so much ease.
+
+QUEEN MARGARET:
+Good Clarence, do; sweet Clarence, do thou do it.
+
+CLARENCE:
+Didst thou not hear me swear I would not do it?
+
+QUEEN MARGARET:
+Ay, but thou usest to forswear thyself:
+'Twas sin before, but now 'tis charity.
+What, wilt thou not? Where is that devil's butcher,
+Hard-favour'd Richard? Richard, where art thou?
+Thou art not here: murder is thy alms-deed;
+Petitioners for blood thou ne'er put'st back.
+
+KING EDWARD IV:
+Away, I say; I charge ye, bear her hence.
+
+QUEEN MARGARET:
+So come to you and yours, as to this Prince!
+
+KING EDWARD IV:
+Where's Richard gone?
+
+CLARENCE:
+To London, all in post; and, as I guess,
+To make a bloody supper in the Tower.
+
+KING EDWARD IV:
+He's sudden, if a thing comes in his head.
+Now march we hence: discharge the common sort
+With pay and thanks, and let's away to London
+And see our gentle queen how well she fares:
+By this, I hope, she hath a son for me.
+3 KING HENRY VI
+
+GLOUCESTER:
+Good day, my lord. What, at your book so hard?
+
+KING HENRY VI:
+Ay, my good lord:--my lord, I should say rather;
+'Tis sin to flatter; 'good' was little better:
+'Good Gloucester' and 'good devil' were alike,
+And both preposterous; therefore, not 'good lord.'
+
+GLOUCESTER:
+Sirrah, leave us to ourselves: we must confer.
+
+KING HENRY VI:
+So flies the reckless shepherd from the wolf;
+So first the harmless sheep doth yield his fleece
+And next his throat unto the butcher's knife.
+What scene of death hath Roscius now to act?
+
+GLOUCESTER:
+Suspicion always haunts the guilty mind;
+The thief doth fear each bush an officer.
+
+KING HENRY VI:
+The bird that hath been limed in a bush,
+With trembling wings misdoubteth every bush;
+And I, the hapless male to one sweet bird,
+Have now the fatal object in my eye
+Where my poor young was limed, was caught and kill'd.
+
+GLOUCESTER:
+Why, what a peevish fool was that of Crete,
+That taught his son the office of a fowl!
+An yet, for all his wings, the fool was drown'd.
+
+KING HENRY VI:
+I, Daedalus; my poor boy, Icarus;
+Thy father, Minos, that denied our course;
+The sun that sear'd the wings of my sweet boy
+Thy brother Edward, and thyself the sea
+Whose envious gulf did swallow up his life.
+Ah, kill me with thy weapon, not with words!
+My breast can better brook thy dagger's point
+Than can my ears that tragic history.
+But wherefore dost thou come? is't for my life?
+
+GLOUCESTER:
+Think'st thou I am an executioner?
+
+KING HENRY VI:
+A persecutor, I am sure, thou art:
+If murdering innocents be executing,
+Why, then thou art an executioner.
+
+GLOUCESTER:
+Thy son I kill'd for his presumption.
+
+KING HENRY VI:
+Hadst thou been kill'd when first thou didst presume,
+Thou hadst not lived to kill a son of mine.
+And thus I prophesy, that many a thousand,
+Which now mistrust no parcel of my fear,
+And many an old man's sigh and many a widow's,
+And many an orphan's water-standing eye--
+Men for their sons, wives for their husbands,
+And orphans for their parents timeless death--
+Shall rue the hour that ever thou wast born.
+The owl shriek'd at thy birth,--an evil sign;
+The night-crow cried, aboding luckless time;
+Dogs howl'd, and hideous tempest shook down trees;
+The raven rook'd her on the chimney's top,
+And chattering pies in dismal discords sung.
+Thy mother felt more than a mother's pain,
+And, yet brought forth less than a mother's hope,
+To wit, an indigested and deformed lump,
+Not like the fruit of such a goodly tree.
+Teeth hadst thou in thy head when thou wast born,
+To signify thou camest to bite the world:
+And, if the rest be true which I have heard,
+Thou camest--
+
+GLOUCESTER:
+I'll hear no more: die, prophet in thy speech:
+For this amongst the rest, was I ordain'd.
+
+KING HENRY VI:
+Ay, and for much more slaughter after this.
+God forgive my sins, and pardon thee!
+
+GLOUCESTER:
+What, will the aspiring blood of Lancaster
+Sink in the ground? I thought it would have mounted.
+See how my sword weeps for the poor king's death!
+O, may such purple tears be alway shed
+From those that wish the downfall of our house!
+If any spark of life be yet remaining,
+Down, down to hell; and say I sent thee thither:
+I, that have neither pity, love, nor fear.
+Indeed, 'tis true that Henry told me of;
+For I have often heard my mother say
+I came into the world with my legs forward:
+Had I not reason, think ye, to make haste,
+And seek their ruin that usurp'd our right?
+The midwife wonder'd and the women cried
+'O, Jesus bless us, he is born with teeth!'
+And so I was; which plainly signified
+That I should snarl and bite and play the dog.
+Then, since the heavens have shaped my body so,
+Let hell make crook'd my mind to answer it.
+I have no brother, I am like no brother;
+And this word 'love,' which graybeards call divine,
+Be resident in men like one another
+And not in me: I am myself alone.
+Clarence, beware; thou keep'st me from the light:
+But I will sort a pitchy day for thee;
+For I will buz abroad such prophecies
+That Edward shall be fearful of his life,
+And then, to purge his fear, I'll be thy death.
+King Henry and the prince his son are gone:
+Clarence, thy turn is next, and then the rest,
+Counting myself but bad till I be best.
+I'll throw thy body in another room
+And triumph, Henry, in thy day of doom.
+3 KING HENRY VI
+
+KING EDWARD IV:
+Once more we sit in England's royal throne,
+Re-purchased with the blood of enemies.
+What valiant foemen, like to autumn's corn,
+Have we mow'd down, in tops of all their pride!
+Three Dukes of Somerset, threefold renown'd
+For hardy and undoubted champions;
+Two Cliffords, as the father and the son,
+And two Northumberlands; two braver men
+Ne'er spurr'd their coursers at the trumpet's sound;
+With them, the two brave bears, Warwick and Montague,
+That in their chains fetter'd the kingly lion
+And made the forest tremble when they roar'd.
+Thus have we swept suspicion from our seat
+And made our footstool of security.
+Come hither, Bess, and let me kiss my boy.
+Young Ned, for thee, thine uncles and myself
+Have in our armours watch'd the winter's night,
+Went all afoot in summer's scalding heat,
+That thou mightst repossess the crown in peace;
+And of our labours thou shalt reap the gain.
+
+GLOUCESTER:
+
+KING EDWARD IV:
+Clarence and Gloucester, love my lovely queen;
+And kiss your princely nephew, brothers both.
+
+CLARENCE:
+The duty that I owe unto your majesty
+I seal upon the lips of this sweet babe.
+
+QUEEN ELIZABETH:
+Thanks, noble Clarence; worthy brother, thanks.
+
+GLOUCESTER:
+And, that I love the tree from whence thou sprang'st,
+Witness the loving kiss I give the fruit.
+
+KING EDWARD IV:
+Now am I seated as my soul delights,
+Having my country's peace and brothers' loves.
+
+CLARENCE:
+What will your grace have done with Margaret?
+Reignier, her father, to the king of France
+Hath pawn'd the Sicils and Jerusalem,
+And hither have they sent it for her ransom.
+
+KING EDWARD IV:
+Away with her, and waft her hence to France.
+And now what rests but that we spend the time
+With stately triumphs, mirthful comic shows,
+Such as befits the pleasure of the court?
+Sound drums and trumpets! farewell sour annoy!
+For here, I hope, begins our lasting joy.
+
+ARCHIDAMUS:
+If you shall chance, Camillo, to visit Bohemia, on
+the like occasion whereon my services are now on
+foot, you shall see, as I have said, great
+difference betwixt our Bohemia and your Sicilia.
+
+CAMILLO:
+I think, this coming summer, the King of Sicilia
+means to pay Bohemia the visitation which he justly owes him.
+
+ARCHIDAMUS:
+Wherein our entertainment shall shame us we will be
+justified in our loves; for indeed--
+
+CAMILLO:
+Beseech you,--
+
+ARCHIDAMUS:
+Verily, I speak it in the freedom of my knowledge:
+we cannot with such magnificence--in so rare--I know
+not what to say. We will give you sleepy drinks,
+that your senses, unintelligent of our insufficience,
+may, though they cannot praise us, as little accuse
+us.
+
+CAMILLO:
+You pay a great deal too dear for what's given freely.
+
+ARCHIDAMUS:
+Believe me, I speak as my understanding instructs me
+and as mine honesty puts it to utterance.
+
+CAMILLO:
+Sicilia cannot show himself over-kind to Bohemia.
+They were trained together in their childhoods; and
+there rooted betwixt them then such an affection,
+which cannot choose but branch now. Since their
+more mature dignities and royal necessities made
+separation of their society, their encounters,
+though not personal, have been royally attorneyed
+with interchange of gifts, letters, loving
+embassies; that they have seemed to be together,
+though absent, shook hands, as over a vast, and
+embraced, as it were, from the ends of opposed
+winds. The heavens continue their loves!
+
+ARCHIDAMUS:
+I think there is not in the world either malice or
+matter to alter it. You have an unspeakable
+comfort of your young prince Mamillius: it is a
+gentleman of the greatest promise that ever came
+into my note.
+
+CAMILLO:
+I very well agree with you in the hopes of him: it
+is a gallant child; one that indeed physics the
+subject, makes old hearts fresh: they that went on
+crutches ere he was born desire yet their life to
+see him a man.
+
+ARCHIDAMUS:
+Would they else be content to die?
+
+CAMILLO:
+Yes; if there were no other excuse why they should
+desire to live.
+
+ARCHIDAMUS:
+If the king had no son, they would desire to live
+on crutches till he had one.
+
+POLIXENES:
+Nine changes of the watery star hath been
+The shepherd's note since we have left our throne
+Without a burthen: time as long again
+Would be find up, my brother, with our thanks;
+And yet we should, for perpetuity,
+Go hence in debt: and therefore, like a cipher,
+Yet standing in rich place, I multiply
+With one 'We thank you' many thousands moe
+That go before it.
+
+LEONTES:
+Stay your thanks a while;
+And pay them when you part.
+
+POLIXENES:
+Sir, that's to-morrow.
+I am question'd by my fears, of what may chance
+Or breed upon our absence; that may blow
+No sneaping winds at home, to make us say
+'This is put forth too truly:' besides, I have stay'd
+To tire your royalty.
+
+LEONTES:
+We are tougher, brother,
+Than you can put us to't.
+
+POLIXENES:
+No longer stay.
+
+LEONTES:
+One seven-night longer.
+
+POLIXENES:
+Very sooth, to-morrow.
+
+LEONTES:
+We'll part the time between's then; and in that
+I'll no gainsaying.
+
+POLIXENES:
+Press me not, beseech you, so.
+There is no tongue that moves, none, none i' the world,
+So soon as yours could win me: so it should now,
+Were there necessity in your request, although
+'Twere needful I denied it. My affairs
+Do even drag me homeward: which to hinder
+Were in your love a whip to me; my stay
+To you a charge and trouble: to save both,
+Farewell, our brother.
+
+LEONTES:
+Tongue-tied, our queen?
+speak you.
+
+HERMIONE:
+I had thought, sir, to have held my peace until
+You have drawn oaths from him not to stay. You, sir,
+Charge him too coldly. Tell him, you are sure
+All in Bohemia's well; this satisfaction
+The by-gone day proclaim'd: say this to him,
+He's beat from his best ward.
+
+LEONTES:
+Well said, Hermione.
+
+HERMIONE:
+To tell, he longs to see his son, were strong:
+But let him say so then, and let him go;
+But let him swear so, and he shall not stay,
+We'll thwack him hence with distaffs.
+Yet of your royal presence I'll adventure
+The borrow of a week. When at Bohemia
+You take my lord, I'll give him my commission
+To let him there a month behind the gest
+Prefix'd for's parting: yet, good deed, Leontes,
+I love thee not a jar o' the clock behind
+What lady-she her lord. You'll stay?
+
+POLIXENES:
+No, madam.
+
+HERMIONE:
+Nay, but you will?
+
+POLIXENES:
+I may not, verily.
+
+HERMIONE:
+Verily!
+You put me off with limber vows; but I,
+Though you would seek to unsphere the
+stars with oaths,
+Should yet say 'Sir, no going.' Verily,
+You shall not go: a lady's 'Verily' 's
+As potent as a lord's. Will you go yet?
+Force me to keep you as a prisoner,
+Not like a guest; so you shall pay your fees
+When you depart, and save your thanks. How say you?
+My prisoner? or my guest? by your dread 'Verily,'
+One of them you shall be.
+
+POLIXENES:
+Your guest, then, madam:
+To be your prisoner should import offending;
+Which is for me less easy to commit
+Than you to punish.
+
+HERMIONE:
+Not your gaoler, then,
+But your kind hostess. Come, I'll question you
+Of my lord's tricks and yours when you were boys:
+You were pretty lordings then?
+
+POLIXENES:
+We were, fair queen,
+Two lads that thought there was no more behind
+But such a day to-morrow as to-day,
+And to be boy eternal.
+
+HERMIONE:
+Was not my lord
+The verier wag o' the two?
+
+POLIXENES:
+We were as twinn'd lambs that did frisk i' the sun,
+And bleat the one at the other: what we changed
+Was innocence for innocence; we knew not
+The doctrine of ill-doing, nor dream'd
+That any did. Had we pursued that life,
+And our weak spirits ne'er been higher rear'd
+With stronger blood, we should have answer'd heaven
+Boldly 'not guilty;' the imposition clear'd
+Hereditary ours.
+
+HERMIONE:
+By this we gather
+You have tripp'd since.
+
+POLIXENES:
+O my most sacred lady!
+Temptations have since then been born to's; for
+In those unfledged days was my wife a girl;
+Your precious self had then not cross'd the eyes
+Of my young play-fellow.
+
+HERMIONE:
+Grace to boot!
+Of this make no conclusion, lest you say
+Your queen and I are devils: yet go on;
+The offences we have made you do we'll answer,
+If you first sinn'd with us and that with us
+You did continue fault and that you slipp'd not
+With any but with us.
+
+LEONTES:
+Is he won yet?
+
+HERMIONE:
+He'll stay my lord.
+
+LEONTES:
+At my request he would not.
+Hermione, my dearest, thou never spokest
+To better purpose.
+
+HERMIONE:
+Never?
+
+LEONTES:
+Never, but once.
+
+HERMIONE:
+What! have I twice said well? when was't before?
+I prithee tell me; cram's with praise, and make's
+As fat as tame things: one good deed dying tongueless
+Slaughters a thousand waiting upon that.
+Our praises are our wages: you may ride's
+With one soft kiss a thousand furlongs ere
+With spur we beat an acre. But to the goal:
+My last good deed was to entreat his stay:
+What was my first? it has an elder sister,
+Or I mistake you: O, would her name were Grace!
+But once before I spoke to the purpose: when?
+Nay, let me have't; I long.
+
+LEONTES:
+Why, that was when
+Three crabbed months had sour'd themselves to death,
+Ere I could make thee open thy white hand
+And clap thyself my love: then didst thou utter
+'I am yours for ever.'
+
+HERMIONE:
+'Tis grace indeed.
+Why, lo you now, I have spoke to the purpose twice:
+The one for ever earn'd a royal husband;
+The other for some while a friend.
+
+LEONTES:
+
+MAMILLIUS:
+Ay, my good lord.
+
+LEONTES:
+I' fecks!
+Why, that's my bawcock. What, hast
+smutch'd thy nose?
+They say it is a copy out of mine. Come, captain,
+We must be neat; not neat, but cleanly, captain:
+And yet the steer, the heifer and the calf
+Are all call'd neat.--Still virginalling
+Upon his palm!--How now, you wanton calf!
+Art thou my calf?
+
+MAMILLIUS:
+Yes, if you will, my lord.
+
+LEONTES:
+Thou want'st a rough pash and the shoots that I have,
+To be full like me: yet they say we are
+Almost as like as eggs; women say so,
+That will say anything but were they false
+As o'er-dyed blacks, as wind, as waters, false
+As dice are to be wish'd by one that fixes
+No bourn 'twixt his and mine, yet were it true
+To say this boy were like me. Come, sir page,
+Look on me with your welkin eye: sweet villain!
+Most dear'st! my collop! Can thy dam?--may't be?--
+Affection! thy intention stabs the centre:
+Thou dost make possible things not so held,
+Communicatest with dreams;--how can this be?--
+With what's unreal thou coactive art,
+And fellow'st nothing: then 'tis very credent
+Thou mayst co-join with something; and thou dost,
+And that beyond commission, and I find it,
+And that to the infection of my brains
+And hardening of my brows.
+
+POLIXENES:
+What means Sicilia?
+
+HERMIONE:
+He something seems unsettled.
+
+POLIXENES:
+How, my lord!
+What cheer? how is't with you, best brother?
+
+HERMIONE:
+You look as if you held a brow of much distraction
+Are you moved, my lord?
+
+LEONTES:
+No, in good earnest.
+How sometimes nature will betray its folly,
+Its tenderness, and make itself a pastime
+To harder bosoms! Looking on the lines
+Of my boy's face, methoughts I did recoil
+Twenty-three years, and saw myself unbreech'd,
+In my green velvet coat, my dagger muzzled,
+Lest it should bite its master, and so prove,
+As ornaments oft do, too dangerous:
+How like, methought, I then was to this kernel,
+This squash, this gentleman. Mine honest friend,
+Will you take eggs for money?
+
+MAMILLIUS:
+No, my lord, I'll fight.
+
+LEONTES:
+You will! why, happy man be's dole! My brother,
+Are you so fond of your young prince as we
+Do seem to be of ours?
+
+POLIXENES:
+If at home, sir,
+He's all my exercise, my mirth, my matter,
+Now my sworn friend and then mine enemy,
+My parasite, my soldier, statesman, all:
+He makes a July's day short as December,
+And with his varying childness cures in me
+Thoughts that would thick my blood.
+
+LEONTES:
+So stands this squire
+Officed with me: we two will walk, my lord,
+And leave you to your graver steps. Hermione,
+How thou lovest us, show in our brother's welcome;
+Let what is dear in Sicily be cheap:
+Next to thyself and my young rover, he's
+Apparent to my heart.
+
+HERMIONE:
+If you would seek us,
+We are yours i' the garden: shall's attend you there?
+
+LEONTES:
+To your own bents dispose you: you'll be found,
+Be you beneath the sky.
+I am angling now,
+Though you perceive me not how I give line.
+Go to, go to!
+How she holds up the neb, the bill to him!
+And arms her with the boldness of a wife
+To her allowing husband!
+Gone already!
+Inch-thick, knee-deep, o'er head and
+ears a fork'd one!
+Go, play, boy, play: thy mother plays, and I
+Play too, but so disgraced a part, whose issue
+Will hiss me to my grave: contempt and clamour
+Will be my knell. Go, play, boy, play.
+There have been,
+Or I am much deceived, cuckolds ere now;
+And many a man there is, even at this present,
+Now while I speak this, holds his wife by the arm,
+That little thinks she has been sluiced in's absence
+And his pond fish'd by his next neighbour, by
+Sir Smile, his neighbour: nay, there's comfort in't
+Whiles other men have gates and those gates open'd,
+As mine, against their will. Should all despair
+That have revolted wives, the tenth of mankind
+Would hang themselves. Physic for't there is none;
+It is a bawdy planet, that will strike
+Where 'tis predominant; and 'tis powerful, think it,
+From east, west, north and south: be it concluded,
+No barricado for a belly; know't;
+It will let in and out the enemy
+With bag and baggage: many thousand on's
+Have the disease, and feel't not. How now, boy!
+
+MAMILLIUS:
+I am like you, they say.
+
+LEONTES:
+Why that's some comfort. What, Camillo there?
+
+CAMILLO:
+Ay, my good lord.
+
+LEONTES:
+Go play, Mamillius; thou'rt an honest man.
+Camillo, this great sir will yet stay longer.
+
+CAMILLO:
+You had much ado to make his anchor hold:
+When you cast out, it still came home.
+
+LEONTES:
+Didst note it?
+
+CAMILLO:
+He would not stay at your petitions: made
+His business more material.
+
+LEONTES:
+Didst perceive it?
+They're here with me already, whispering, rounding
+'Sicilia is a so-forth:' 'tis far gone,
+When I shall gust it last. How came't, Camillo,
+That he did stay?
+
+CAMILLO:
+At the good queen's entreaty.
+
+LEONTES:
+At the queen's be't: 'good' should be pertinent
+But, so it is, it is not. Was this taken
+By any understanding pate but thine?
+For thy conceit is soaking, will draw in
+More than the common blocks: not noted, is't,
+But of the finer natures? by some severals
+Of head-piece extraordinary? lower messes
+Perchance are to this business purblind? say.
+
+CAMILLO:
+Business, my lord! I think most understand
+Bohemia stays here longer.
+
+LEONTES:
+Ha!
+
+CAMILLO:
+Stays here longer.
+
+LEONTES:
+Ay, but why?
+
+CAMILLO:
+To satisfy your highness and the entreaties
+Of our most gracious mistress.
+
+LEONTES:
+Satisfy!
+The entreaties of your mistress! satisfy!
+Let that suffice. I have trusted thee, Camillo,
+With all the nearest things to my heart, as well
+My chamber-councils, wherein, priest-like, thou
+Hast cleansed my bosom, I from thee departed
+Thy penitent reform'd: but we have been
+Deceived in thy integrity, deceived
+In that which seems so.
+
+CAMILLO:
+Be it forbid, my lord!
+
+LEONTES:
+To bide upon't, thou art not honest, or,
+If thou inclinest that way, thou art a coward,
+Which hoxes honesty behind, restraining
+From course required; or else thou must be counted
+A servant grafted in my serious trust
+And therein negligent; or else a fool
+That seest a game play'd home, the rich stake drawn,
+And takest it all for jest.
+
+CAMILLO:
+My gracious lord,
+I may be negligent, foolish and fearful;
+In every one of these no man is free,
+But that his negligence, his folly, fear,
+Among the infinite doings of the world,
+Sometime puts forth. In your affairs, my lord,
+If ever I were wilful-negligent,
+It was my folly; if industriously
+I play'd the fool, it was my negligence,
+Not weighing well the end; if ever fearful
+To do a thing, where I the issue doubted,
+Where of the execution did cry out
+Against the non-performance, 'twas a fear
+Which oft infects the wisest: these, my lord,
+Are such allow'd infirmities that honesty
+Is never free of. But, beseech your grace,
+Be plainer with me; let me know my trespass
+By its own visage: if I then deny it,
+'Tis none of mine.
+
+LEONTES:
+Ha' not you seen, Camillo,--
+But that's past doubt, you have, or your eye-glass
+Is thicker than a cuckold's horn,--or heard,--
+For to a vision so apparent rumour
+Cannot be mute,--or thought,--for cogitation
+Resides not in that man that does not think,--
+My wife is slippery? If thou wilt confess,
+Or else be impudently negative,
+To have nor eyes nor ears nor thought, then say
+My wife's a hobby-horse, deserves a name
+As rank as any flax-wench that puts to
+Before her troth-plight: say't and justify't.
+
+CAMILLO:
+I would not be a stander-by to hear
+My sovereign mistress clouded so, without
+My present vengeance taken: 'shrew my heart,
+You never spoke what did become you less
+Than this; which to reiterate were sin
+As deep as that, though true.
+
+LEONTES:
+Is whispering nothing?
+Is leaning cheek to cheek? is meeting noses?
+Kissing with inside lip? stopping the career
+Of laughing with a sigh?--a note infallible
+Of breaking honesty--horsing foot on foot?
+Skulking in corners? wishing clocks more swift?
+Hours, minutes? noon, midnight? and all eyes
+Blind with the pin and web but theirs, theirs only,
+That would unseen be wicked? is this nothing?
+Why, then the world and all that's in't is nothing;
+The covering sky is nothing; Bohemia nothing;
+My wife is nothing; nor nothing have these nothings,
+If this be nothing.
+
+CAMILLO:
+Good my lord, be cured
+Of this diseased opinion, and betimes;
+For 'tis most dangerous.
+
+LEONTES:
+Say it be, 'tis true.
+
+CAMILLO:
+No, no, my lord.
+
+LEONTES:
+It is; you lie, you lie:
+I say thou liest, Camillo, and I hate thee,
+Pronounce thee a gross lout, a mindless slave,
+Or else a hovering temporizer, that
+Canst with thine eyes at once see good and evil,
+Inclining to them both: were my wife's liver
+Infected as her life, she would not live
+The running of one glass.
+
+CAMILLO:
+Who does infect her?
+
+LEONTES:
+Why, he that wears her like a medal, hanging
+About his neck, Bohemia: who, if I
+Had servants true about me, that bare eyes
+To see alike mine honour as their profits,
+Their own particular thrifts, they would do that
+Which should undo more doing: ay, and thou,
+His cupbearer,--whom I from meaner form
+Have benched and reared to worship, who mayst see
+Plainly as heaven sees earth and earth sees heaven,
+How I am galled,--mightst bespice a cup,
+To give mine enemy a lasting wink;
+Which draught to me were cordial.
+
+CAMILLO:
+Sir, my lord,
+I could do this, and that with no rash potion,
+But with a lingering dram that should not work
+Maliciously like poison: but I cannot
+Believe this crack to be in my dread mistress,
+So sovereignly being honourable.
+I have loved thee,--
+
+LEONTES:
+Make that thy question, and go rot!
+Dost think I am so muddy, so unsettled,
+To appoint myself in this vexation, sully
+The purity and whiteness of my sheets,
+Which to preserve is sleep, which being spotted
+Is goads, thorns, nettles, tails of wasps,
+Give scandal to the blood o' the prince my son,
+Who I do think is mine and love as mine,
+Without ripe moving to't? Would I do this?
+Could man so blench?
+
+CAMILLO:
+I must believe you, sir:
+I do; and will fetch off Bohemia for't;
+Provided that, when he's removed, your highness
+Will take again your queen as yours at first,
+Even for your son's sake; and thereby for sealing
+The injury of tongues in courts and kingdoms
+Known and allied to yours.
+
+LEONTES:
+Thou dost advise me
+Even so as I mine own course have set down:
+I'll give no blemish to her honour, none.
+
+CAMILLO:
+My lord,
+Go then; and with a countenance as clear
+As friendship wears at feasts, keep with Bohemia
+And with your queen. I am his cupbearer:
+If from me he have wholesome beverage,
+Account me not your servant.
+
+LEONTES:
+This is all:
+Do't and thou hast the one half of my heart;
+Do't not, thou split'st thine own.
+
+CAMILLO:
+I'll do't, my lord.
+
+LEONTES:
+I will seem friendly, as thou hast advised me.
+
+CAMILLO:
+O miserable lady! But, for me,
+What case stand I in? I must be the poisoner
+Of good Polixenes; and my ground to do't
+Is the obedience to a master, one
+Who in rebellion with himself will have
+All that are his so too. To do this deed,
+Promotion follows. If I could find example
+Of thousands that had struck anointed kings
+And flourish'd after, I'ld not do't; but since
+Nor brass nor stone nor parchment bears not one,
+Let villany itself forswear't. I must
+Forsake the court: to do't, or no, is certain
+To me a break-neck. Happy star, reign now!
+Here comes Bohemia.
+
+POLIXENES:
+This is strange: methinks
+My favour here begins to warp. Not speak?
+Good day, Camillo.
+
+CAMILLO:
+Hail, most royal sir!
+
+POLIXENES:
+What is the news i' the court?
+
+CAMILLO:
+None rare, my lord.
+
+POLIXENES:
+The king hath on him such a countenance
+As he had lost some province and a region
+Loved as he loves himself: even now I met him
+With customary compliment; when he,
+Wafting his eyes to the contrary and falling
+A lip of much contempt, speeds from me and
+So leaves me to consider what is breeding
+That changeth thus his manners.
+
+CAMILLO:
+I dare not know, my lord.
+
+POLIXENES:
+How! dare not! do not. Do you know, and dare not?
+Be intelligent to me: 'tis thereabouts;
+For, to yourself, what you do know, you must.
+And cannot say, you dare not. Good Camillo,
+Your changed complexions are to me a mirror
+Which shows me mine changed too; for I must be
+A party in this alteration, finding
+Myself thus alter'd with 't.
+
+CAMILLO:
+There is a sickness
+Which puts some of us in distemper, but
+I cannot name the disease; and it is caught
+Of you that yet are well.
+
+POLIXENES:
+How! caught of me!
+Make me not sighted like the basilisk:
+I have look'd on thousands, who have sped the better
+By my regard, but kill'd none so. Camillo,--
+As you are certainly a gentleman, thereto
+Clerk-like experienced, which no less adorns
+Our gentry than our parents' noble names,
+In whose success we are gentle,--I beseech you,
+If you know aught which does behove my knowledge
+Thereof to be inform'd, imprison't not
+In ignorant concealment.
+
+CAMILLO:
+I may not answer.
+
+POLIXENES:
+A sickness caught of me, and yet I well!
+I must be answer'd. Dost thou hear, Camillo,
+I conjure thee, by all the parts of man
+Which honour does acknowledge, whereof the least
+Is not this suit of mine, that thou declare
+What incidency thou dost guess of harm
+Is creeping toward me; how far off, how near;
+Which way to be prevented, if to be;
+If not, how best to bear it.
+
+CAMILLO:
+Sir, I will tell you;
+Since I am charged in honour and by him
+That I think honourable: therefore mark my counsel,
+Which must be even as swiftly follow'd as
+I mean to utter it, or both yourself and me
+Cry lost, and so good night!
+
+POLIXENES:
+On, good Camillo.
+
+CAMILLO:
+I am appointed him to murder you.
+
+POLIXENES:
+By whom, Camillo?
+
+CAMILLO:
+By the king.
+
+POLIXENES:
+For what?
+
+CAMILLO:
+He thinks, nay, with all confidence he swears,
+As he had seen't or been an instrument
+To vice you to't, that you have touch'd his queen
+Forbiddenly.
+
+POLIXENES:
+O, then my best blood turn
+To an infected jelly and my name
+Be yoked with his that did betray the Best!
+Turn then my freshest reputation to
+A savour that may strike the dullest nostril
+Where I arrive, and my approach be shunn'd,
+Nay, hated too, worse than the great'st infection
+That e'er was heard or read!
+
+CAMILLO:
+Swear his thought over
+By each particular star in heaven and
+By all their influences, you may as well
+Forbid the sea for to obey the moon
+As or by oath remove or counsel shake
+The fabric of his folly, whose foundation
+Is piled upon his faith and will continue
+The standing of his body.
+
+POLIXENES:
+How should this grow?
+
+CAMILLO:
+I know not: but I am sure 'tis safer to
+Avoid what's grown than question how 'tis born.
+If therefore you dare trust my honesty,
+That lies enclosed in this trunk which you
+Shall bear along impawn'd, away to-night!
+Your followers I will whisper to the business,
+And will by twos and threes at several posterns
+Clear them o' the city. For myself, I'll put
+My fortunes to your service, which are here
+By this discovery lost. Be not uncertain;
+For, by the honour of my parents, I
+Have utter'd truth: which if you seek to prove,
+I dare not stand by; nor shall you be safer
+Than one condemn'd by the king's own mouth, thereon
+His execution sworn.
+
+POLIXENES:
+I do believe thee:
+I saw his heart in 's face. Give me thy hand:
+Be pilot to me and thy places shall
+Still neighbour mine. My ships are ready and
+My people did expect my hence departure
+Two days ago. This jealousy
+Is for a precious creature: as she's rare,
+Must it be great, and as his person's mighty,
+Must it be violent, and as he does conceive
+He is dishonour'd by a man which ever
+Profess'd to him, why, his revenges must
+In that be made more bitter. Fear o'ershades me:
+Good expedition be my friend, and comfort
+The gracious queen, part of his theme, but nothing
+Of his ill-ta'en suspicion! Come, Camillo;
+I will respect thee as a father if
+Thou bear'st my life off hence: let us avoid.
+
+CAMILLO:
+It is in mine authority to command
+The keys of all the posterns: please your highness
+To take the urgent hour. Come, sir, away.
+
+HERMIONE:
+Take the boy to you: he so troubles me,
+'Tis past enduring.
+
+First Lady:
+Come, my gracious lord,
+Shall I be your playfellow?
+
+MAMILLIUS:
+No, I'll none of you.
+
+First Lady:
+Why, my sweet lord?
+
+MAMILLIUS:
+You'll kiss me hard and speak to me as if
+I were a baby still. I love you better.
+
+Second Lady:
+And why so, my lord?
+
+MAMILLIUS:
+Not for because
+Your brows are blacker; yet black brows, they say,
+Become some women best, so that there be not
+Too much hair there, but in a semicircle
+Or a half-moon made with a pen.
+
+Second Lady:
+Who taught you this?
+
+MAMILLIUS:
+I learnt it out of women's faces. Pray now
+What colour are your eyebrows?
+
+First Lady:
+Blue, my lord.
+
+MAMILLIUS:
+Nay, that's a mock: I have seen a lady's nose
+That has been blue, but not her eyebrows.
+
+First Lady:
+Hark ye;
+The queen your mother rounds apace: we shall
+Present our services to a fine new prince
+One of these days; and then you'ld wanton with us,
+If we would have you.
+
+Second Lady:
+She is spread of late
+Into a goodly bulk: good time encounter her!
+
+HERMIONE:
+What wisdom stirs amongst you? Come, sir, now
+I am for you again: pray you, sit by us,
+And tell 's a tale.
+
+MAMILLIUS:
+Merry or sad shall't be?
+
+HERMIONE:
+As merry as you will.
+
+MAMILLIUS:
+A sad tale's best for winter: I have one
+Of sprites and goblins.
+
+HERMIONE:
+Let's have that, good sir.
+Come on, sit down: come on, and do your best
+To fright me with your sprites; you're powerful at it.
+
+MAMILLIUS:
+There was a man--
+
+HERMIONE:
+Nay, come, sit down; then on.
+
+MAMILLIUS:
+Dwelt by a churchyard: I will tell it softly;
+Yond crickets shall not hear it.
+
+HERMIONE:
+Come on, then,
+And give't me in mine ear.
+
+LEONTES:
+Was he met there? his train? Camillo with him?
+
+First Lord:
+Behind the tuft of pines I met them; never
+Saw I men scour so on their way: I eyed them
+Even to their ships.
+
+LEONTES:
+How blest am I
+In my just censure, in my true opinion!
+Alack, for lesser knowledge! how accursed
+In being so blest! There may be in the cup
+A spider steep'd, and one may drink, depart,
+And yet partake no venom, for his knowledge
+Is not infected: but if one present
+The abhorr'd ingredient to his eye, make known
+How he hath drunk, he cracks his gorge, his sides,
+With violent hefts. I have drunk,
+and seen the spider.
+Camillo was his help in this, his pander:
+There is a plot against my life, my crown;
+All's true that is mistrusted: that false villain
+Whom I employ'd was pre-employ'd by him:
+He has discover'd my design, and I
+Remain a pinch'd thing; yea, a very trick
+For them to play at will. How came the posterns
+So easily open?
+
+First Lord:
+By his great authority;
+Which often hath no less prevail'd than so
+On your command.
+
+LEONTES:
+I know't too well.
+Give me the boy: I am glad you did not nurse him:
+Though he does bear some signs of me, yet you
+Have too much blood in him.
+
+HERMIONE:
+What is this? sport?
+
+LEONTES:
+Bear the boy hence; he shall not come about her;
+Away with him! and let her sport herself
+With that she's big with; for 'tis Polixenes
+Has made thee swell thus.
+
+HERMIONE:
+But I'ld say he had not,
+And I'll be sworn you would believe my saying,
+Howe'er you lean to the nayward.
+
+LEONTES:
+You, my lords,
+Look on her, mark her well; be but about
+To say 'she is a goodly lady,' and
+The justice of your bearts will thereto add
+'Tis pity she's not honest, honourable:'
+Praise her but for this her without-door form,
+Which on my faith deserves high speech, and straight
+The shrug, the hum or ha, these petty brands
+That calumny doth use--O, I am out--
+That mercy does, for calumny will sear
+Virtue itself: these shrugs, these hums and ha's,
+When you have said 'she's goodly,' come between
+Ere you can say 'she's honest:' but be 't known,
+From him that has most cause to grieve it should be,
+She's an adulteress.
+
+HERMIONE:
+Should a villain say so,
+The most replenish'd villain in the world,
+He were as much more villain: you, my lord,
+Do but mistake.
+
+LEONTES:
+You have mistook, my lady,
+Polixenes for Leontes: O thou thing!
+Which I'll not call a creature of thy place,
+Lest barbarism, making me the precedent,
+Should a like language use to all degrees
+And mannerly distinguishment leave out
+Betwixt the prince and beggar: I have said
+She's an adulteress; I have said with whom:
+More, she's a traitor and Camillo is
+A federary with her, and one that knows
+What she should shame to know herself
+But with her most vile principal, that she's
+A bed-swerver, even as bad as those
+That vulgars give bold'st titles, ay, and privy
+To this their late escape.
+
+HERMIONE:
+No, by my life.
+Privy to none of this. How will this grieve you,
+When you shall come to clearer knowledge, that
+You thus have publish'd me! Gentle my lord,
+You scarce can right me throughly then to say
+You did mistake.
+
+LEONTES:
+No; if I mistake
+In those foundations which I build upon,
+The centre is not big enough to bear
+A school-boy's top. Away with her! to prison!
+He who shall speak for her is afar off guilty
+But that he speaks.
+
+HERMIONE:
+There's some ill planet reigns:
+I must be patient till the heavens look
+With an aspect more favourable. Good my lords,
+I am not prone to weeping, as our sex
+Commonly are; the want of which vain dew
+Perchance shall dry your pities: but I have
+That honourable grief lodged here which burns
+Worse than tears drown: beseech you all, my lords,
+With thoughts so qualified as your charities
+Shall best instruct you, measure me; and so
+The king's will be perform'd!
+
+LEONTES:
+Shall I be heard?
+
+HERMIONE:
+Who is't that goes with me? Beseech your highness,
+My women may be with me; for you see
+My plight requires it. Do not weep, good fools;
+There is no cause: when you shall know your mistress
+Has deserved prison, then abound in tears
+As I come out: this action I now go on
+Is for my better grace. Adieu, my lord:
+I never wish'd to see you sorry; now
+I trust I shall. My women, come; you have leave.
+
+LEONTES:
+Go, do our bidding; hence!
+
+First Lord:
+Beseech your highness, call the queen again.
+
+ANTIGONUS:
+Be certain what you do, sir, lest your justice
+Prove violence; in the which three great ones suffer,
+Yourself, your queen, your son.
+
+First Lord:
+For her, my lord,
+I dare my life lay down and will do't, sir,
+Please you to accept it, that the queen is spotless
+I' the eyes of heaven and to you; I mean,
+In this which you accuse her.
+
+ANTIGONUS:
+If it prove
+She's otherwise, I'll keep my stables where
+I lodge my wife; I'll go in couples with her;
+Than when I feel and see her no farther trust her;
+For every inch of woman in the world,
+Ay, every dram of woman's flesh is false, If she be.
+
+LEONTES:
+Hold your peaces.
+
+First Lord:
+Good my lord,--
+
+ANTIGONUS:
+It is for you we speak, not for ourselves:
+You are abused and by some putter-on
+That will be damn'd for't; would I knew the villain,
+I would land-damn him. Be she honour-flaw'd,
+I have three daughters; the eldest is eleven
+The second and the third, nine, and some five;
+If this prove true, they'll pay for't:
+by mine honour,
+I'll geld 'em all; fourteen they shall not see,
+To bring false generations: they are co-heirs;
+And I had rather glib myself than they
+Should not produce fair issue.
+
+LEONTES:
+Cease; no more.
+You smell this business with a sense as cold
+As is a dead man's nose: but I do see't and feel't
+As you feel doing thus; and see withal
+The instruments that feel.
+
+ANTIGONUS:
+If it be so,
+We need no grave to bury honesty:
+There's not a grain of it the face to sweeten
+Of the whole dungy earth.
+
+LEONTES:
+What! lack I credit?
+
+First Lord:
+I had rather you did lack than I, my lord,
+Upon this ground; and more it would content me
+To have her honour true than your suspicion,
+Be blamed for't how you might.
+
+LEONTES:
+Why, what need we
+Commune with you of this, but rather follow
+Our forceful instigation? Our prerogative
+Calls not your counsels, but our natural goodness
+Imparts this; which if you, or stupefied
+Or seeming so in skill, cannot or will not
+Relish a truth like us, inform yourselves
+We need no more of your advice: the matter,
+The loss, the gain, the ordering on't, is all
+Properly ours.
+
+ANTIGONUS:
+And I wish, my liege,
+You had only in your silent judgment tried it,
+Without more overture.
+
+LEONTES:
+How could that be?
+Either thou art most ignorant by age,
+Or thou wert born a fool. Camillo's flight,
+Added to their familiarity,
+Which was as gross as ever touch'd conjecture,
+That lack'd sight only, nought for approbation
+But only seeing, all other circumstances
+Made up to the deed, doth push on this proceeding:
+Yet, for a greater confirmation,
+For in an act of this importance 'twere
+Most piteous to be wild, I have dispatch'd in post
+To sacred Delphos, to Apollo's temple,
+Cleomenes and Dion, whom you know
+Of stuff'd sufficiency: now from the oracle
+They will bring all; whose spiritual counsel had,
+Shall stop or spur me. Have I done well?
+
+First Lord:
+Well done, my lord.
+
+LEONTES:
+Though I am satisfied and need no more
+Than what I know, yet shall the oracle
+Give rest to the minds of others, such as he
+Whose ignorant credulity will not
+Come up to the truth. So have we thought it good
+From our free person she should be confined,
+Lest that the treachery of the two fled hence
+Be left her to perform. Come, follow us;
+We are to speak in public; for this business
+Will raise us all.
+
+ANTIGONUS:
+
+PAULINA:
+The keeper of the prison, call to him;
+let him have knowledge who I am.
+Good lady,
+No court in Europe is too good for thee;
+What dost thou then in prison?
+Now, good sir,
+You know me, do you not?
+
+Gaoler:
+For a worthy lady
+And one whom much I honour.
+
+PAULINA:
+Pray you then,
+Conduct me to the queen.
+
+Gaoler:
+I may not, madam:
+To the contrary I have express commandment.
+
+PAULINA:
+Here's ado,
+To lock up honesty and honour from
+The access of gentle visitors!
+Is't lawful, pray you,
+To see her women? any of them? Emilia?
+
+Gaoler:
+So please you, madam,
+To put apart these your attendants, I
+Shall bring Emilia forth.
+
+PAULINA:
+I pray now, call her.
+Withdraw yourselves.
+
+Gaoler:
+And, madam,
+I must be present at your conference.
+
+PAULINA:
+Well, be't so, prithee.
+Here's such ado to make no stain a stain
+As passes colouring.
+Dear gentlewoman,
+How fares our gracious lady?
+
+EMILIA:
+As well as one so great and so forlorn
+May hold together: on her frights and griefs,
+Which never tender lady hath born greater,
+She is something before her time deliver'd.
+
+PAULINA:
+A boy?
+
+EMILIA:
+A daughter, and a goodly babe,
+Lusty and like to live: the queen receives
+Much comfort in't; says 'My poor prisoner,
+I am innocent as you.'
+
+PAULINA:
+I dare be sworn
+These dangerous unsafe lunes i' the king,
+beshrew them!
+He must be told on't, and he shall: the office
+Becomes a woman best; I'll take't upon me:
+If I prove honey-mouth'd let my tongue blister
+And never to my red-look'd anger be
+The trumpet any more. Pray you, Emilia,
+Commend my best obedience to the queen:
+If she dares trust me with her little babe,
+I'll show't the king and undertake to be
+Her advocate to the loud'st. We do not know
+How he may soften at the sight o' the child:
+The silence often of pure innocence
+Persuades when speaking fails.
+
+EMILIA:
+Most worthy madam,
+Your honour and your goodness is so evident
+That your free undertaking cannot miss
+A thriving issue: there is no lady living
+So meet for this great errand. Please your ladyship
+To visit the next room, I'll presently
+Acquaint the queen of your most noble offer;
+Who but to-day hammer'd of this design,
+But durst not tempt a minister of honour,
+Lest she should be denied.
+
+PAULINA:
+Tell her, Emilia.
+I'll use that tongue I have: if wit flow from't
+As boldness from my bosom, let 't not be doubted
+I shall do good.
+
+EMILIA:
+Now be you blest for it!
+I'll to the queen: please you,
+come something nearer.
+
+Gaoler:
+Madam, if't please the queen to send the babe,
+I know not what I shall incur to pass it,
+Having no warrant.
+
+PAULINA:
+You need not fear it, sir:
+This child was prisoner to the womb and is
+By law and process of great nature thence
+Freed and enfranchised, not a party to
+The anger of the king nor guilty of,
+If any be, the trespass of the queen.
+
+Gaoler:
+I do believe it.
+
+PAULINA:
+Do not you fear: upon mine honour,
+I will stand betwixt you and danger.
+
+LEONTES:
+Nor night nor day no rest: it is but weakness
+To bear the matter thus; mere weakness. If
+The cause were not in being,--part o' the cause,
+She the adulteress; for the harlot king
+Is quite beyond mine arm, out of the blank
+And level of my brain, plot-proof; but she
+I can hook to me: say that she were gone,
+Given to the fire, a moiety of my rest
+Might come to me again. Who's there?
+
+First Servant:
+My lord?
+
+LEONTES:
+How does the boy?
+
+First Servant:
+He took good rest to-night;
+'Tis hoped his sickness is discharged.
+
+LEONTES:
+To see his nobleness!
+Conceiving the dishonour of his mother,
+He straight declined, droop'd, took it deeply,
+Fasten'd and fix'd the shame on't in himself,
+Threw off his spirit, his appetite, his sleep,
+And downright languish'd. Leave me solely: go,
+See how he fares.
+Fie, fie! no thought of him:
+The thought of my revenges that way
+Recoil upon me: in himself too mighty,
+And in his parties, his alliance; let him be
+Until a time may serve: for present vengeance,
+Take it on her. Camillo and Polixenes
+Laugh at me, make their pastime at my sorrow:
+They should not laugh if I could reach them, nor
+Shall she within my power.
+
+First Lord:
+You must not enter.
+
+PAULINA:
+Nay, rather, good my lords, be second to me:
+Fear you his tyrannous passion more, alas,
+Than the queen's life? a gracious innocent soul,
+More free than he is jealous.
+
+ANTIGONUS:
+That's enough.
+
+Second Servant:
+Madam, he hath not slept tonight; commanded
+None should come at him.
+
+PAULINA:
+Not so hot, good sir:
+I come to bring him sleep. 'Tis such as you,
+That creep like shadows by him and do sigh
+At each his needless heavings, such as you
+Nourish the cause of his awaking: I
+Do come with words as medicinal as true,
+Honest as either, to purge him of that humour
+That presses him from sleep.
+
+LEONTES:
+What noise there, ho?
+
+PAULINA:
+No noise, my lord; but needful conference
+About some gossips for your highness.
+
+LEONTES:
+How!
+Away with that audacious lady! Antigonus,
+I charged thee that she should not come about me:
+I knew she would.
+
+ANTIGONUS:
+I told her so, my lord,
+On your displeasure's peril and on mine,
+She should not visit you.
+
+LEONTES:
+What, canst not rule her?
+
+PAULINA:
+From all dishonesty he can: in this,
+Unless he take the course that you have done,
+Commit me for committing honour, trust it,
+He shall not rule me.
+
+ANTIGONUS:
+La you now, you hear:
+When she will take the rein I let her run;
+But she'll not stumble.
+
+PAULINA:
+Good my liege, I come;
+And, I beseech you, hear me, who profess
+Myself your loyal servant, your physician,
+Your most obedient counsellor, yet that dare
+Less appear so in comforting your evils,
+Than such as most seem yours: I say, I come
+From your good queen.
+
+LEONTES:
+Good queen!
+
+PAULINA:
+Good queen, my lord,
+Good queen; I say good queen;
+And would by combat make her good, so were I
+A man, the worst about you.
+
+LEONTES:
+Force her hence.
+
+PAULINA:
+Let him that makes but trifles of his eyes
+First hand me: on mine own accord I'll off;
+But first I'll do my errand. The good queen,
+For she is good, hath brought you forth a daughter;
+Here 'tis; commends it to your blessing.
+
+LEONTES:
+Out!
+A mankind witch! Hence with her, out o' door:
+A most intelligencing bawd!
+
+PAULINA:
+Not so:
+I am as ignorant in that as you
+In so entitling me, and no less honest
+Than you are mad; which is enough, I'll warrant,
+As this world goes, to pass for honest.
+
+LEONTES:
+Traitors!
+Will you not push her out? Give her the bastard.
+Thou dotard! thou art woman-tired, unroosted
+By thy dame Partlet here. Take up the bastard;
+Take't up, I say; give't to thy crone.
+
+PAULINA:
+For ever
+Unvenerable be thy hands, if thou
+Takest up the princess by that forced baseness
+Which he has put upon't!
+
+LEONTES:
+He dreads his wife.
+
+PAULINA:
+So I would you did; then 'twere past all doubt
+You'ld call your children yours.
+
+LEONTES:
+A nest of traitors!
+
+ANTIGONUS:
+I am none, by this good light.
+
+PAULINA:
+Nor I, nor any
+But one that's here, and that's himself, for he
+The sacred honour of himself, his queen's,
+His hopeful son's, his babe's, betrays to slander,
+Whose sting is sharper than the sword's;
+and will not--
+For, as the case now stands, it is a curse
+He cannot be compell'd to't--once remove
+The root of his opinion, which is rotten
+As ever oak or stone was sound.
+
+LEONTES:
+A callat
+Of boundless tongue, who late hath beat her husband
+And now baits me! This brat is none of mine;
+It is the issue of Polixenes:
+Hence with it, and together with the dam
+Commit them to the fire!
+
+PAULINA:
+It is yours;
+And, might we lay the old proverb to your charge,
+So like you, 'tis the worse. Behold, my lords,
+Although the print be little, the whole matter
+And copy of the father, eye, nose, lip,
+The trick of's frown, his forehead, nay, the valley,
+The pretty dimples of his chin and cheek,
+His smiles,
+The very mould and frame of hand, nail, finger:
+And thou, good goddess Nature, which hast made it
+So like to him that got it, if thou hast
+The ordering of the mind too, 'mongst all colours
+No yellow in't, lest she suspect, as he does,
+Her children not her husband's!
+
+LEONTES:
+A gross hag
+And, lozel, thou art worthy to be hang'd,
+That wilt not stay her tongue.
+
+ANTIGONUS:
+Hang all the husbands
+That cannot do that feat, you'll leave yourself
+Hardly one subject.
+
+LEONTES:
+Once more, take her hence.
+
+PAULINA:
+A most unworthy and unnatural lord
+Can do no more.
+
+LEONTES:
+I'll ha' thee burnt.
+
+PAULINA:
+I care not:
+It is an heretic that makes the fire,
+Not she which burns in't. I'll not call you tyrant;
+But this most cruel usage of your queen,
+Not able to produce more accusation
+Than your own weak-hinged fancy, something savours
+Of tyranny and will ignoble make you,
+Yea, scandalous to the world.
+
+LEONTES:
+On your allegiance,
+Out of the chamber with her! Were I a tyrant,
+Where were her life? she durst not call me so,
+If she did know me one. Away with her!
+
+PAULINA:
+I pray you, do not push me; I'll be gone.
+Look to your babe, my lord; 'tis yours:
+Jove send her
+A better guiding spirit! What needs these hands?
+You, that are thus so tender o'er his follies,
+Will never do him good, not one of you.
+So, so: farewell; we are gone.
+
+LEONTES:
+Thou, traitor, hast set on thy wife to this.
+My child? away with't! Even thou, that hast
+A heart so tender o'er it, take it hence
+And see it instantly consumed with fire;
+Even thou and none but thou. Take it up straight:
+Within this hour bring me word 'tis done,
+And by good testimony, or I'll seize thy life,
+With what thou else call'st thine. If thou refuse
+And wilt encounter with my wrath, say so;
+The bastard brains with these my proper hands
+Shall I dash out. Go, take it to the fire;
+For thou set'st on thy wife.
+
+ANTIGONUS:
+I did not, sir:
+These lords, my noble fellows, if they please,
+Can clear me in't.
+
+Lords:
+We can: my royal liege,
+He is not guilty of her coming hither.
+
+LEONTES:
+You're liars all.
+
+First Lord:
+Beseech your highness, give us better credit:
+We have always truly served you, and beseech you
+So to esteem of us, and on our knees we beg,
+As recompense of our dear services
+Past and to come, that you do change this purpose,
+Which being so horrible, so bloody, must
+Lead on to some foul issue: we all kneel.
+
+LEONTES:
+I am a feather for each wind that blows:
+Shall I live on to see this bastard kneel
+And call me father? better burn it now
+Than curse it then. But be it; let it live.
+It shall not neither. You, sir, come you hither;
+You that have been so tenderly officious
+With Lady Margery, your midwife there,
+To save this bastard's life,--for 'tis a bastard,
+So sure as this beard's grey,
+--what will you adventure
+To save this brat's life?
+
+ANTIGONUS:
+Any thing, my lord,
+That my ability may undergo
+And nobleness impose: at least thus much:
+I'll pawn the little blood which I have left
+To save the innocent: any thing possible.
+
+LEONTES:
+It shall be possible. Swear by this sword
+Thou wilt perform my bidding.
+
+ANTIGONUS:
+I will, my lord.
+
+LEONTES:
+Mark and perform it, see'st thou! for the fail
+Of any point in't shall not only be
+Death to thyself but to thy lewd-tongued wife,
+Whom for this time we pardon. We enjoin thee,
+As thou art liege-man to us, that thou carry
+This female bastard hence and that thou bear it
+To some remote and desert place quite out
+Of our dominions, and that there thou leave it,
+Without more mercy, to its own protection
+And favour of the climate. As by strange fortune
+It came to us, I do in justice charge thee,
+On thy soul's peril and thy body's torture,
+That thou commend it strangely to some place
+Where chance may nurse or end it. Take it up.
+
+ANTIGONUS:
+I swear to do this, though a present death
+Had been more merciful. Come on, poor babe:
+Some powerful spirit instruct the kites and ravens
+To be thy nurses! Wolves and bears, they say
+Casting their savageness aside have done
+Like offices of pity. Sir, be prosperous
+In more than this deed does require! And blessing
+Against this cruelty fight on thy side,
+Poor thing, condemn'd to loss!
+
+LEONTES:
+No, I'll not rear
+Another's issue.
+
+Servant:
+Please your highness, posts
+From those you sent to the oracle are come
+An hour since: Cleomenes and Dion,
+Being well arrived from Delphos, are both landed,
+Hasting to the court.
+
+First Lord:
+So please you, sir, their speed
+Hath been beyond account.
+
+LEONTES:
+Twenty-three days
+They have been absent: 'tis good speed; foretells
+The great Apollo suddenly will have
+The truth of this appear. Prepare you, lords;
+Summon a session, that we may arraign
+Our most disloyal lady, for, as she hath
+Been publicly accused, so shall she have
+A just and open trial. While she lives
+My heart will be a burthen to me. Leave me,
+And think upon my bidding.
+
+CLEOMENES:
+The climate's delicate, the air most sweet,
+Fertile the isle, the temple much surpassing
+The common praise it bears.
+
+DION:
+I shall report,
+For most it caught me, the celestial habits,
+Methinks I so should term them, and the reverence
+Of the grave wearers. O, the sacrifice!
+How ceremonious, solemn and unearthly
+It was i' the offering!
+
+CLEOMENES:
+But of all, the burst
+And the ear-deafening voice o' the oracle,
+Kin to Jove's thunder, so surprised my sense.
+That I was nothing.
+
+DION:
+If the event o' the journey
+Prove as successful to the queen,--O be't so!--
+As it hath been to us rare, pleasant, speedy,
+The time is worth the use on't.
+
+CLEOMENES:
+Great Apollo
+Turn all to the best! These proclamations,
+So forcing faults upon Hermione,
+I little like.
+
+DION:
+The violent carriage of it
+Will clear or end the business: when the oracle,
+Thus by Apollo's great divine seal'd up,
+Shall the contents discover, something rare
+Even then will rush to knowledge. Go: fresh horses!
+And gracious be the issue!
+
+LEONTES:
+This sessions, to our great grief we pronounce,
+Even pushes 'gainst our heart: the party tried
+The daughter of a king, our wife, and one
+Of us too much beloved. Let us be clear'd
+Of being tyrannous, since we so openly
+Proceed in justice, which shall have due course,
+Even to the guilt or the purgation.
+Produce the prisoner.
+
+Officer:
+It is his highness' pleasure that the queen
+Appear in person here in court. Silence!
+
+LEONTES:
+Read the indictment.
+
+Officer:
+
+HERMIONE:
+Since what I am to say must be but that
+Which contradicts my accusation and
+The testimony on my part no other
+But what comes from myself, it shall scarce boot me
+To say 'not guilty:' mine integrity
+Being counted falsehood, shall, as I express it,
+Be so received. But thus: if powers divine
+Behold our human actions, as they do,
+I doubt not then but innocence shall make
+False accusation blush and tyranny
+Tremble at patience. You, my lord, best know,
+Who least will seem to do so, my past life
+Hath been as continent, as chaste, as true,
+As I am now unhappy; which is more
+Than history can pattern, though devised
+And play'd to take spectators. For behold me
+A fellow of the royal bed, which owe
+A moiety of the throne a great king's daughter,
+The mother to a hopeful prince, here standing
+To prate and talk for life and honour 'fore
+Who please to come and hear. For life, I prize it
+As I weigh grief, which I would spare: for honour,
+'Tis a derivative from me to mine,
+And only that I stand for. I appeal
+To your own conscience, sir, before Polixenes
+Came to your court, how I was in your grace,
+How merited to be so; since he came,
+With what encounter so uncurrent I
+Have strain'd to appear thus: if one jot beyond
+The bound of honour, or in act or will
+That way inclining, harden'd be the hearts
+Of all that hear me, and my near'st of kin
+Cry fie upon my grave!
+
+LEONTES:
+I ne'er heard yet
+That any of these bolder vices wanted
+Less impudence to gainsay what they did
+Than to perform it first.
+
+HERMIONE:
+That's true enough;
+Through 'tis a saying, sir, not due to me.
+
+LEONTES:
+You will not own it.
+
+HERMIONE:
+More than mistress of
+Which comes to me in name of fault, I must not
+At all acknowledge. For Polixenes,
+With whom I am accused, I do confess
+I loved him as in honour he required,
+With such a kind of love as might become
+A lady like me, with a love even such,
+So and no other, as yourself commanded:
+Which not to have done I think had been in me
+Both disobedience and ingratitude
+To you and toward your friend, whose love had spoke,
+Even since it could speak, from an infant, freely
+That it was yours. Now, for conspiracy,
+I know not how it tastes; though it be dish'd
+For me to try how: all I know of it
+Is that Camillo was an honest man;
+And why he left your court, the gods themselves,
+Wotting no more than I, are ignorant.
+
+LEONTES:
+You knew of his departure, as you know
+What you have underta'en to do in's absence.
+
+HERMIONE:
+Sir,
+You speak a language that I understand not:
+My life stands in the level of your dreams,
+Which I'll lay down.
+
+LEONTES:
+Your actions are my dreams;
+You had a bastard by Polixenes,
+And I but dream'd it. As you were past all shame,--
+Those of your fact are so--so past all truth:
+Which to deny concerns more than avails; for as
+Thy brat hath been cast out, like to itself,
+No father owning it,--which is, indeed,
+More criminal in thee than it,--so thou
+Shalt feel our justice, in whose easiest passage
+Look for no less than death.
+
+HERMIONE:
+Sir, spare your threats:
+The bug which you would fright me with I seek.
+To me can life be no commodity:
+The crown and comfort of my life, your favour,
+I do give lost; for I do feel it gone,
+But know not how it went. My second joy
+And first-fruits of my body, from his presence
+I am barr'd, like one infectious. My third comfort
+Starr'd most unluckily, is from my breast,
+The innocent milk in its most innocent mouth,
+Haled out to murder: myself on every post
+Proclaimed a strumpet: with immodest hatred
+The child-bed privilege denied, which 'longs
+To women of all fashion; lastly, hurried
+Here to this place, i' the open air, before
+I have got strength of limit. Now, my liege,
+Tell me what blessings I have here alive,
+That I should fear to die? Therefore proceed.
+But yet hear this: mistake me not; no life,
+I prize it not a straw, but for mine honour,
+Which I would free, if I shall be condemn'd
+Upon surmises, all proofs sleeping else
+But what your jealousies awake, I tell you
+'Tis rigor and not law. Your honours all,
+I do refer me to the oracle:
+Apollo be my judge!
+
+First Lord:
+This your request
+Is altogether just: therefore bring forth,
+And in Apollos name, his oracle.
+
+HERMIONE:
+The Emperor of Russia was my father:
+O that he were alive, and here beholding
+His daughter's trial! that he did but see
+The flatness of my misery, yet with eyes
+Of pity, not revenge!
+
+Officer:
+You here shall swear upon this sword of justice,
+That you, Cleomenes and Dion, have
+Been both at Delphos, and from thence have brought
+The seal'd-up oracle, by the hand deliver'd
+Of great Apollo's priest; and that, since then,
+You have not dared to break the holy seal
+Nor read the secrets in't.
+
+CLEOMENES:
+All this we swear.
+
+LEONTES:
+Break up the seals and read.
+
+Officer:
+
+Lords:
+Now blessed be the great Apollo!
+
+HERMIONE:
+Praised!
+
+LEONTES:
+Hast thou read truth?
+
+Officer:
+Ay, my lord; even so
+As it is here set down.
+
+LEONTES:
+There is no truth at all i' the oracle:
+The sessions shall proceed: this is mere falsehood.
+
+Servant:
+My lord the king, the king!
+
+LEONTES:
+What is the business?
+
+Servant:
+O sir, I shall be hated to report it!
+The prince your son, with mere conceit and fear
+Of the queen's speed, is gone.
+
+LEONTES:
+How! gone!
+
+Servant:
+Is dead.
+
+LEONTES:
+Apollo's angry; and the heavens themselves
+Do strike at my injustice.
+How now there!
+
+PAULINA:
+This news is mortal to the queen: look down
+And see what death is doing.
+
+LEONTES:
+Take her hence:
+Her heart is but o'ercharged; she will recover:
+I have too much believed mine own suspicion:
+Beseech you, tenderly apply to her
+Some remedies for life.
+Apollo, pardon
+My great profaneness 'gainst thine oracle!
+I'll reconcile me to Polixenes,
+New woo my queen, recall the good Camillo,
+Whom I proclaim a man of truth, of mercy;
+For, being transported by my jealousies
+To bloody thoughts and to revenge, I chose
+Camillo for the minister to poison
+My friend Polixenes: which had been done,
+But that the good mind of Camillo tardied
+My swift command, though I with death and with
+Reward did threaten and encourage him,
+Not doing 't and being done: he, most humane
+And fill'd with honour, to my kingly guest
+Unclasp'd my practise, quit his fortunes here,
+Which you knew great, and to the hazard
+Of all encertainties himself commended,
+No richer than his honour: how he glisters
+Thorough my rust! and how his pity
+Does my deeds make the blacker!
+
+PAULINA:
+Woe the while!
+O, cut my lace, lest my heart, cracking it,
+Break too.
+
+First Lord:
+What fit is this, good lady?
+
+PAULINA:
+What studied torments, tyrant, hast for me?
+What wheels? racks? fires? what flaying? boiling?
+In leads or oils? what old or newer torture
+Must I receive, whose every word deserves
+To taste of thy most worst? Thy tyranny
+Together working with thy jealousies,
+Fancies too weak for boys, too green and idle
+For girls of nine, O, think what they have done
+And then run mad indeed, stark mad! for all
+Thy by-gone fooleries were but spices of it.
+That thou betray'dst Polixenes,'twas nothing;
+That did but show thee, of a fool, inconstant
+And damnable ingrateful: nor was't much,
+Thou wouldst have poison'd good Camillo's honour,
+To have him kill a king: poor trespasses,
+More monstrous standing by: whereof I reckon
+The casting forth to crows thy baby-daughter
+To be or none or little; though a devil
+Would have shed water out of fire ere done't:
+Nor is't directly laid to thee, the death
+Of the young prince, whose honourable thoughts,
+Thoughts high for one so tender, cleft the heart
+That could conceive a gross and foolish sire
+Blemish'd his gracious dam: this is not, no,
+Laid to thy answer: but the last,--O lords,
+When I have said, cry 'woe!' the queen, the queen,
+The sweet'st, dear'st creature's dead,
+and vengeance for't
+Not dropp'd down yet.
+
+First Lord:
+The higher powers forbid!
+
+PAULINA:
+I say she's dead; I'll swear't. If word nor oath
+Prevail not, go and see: if you can bring
+Tincture or lustre in her lip, her eye,
+Heat outwardly or breath within, I'll serve you
+As I would do the gods. But, O thou tyrant!
+Do not repent these things, for they are heavier
+Than all thy woes can stir; therefore betake thee
+To nothing but despair. A thousand knees
+Ten thousand years together, naked, fasting,
+Upon a barren mountain and still winter
+In storm perpetual, could not move the gods
+To look that way thou wert.
+
+LEONTES:
+Go on, go on
+Thou canst not speak too much; I have deserved
+All tongues to talk their bitterest.
+
+First Lord:
+Say no more:
+Howe'er the business goes, you have made fault
+I' the boldness of your speech.
+
+PAULINA:
+I am sorry for't:
+All faults I make, when I shall come to know them,
+I do repent. Alas! I have show'd too much
+The rashness of a woman: he is touch'd
+To the noble heart. What's gone and what's past help
+Should be past grief: do not receive affliction
+At my petition; I beseech you, rather
+Let me be punish'd, that have minded you
+Of what you should forget. Now, good my liege
+Sir, royal sir, forgive a foolish woman:
+The love I bore your queen--lo, fool again!--
+I'll speak of her no more, nor of your children;
+I'll not remember you of my own lord,
+Who is lost too: take your patience to you,
+And I'll say nothing.
+
+LEONTES:
+Thou didst speak but well
+When most the truth; which I receive much better
+Than to be pitied of thee. Prithee, bring me
+To the dead bodies of my queen and son:
+One grave shall be for both: upon them shall
+The causes of their death appear, unto
+Our shame perpetual. Once a day I'll visit
+The chapel where they lie, and tears shed there
+Shall be my recreation: so long as nature
+Will bear up with this exercise, so long
+I daily vow to use it. Come and lead me
+Unto these sorrows.
+
+ANTIGONUS:
+Thou art perfect then, our ship hath touch'd upon
+The deserts of Bohemia?
+
+Mariner:
+Ay, my lord: and fear
+We have landed in ill time: the skies look grimly
+And threaten present blusters. In my conscience,
+The heavens with that we have in hand are angry
+And frown upon 's.
+
+ANTIGONUS:
+Their sacred wills be done! Go, get aboard;
+Look to thy bark: I'll not be long before
+I call upon thee.
+
+Mariner:
+Make your best haste, and go not
+Too far i' the land: 'tis like to be loud weather;
+Besides, this place is famous for the creatures
+Of prey that keep upon't.
+
+ANTIGONUS:
+Go thou away:
+I'll follow instantly.
+
+Mariner:
+I am glad at heart
+To be so rid o' the business.
+
+ANTIGONUS:
+Come, poor babe:
+I have heard, but not believed,
+the spirits o' the dead
+May walk again: if such thing be, thy mother
+Appear'd to me last night, for ne'er was dream
+So like a waking. To me comes a creature,
+Sometimes her head on one side, some another;
+I never saw a vessel of like sorrow,
+So fill'd and so becoming: in pure white robes,
+Like very sanctity, she did approach
+My cabin where I lay; thrice bow'd before me,
+And gasping to begin some speech, her eyes
+Became two spouts: the fury spent, anon
+Did this break-from her: 'Good Antigonus,
+Since fate, against thy better disposition,
+Hath made thy person for the thrower-out
+Of my poor babe, according to thine oath,
+Places remote enough are in Bohemia,
+There weep and leave it crying; and, for the babe
+Is counted lost for ever, Perdita,
+I prithee, call't. For this ungentle business
+Put on thee by my lord, thou ne'er shalt see
+Thy wife Paulina more.' And so, with shrieks
+She melted into air. Affrighted much,
+I did in time collect myself and thought
+This was so and no slumber. Dreams are toys:
+Yet for this once, yea, superstitiously,
+I will be squared by this. I do believe
+Hermione hath suffer'd death, and that
+Apollo would, this being indeed the issue
+Of King Polixenes, it should here be laid,
+Either for life or death, upon the earth
+Of its right father. Blossom, speed thee well!
+There lie, and there thy character: there these;
+Which may, if fortune please, both breed thee, pretty,
+And still rest thine. The storm begins; poor wretch,
+That for thy mother's fault art thus exposed
+To loss and what may follow! Weep I cannot,
+But my heart bleeds; and most accursed am I
+To be by oath enjoin'd to this. Farewell!
+The day frowns more and more: thou'rt like to have
+A lullaby too rough: I never saw
+The heavens so dim by day. A savage clamour!
+Well may I get aboard! This is the chase:
+I am gone for ever.
+
+Shepherd:
+I would there were no age between sixteen and
+three-and-twenty, or that youth would sleep out the
+rest; for there is nothing in the between but
+getting wenches with child, wronging the ancientry,
+stealing, fighting--Hark you now! Would any but
+these boiled brains of nineteen and two-and-twenty
+hunt this weather? They have scared away two of my
+best sheep, which I fear the wolf will sooner find
+than the master: if any where I have them, 'tis by
+the seaside, browsing of ivy. Good luck, an't be thy
+will what have we here! Mercy on 's, a barne a very
+pretty barne! A boy or a child, I wonder? A
+pretty one; a very pretty one: sure, some 'scape:
+though I am not bookish, yet I can read
+waiting-gentlewoman in the 'scape. This has been
+some stair-work, some trunk-work, some
+behind-door-work: they were warmer that got this
+than the poor thing is here. I'll take it up for
+pity: yet I'll tarry till my son come; he hallooed
+but even now. Whoa, ho, hoa!
+
+Clown:
+Hilloa, loa!
+
+Shepherd:
+What, art so near? If thou'lt see a thing to talk
+on when thou art dead and rotten, come hither. What
+ailest thou, man?
+
+Clown:
+I have seen two such sights, by sea and by land!
+but I am not to say it is a sea, for it is now the
+sky: betwixt the firmament and it you cannot thrust
+a bodkin's point.
+
+Shepherd:
+Why, boy, how is it?
+
+Clown:
+I would you did but see how it chafes, how it rages,
+how it takes up the shore! but that's not the
+point. O, the most piteous cry of the poor souls!
+sometimes to see 'em, and not to see 'em; now the
+ship boring the moon with her main-mast, and anon
+swallowed with yest and froth, as you'ld thrust a
+cork into a hogshead. And then for the
+land-service, to see how the bear tore out his
+shoulder-bone; how he cried to me for help and said
+his name was Antigonus, a nobleman. But to make an
+end of the ship, to see how the sea flap-dragoned
+it: but, first, how the poor souls roared, and the
+sea mocked them; and how the poor gentleman roared
+and the bear mocked him, both roaring louder than
+the sea or weather.
+
+Shepherd:
+Name of mercy, when was this, boy?
+
+Clown:
+Now, now: I have not winked since I saw these
+sights: the men are not yet cold under water, nor
+the bear half dined on the gentleman: he's at it
+now.
+
+Shepherd:
+Would I had been by, to have helped the old man!
+
+Clown:
+I would you had been by the ship side, to have
+helped her: there your charity would have lacked footing.
+
+Shepherd:
+Heavy matters! heavy matters! but look thee here,
+boy. Now bless thyself: thou mettest with things
+dying, I with things newborn. Here's a sight for
+thee; look thee, a bearing-cloth for a squire's
+child! look thee here; take up, take up, boy;
+open't. So, let's see: it was told me I should be
+rich by the fairies. This is some changeling:
+open't. What's within, boy?
+
+Clown:
+You're a made old man: if the sins of your youth
+are forgiven you, you're well to live. Gold! all gold!
+
+Shepherd:
+This is fairy gold, boy, and 'twill prove so: up
+with't, keep it close: home, home, the next way.
+We are lucky, boy; and to be so still requires
+nothing but secrecy. Let my sheep go: come, good
+boy, the next way home.
+
+Clown:
+Go you the next way with your findings. I'll go see
+if the bear be gone from the gentleman and how much
+he hath eaten: they are never curst but when they
+are hungry: if there be any of him left, I'll bury
+it.
+
+Shepherd:
+That's a good deed. If thou mayest discern by that
+which is left of him what he is, fetch me to the
+sight of him.
+
+Clown:
+Marry, will I; and you shall help to put him i' the ground.
+
+Shepherd:
+'Tis a lucky day, boy, and we'll do good deeds on't.
+
+Time:
+I, that please some, try all, both joy and terror
+Of good and bad, that makes and unfolds error,
+Now take upon me, in the name of Time,
+To use my wings. Impute it not a crime
+To me or my swift passage, that I slide
+O'er sixteen years and leave the growth untried
+Of that wide gap, since it is in my power
+To o'erthrow law and in one self-born hour
+To plant and o'erwhelm custom. Let me pass
+The same I am, ere ancient'st order was
+Or what is now received: I witness to
+The times that brought them in; so shall I do
+To the freshest things now reigning and make stale
+The glistering of this present, as my tale
+Now seems to it. Your patience this allowing,
+I turn my glass and give my scene such growing
+As you had slept between: Leontes leaving,
+The effects of his fond jealousies so grieving
+That he shuts up himself, imagine me,
+Gentle spectators, that I now may be
+In fair Bohemia, and remember well,
+I mentioned a son o' the king's, which Florizel
+I now name to you; and with speed so pace
+To speak of Perdita, now grown in grace
+Equal with wondering: what of her ensues
+I list not prophecy; but let Time's news
+Be known when 'tis brought forth.
+A shepherd's daughter,
+And what to her adheres, which follows after,
+Is the argument of Time. Of this allow,
+If ever you have spent time worse ere now;
+If never, yet that Time himself doth say
+He wishes earnestly you never may.
+
+POLIXENES:
+I pray thee, good Camillo, be no more importunate:
+'tis a sickness denying thee any thing; a death to
+grant this.
+
+CAMILLO:
+It is fifteen years since I saw my country: though
+I have for the most part been aired abroad, I
+desire to lay my bones there. Besides, the penitent
+king, my master, hath sent for me; to whose feeling
+sorrows I might be some allay, or I o'erween to
+think so, which is another spur to my departure.
+
+POLIXENES:
+As thou lovest me, Camillo, wipe not out the rest of
+thy services by leaving me now: the need I have of
+thee thine own goodness hath made; better not to
+have had thee than thus to want thee: thou, having
+made me businesses which none without thee can
+sufficiently manage, must either stay to execute
+them thyself or take away with thee the very
+services thou hast done; which if I have not enough
+considered, as too much I cannot, to be more
+thankful to thee shall be my study, and my profit
+therein the heaping friendships. Of that fatal
+country, Sicilia, prithee speak no more; whose very
+naming punishes me with the remembrance of that
+penitent, as thou callest him, and reconciled king,
+my brother; whose loss of his most precious queen
+and children are even now to be afresh lamented.
+Say to me, when sawest thou the Prince Florizel, my
+son? Kings are no less unhappy, their issue not
+being gracious, than they are in losing them when
+they have approved their virtues.
+
+CAMILLO:
+Sir, it is three days since I saw the prince. What
+his happier affairs may be, are to me unknown: but I
+have missingly noted, he is of late much retired
+from court and is less frequent to his princely
+exercises than formerly he hath appeared.
+
+POLIXENES:
+I have considered so much, Camillo, and with some
+care; so far that I have eyes under my service which
+look upon his removedness; from whom I have this
+intelligence, that he is seldom from the house of a
+most homely shepherd; a man, they say, that from
+very nothing, and beyond the imagination of his
+neighbours, is grown into an unspeakable estate.
+
+CAMILLO:
+I have heard, sir, of such a man, who hath a
+daughter of most rare note: the report of her is
+extended more than can be thought to begin from such a cottage.
+
+POLIXENES:
+That's likewise part of my intelligence; but, I
+fear, the angle that plucks our son thither. Thou
+shalt accompany us to the place; where we will, not
+appearing what we are, have some question with the
+shepherd; from whose simplicity I think it not
+uneasy to get the cause of my son's resort thither.
+Prithee, be my present partner in this business, and
+lay aside the thoughts of Sicilia.
+
+CAMILLO:
+I willingly obey your command.
+
+POLIXENES:
+My best Camillo! We must disguise ourselves.
+
+AUTOLYCUS:
+When daffodils begin to peer,
+With heigh! the doxy over the dale,
+Why, then comes in the sweet o' the year;
+For the red blood reigns in the winter's pale.
+The white sheet bleaching on the hedge,
+With heigh! the sweet birds, O, how they sing!
+Doth set my pugging tooth on edge;
+For a quart of ale is a dish for a king.
+The lark, that tirra-lyra chants,
+With heigh! with heigh! the thrush and the jay,
+Are summer songs for me and my aunts,
+While we lie tumbling in the hay.
+I have served Prince Florizel and in my time
+wore three-pile; but now I am out of service:
+But shall I go mourn for that, my dear?
+The pale moon shines by night:
+And when I wander here and there,
+I then do most go right.
+If tinkers may have leave to live,
+And bear the sow-skin budget,
+Then my account I well may, give,
+And in the stocks avouch it.
+My traffic is sheets; when the kite builds, look to
+lesser linen. My father named me Autolycus; who
+being, as I am, littered under Mercury, was likewise
+a snapper-up of unconsidered trifles. With die and
+drab I purchased this caparison, and my revenue is
+the silly cheat. Gallows and knock are too powerful
+on the highway: beating and hanging are terrors to
+me: for the life to come, I sleep out the thought
+of it. A prize! a prize!
+
+Clown:
+Let me see: every 'leven wether tods; every tod
+yields pound and odd shilling; fifteen hundred
+shorn. what comes the wool to?
+
+AUTOLYCUS:
+
+Clown:
+I cannot do't without counters. Let me see; what am
+I to buy for our sheep-shearing feast? Three pound
+of sugar, five pound of currants, rice,--what will
+this sister of mine do with rice? But my father
+hath made her mistress of the feast, and she lays it
+on. She hath made me four and twenty nose-gays for
+the shearers, three-man-song-men all, and very good
+ones; but they are most of them means and bases; but
+one puritan amongst them, and he sings psalms to
+horn-pipes. I must have saffron to colour the warden
+pies; mace; dates?--none, that's out of my note;
+nutmegs, seven; a race or two of ginger, but that I
+may beg; four pound of prunes, and as many of
+raisins o' the sun.
+
+AUTOLYCUS:
+O that ever I was born!
+
+Clown:
+I' the name of me--
+
+AUTOLYCUS:
+O, help me, help me! pluck but off these rags; and
+then, death, death!
+
+Clown:
+Alack, poor soul! thou hast need of more rags to lay
+on thee, rather than have these off.
+
+AUTOLYCUS:
+O sir, the loathsomeness of them offends me more
+than the stripes I have received, which are mighty
+ones and millions.
+
+Clown:
+Alas, poor man! a million of beating may come to a
+great matter.
+
+AUTOLYCUS:
+I am robbed, sir, and beaten; my money and apparel
+ta'en from me, and these detestable things put upon
+me.
+
+Clown:
+What, by a horseman, or a footman?
+
+AUTOLYCUS:
+A footman, sweet sir, a footman.
+
+Clown:
+Indeed, he should be a footman by the garments he
+has left with thee: if this be a horseman's coat,
+it hath seen very hot service. Lend me thy hand,
+I'll help thee: come, lend me thy hand.
+
+AUTOLYCUS:
+O, good sir, tenderly, O!
+
+Clown:
+Alas, poor soul!
+
+AUTOLYCUS:
+O, good sir, softly, good sir! I fear, sir, my
+shoulder-blade is out.
+
+Clown:
+How now! canst stand?
+
+AUTOLYCUS:
+
+Clown:
+Dost lack any money? I have a little money for thee.
+
+AUTOLYCUS:
+No, good sweet sir; no, I beseech you, sir: I have
+a kinsman not past three quarters of a mile hence,
+unto whom I was going; I shall there have money, or
+any thing I want: offer me no money, I pray you;
+that kills my heart.
+
+Clown:
+What manner of fellow was he that robbed you?
+
+AUTOLYCUS:
+A fellow, sir, that I have known to go about with
+troll-my-dames; I knew him once a servant of the
+prince: I cannot tell, good sir, for which of his
+virtues it was, but he was certainly whipped out of the court.
+
+Clown:
+His vices, you would say; there's no virtue whipped
+out of the court: they cherish it to make it stay
+there; and yet it will no more but abide.
+
+AUTOLYCUS:
+Vices, I would say, sir. I know this man well: he
+hath been since an ape-bearer; then a
+process-server, a bailiff; then he compassed a
+motion of the Prodigal Son, and married a tinker's
+wife within a mile where my land and living lies;
+and, having flown over many knavish professions, he
+settled only in rogue: some call him Autolycus.
+
+Clown:
+Out upon him! prig, for my life, prig: he haunts
+wakes, fairs and bear-baitings.
+
+AUTOLYCUS:
+Very true, sir; he, sir, he; that's the rogue that
+put me into this apparel.
+
+Clown:
+Not a more cowardly rogue in all Bohemia: if you had
+but looked big and spit at him, he'ld have run.
+
+AUTOLYCUS:
+I must confess to you, sir, I am no fighter: I am
+false of heart that way; and that he knew, I warrant
+him.
+
+Clown:
+How do you now?
+
+AUTOLYCUS:
+Sweet sir, much better than I was; I can stand and
+walk: I will even take my leave of you, and pace
+softly towards my kinsman's.
+
+Clown:
+Shall I bring thee on the way?
+
+AUTOLYCUS:
+No, good-faced sir; no, sweet sir.
+
+Clown:
+Then fare thee well: I must go buy spices for our
+sheep-shearing.
+
+AUTOLYCUS:
+Prosper you, sweet sir!
+Your purse is not hot enough to purchase your spice.
+I'll be with you at your sheep-shearing too: if I
+make not this cheat bring out another and the
+shearers prove sheep, let me be unrolled and my name
+put in the book of virtue!
+Jog on, jog on, the foot-path way,
+And merrily hent the stile-a:
+A merry heart goes all the day,
+Your sad tires in a mile-a.
+
+FLORIZEL:
+These your unusual weeds to each part of you
+Do give a life: no shepherdess, but Flora
+Peering in April's front. This your sheep-shearing
+Is as a meeting of the petty gods,
+And you the queen on't.
+
+PERDITA:
+Sir, my gracious lord,
+To chide at your extremes it not becomes me:
+O, pardon, that I name them! Your high self,
+The gracious mark o' the land, you have obscured
+With a swain's wearing, and me, poor lowly maid,
+Most goddess-like prank'd up: but that our feasts
+In every mess have folly and the feeders
+Digest it with a custom, I should blush
+To see you so attired, sworn, I think,
+To show myself a glass.
+
+FLORIZEL:
+I bless the time
+When my good falcon made her flight across
+Thy father's ground.
+
+PERDITA:
+Now Jove afford you cause!
+To me the difference forges dread; your greatness
+Hath not been used to fear. Even now I tremble
+To think your father, by some accident,
+Should pass this way as you did: O, the Fates!
+How would he look, to see his work so noble
+Vilely bound up? What would he say? Or how
+Should I, in these my borrow'd flaunts, behold
+The sternness of his presence?
+
+FLORIZEL:
+Apprehend
+Nothing but jollity. The gods themselves,
+Humbling their deities to love, have taken
+The shapes of beasts upon them: Jupiter
+Became a bull, and bellow'd; the green Neptune
+A ram, and bleated; and the fire-robed god,
+Golden Apollo, a poor humble swain,
+As I seem now. Their transformations
+Were never for a piece of beauty rarer,
+Nor in a way so chaste, since my desires
+Run not before mine honour, nor my lusts
+Burn hotter than my faith.
+
+PERDITA:
+O, but, sir,
+Your resolution cannot hold, when 'tis
+Opposed, as it must be, by the power of the king:
+One of these two must be necessities,
+Which then will speak, that you must
+change this purpose,
+Or I my life.
+
+FLORIZEL:
+Thou dearest Perdita,
+With these forced thoughts, I prithee, darken not
+The mirth o' the feast. Or I'll be thine, my fair,
+Or not my father's. For I cannot be
+Mine own, nor any thing to any, if
+I be not thine. To this I am most constant,
+Though destiny say no. Be merry, gentle;
+Strangle such thoughts as these with any thing
+That you behold the while. Your guests are coming:
+Lift up your countenance, as it were the day
+Of celebration of that nuptial which
+We two have sworn shall come.
+
+PERDITA:
+O lady Fortune,
+Stand you auspicious!
+
+FLORIZEL:
+See, your guests approach:
+Address yourself to entertain them sprightly,
+And let's be red with mirth.
+
+Shepherd:
+Fie, daughter! when my old wife lived, upon
+This day she was both pantler, butler, cook,
+Both dame and servant; welcomed all, served all;
+Would sing her song and dance her turn; now here,
+At upper end o' the table, now i' the middle;
+On his shoulder, and his; her face o' fire
+With labour and the thing she took to quench it,
+She would to each one sip. You are retired,
+As if you were a feasted one and not
+The hostess of the meeting: pray you, bid
+These unknown friends to's welcome; for it is
+A way to make us better friends, more known.
+Come, quench your blushes and present yourself
+That which you are, mistress o' the feast: come on,
+And bid us welcome to your sheep-shearing,
+As your good flock shall prosper.
+
+PERDITA:
+
+POLIXENES:
+Shepherdess,
+A fair one are you--well you fit our ages
+With flowers of winter.
+
+PERDITA:
+Sir, the year growing ancient,
+Not yet on summer's death, nor on the birth
+Of trembling winter, the fairest
+flowers o' the season
+Are our carnations and streak'd gillyvors,
+Which some call nature's bastards: of that kind
+Our rustic garden's barren; and I care not
+To get slips of them.
+
+POLIXENES:
+Wherefore, gentle maiden,
+Do you neglect them?
+
+PERDITA:
+For I have heard it said
+There is an art which in their piedness shares
+With great creating nature.
+
+POLIXENES:
+Say there be;
+Yet nature is made better by no mean
+But nature makes that mean: so, over that art
+Which you say adds to nature, is an art
+That nature makes. You see, sweet maid, we marry
+A gentler scion to the wildest stock,
+And make conceive a bark of baser kind
+By bud of nobler race: this is an art
+Which does mend nature, change it rather, but
+The art itself is nature.
+
+PERDITA:
+So it is.
+
+POLIXENES:
+Then make your garden rich in gillyvors,
+And do not call them bastards.
+
+PERDITA:
+I'll not put
+The dibble in earth to set one slip of them;
+No more than were I painted I would wish
+This youth should say 'twere well and only therefore
+Desire to breed by me. Here's flowers for you;
+Hot lavender, mints, savoury, marjoram;
+The marigold, that goes to bed wi' the sun
+And with him rises weeping: these are flowers
+Of middle summer, and I think they are given
+To men of middle age. You're very welcome.
+
+CAMILLO:
+I should leave grazing, were I of your flock,
+And only live by gazing.
+
+PERDITA:
+Out, alas!
+You'd be so lean, that blasts of January
+Would blow you through and through.
+Now, my fair'st friend,
+I would I had some flowers o' the spring that might
+Become your time of day; and yours, and yours,
+That wear upon your virgin branches yet
+Your maidenheads growing: O Proserpina,
+For the flowers now, that frighted thou let'st fall
+From Dis's waggon! daffodils,
+That come before the swallow dares, and take
+The winds of March with beauty; violets dim,
+But sweeter than the lids of Juno's eyes
+Or Cytherea's breath; pale primroses
+That die unmarried, ere they can behold
+Bight Phoebus in his strength--a malady
+Most incident to maids; bold oxlips and
+The crown imperial; lilies of all kinds,
+The flower-de-luce being one! O, these I lack,
+To make you garlands of, and my sweet friend,
+To strew him o'er and o'er!
+
+FLORIZEL:
+What, like a corse?
+
+PERDITA:
+No, like a bank for love to lie and play on;
+Not like a corse; or if, not to be buried,
+But quick and in mine arms. Come, take your flowers:
+Methinks I play as I have seen them do
+In Whitsun pastorals: sure this robe of mine
+Does change my disposition.
+
+FLORIZEL:
+What you do
+Still betters what is done. When you speak, sweet.
+I'ld have you do it ever: when you sing,
+I'ld have you buy and sell so, so give alms,
+Pray so; and, for the ordering your affairs,
+To sing them too: when you do dance, I wish you
+A wave o' the sea, that you might ever do
+Nothing but that; move still, still so,
+And own no other function: each your doing,
+So singular in each particular,
+Crowns what you are doing in the present deed,
+That all your acts are queens.
+
+PERDITA:
+O Doricles,
+Your praises are too large: but that your youth,
+And the true blood which peepeth fairly through't,
+Do plainly give you out an unstain'd shepherd,
+With wisdom I might fear, my Doricles,
+You woo'd me the false way.
+
+FLORIZEL:
+I think you have
+As little skill to fear as I have purpose
+To put you to't. But come; our dance, I pray:
+Your hand, my Perdita: so turtles pair,
+That never mean to part.
+
+PERDITA:
+I'll swear for 'em.
+
+POLIXENES:
+This is the prettiest low-born lass that ever
+Ran on the green-sward: nothing she does or seems
+But smacks of something greater than herself,
+Too noble for this place.
+
+CAMILLO:
+He tells her something
+That makes her blood look out: good sooth, she is
+The queen of curds and cream.
+
+Clown:
+Come on, strike up!
+
+DORCAS:
+Mopsa must be your mistress: marry, garlic,
+To mend her kissing with!
+
+MOPSA:
+Now, in good time!
+
+Clown:
+Not a word, a word; we stand upon our manners.
+Come, strike up!
+
+POLIXENES:
+Pray, good shepherd, what fair swain is this
+Which dances with your daughter?
+
+Shepherd:
+They call him Doricles; and boasts himself
+To have a worthy feeding: but I have it
+Upon his own report and I believe it;
+He looks like sooth. He says he loves my daughter:
+I think so too; for never gazed the moon
+Upon the water as he'll stand and read
+As 'twere my daughter's eyes: and, to be plain.
+I think there is not half a kiss to choose
+Who loves another best.
+
+POLIXENES:
+She dances featly.
+
+Shepherd:
+So she does any thing; though I report it,
+That should be silent: if young Doricles
+Do light upon her, she shall bring him that
+Which he not dreams of.
+
+Servant:
+O master, if you did but hear the pedlar at the
+door, you would never dance again after a tabour and
+pipe; no, the bagpipe could not move you: he sings
+several tunes faster than you'll tell money; he
+utters them as he had eaten ballads and all men's
+ears grew to his tunes.
+
+Clown:
+He could never come better; he shall come in. I
+love a ballad but even too well, if it be doleful
+matter merrily set down, or a very pleasant thing
+indeed and sung lamentably.
+
+Servant:
+He hath songs for man or woman, of all sizes; no
+milliner can so fit his customers with gloves: he
+has the prettiest love-songs for maids; so without
+bawdry, which is strange; with such delicate
+burthens of dildos and fadings, 'jump her and thump
+her;' and where some stretch-mouthed rascal would,
+as it were, mean mischief and break a foul gap into
+the matter, he makes the maid to answer 'Whoop, do me
+no harm, good man;' puts him off, slights him, with
+'Whoop, do me no harm, good man.'
+
+POLIXENES:
+This is a brave fellow.
+
+Clown:
+Believe me, thou talkest of an admirable conceited
+fellow. Has he any unbraided wares?
+
+Servant:
+He hath ribbons of an the colours i' the rainbow;
+points more than all the lawyers in Bohemia can
+learnedly handle, though they come to him by the
+gross: inkles, caddisses, cambrics, lawns: why, he
+sings 'em over as they were gods or goddesses; you
+would think a smock were a she-angel, he so chants
+to the sleeve-hand and the work about the square on't.
+
+Clown:
+Prithee bring him in; and let him approach singing.
+
+PERDITA:
+Forewarn him that he use no scurrilous words in 's tunes.
+
+Clown:
+You have of these pedlars, that have more in them
+than you'ld think, sister.
+
+PERDITA:
+Ay, good brother, or go about to think.
+
+AUTOLYCUS:
+Lawn as white as driven snow;
+Cyprus black as e'er was crow;
+Gloves as sweet as damask roses;
+Masks for faces and for noses;
+Bugle bracelet, necklace amber,
+Perfume for a lady's chamber;
+Golden quoifs and stomachers,
+For my lads to give their dears:
+Pins and poking-sticks of steel,
+What maids lack from head to heel:
+Come buy of me, come; come buy, come buy;
+Buy lads, or else your lasses cry: Come buy.
+
+Clown:
+If I were not in love with Mopsa, thou shouldst take
+no money of me; but being enthralled as I am, it
+will also be the bondage of certain ribbons and gloves.
+
+MOPSA:
+I was promised them against the feast; but they come
+not too late now.
+
+DORCAS:
+He hath promised you more than that, or there be liars.
+
+MOPSA:
+He hath paid you all he promised you; may be, he has
+paid you more, which will shame you to give him again.
+
+Clown:
+Is there no manners left among maids? will they
+wear their plackets where they should bear their
+faces? Is there not milking-time, when you are
+going to bed, or kiln-hole, to whistle off these
+secrets, but you must be tittle-tattling before all
+our guests? 'tis well they are whispering: clamour
+your tongues, and not a word more.
+
+MOPSA:
+I have done. Come, you promised me a tawdry-lace
+and a pair of sweet gloves.
+
+Clown:
+Have I not told thee how I was cozened by the way
+and lost all my money?
+
+AUTOLYCUS:
+And indeed, sir, there are cozeners abroad;
+therefore it behoves men to be wary.
+
+Clown:
+Fear not thou, man, thou shalt lose nothing here.
+
+AUTOLYCUS:
+I hope so, sir; for I have about me many parcels of charge.
+
+Clown:
+What hast here? ballads?
+
+MOPSA:
+Pray now, buy some: I love a ballad in print o'
+life, for then we are sure they are true.
+
+AUTOLYCUS:
+Here's one to a very doleful tune, how a usurer's
+wife was brought to bed of twenty money-bags at a
+burthen and how she longed to eat adders' heads and
+toads carbonadoed.
+
+MOPSA:
+Is it true, think you?
+
+AUTOLYCUS:
+Very true, and but a month old.
+
+DORCAS:
+Bless me from marrying a usurer!
+
+AUTOLYCUS:
+Here's the midwife's name to't, one Mistress
+Tale-porter, and five or six honest wives that were
+present. Why should I carry lies abroad?
+
+MOPSA:
+Pray you now, buy it.
+
+Clown:
+Come on, lay it by: and let's first see moe
+ballads; we'll buy the other things anon.
+
+AUTOLYCUS:
+Here's another ballad of a fish, that appeared upon
+the coast on Wednesday the four-score of April,
+forty thousand fathom above water, and sung this
+ballad against the hard hearts of maids: it was
+thought she was a woman and was turned into a cold
+fish for she would not exchange flesh with one that
+loved her: the ballad is very pitiful and as true.
+
+DORCAS:
+Is it true too, think you?
+
+AUTOLYCUS:
+Five justices' hands at it, and witnesses more than
+my pack will hold.
+
+Clown:
+Lay it by too: another.
+
+AUTOLYCUS:
+This is a merry ballad, but a very pretty one.
+
+MOPSA:
+Let's have some merry ones.
+
+AUTOLYCUS:
+Why, this is a passing merry one and goes to
+the tune of 'Two maids wooing a man:' there's
+scarce a maid westward but she sings it; 'tis in
+request, I can tell you.
+
+MOPSA:
+We can both sing it: if thou'lt bear a part, thou
+shalt hear; 'tis in three parts.
+
+DORCAS:
+We had the tune on't a month ago.
+
+AUTOLYCUS:
+I can bear my part; you must know 'tis my
+occupation; have at it with you.
+
+AUTOLYCUS:
+Get you hence, for I must go
+Where it fits not you to know.
+
+DORCAS:
+Whither?
+
+MOPSA:
+O, whither?
+
+DORCAS:
+Whither?
+
+MOPSA:
+It becomes thy oath full well,
+Thou to me thy secrets tell.
+
+DORCAS:
+Me too, let me go thither.
+
+MOPSA:
+Or thou goest to the orange or mill.
+
+DORCAS:
+If to either, thou dost ill.
+
+AUTOLYCUS:
+Neither.
+
+DORCAS:
+What, neither?
+
+AUTOLYCUS:
+Neither.
+
+DORCAS:
+Thou hast sworn my love to be.
+
+MOPSA:
+Thou hast sworn it more to me:
+Then whither goest? say, whither?
+
+Clown:
+We'll have this song out anon by ourselves: my
+father and the gentlemen are in sad talk, and we'll
+not trouble them. Come, bring away thy pack after
+me. Wenches, I'll buy for you both. Pedlar, let's
+have the first choice. Follow me, girls.
+
+AUTOLYCUS:
+And you shall pay well for 'em.
+Will you buy any tape,
+Or lace for your cape,
+My dainty duck, my dear-a?
+Any silk, any thread,
+Any toys for your head,
+Of the new'st and finest, finest wear-a?
+Come to the pedlar;
+Money's a medler.
+That doth utter all men's ware-a.
+
+Servant:
+Master, there is three carters, three shepherds,
+three neat-herds, three swine-herds, that have made
+themselves all men of hair, they call themselves
+Saltiers, and they have a dance which the wenches
+say is a gallimaufry of gambols, because they are
+not in't; but they themselves are o' the mind, if it
+be not too rough for some that know little but
+bowling, it will please plentifully.
+
+Shepherd:
+Away! we'll none on 't: here has been too much
+homely foolery already. I know, sir, we weary you.
+
+POLIXENES:
+You weary those that refresh us: pray, let's see
+these four threes of herdsmen.
+
+Servant:
+One three of them, by their own report, sir, hath
+danced before the king; and not the worst of the
+three but jumps twelve foot and a half by the squier.
+
+Shepherd:
+Leave your prating: since these good men are
+pleased, let them come in; but quickly now.
+
+Servant:
+Why, they stay at door, sir.
+
+POLIXENES:
+O, father, you'll know more of that hereafter.
+Is it not too far gone? 'Tis time to part them.
+He's simple and tells much.
+How now, fair shepherd!
+Your heart is full of something that does take
+Your mind from feasting. Sooth, when I was young
+And handed love as you do, I was wont
+To load my she with knacks: I would have ransack'd
+The pedlar's silken treasury and have pour'd it
+To her acceptance; you have let him go
+And nothing marted with him. If your lass
+Interpretation should abuse and call this
+Your lack of love or bounty, you were straited
+For a reply, at least if you make a care
+Of happy holding her.
+
+FLORIZEL:
+Old sir, I know
+She prizes not such trifles as these are:
+The gifts she looks from me are pack'd and lock'd
+Up in my heart; which I have given already,
+But not deliver'd. O, hear me breathe my life
+Before this ancient sir, who, it should seem,
+Hath sometime loved! I take thy hand, this hand,
+As soft as dove's down and as white as it,
+Or Ethiopian's tooth, or the fann'd
+snow that's bolted
+By the northern blasts twice o'er.
+
+POLIXENES:
+What follows this?
+How prettily the young swain seems to wash
+The hand was fair before! I have put you out:
+But to your protestation; let me hear
+What you profess.
+
+FLORIZEL:
+Do, and be witness to 't.
+
+POLIXENES:
+And this my neighbour too?
+
+FLORIZEL:
+And he, and more
+Than he, and men, the earth, the heavens, and all:
+That, were I crown'd the most imperial monarch,
+Thereof most worthy, were I the fairest youth
+That ever made eye swerve, had force and knowledge
+More than was ever man's, I would not prize them
+Without her love; for her employ them all;
+Commend them and condemn them to her service
+Or to their own perdition.
+
+POLIXENES:
+Fairly offer'd.
+
+CAMILLO:
+This shows a sound affection.
+
+Shepherd:
+But, my daughter,
+Say you the like to him?
+
+PERDITA:
+I cannot speak
+So well, nothing so well; no, nor mean better:
+By the pattern of mine own thoughts I cut out
+The purity of his.
+
+Shepherd:
+Take hands, a bargain!
+And, friends unknown, you shall bear witness to 't:
+I give my daughter to him, and will make
+Her portion equal his.
+
+FLORIZEL:
+O, that must be
+I' the virtue of your daughter: one being dead,
+I shall have more than you can dream of yet;
+Enough then for your wonder. But, come on,
+Contract us 'fore these witnesses.
+
+Shepherd:
+Come, your hand;
+And, daughter, yours.
+
+POLIXENES:
+Soft, swain, awhile, beseech you;
+Have you a father?
+
+FLORIZEL:
+I have: but what of him?
+
+POLIXENES:
+Knows he of this?
+
+FLORIZEL:
+He neither does nor shall.
+
+POLIXENES:
+Methinks a father
+Is at the nuptial of his son a guest
+That best becomes the table. Pray you once more,
+Is not your father grown incapable
+Of reasonable affairs? is he not stupid
+With age and altering rheums? can he speak? hear?
+Know man from man? dispute his own estate?
+Lies he not bed-rid? and again does nothing
+But what he did being childish?
+
+FLORIZEL:
+No, good sir;
+He has his health and ampler strength indeed
+Than most have of his age.
+
+POLIXENES:
+By my white beard,
+You offer him, if this be so, a wrong
+Something unfilial: reason my son
+Should choose himself a wife, but as good reason
+The father, all whose joy is nothing else
+But fair posterity, should hold some counsel
+In such a business.
+
+FLORIZEL:
+I yield all this;
+But for some other reasons, my grave sir,
+Which 'tis not fit you know, I not acquaint
+My father of this business.
+
+POLIXENES:
+Let him know't.
+
+FLORIZEL:
+He shall not.
+
+POLIXENES:
+Prithee, let him.
+
+FLORIZEL:
+No, he must not.
+
+Shepherd:
+Let him, my son: he shall not need to grieve
+At knowing of thy choice.
+
+FLORIZEL:
+Come, come, he must not.
+Mark our contract.
+
+POLIXENES:
+Mark your divorce, young sir,
+Whom son I dare not call; thou art too base
+To be acknowledged: thou a sceptre's heir,
+That thus affect'st a sheep-hook! Thou old traitor,
+I am sorry that by hanging thee I can
+But shorten thy life one week. And thou, fresh piece
+Of excellent witchcraft, who of force must know
+The royal fool thou copest with,--
+
+Shepherd:
+O, my heart!
+
+POLIXENES:
+I'll have thy beauty scratch'd with briers, and made
+More homely than thy state. For thee, fond boy,
+If I may ever know thou dost but sigh
+That thou no more shalt see this knack, as never
+I mean thou shalt, we'll bar thee from succession;
+Not hold thee of our blood, no, not our kin,
+Far than Deucalion off: mark thou my words:
+Follow us to the court. Thou churl, for this time,
+Though full of our displeasure, yet we free thee
+From the dead blow of it. And you, enchantment.--
+Worthy enough a herdsman: yea, him too,
+That makes himself, but for our honour therein,
+Unworthy thee,--if ever henceforth thou
+These rural latches to his entrance open,
+Or hoop his body more with thy embraces,
+I will devise a death as cruel for thee
+As thou art tender to't.
+
+PERDITA:
+Even here undone!
+I was not much afeard; for once or twice
+I was about to speak and tell him plainly,
+The selfsame sun that shines upon his court
+Hides not his visage from our cottage but
+Looks on alike. Will't please you, sir, be gone?
+I told you what would come of this: beseech you,
+Of your own state take care: this dream of mine,--
+Being now awake, I'll queen it no inch farther,
+But milk my ewes and weep.
+
+CAMILLO:
+Why, how now, father!
+Speak ere thou diest.
+
+Shepherd:
+I cannot speak, nor think
+Nor dare to know that which I know. O sir!
+You have undone a man of fourscore three,
+That thought to fill his grave in quiet, yea,
+To die upon the bed my father died,
+To lie close by his honest bones: but now
+Some hangman must put on my shroud and lay me
+Where no priest shovels in dust. O cursed wretch,
+That knew'st this was the prince,
+and wouldst adventure
+To mingle faith with him! Undone! undone!
+If I might die within this hour, I have lived
+To die when I desire.
+
+FLORIZEL:
+Why look you so upon me?
+I am but sorry, not afeard; delay'd,
+But nothing alter'd: what I was, I am;
+More straining on for plucking back, not following
+My leash unwillingly.
+
+CAMILLO:
+Gracious my lord,
+You know your father's temper: at this time
+He will allow no speech, which I do guess
+You do not purpose to him; and as hardly
+Will he endure your sight as yet, I fear:
+Then, till the fury of his highness settle,
+Come not before him.
+
+FLORIZEL:
+I not purpose it.
+I think, Camillo?
+
+CAMILLO:
+Even he, my lord.
+
+PERDITA:
+How often have I told you 'twould be thus!
+How often said, my dignity would last
+But till 'twere known!
+
+FLORIZEL:
+It cannot fail but by
+The violation of my faith; and then
+Let nature crush the sides o' the earth together
+And mar the seeds within! Lift up thy looks:
+From my succession wipe me, father; I
+Am heir to my affection.
+
+CAMILLO:
+Be advised.
+
+FLORIZEL:
+I am, and by my fancy: if my reason
+Will thereto be obedient, I have reason;
+If not, my senses, better pleased with madness,
+Do bid it welcome.
+
+CAMILLO:
+This is desperate, sir.
+
+FLORIZEL:
+So call it: but it does fulfil my vow;
+I needs must think it honesty. Camillo,
+Not for Bohemia, nor the pomp that may
+Be thereat glean'd, for all the sun sees or
+The close earth wombs or the profound sea hides
+In unknown fathoms, will I break my oath
+To this my fair beloved: therefore, I pray you,
+As you have ever been my father's honour'd friend,
+When he shall miss me,--as, in faith, I mean not
+To see him any more,--cast your good counsels
+Upon his passion; let myself and fortune
+Tug for the time to come. This you may know
+And so deliver, I am put to sea
+With her whom here I cannot hold on shore;
+And most opportune to our need I have
+A vessel rides fast by, but not prepared
+For this design. What course I mean to hold
+Shall nothing benefit your knowledge, nor
+Concern me the reporting.
+
+CAMILLO:
+O my lord!
+I would your spirit were easier for advice,
+Or stronger for your need.
+
+FLORIZEL:
+Hark, Perdita
+I'll hear you by and by.
+
+CAMILLO:
+He's irremoveable,
+Resolved for flight. Now were I happy, if
+His going I could frame to serve my turn,
+Save him from danger, do him love and honour,
+Purchase the sight again of dear Sicilia
+And that unhappy king, my master, whom
+I so much thirst to see.
+
+FLORIZEL:
+Now, good Camillo;
+I am so fraught with curious business that
+I leave out ceremony.
+
+CAMILLO:
+Sir, I think
+You have heard of my poor services, i' the love
+That I have borne your father?
+
+FLORIZEL:
+Very nobly
+Have you deserved: it is my father's music
+To speak your deeds, not little of his care
+To have them recompensed as thought on.
+
+CAMILLO:
+Well, my lord,
+If you may please to think I love the king
+And through him what is nearest to him, which is
+Your gracious self, embrace but my direction:
+If your more ponderous and settled project
+May suffer alteration, on mine honour,
+I'll point you where you shall have such receiving
+As shall become your highness; where you may
+Enjoy your mistress, from the whom, I see,
+There's no disjunction to be made, but by--
+As heavens forefend!--your ruin; marry her,
+And, with my best endeavours in your absence,
+Your discontenting father strive to qualify
+And bring him up to liking.
+
+FLORIZEL:
+How, Camillo,
+May this, almost a miracle, be done?
+That I may call thee something more than man
+And after that trust to thee.
+
+CAMILLO:
+Have you thought on
+A place whereto you'll go?
+
+FLORIZEL:
+Not any yet:
+But as the unthought-on accident is guilty
+To what we wildly do, so we profess
+Ourselves to be the slaves of chance and flies
+Of every wind that blows.
+
+CAMILLO:
+Then list to me:
+This follows, if you will not change your purpose
+But undergo this flight, make for Sicilia,
+And there present yourself and your fair princess,
+For so I see she must be, 'fore Leontes:
+She shall be habited as it becomes
+The partner of your bed. Methinks I see
+Leontes opening his free arms and weeping
+His welcomes forth; asks thee the son forgiveness,
+As 'twere i' the father's person; kisses the hands
+Of your fresh princess; o'er and o'er divides him
+'Twixt his unkindness and his kindness; the one
+He chides to hell and bids the other grow
+Faster than thought or time.
+
+FLORIZEL:
+Worthy Camillo,
+What colour for my visitation shall I
+Hold up before him?
+
+CAMILLO:
+Sent by the king your father
+To greet him and to give him comforts. Sir,
+The manner of your bearing towards him, with
+What you as from your father shall deliver,
+Things known betwixt us three, I'll write you down:
+The which shall point you forth at every sitting
+What you must say; that he shall not perceive
+But that you have your father's bosom there
+And speak his very heart.
+
+FLORIZEL:
+I am bound to you:
+There is some sap in this.
+
+CAMILLO:
+A cause more promising
+Than a wild dedication of yourselves
+To unpath'd waters, undream'd shores, most certain
+To miseries enough; no hope to help you,
+But as you shake off one to take another;
+Nothing so certain as your anchors, who
+Do their best office, if they can but stay you
+Where you'll be loath to be: besides you know
+Prosperity's the very bond of love,
+Whose fresh complexion and whose heart together
+Affliction alters.
+
+PERDITA:
+One of these is true:
+I think affliction may subdue the cheek,
+But not take in the mind.
+
+CAMILLO:
+Yea, say you so?
+There shall not at your father's house these
+seven years
+Be born another such.
+
+FLORIZEL:
+My good Camillo,
+She is as forward of her breeding as
+She is i' the rear our birth.
+
+CAMILLO:
+I cannot say 'tis pity
+She lacks instructions, for she seems a mistress
+To most that teach.
+
+PERDITA:
+Your pardon, sir; for this
+I'll blush you thanks.
+
+FLORIZEL:
+My prettiest Perdita!
+But O, the thorns we stand upon! Camillo,
+Preserver of my father, now of me,
+The medicine of our house, how shall we do?
+We are not furnish'd like Bohemia's son,
+Nor shall appear in Sicilia.
+
+CAMILLO:
+My lord,
+Fear none of this: I think you know my fortunes
+Do all lie there: it shall be so my care
+To have you royally appointed as if
+The scene you play were mine. For instance, sir,
+That you may know you shall not want, one word.
+
+AUTOLYCUS:
+Ha, ha! what a fool Honesty is! and Trust, his
+sworn brother, a very simple gentleman! I have sold
+all my trumpery; not a counterfeit stone, not a
+ribbon, glass, pomander, brooch, table-book, ballad,
+knife, tape, glove, shoe-tie, bracelet, horn-ring,
+to keep my pack from fasting: they throng who
+should buy first, as if my trinkets had been
+hallowed and brought a benediction to the buyer:
+by which means I saw whose purse was best in
+picture; and what I saw, to my good use I
+remembered. My clown, who wants but something to
+be a reasonable man, grew so in love with the
+wenches' song, that he would not stir his pettitoes
+till he had both tune and words; which so drew the
+rest of the herd to me that all their other senses
+stuck in ears: you might have pinched a placket, it
+was senseless; 'twas nothing to geld a codpiece of a
+purse; I could have filed keys off that hung in
+chains: no hearing, no feeling, but my sir's song,
+and admiring the nothing of it. So that in this
+time of lethargy I picked and cut most of their
+festival purses; and had not the old man come in
+with a whoo-bub against his daughter and the king's
+son and scared my choughs from the chaff, I had not
+left a purse alive in the whole army.
+
+CAMILLO:
+Nay, but my letters, by this means being there
+So soon as you arrive, shall clear that doubt.
+
+FLORIZEL:
+And those that you'll procure from King Leontes--
+
+CAMILLO:
+Shall satisfy your father.
+
+PERDITA:
+Happy be you!
+All that you speak shows fair.
+
+CAMILLO:
+Who have we here?
+We'll make an instrument of this, omit
+Nothing may give us aid.
+
+AUTOLYCUS:
+If they have overheard me now, why, hanging.
+
+CAMILLO:
+How now, good fellow! why shakest thou so? Fear
+not, man; here's no harm intended to thee.
+
+AUTOLYCUS:
+I am a poor fellow, sir.
+
+CAMILLO:
+Why, be so still; here's nobody will steal that from
+thee: yet for the outside of thy poverty we must
+make an exchange; therefore discase thee instantly,
+--thou must think there's a necessity in't,--and
+change garments with this gentleman: though the
+pennyworth on his side be the worst, yet hold thee,
+there's some boot.
+
+AUTOLYCUS:
+I am a poor fellow, sir.
+I know ye well enough.
+
+CAMILLO:
+Nay, prithee, dispatch: the gentleman is half
+flayed already.
+
+AUTOLYCUS:
+Are you in earnest, sir?
+I smell the trick on't.
+
+FLORIZEL:
+Dispatch, I prithee.
+
+AUTOLYCUS:
+Indeed, I have had earnest: but I cannot with
+conscience take it.
+
+CAMILLO:
+Unbuckle, unbuckle.
+Fortunate mistress,--let my prophecy
+Come home to ye!--you must retire yourself
+Into some covert: take your sweetheart's hat
+And pluck it o'er your brows, muffle your face,
+Dismantle you, and, as you can, disliken
+The truth of your own seeming; that you may--
+For I do fear eyes over--to shipboard
+Get undescried.
+
+PERDITA:
+I see the play so lies
+That I must bear a part.
+
+CAMILLO:
+No remedy.
+Have you done there?
+
+FLORIZEL:
+Should I now meet my father,
+He would not call me son.
+
+CAMILLO:
+Nay, you shall have no hat.
+Come, lady, come. Farewell, my friend.
+
+AUTOLYCUS:
+Adieu, sir.
+
+FLORIZEL:
+O Perdita, what have we twain forgot!
+Pray you, a word.
+
+CAMILLO:
+
+FLORIZEL:
+Fortune speed us!
+Thus we set on, Camillo, to the sea-side.
+
+CAMILLO:
+The swifter speed the better.
+
+AUTOLYCUS:
+I understand the business, I hear it: to have an
+open ear, a quick eye, and a nimble hand, is
+necessary for a cut-purse; a good nose is requisite
+also, to smell out work for the other senses. I see
+this is the time that the unjust man doth thrive.
+What an exchange had this been without boot! What
+a boot is here with this exchange! Sure the gods do
+this year connive at us, and we may do any thing
+extempore. The prince himself is about a piece of
+iniquity, stealing away from his father with his
+clog at his heels: if I thought it were a piece of
+honesty to acquaint the king withal, I would not
+do't: I hold it the more knavery to conceal it;
+and therein am I constant to my profession.
+Aside, aside; here is more matter for a hot brain:
+every lane's end, every shop, church, session,
+hanging, yields a careful man work.
+
+Clown:
+See, see; what a man you are now!
+There is no other way but to tell the king
+she's a changeling and none of your flesh and blood.
+
+Shepherd:
+Nay, but hear me.
+
+Clown:
+Nay, but hear me.
+
+Shepherd:
+Go to, then.
+
+Clown:
+She being none of your flesh and blood, your flesh
+and blood has not offended the king; and so your
+flesh and blood is not to be punished by him. Show
+those things you found about her, those secret
+things, all but what she has with her: this being
+done, let the law go whistle: I warrant you.
+
+Shepherd:
+I will tell the king all, every word, yea, and his
+son's pranks too; who, I may say, is no honest man,
+neither to his father nor to me, to go about to make
+me the king's brother-in-law.
+
+Clown:
+Indeed, brother-in-law was the farthest off you
+could have been to him and then your blood had been
+the dearer by I know how much an ounce.
+
+AUTOLYCUS:
+
+Shepherd:
+Well, let us to the king: there is that in this
+fardel will make him scratch his beard.
+
+AUTOLYCUS:
+
+Clown:
+Pray heartily he be at palace.
+
+AUTOLYCUS:
+
+Shepherd:
+To the palace, an it like your worship.
+
+AUTOLYCUS:
+Your affairs there, what, with whom, the condition
+of that fardel, the place of your dwelling, your
+names, your ages, of what having, breeding, and any
+thing that is fitting to be known, discover.
+
+Clown:
+We are but plain fellows, sir.
+
+AUTOLYCUS:
+A lie; you are rough and hairy. Let me have no
+lying: it becomes none but tradesmen, and they
+often give us soldiers the lie: but we pay them for
+it with stamped coin, not stabbing steel; therefore
+they do not give us the lie.
+
+Clown:
+Your worship had like to have given us one, if you
+had not taken yourself with the manner.
+
+Shepherd:
+Are you a courtier, an't like you, sir?
+
+AUTOLYCUS:
+Whether it like me or no, I am a courtier. Seest
+thou not the air of the court in these enfoldings?
+hath not my gait in it the measure of the court?
+receives not thy nose court-odor from me? reflect I
+not on thy baseness court-contempt? Thinkest thou,
+for that I insinuate, or toaze from thee thy
+business, I am therefore no courtier? I am courtier
+cap-a-pe; and one that will either push on or pluck
+back thy business there: whereupon I command thee to
+open thy affair.
+
+Shepherd:
+My business, sir, is to the king.
+
+AUTOLYCUS:
+What advocate hast thou to him?
+
+Shepherd:
+I know not, an't like you.
+
+Clown:
+Advocate's the court-word for a pheasant: say you
+have none.
+
+Shepherd:
+None, sir; I have no pheasant, cock nor hen.
+
+AUTOLYCUS:
+How blessed are we that are not simple men!
+Yet nature might have made me as these are,
+Therefore I will not disdain.
+
+Clown:
+This cannot be but a great courtier.
+
+Shepherd:
+His garments are rich, but he wears
+them not handsomely.
+
+Clown:
+He seems to be the more noble in being fantastical:
+a great man, I'll warrant; I know by the picking
+on's teeth.
+
+AUTOLYCUS:
+The fardel there? what's i' the fardel?
+Wherefore that box?
+
+Shepherd:
+Sir, there lies such secrets in this fardel and box,
+which none must know but the king; and which he
+shall know within this hour, if I may come to the
+speech of him.
+
+AUTOLYCUS:
+Age, thou hast lost thy labour.
+
+Shepherd:
+Why, sir?
+
+AUTOLYCUS:
+The king is not at the palace; he is gone aboard a
+new ship to purge melancholy and air himself: for,
+if thou beest capable of things serious, thou must
+know the king is full of grief.
+
+Shepard:
+So 'tis said, sir; about his son, that should have
+married a shepherd's daughter.
+
+AUTOLYCUS:
+If that shepherd be not in hand-fast, let him fly:
+the curses he shall have, the tortures he shall
+feel, will break the back of man, the heart of monster.
+
+Clown:
+Think you so, sir?
+
+AUTOLYCUS:
+Not he alone shall suffer what wit can make heavy
+and vengeance bitter; but those that are germane to
+him, though removed fifty times, shall all come
+under the hangman: which though it be great pity,
+yet it is necessary. An old sheep-whistling rogue a
+ram-tender, to offer to have his daughter come into
+grace! Some say he shall be stoned; but that death
+is too soft for him, say I draw our throne into a
+sheep-cote! all deaths are too few, the sharpest too easy.
+
+Clown:
+Has the old man e'er a son, sir, do you hear. an't
+like you, sir?
+
+AUTOLYCUS:
+He has a son, who shall be flayed alive; then
+'nointed over with honey, set on the head of a
+wasp's nest; then stand till he be three quarters
+and a dram dead; then recovered again with
+aqua-vitae or some other hot infusion; then, raw as
+he is, and in the hottest day prognostication
+proclaims, shall be be set against a brick-wall, the
+sun looking with a southward eye upon him, where he
+is to behold him with flies blown to death. But what
+talk we of these traitorly rascals, whose miseries
+are to be smiled at, their offences being so
+capital? Tell me, for you seem to be honest plain
+men, what you have to the king: being something
+gently considered, I'll bring you where he is
+aboard, tender your persons to his presence,
+whisper him in your behalfs; and if it be in man
+besides the king to effect your suits, here is man
+shall do it.
+
+Clown:
+He seems to be of great authority: close with him,
+give him gold; and though authority be a stubborn
+bear, yet he is oft led by the nose with gold: show
+the inside of your purse to the outside of his hand,
+and no more ado. Remember 'stoned,' and 'flayed alive.'
+
+Shepherd:
+An't please you, sir, to undertake the business for
+us, here is that gold I have: I'll make it as much
+more and leave this young man in pawn till I bring it you.
+
+AUTOLYCUS:
+After I have done what I promised?
+
+Shepherd:
+Ay, sir.
+
+AUTOLYCUS:
+Well, give me the moiety. Are you a party in this business?
+
+Clown:
+In some sort, sir: but though my case be a pitiful
+one, I hope I shall not be flayed out of it.
+
+AUTOLYCUS:
+O, that's the case of the shepherd's son: hang him,
+he'll be made an example.
+
+Clown:
+Comfort, good comfort! We must to the king and show
+our strange sights: he must know 'tis none of your
+daughter nor my sister; we are gone else. Sir, I
+will give you as much as this old man does when the
+business is performed, and remain, as he says, your
+pawn till it be brought you.
+
+AUTOLYCUS:
+I will trust you. Walk before toward the sea-side;
+go on the right hand: I will but look upon the
+hedge and follow you.
+
+Clown:
+We are blest in this man, as I may say, even blest.
+
+Shepherd:
+Let's before as he bids us: he was provided to do us good.
+
+AUTOLYCUS:
+If I had a mind to be honest, I see Fortune would
+not suffer me: she drops booties in my mouth. I am
+courted now with a double occasion, gold and a means
+to do the prince my master good; which who knows how
+that may turn back to my advancement? I will bring
+these two moles, these blind ones, aboard him: if he
+think it fit to shore them again and that the
+complaint they have to the king concerns him
+nothing, let him call me rogue for being so far
+officious; for I am proof against that title and
+what shame else belongs to't. To him will I present
+them: there may be matter in it.
+
+CLEOMENES:
+Sir, you have done enough, and have perform'd
+A saint-like sorrow: no fault could you make,
+Which you have not redeem'd; indeed, paid down
+More penitence than done trespass: at the last,
+Do as the heavens have done, forget your evil;
+With them forgive yourself.
+
+LEONTES:
+Whilst I remember
+Her and her virtues, I cannot forget
+My blemishes in them, and so still think of
+The wrong I did myself; which was so much,
+That heirless it hath made my kingdom and
+Destroy'd the sweet'st companion that e'er man
+Bred his hopes out of.
+
+PAULINA:
+True, too true, my lord:
+If, one by one, you wedded all the world,
+Or from the all that are took something good,
+To make a perfect woman, she you kill'd
+Would be unparallel'd.
+
+LEONTES:
+I think so. Kill'd!
+She I kill'd! I did so: but thou strikest me
+Sorely, to say I did; it is as bitter
+Upon thy tongue as in my thought: now, good now,
+Say so but seldom.
+
+CLEOMENES:
+Not at all, good lady:
+You might have spoken a thousand things that would
+Have done the time more benefit and graced
+Your kindness better.
+
+PAULINA:
+You are one of those
+Would have him wed again.
+
+DION:
+If you would not so,
+You pity not the state, nor the remembrance
+Of his most sovereign name; consider little
+What dangers, by his highness' fail of issue,
+May drop upon his kingdom and devour
+Incertain lookers on. What were more holy
+Than to rejoice the former queen is well?
+What holier than, for royalty's repair,
+For present comfort and for future good,
+To bless the bed of majesty again
+With a sweet fellow to't?
+
+PAULINA:
+There is none worthy,
+Respecting her that's gone. Besides, the gods
+Will have fulfill'd their secret purposes;
+For has not the divine Apollo said,
+Is't not the tenor of his oracle,
+That King Leontes shall not have an heir
+Till his lost child be found? which that it shall,
+Is all as monstrous to our human reason
+As my Antigonus to break his grave
+And come again to me; who, on my life,
+Did perish with the infant. 'Tis your counsel
+My lord should to the heavens be contrary,
+Oppose against their wills.
+Care not for issue;
+The crown will find an heir: great Alexander
+Left his to the worthiest; so his successor
+Was like to be the best.
+
+LEONTES:
+Good Paulina,
+Who hast the memory of Hermione,
+I know, in honour, O, that ever I
+Had squared me to thy counsel! then, even now,
+I might have look'd upon my queen's full eyes,
+Have taken treasure from her lips--
+
+PAULINA:
+And left them
+More rich for what they yielded.
+
+LEONTES:
+Thou speak'st truth.
+No more such wives; therefore, no wife: one worse,
+And better used, would make her sainted spirit
+Again possess her corpse, and on this stage,
+Where we're offenders now, appear soul-vex'd,
+And begin, 'Why to me?'
+
+PAULINA:
+Had she such power,
+She had just cause.
+
+LEONTES:
+She had; and would incense me
+To murder her I married.
+
+PAULINA:
+I should so.
+Were I the ghost that walk'd, I'ld bid you mark
+Her eye, and tell me for what dull part in't
+You chose her; then I'ld shriek, that even your ears
+Should rift to hear me; and the words that follow'd
+Should be 'Remember mine.'
+
+LEONTES:
+Stars, stars,
+And all eyes else dead coals! Fear thou no wife;
+I'll have no wife, Paulina.
+
+PAULINA:
+Will you swear
+Never to marry but by my free leave?
+
+LEONTES:
+Never, Paulina; so be blest my spirit!
+
+PAULINA:
+Then, good my lords, bear witness to his oath.
+
+CLEOMENES:
+You tempt him over-much.
+
+PAULINA:
+Unless another,
+As like Hermione as is her picture,
+Affront his eye.
+
+CLEOMENES:
+Good madam,--
+
+PAULINA:
+I have done.
+Yet, if my lord will marry,--if you will, sir,
+No remedy, but you will,--give me the office
+To choose you a queen: she shall not be so young
+As was your former; but she shall be such
+As, walk'd your first queen's ghost,
+it should take joy
+To see her in your arms.
+
+LEONTES:
+My true Paulina,
+We shall not marry till thou bid'st us.
+
+PAULINA:
+That
+Shall be when your first queen's again in breath;
+Never till then.
+
+Gentleman:
+One that gives out himself Prince Florizel,
+Son of Polixenes, with his princess, she
+The fairest I have yet beheld, desires access
+To your high presence.
+
+LEONTES:
+What with him? he comes not
+Like to his father's greatness: his approach,
+So out of circumstance and sudden, tells us
+'Tis not a visitation framed, but forced
+By need and accident. What train?
+
+Gentleman:
+But few,
+And those but mean.
+
+LEONTES:
+His princess, say you, with him?
+
+Gentleman:
+Ay, the most peerless piece of earth, I think,
+That e'er the sun shone bright on.
+
+PAULINA:
+O Hermione,
+As every present time doth boast itself
+Above a better gone, so must thy grave
+Give way to what's seen now! Sir, you yourself
+Have said and writ so, but your writing now
+Is colder than that theme, 'She had not been,
+Nor was not to be equall'd;'--thus your verse
+Flow'd with her beauty once: 'tis shrewdly ebb'd,
+To say you have seen a better.
+
+Gentleman:
+Pardon, madam:
+The one I have almost forgot,--your pardon,--
+The other, when she has obtain'd your eye,
+Will have your tongue too. This is a creature,
+Would she begin a sect, might quench the zeal
+Of all professors else, make proselytes
+Of who she but bid follow.
+
+PAULINA:
+How! not women?
+
+Gentleman:
+Women will love her, that she is a woman
+More worth than any man; men, that she is
+The rarest of all women.
+
+LEONTES:
+Go, Cleomenes;
+Yourself, assisted with your honour'd friends,
+Bring them to our embracement. Still, 'tis strange
+He thus should steal upon us.
+
+PAULINA:
+Had our prince,
+Jewel of children, seen this hour, he had pair'd
+Well with this lord: there was not full a month
+Between their births.
+
+LEONTES:
+Prithee, no more; cease; thou know'st
+He dies to me again when talk'd of: sure,
+When I shall see this gentleman, thy speeches
+Will bring me to consider that which may
+Unfurnish me of reason. They are come.
+Your mother was most true to wedlock, prince;
+For she did print your royal father off,
+Conceiving you: were I but twenty-one,
+Your father's image is so hit in you,
+His very air, that I should call you brother,
+As I did him, and speak of something wildly
+By us perform'd before. Most dearly welcome!
+And your fair princess,--goddess!--O, alas!
+I lost a couple, that 'twixt heaven and earth
+Might thus have stood begetting wonder as
+You, gracious couple, do: and then I lost--
+All mine own folly--the society,
+Amity too, of your brave father, whom,
+Though bearing misery, I desire my life
+Once more to look on him.
+
+FLORIZEL:
+By his command
+Have I here touch'd Sicilia and from him
+Give you all greetings that a king, at friend,
+Can send his brother: and, but infirmity
+Which waits upon worn times hath something seized
+His wish'd ability, he had himself
+The lands and waters 'twixt your throne and his
+Measured to look upon you; whom he loves--
+He bade me say so--more than all the sceptres
+And those that bear them living.
+
+LEONTES:
+O my brother,
+Good gentleman! the wrongs I have done thee stir
+Afresh within me, and these thy offices,
+So rarely kind, are as interpreters
+Of my behind-hand slackness. Welcome hither,
+As is the spring to the earth. And hath he too
+Exposed this paragon to the fearful usage,
+At least ungentle, of the dreadful Neptune,
+To greet a man not worth her pains, much less
+The adventure of her person?
+
+FLORIZEL:
+Good my lord,
+She came from Libya.
+
+LEONTES:
+Where the warlike Smalus,
+That noble honour'd lord, is fear'd and loved?
+
+FLORIZEL:
+Most royal sir, from thence; from him, whose daughter
+His tears proclaim'd his, parting with her: thence,
+A prosperous south-wind friendly, we have cross'd,
+To execute the charge my father gave me
+For visiting your highness: my best train
+I have from your Sicilian shores dismiss'd;
+Who for Bohemia bend, to signify
+Not only my success in Libya, sir,
+But my arrival and my wife's in safety
+Here where we are.
+
+LEONTES:
+The blessed gods
+Purge all infection from our air whilst you
+Do climate here! You have a holy father,
+A graceful gentleman; against whose person,
+So sacred as it is, I have done sin:
+For which the heavens, taking angry note,
+Have left me issueless; and your father's blest,
+As he from heaven merits it, with you
+Worthy his goodness. What might I have been,
+Might I a son and daughter now have look'd on,
+Such goodly things as you!
+
+Lord:
+Most noble sir,
+That which I shall report will bear no credit,
+Were not the proof so nigh. Please you, great sir,
+Bohemia greets you from himself by me;
+Desires you to attach his son, who has--
+His dignity and duty both cast off--
+Fled from his father, from his hopes, and with
+A shepherd's daughter.
+
+LEONTES:
+Where's Bohemia? speak.
+
+Lord:
+Here in your city; I now came from him:
+I speak amazedly; and it becomes
+My marvel and my message. To your court
+Whiles he was hastening, in the chase, it seems,
+Of this fair couple, meets he on the way
+The father of this seeming lady and
+Her brother, having both their country quitted
+With this young prince.
+
+FLORIZEL:
+Camillo has betray'd me;
+Whose honour and whose honesty till now
+Endured all weathers.
+
+Lord:
+Lay't so to his charge:
+He's with the king your father.
+
+LEONTES:
+Who? Camillo?
+
+Lord:
+Camillo, sir; I spake with him; who now
+Has these poor men in question. Never saw I
+Wretches so quake: they kneel, they kiss the earth;
+Forswear themselves as often as they speak:
+Bohemia stops his ears, and threatens them
+With divers deaths in death.
+
+PERDITA:
+O my poor father!
+The heaven sets spies upon us, will not have
+Our contract celebrated.
+
+LEONTES:
+You are married?
+
+FLORIZEL:
+We are not, sir, nor are we like to be;
+The stars, I see, will kiss the valleys first:
+The odds for high and low's alike.
+
+LEONTES:
+My lord,
+Is this the daughter of a king?
+
+FLORIZEL:
+She is,
+When once she is my wife.
+
+LEONTES:
+That 'once' I see by your good father's speed
+Will come on very slowly. I am sorry,
+Most sorry, you have broken from his liking
+Where you were tied in duty, and as sorry
+Your choice is not so rich in worth as beauty,
+That you might well enjoy her.
+
+FLORIZEL:
+Dear, look up:
+Though Fortune, visible an enemy,
+Should chase us with my father, power no jot
+Hath she to change our loves. Beseech you, sir,
+Remember since you owed no more to time
+Than I do now: with thought of such affections,
+Step forth mine advocate; at your request
+My father will grant precious things as trifles.
+
+LEONTES:
+Would he do so, I'ld beg your precious mistress,
+Which he counts but a trifle.
+
+PAULINA:
+Sir, my liege,
+Your eye hath too much youth in't: not a month
+'Fore your queen died, she was more worth such gazes
+Than what you look on now.
+
+LEONTES:
+I thought of her,
+Even in these looks I made.
+But your petition
+Is yet unanswer'd. I will to your father:
+Your honour not o'erthrown by your desires,
+I am friend to them and you: upon which errand
+I now go toward him; therefore follow me
+And mark what way I make: come, good my lord.
+
+AUTOLYCUS:
+Beseech you, sir, were you present at this relation?
+
+First Gentleman:
+I was by at the opening of the fardel, heard the old
+shepherd deliver the manner how he found it:
+whereupon, after a little amazedness, we were all
+commanded out of the chamber; only this methought I
+heard the shepherd say, he found the child.
+
+AUTOLYCUS:
+I would most gladly know the issue of it.
+
+First Gentleman:
+I make a broken delivery of the business; but the
+changes I perceived in the king and Camillo were
+very notes of admiration: they seemed almost, with
+staring on one another, to tear the cases of their
+eyes; there was speech in their dumbness, language
+in their very gesture; they looked as they had heard
+of a world ransomed, or one destroyed: a notable
+passion of wonder appeared in them; but the wisest
+beholder, that knew no more but seeing, could not
+say if the importance were joy or sorrow; but in the
+extremity of the one, it must needs be.
+Here comes a gentleman that haply knows more.
+The news, Rogero?
+
+Second Gentleman:
+Nothing but bonfires: the oracle is fulfilled; the
+king's daughter is found: such a deal of wonder is
+broken out within this hour that ballad-makers
+cannot be able to express it.
+Here comes the Lady Paulina's steward: he can
+deliver you more. How goes it now, sir? this news
+which is called true is so like an old tale, that
+the verity of it is in strong suspicion: has the king
+found his heir?
+
+Third Gentleman:
+Most true, if ever truth were pregnant by
+circumstance: that which you hear you'll swear you
+see, there is such unity in the proofs. The mantle
+of Queen Hermione's, her jewel about the neck of it,
+the letters of Antigonus found with it which they
+know to be his character, the majesty of the
+creature in resemblance of the mother, the affection
+of nobleness which nature shows above her breeding,
+and many other evidences proclaim her with all
+certainty to be the king's daughter. Did you see
+the meeting of the two kings?
+
+Second Gentleman:
+No.
+
+Third Gentleman:
+Then have you lost a sight, which was to be seen,
+cannot be spoken of. There might you have beheld one
+joy crown another, so and in such manner that it
+seemed sorrow wept to take leave of them, for their
+joy waded in tears. There was casting up of eyes,
+holding up of hands, with countenances of such
+distraction that they were to be known by garment,
+not by favour. Our king, being ready to leap out of
+himself for joy of his found daughter, as if that
+joy were now become a loss, cries 'O, thy mother,
+thy mother!' then asks Bohemia forgiveness; then
+embraces his son-in-law; then again worries he his
+daughter with clipping her; now he thanks the old
+shepherd, which stands by like a weather-bitten
+conduit of many kings' reigns. I never heard of such
+another encounter, which lames report to follow it
+and undoes description to do it.
+
+Second Gentleman:
+What, pray you, became of Antigonus, that carried
+hence the child?
+
+Third Gentleman:
+Like an old tale still, which will have matter to
+rehearse, though credit be asleep and not an ear
+open. He was torn to pieces with a bear: this
+avouches the shepherd's son; who has not only his
+innocence, which seems much, to justify him, but a
+handkerchief and rings of his that Paulina knows.
+
+First Gentleman:
+What became of his bark and his followers?
+
+Third Gentleman:
+Wrecked the same instant of their master's death and
+in the view of the shepherd: so that all the
+instruments which aided to expose the child were
+even then lost when it was found. But O, the noble
+combat that 'twixt joy and sorrow was fought in
+Paulina! She had one eye declined for the loss of
+her husband, another elevated that the oracle was
+fulfilled: she lifted the princess from the earth,
+and so locks her in embracing, as if she would pin
+her to her heart that she might no more be in danger
+of losing.
+
+First Gentleman:
+The dignity of this act was worth the audience of
+kings and princes; for by such was it acted.
+
+Third Gentleman:
+One of the prettiest touches of all and that which
+angled for mine eyes, caught the water though not
+the fish, was when, at the relation of the queen's
+death, with the manner how she came to't bravely
+confessed and lamented by the king, how
+attentiveness wounded his daughter; till, from one
+sign of dolour to another, she did, with an 'Alas,'
+I would fain say, bleed tears, for I am sure my
+heart wept blood. Who was most marble there changed
+colour; some swooned, all sorrowed: if all the world
+could have seen 't, the woe had been universal.
+
+First Gentleman:
+Are they returned to the court?
+
+Third Gentleman:
+No: the princess hearing of her mother's statue,
+which is in the keeping of Paulina,--a piece many
+years in doing and now newly performed by that rare
+Italian master, Julio Romano, who, had he himself
+eternity and could put breath into his work, would
+beguile Nature of her custom, so perfectly he is her
+ape: he so near to Hermione hath done Hermione that
+they say one would speak to her and stand in hope of
+answer: thither with all greediness of affection
+are they gone, and there they intend to sup.
+
+Second Gentleman:
+I thought she had some great matter there in hand;
+for she hath privately twice or thrice a day, ever
+since the death of Hermione, visited that removed
+house. Shall we thither and with our company piece
+the rejoicing?
+
+First Gentleman:
+Who would be thence that has the benefit of access?
+every wink of an eye some new grace will be born:
+our absence makes us unthrifty to our knowledge.
+Let's along.
+
+AUTOLYCUS:
+Now, had I not the dash of my former life in me,
+would preferment drop on my head. I brought the old
+man and his son aboard the prince: told him I heard
+them talk of a fardel and I know not what: but he
+at that time, overfond of the shepherd's daughter,
+so he then took her to be, who began to be much
+sea-sick, and himself little better, extremity of
+weather continuing, this mystery remained
+undiscovered. But 'tis all one to me; for had I
+been the finder out of this secret, it would not
+have relished among my other discredits.
+Here come those I have done good to against my will,
+and already appearing in the blossoms of their fortune.
+
+Shepherd:
+Come, boy; I am past moe children, but thy sons and
+daughters will be all gentlemen born.
+
+Clown:
+You are well met, sir. You denied to fight with me
+this other day, because I was no gentleman born.
+See you these clothes? say you see them not and
+think me still no gentleman born: you were best say
+these robes are not gentlemen born: give me the
+lie, do, and try whether I am not now a gentleman born.
+
+AUTOLYCUS:
+I know you are now, sir, a gentleman born.
+
+Clown:
+Ay, and have been so any time these four hours.
+
+Shepherd:
+And so have I, boy.
+
+Clown:
+So you have: but I was a gentleman born before my
+father; for the king's son took me by the hand, and
+called me brother; and then the two kings called my
+father brother; and then the prince my brother and
+the princess my sister called my father father; and
+so we wept, and there was the first gentleman-like
+tears that ever we shed.
+
+Shepherd:
+We may live, son, to shed many more.
+
+Clown:
+Ay; or else 'twere hard luck, being in so
+preposterous estate as we are.
+
+AUTOLYCUS:
+I humbly beseech you, sir, to pardon me all the
+faults I have committed to your worship and to give
+me your good report to the prince my master.
+
+Shepherd:
+Prithee, son, do; for we must be gentle, now we are
+gentlemen.
+
+Clown:
+Thou wilt amend thy life?
+
+AUTOLYCUS:
+Ay, an it like your good worship.
+
+Clown:
+Give me thy hand: I will swear to the prince thou
+art as honest a true fellow as any is in Bohemia.
+
+Shepherd:
+You may say it, but not swear it.
+
+Clown:
+Not swear it, now I am a gentleman? Let boors and
+franklins say it, I'll swear it.
+
+Shepherd:
+How if it be false, son?
+
+Clown:
+If it be ne'er so false, a true gentleman may swear
+it in the behalf of his friend: and I'll swear to
+the prince thou art a tall fellow of thy hands and
+that thou wilt not be drunk; but I know thou art no
+tall fellow of thy hands and that thou wilt be
+drunk: but I'll swear it, and I would thou wouldst
+be a tall fellow of thy hands.
+
+AUTOLYCUS:
+I will prove so, sir, to my power.
+
+Clown:
+Ay, by any means prove a tall fellow: if I do not
+wonder how thou darest venture to be drunk, not
+being a tall fellow, trust me not. Hark! the kings
+and the princes, our kindred, are going to see the
+queen's picture. Come, follow us: we'll be thy
+good masters.
+
+LEONTES:
+O grave and good Paulina, the great comfort
+That I have had of thee!
+
+PAULINA:
+What, sovereign sir,
+I did not well I meant well. All my services
+You have paid home: but that you have vouchsafed,
+With your crown'd brother and these your contracted
+Heirs of your kingdoms, my poor house to visit,
+It is a surplus of your grace, which never
+My life may last to answer.
+
+LEONTES:
+O Paulina,
+We honour you with trouble: but we came
+To see the statue of our queen: your gallery
+Have we pass'd through, not without much content
+In many singularities; but we saw not
+That which my daughter came to look upon,
+The statue of her mother.
+
+PAULINA:
+As she lived peerless,
+So her dead likeness, I do well believe,
+Excels whatever yet you look'd upon
+Or hand of man hath done; therefore I keep it
+Lonely, apart. But here it is: prepare
+To see the life as lively mock'd as ever
+Still sleep mock'd death: behold, and say 'tis well.
+I like your silence, it the more shows off
+Your wonder: but yet speak; first, you, my liege,
+Comes it not something near?
+
+LEONTES:
+Her natural posture!
+Chide me, dear stone, that I may say indeed
+Thou art Hermione; or rather, thou art she
+In thy not chiding, for she was as tender
+As infancy and grace. But yet, Paulina,
+Hermione was not so much wrinkled, nothing
+So aged as this seems.
+
+POLIXENES:
+O, not by much.
+
+PAULINA:
+So much the more our carver's excellence;
+Which lets go by some sixteen years and makes her
+As she lived now.
+
+LEONTES:
+As now she might have done,
+So much to my good comfort, as it is
+Now piercing to my soul. O, thus she stood,
+Even with such life of majesty, warm life,
+As now it coldly stands, when first I woo'd her!
+I am ashamed: does not the stone rebuke me
+For being more stone than it? O royal piece,
+There's magic in thy majesty, which has
+My evils conjured to remembrance and
+From thy admiring daughter took the spirits,
+Standing like stone with thee.
+
+PERDITA:
+And give me leave,
+And do not say 'tis superstition, that
+I kneel and then implore her blessing. Lady,
+Dear queen, that ended when I but began,
+Give me that hand of yours to kiss.
+
+PAULINA:
+O, patience!
+The statue is but newly fix'd, the colour's Not dry.
+
+CAMILLO:
+My lord, your sorrow was too sore laid on,
+Which sixteen winters cannot blow away,
+So many summers dry; scarce any joy
+Did ever so long live; no sorrow
+But kill'd itself much sooner.
+
+POLIXENES:
+Dear my brother,
+Let him that was the cause of this have power
+To take off so much grief from you as he
+Will piece up in himself.
+
+PAULINA:
+Indeed, my lord,
+If I had thought the sight of my poor image
+Would thus have wrought you,--for the stone is mine--
+I'ld not have show'd it.
+
+LEONTES:
+Do not draw the curtain.
+
+PAULINA:
+No longer shall you gaze on't, lest your fancy
+May think anon it moves.
+
+LEONTES:
+Let be, let be.
+Would I were dead, but that, methinks, already--
+What was he that did make it? See, my lord,
+Would you not deem it breathed? and that those veins
+Did verily bear blood?
+
+POLIXENES:
+Masterly done:
+The very life seems warm upon her lip.
+
+LEONTES:
+The fixture of her eye has motion in't,
+As we are mock'd with art.
+
+PAULINA:
+I'll draw the curtain:
+My lord's almost so far transported that
+He'll think anon it lives.
+
+LEONTES:
+O sweet Paulina,
+Make me to think so twenty years together!
+No settled senses of the world can match
+The pleasure of that madness. Let 't alone.
+
+PAULINA:
+I am sorry, sir, I have thus far stirr'd you: but
+I could afflict you farther.
+
+LEONTES:
+Do, Paulina;
+For this affliction has a taste as sweet
+As any cordial comfort. Still, methinks,
+There is an air comes from her: what fine chisel
+Could ever yet cut breath? Let no man mock me,
+For I will kiss her.
+
+PAULINA:
+Good my lord, forbear:
+The ruddiness upon her lip is wet;
+You'll mar it if you kiss it, stain your own
+With oily painting. Shall I draw the curtain?
+
+LEONTES:
+No, not these twenty years.
+
+PERDITA:
+So long could I
+Stand by, a looker on.
+
+PAULINA:
+Either forbear,
+Quit presently the chapel, or resolve you
+For more amazement. If you can behold it,
+I'll make the statue move indeed, descend
+And take you by the hand; but then you'll think--
+Which I protest against--I am assisted
+By wicked powers.
+
+LEONTES:
+What you can make her do,
+I am content to look on: what to speak,
+I am content to hear; for 'tis as easy
+To make her speak as move.
+
+PAULINA:
+It is required
+You do awake your faith. Then all stand still;
+On: those that think it is unlawful business
+I am about, let them depart.
+
+LEONTES:
+Proceed:
+No foot shall stir.
+
+PAULINA:
+Music, awake her; strike!
+'Tis time; descend; be stone no more; approach;
+Strike all that look upon with marvel. Come,
+I'll fill your grave up: stir, nay, come away,
+Bequeath to death your numbness, for from him
+Dear life redeems you. You perceive she stirs:
+Start not; her actions shall be holy as
+You hear my spell is lawful: do not shun her
+Until you see her die again; for then
+You kill her double. Nay, present your hand:
+When she was young you woo'd her; now in age
+Is she become the suitor?
+
+LEONTES:
+O, she's warm!
+If this be magic, let it be an art
+Lawful as eating.
+
+POLIXENES:
+She embraces him.
+
+CAMILLO:
+She hangs about his neck:
+If she pertain to life let her speak too.
+
+POLIXENES:
+Ay, and make't manifest where she has lived,
+Or how stolen from the dead.
+
+PAULINA:
+That she is living,
+Were it but told you, should be hooted at
+Like an old tale: but it appears she lives,
+Though yet she speak not. Mark a little while.
+Please you to interpose, fair madam: kneel
+And pray your mother's blessing. Turn, good lady;
+Our Perdita is found.
+
+HERMIONE:
+You gods, look down
+And from your sacred vials pour your graces
+Upon my daughter's head! Tell me, mine own.
+Where hast thou been preserved? where lived? how found
+Thy father's court? for thou shalt hear that I,
+Knowing by Paulina that the oracle
+Gave hope thou wast in being, have preserved
+Myself to see the issue.
+
+PAULINA:
+There's time enough for that;
+Lest they desire upon this push to trouble
+Your joys with like relation. Go together,
+You precious winners all; your exultation
+Partake to every one. I, an old turtle,
+Will wing me to some wither'd bough and there
+My mate, that's never to be found again,
+Lament till I am lost.
+
+LEONTES:
+O, peace, Paulina!
+Thou shouldst a husband take by my consent,
+As I by thine a wife: this is a match,
+And made between's by vows. Thou hast found mine;
+But how, is to be question'd; for I saw her,
+As I thought, dead, and have in vain said many
+A prayer upon her grave. I'll not seek far--
+For him, I partly know his mind--to find thee
+An honourable husband. Come, Camillo,
+And take her by the hand, whose worth and honesty
+Is richly noted and here justified
+By us, a pair of kings. Let's from this place.
+What! look upon my brother: both your pardons,
+That e'er I put between your holy looks
+My ill suspicion. This is your son-in-law,
+And son unto the king, who, heavens directing,
+Is troth-plight to your daughter. Good Paulina,
+Lead us from hence, where we may leisurely
+Each one demand an answer to his part
+Perform'd in this wide gap of time since first
+We were dissever'd: hastily lead away.
+
+DUKE VINCENTIO:
+Escalus.
+
+ESCALUS:
+My lord.
+
+DUKE VINCENTIO:
+Of government the properties to unfold,
+Would seem in me to affect speech and discourse;
+Since I am put to know that your own science
+Exceeds, in that, the lists of all advice
+My strength can give you: then no more remains,
+But that to your sufficiency, as your Worth is able,
+And let them work. The nature of our people,
+Our city's institutions, and the terms
+For common justice, you're as pregnant in
+As art and practise hath enriched any
+That we remember. There is our commission,
+From which we would not have you warp. Call hither,
+I say, bid come before us Angelo.
+What figure of us think you he will bear?
+For you must know, we have with special soul
+Elected him our absence to supply,
+Lent him our terror, dress'd him with our love,
+And given his deputation all the organs
+Of our own power: what think you of it?
+
+ESCALUS:
+If any in Vienna be of worth
+To undergo such ample grace and honour,
+It is Lord Angelo.
+
+DUKE VINCENTIO:
+Look where he comes.
+
+ANGELO:
+Always obedient to your grace's will,
+I come to know your pleasure.
+
+DUKE VINCENTIO:
+Angelo,
+There is a kind of character in thy life,
+That to the observer doth thy history
+Fully unfold. Thyself and thy belongings
+Are not thine own so proper as to waste
+Thyself upon thy virtues, they on thee.
+Heaven doth with us as we with torches do,
+Not light them for themselves; for if our virtues
+Did not go forth of us, 'twere all alike
+As if we had them not. Spirits are not finely touch'd
+But to fine issues, nor Nature never lends
+The smallest scruple of her excellence
+But, like a thrifty goddess, she determines
+Herself the glory of a creditor,
+Both thanks and use. But I do bend my speech
+To one that can my part in him advertise;
+Hold therefore, Angelo:--
+In our remove be thou at full ourself;
+Mortality and mercy in Vienna
+Live in thy tongue and heart: old Escalus,
+Though first in question, is thy secondary.
+Take thy commission.
+
+ANGELO:
+Now, good my lord,
+Let there be some more test made of my metal,
+Before so noble and so great a figure
+Be stamp'd upon it.
+
+DUKE VINCENTIO:
+No more evasion:
+We have with a leaven'd and prepared choice
+Proceeded to you; therefore take your honours.
+Our haste from hence is of so quick condition
+That it prefers itself and leaves unquestion'd
+Matters of needful value. We shall write to you,
+As time and our concernings shall importune,
+How it goes with us, and do look to know
+What doth befall you here. So, fare you well;
+To the hopeful execution do I leave you
+Of your commissions.
+
+ANGELO:
+Yet give leave, my lord,
+That we may bring you something on the way.
+
+DUKE VINCENTIO:
+My haste may not admit it;
+Nor need you, on mine honour, have to do
+With any scruple; your scope is as mine own
+So to enforce or qualify the laws
+As to your soul seems good. Give me your hand:
+I'll privily away. I love the people,
+But do not like to stage me to their eyes:
+Through it do well, I do not relish well
+Their loud applause and Aves vehement;
+Nor do I think the man of safe discretion
+That does affect it. Once more, fare you well.
+
+ANGELO:
+The heavens give safety to your purposes!
+
+ESCALUS:
+Lead forth and bring you back in happiness!
+
+DUKE:
+I thank you. Fare you well.
+
+ESCALUS:
+I shall desire you, sir, to give me leave
+To have free speech with you; and it concerns me
+To look into the bottom of my place:
+A power I have, but of what strength and nature
+I am not yet instructed.
+
+ANGELO:
+'Tis so with me. Let us withdraw together,
+And we may soon our satisfaction have
+Touching that point.
+
+ESCALUS:
+I'll wait upon your honour.
+
+LUCIO:
+If the duke with the other dukes come not to
+composition with the King of Hungary, why then all
+the dukes fall upon the king.
+
+First Gentleman:
+Heaven grant us its peace, but not the King of
+Hungary's!
+
+Second Gentleman:
+Amen.
+
+LUCIO:
+Thou concludest like the sanctimonious pirate, that
+went to sea with the Ten Commandments, but scraped
+one out of the table.
+
+Second Gentleman:
+'Thou shalt not steal'?
+
+LUCIO:
+Ay, that he razed.
+
+First Gentleman:
+Why, 'twas a commandment to command the captain and
+all the rest from their functions: they put forth
+to steal. There's not a soldier of us all, that, in
+the thanksgiving before meat, do relish the petition
+well that prays for peace.
+
+Second Gentleman:
+I never heard any soldier dislike it.
+
+LUCIO:
+I believe thee; for I think thou never wast where
+grace was said.
+
+Second Gentleman:
+No? a dozen times at least.
+
+First Gentleman:
+What, in metre?
+
+LUCIO:
+In any proportion or in any language.
+
+First Gentleman:
+I think, or in any religion.
+
+LUCIO:
+Ay, why not? Grace is grace, despite of all
+controversy: as, for example, thou thyself art a
+wicked villain, despite of all grace.
+
+First Gentleman:
+Well, there went but a pair of shears between us.
+
+LUCIO:
+I grant; as there may between the lists and the
+velvet. Thou art the list.
+
+First Gentleman:
+And thou the velvet: thou art good velvet; thou'rt
+a three-piled piece, I warrant thee: I had as lief
+be a list of an English kersey as be piled, as thou
+art piled, for a French velvet. Do I speak
+feelingly now?
+
+LUCIO:
+I think thou dost; and, indeed, with most painful
+feeling of thy speech: I will, out of thine own
+confession, learn to begin thy health; but, whilst I
+live, forget to drink after thee.
+
+First Gentleman:
+I think I have done myself wrong, have I not?
+
+Second Gentleman:
+Yes, that thou hast, whether thou art tainted or free.
+
+LUCIO:
+Behold, behold. where Madam Mitigation comes! I
+have purchased as many diseases under her roof as come to--
+
+Second Gentleman:
+To what, I pray?
+
+LUCIO:
+Judge.
+
+Second Gentleman:
+To three thousand dolours a year.
+
+First Gentleman:
+Ay, and more.
+
+LUCIO:
+A French crown more.
+
+First Gentleman:
+Thou art always figuring diseases in me; but thou
+art full of error; I am sound.
+
+LUCIO:
+Nay, not as one would say, healthy; but so sound as
+things that are hollow: thy bones are hollow;
+impiety has made a feast of thee.
+
+First Gentleman:
+How now! which of your hips has the most profound sciatica?
+
+MISTRESS OVERDONE:
+Well, well; there's one yonder arrested and carried
+to prison was worth five thousand of you all.
+
+Second Gentleman:
+Who's that, I pray thee?
+
+MISTRESS OVERDONE:
+Marry, sir, that's Claudio, Signior Claudio.
+
+First Gentleman:
+Claudio to prison? 'tis not so.
+
+MISTRESS OVERDONE:
+Nay, but I know 'tis so: I saw him arrested, saw
+him carried away; and, which is more, within these
+three days his head to be chopped off.
+
+LUCIO:
+But, after all this fooling, I would not have it so.
+Art thou sure of this?
+
+MISTRESS OVERDONE:
+I am too sure of it: and it is for getting Madam
+Julietta with child.
+
+LUCIO:
+Believe me, this may be: he promised to meet me two
+hours since, and he was ever precise in
+promise-keeping.
+
+Second Gentleman:
+Besides, you know, it draws something near to the
+speech we had to such a purpose.
+
+First Gentleman:
+But, most of all, agreeing with the proclamation.
+
+LUCIO:
+Away! let's go learn the truth of it.
+
+MISTRESS OVERDONE:
+Thus, what with the war, what with the sweat, what
+with the gallows and what with poverty, I am
+custom-shrunk.
+How now! what's the news with you?
+
+POMPEY:
+Yonder man is carried to prison.
+
+MISTRESS OVERDONE:
+Well; what has he done?
+
+POMPEY:
+A woman.
+
+MISTRESS OVERDONE:
+But what's his offence?
+
+POMPEY:
+Groping for trouts in a peculiar river.
+
+MISTRESS OVERDONE:
+What, is there a maid with child by him?
+
+POMPEY:
+No, but there's a woman with maid by him. You have
+not heard of the proclamation, have you?
+
+MISTRESS OVERDONE:
+What proclamation, man?
+
+POMPEY:
+All houses in the suburbs of Vienna must be plucked down.
+
+MISTRESS OVERDONE:
+And what shall become of those in the city?
+
+POMPEY:
+They shall stand for seed: they had gone down too,
+but that a wise burgher put in for them.
+
+MISTRESS OVERDONE:
+But shall all our houses of resort in the suburbs be
+pulled down?
+
+POMPEY:
+To the ground, mistress.
+
+MISTRESS OVERDONE:
+Why, here's a change indeed in the commonwealth!
+What shall become of me?
+
+POMPEY:
+Come; fear you not: good counsellors lack no
+clients: though you change your place, you need not
+change your trade; I'll be your tapster still.
+Courage! there will be pity taken on you: you that
+have worn your eyes almost out in the service, you
+will be considered.
+
+MISTRESS OVERDONE:
+What's to do here, Thomas tapster? let's withdraw.
+
+POMPEY:
+Here comes Signior Claudio, led by the provost to
+prison; and there's Madam Juliet.
+
+CLAUDIO:
+Fellow, why dost thou show me thus to the world?
+Bear me to prison, where I am committed.
+
+Provost:
+I do it not in evil disposition,
+But from Lord Angelo by special charge.
+
+CLAUDIO:
+Thus can the demigod Authority
+Make us pay down for our offence by weight
+The words of heaven; on whom it will, it will;
+On whom it will not, so; yet still 'tis just.
+
+LUCIO:
+Why, how now, Claudio! whence comes this restraint?
+
+CLAUDIO:
+From too much liberty, my Lucio, liberty:
+As surfeit is the father of much fast,
+So every scope by the immoderate use
+Turns to restraint. Our natures do pursue,
+Like rats that ravin down their proper bane,
+A thirsty evil; and when we drink we die.
+
+LUCIO:
+If could speak so wisely under an arrest, I would
+send for certain of my creditors: and yet, to say
+the truth, I had as lief have the foppery of freedom
+as the morality of imprisonment. What's thy
+offence, Claudio?
+
+CLAUDIO:
+What but to speak of would offend again.
+
+LUCIO:
+What, is't murder?
+
+CLAUDIO:
+No.
+
+LUCIO:
+Lechery?
+
+CLAUDIO:
+Call it so.
+
+Provost:
+Away, sir! you must go.
+
+CLAUDIO:
+One word, good friend. Lucio, a word with you.
+
+LUCIO:
+A hundred, if they'll do you any good.
+Is lechery so look'd after?
+
+CLAUDIO:
+Thus stands it with me: upon a true contract
+I got possession of Julietta's bed:
+You know the lady; she is fast my wife,
+Save that we do the denunciation lack
+Of outward order: this we came not to,
+Only for propagation of a dower
+Remaining in the coffer of her friends,
+From whom we thought it meet to hide our love
+Till time had made them for us. But it chances
+The stealth of our most mutual entertainment
+With character too gross is writ on Juliet.
+
+LUCIO:
+With child, perhaps?
+
+CLAUDIO:
+Unhappily, even so.
+And the new deputy now for the duke--
+Whether it be the fault and glimpse of newness,
+Or whether that the body public be
+A horse whereon the governor doth ride,
+Who, newly in the seat, that it may know
+He can command, lets it straight feel the spur;
+Whether the tyranny be in his place,
+Or in his emmence that fills it up,
+I stagger in:--but this new governor
+Awakes me all the enrolled penalties
+Which have, like unscour'd armour, hung by the wall
+So long that nineteen zodiacs have gone round
+And none of them been worn; and, for a name,
+Now puts the drowsy and neglected act
+Freshly on me: 'tis surely for a name.
+
+LUCIO:
+I warrant it is: and thy head stands so tickle on
+thy shoulders that a milkmaid, if she be in love,
+may sigh it off. Send after the duke and appeal to
+him.
+
+CLAUDIO:
+I have done so, but he's not to be found.
+I prithee, Lucio, do me this kind service:
+This day my sister should the cloister enter
+And there receive her approbation:
+Acquaint her with the danger of my state:
+Implore her, in my voice, that she make friends
+To the strict deputy; bid herself assay him:
+I have great hope in that; for in her youth
+There is a prone and speechless dialect,
+Such as move men; beside, she hath prosperous art
+When she will play with reason and discourse,
+And well she can persuade.
+
+LUCIO:
+I pray she may; as well for the encouragement of the
+like, which else would stand under grievous
+imposition, as for the enjoying of thy life, who I
+would be sorry should be thus foolishly lost at a
+game of tick-tack. I'll to her.
+
+CLAUDIO:
+I thank you, good friend Lucio.
+
+LUCIO:
+Within two hours.
+
+CLAUDIO:
+Come, officer, away!
+
+DUKE VINCENTIO:
+No, holy father; throw away that thought;
+Believe not that the dribbling dart of love
+Can pierce a complete bosom. Why I desire thee
+To give me secret harbour, hath a purpose
+More grave and wrinkled than the aims and ends
+Of burning youth.
+
+FRIAR THOMAS:
+May your grace speak of it?
+
+DUKE VINCENTIO:
+My holy sir, none better knows than you
+How I have ever loved the life removed
+And held in idle price to haunt assemblies
+Where youth, and cost, and witless bravery keeps.
+I have deliver'd to Lord Angelo,
+A man of stricture and firm abstinence,
+My absolute power and place here in Vienna,
+And he supposes me travell'd to Poland;
+For so I have strew'd it in the common ear,
+And so it is received. Now, pious sir,
+You will demand of me why I do this?
+
+FRIAR THOMAS:
+Gladly, my lord.
+
+DUKE VINCENTIO:
+We have strict statutes and most biting laws.
+The needful bits and curbs to headstrong weeds,
+Which for this nineteen years we have let slip;
+Even like an o'ergrown lion in a cave,
+That goes not out to prey. Now, as fond fathers,
+Having bound up the threatening twigs of birch,
+Only to stick it in their children's sight
+For terror, not to use, in time the rod
+Becomes more mock'd than fear'd; so our decrees,
+Dead to infliction, to themselves are dead;
+And liberty plucks justice by the nose;
+The baby beats the nurse, and quite athwart
+Goes all decorum.
+
+FRIAR THOMAS:
+It rested in your grace
+To unloose this tied-up justice when you pleased:
+And it in you more dreadful would have seem'd
+Than in Lord Angelo.
+
+DUKE VINCENTIO:
+I do fear, too dreadful:
+Sith 'twas my fault to give the people scope,
+'Twould be my tyranny to strike and gall them
+For what I bid them do: for we bid this be done,
+When evil deeds have their permissive pass
+And not the punishment. Therefore indeed, my father,
+I have on Angelo imposed the office;
+Who may, in the ambush of my name, strike home,
+And yet my nature never in the fight
+To do in slander. And to behold his sway,
+I will, as 'twere a brother of your order,
+Visit both prince and people: therefore, I prithee,
+Supply me with the habit and instruct me
+How I may formally in person bear me
+Like a true friar. More reasons for this action
+At our more leisure shall I render you;
+Only, this one: Lord Angelo is precise;
+Stands at a guard with envy; scarce confesses
+That his blood flows, or that his appetite
+Is more to bread than stone: hence shall we see,
+If power change purpose, what our seemers be.
+
+ISABELLA:
+And have you nuns no farther privileges?
+
+FRANCISCA:
+Are not these large enough?
+
+ISABELLA:
+Yes, truly; I speak not as desiring more;
+But rather wishing a more strict restraint
+Upon the sisterhood, the votarists of Saint Clare.
+
+LUCIO:
+
+ISABELLA:
+Who's that which calls?
+
+FRANCISCA:
+It is a man's voice. Gentle Isabella,
+Turn you the key, and know his business of him;
+You may, I may not; you are yet unsworn.
+When you have vow'd, you must not speak with men
+But in the presence of the prioress:
+Then, if you speak, you must not show your face,
+Or, if you show your face, you must not speak.
+He calls again; I pray you, answer him.
+
+ISABELLA:
+Peace and prosperity! Who is't that calls
+
+LUCIO:
+Hail, virgin, if you be, as those cheek-roses
+Proclaim you are no less! Can you so stead me
+As bring me to the sight of Isabella,
+A novice of this place and the fair sister
+To her unhappy brother Claudio?
+
+ISABELLA:
+Why 'her unhappy brother'? let me ask,
+The rather for I now must make you know
+I am that Isabella and his sister.
+
+LUCIO:
+Gentle and fair, your brother kindly greets you:
+Not to be weary with you, he's in prison.
+
+ISABELLA:
+Woe me! for what?
+
+LUCIO:
+For that which, if myself might be his judge,
+He should receive his punishment in thanks:
+He hath got his friend with child.
+
+ISABELLA:
+Sir, make me not your story.
+
+LUCIO:
+It is true.
+I would not--though 'tis my familiar sin
+With maids to seem the lapwing and to jest,
+Tongue far from heart--play with all virgins so:
+I hold you as a thing ensky'd and sainted.
+By your renouncement an immortal spirit,
+And to be talk'd with in sincerity,
+As with a saint.
+
+ISABELLA:
+You do blaspheme the good in mocking me.
+
+LUCIO:
+Do not believe it. Fewness and truth, 'tis thus:
+Your brother and his lover have embraced:
+As those that feed grow full, as blossoming time
+That from the seedness the bare fallow brings
+To teeming foison, even so her plenteous womb
+Expresseth his full tilth and husbandry.
+
+ISABELLA:
+Some one with child by him? My cousin Juliet?
+
+LUCIO:
+Is she your cousin?
+
+ISABELLA:
+Adoptedly; as school-maids change their names
+By vain though apt affection.
+
+LUCIO:
+She it is.
+
+ISABELLA:
+O, let him marry her.
+
+LUCIO:
+This is the point.
+The duke is very strangely gone from hence;
+Bore many gentlemen, myself being one,
+In hand and hope of action: but we do learn
+By those that know the very nerves of state,
+His givings-out were of an infinite distance
+From his true-meant design. Upon his place,
+And with full line of his authority,
+Governs Lord Angelo; a man whose blood
+Is very snow-broth; one who never feels
+The wanton stings and motions of the sense,
+But doth rebate and blunt his natural edge
+With profits of the mind, study and fast.
+He--to give fear to use and liberty,
+Which have for long run by the hideous law,
+As mice by lions--hath pick'd out an act,
+Under whose heavy sense your brother's life
+Falls into forfeit: he arrests him on it;
+And follows close the rigour of the statute,
+To make him an example. All hope is gone,
+Unless you have the grace by your fair prayer
+To soften Angelo: and that's my pith of business
+'Twixt you and your poor brother.
+
+ISABELLA:
+Doth he so seek his life?
+
+LUCIO:
+Has censured him
+Already; and, as I hear, the provost hath
+A warrant for his execution.
+
+ISABELLA:
+Alas! what poor ability's in me
+To do him good?
+
+LUCIO:
+Assay the power you have.
+
+ISABELLA:
+My power? Alas, I doubt--
+
+LUCIO:
+Our doubts are traitors
+And make us lose the good we oft might win
+By fearing to attempt. Go to Lord Angelo,
+And let him learn to know, when maidens sue,
+Men give like gods; but when they weep and kneel,
+All their petitions are as freely theirs
+As they themselves would owe them.
+
+ISABELLA:
+I'll see what I can do.
+
+LUCIO:
+But speedily.
+
+ISABELLA:
+I will about it straight;
+No longer staying but to give the mother
+Notice of my affair. I humbly thank you:
+Commend me to my brother: soon at night
+I'll send him certain word of my success.
+
+LUCIO:
+I take my leave of you.
+
+ISABELLA:
+Good sir, adieu.
+
+ANGELO:
+We must not make a scarecrow of the law,
+Setting it up to fear the birds of prey,
+And let it keep one shape, till custom make it
+Their perch and not their terror.
+
+ESCALUS:
+Ay, but yet
+Let us be keen, and rather cut a little,
+Than fall, and bruise to death. Alas, this gentleman
+Whom I would save, had a most noble father!
+Let but your honour know,
+Whom I believe to be most strait in virtue,
+That, in the working of your own affections,
+Had time cohered with place or place with wishing,
+Or that the resolute acting of your blood
+Could have attain'd the effect of your own purpose,
+Whether you had not sometime in your life
+Err'd in this point which now you censure him,
+And pull'd the law upon you.
+
+ANGELO:
+'Tis one thing to be tempted, Escalus,
+Another thing to fall. I not deny,
+The jury, passing on the prisoner's life,
+May in the sworn twelve have a thief or two
+Guiltier than him they try. What's open made to justice,
+That justice seizes: what know the laws
+That thieves do pass on thieves? 'Tis very pregnant,
+The jewel that we find, we stoop and take't
+Because we see it; but what we do not see
+We tread upon, and never think of it.
+You may not so extenuate his offence
+For I have had such faults; but rather tell me,
+When I, that censure him, do so offend,
+Let mine own judgment pattern out my death,
+And nothing come in partial. Sir, he must die.
+
+ESCALUS:
+Be it as your wisdom will.
+
+ANGELO:
+Where is the provost?
+
+Provost:
+Here, if it like your honour.
+
+ANGELO:
+See that Claudio
+Be executed by nine to-morrow morning:
+Bring him his confessor, let him be prepared;
+For that's the utmost of his pilgrimage.
+
+ESCALUS:
+
+ELBOW:
+Come, bring them away: if these be good people in
+a commonweal that do nothing but use their abuses in
+common houses, I know no law: bring them away.
+
+ANGELO:
+How now, sir! What's your name? and what's the matter?
+
+ELBOW:
+If it Please your honour, I am the poor duke's
+constable, and my name is Elbow: I do lean upon
+justice, sir, and do bring in here before your good
+honour two notorious benefactors.
+
+ANGELO:
+Benefactors? Well; what benefactors are they? are
+they not malefactors?
+
+ELBOW:
+If it? please your honour, I know not well what they
+are: but precise villains they are, that I am sure
+of; and void of all profanation in the world that
+good Christians ought to have.
+
+ESCALUS:
+This comes off well; here's a wise officer.
+
+ANGELO:
+Go to: what quality are they of? Elbow is your
+name? why dost thou not speak, Elbow?
+
+POMPEY:
+He cannot, sir; he's out at elbow.
+
+ANGELO:
+What are you, sir?
+
+ELBOW:
+He, sir! a tapster, sir; parcel-bawd; one that
+serves a bad woman; whose house, sir, was, as they
+say, plucked down in the suburbs; and now she
+professes a hot-house, which, I think, is a very ill house too.
+
+ESCALUS:
+How know you that?
+
+ELBOW:
+My wife, sir, whom I detest before heaven and your honour,--
+
+ESCALUS:
+How? thy wife?
+
+ELBOW:
+Ay, sir; whom, I thank heaven, is an honest woman,--
+
+ESCALUS:
+Dost thou detest her therefore?
+
+ELBOW:
+I say, sir, I will detest myself also, as well as
+she, that this house, if it be not a bawd's house,
+it is pity of her life, for it is a naughty house.
+
+ESCALUS:
+How dost thou know that, constable?
+
+ELBOW:
+Marry, sir, by my wife; who, if she had been a woman
+cardinally given, might have been accused in
+fornication, adultery, and all uncleanliness there.
+
+ESCALUS:
+By the woman's means?
+
+ELBOW:
+Ay, sir, by Mistress Overdone's means: but as she
+spit in his face, so she defied him.
+
+POMPEY:
+Sir, if it please your honour, this is not so.
+
+ELBOW:
+Prove it before these varlets here, thou honourable
+man; prove it.
+
+ESCALUS:
+Do you hear how he misplaces?
+
+POMPEY:
+Sir, she came in great with child; and longing,
+saving your honour's reverence, for stewed prunes;
+sir, we had but two in the house, which at that very
+distant time stood, as it were, in a fruit-dish, a
+dish of some three-pence; your honours have seen
+such dishes; they are not China dishes, but very
+good dishes,--
+
+ESCALUS:
+Go to, go to: no matter for the dish, sir.
+
+POMPEY:
+No, indeed, sir, not of a pin; you are therein in
+the right: but to the point. As I say, this
+Mistress Elbow, being, as I say, with child, and
+being great-bellied, and longing, as I said, for
+prunes; and having but two in the dish, as I said,
+Master Froth here, this very man, having eaten the
+rest, as I said, and, as I say, paying for them very
+honestly; for, as you know, Master Froth, I could
+not give you three-pence again.
+
+FROTH:
+No, indeed.
+
+POMPEY:
+Very well: you being then, if you be remembered,
+cracking the stones of the foresaid prunes,--
+
+FROTH:
+Ay, so I did indeed.
+
+POMPEY:
+Why, very well; I telling you then, if you be
+remembered, that such a one and such a one were past
+cure of the thing you wot of, unless they kept very
+good diet, as I told you,--
+
+FROTH:
+All this is true.
+
+POMPEY:
+Why, very well, then,--
+
+ESCALUS:
+Come, you are a tedious fool: to the purpose. What
+was done to Elbow's wife, that he hath cause to
+complain of? Come me to what was done to her.
+
+POMPEY:
+Sir, your honour cannot come to that yet.
+
+ESCALUS:
+No, sir, nor I mean it not.
+
+POMPEY:
+Sir, but you shall come to it, by your honour's
+leave. And, I beseech you, look into Master Froth
+here, sir; a man of four-score pound a year; whose
+father died at Hallowmas: was't not at Hallowmas,
+Master Froth?
+
+FROTH:
+All-hallond eve.
+
+POMPEY:
+Why, very well; I hope here be truths. He, sir,
+sitting, as I say, in a lower chair, sir; 'twas in
+the Bunch of Grapes, where indeed you have a delight
+to sit, have you not?
+
+FROTH:
+I have so; because it is an open room and good for winter.
+
+POMPEY:
+Why, very well, then; I hope here be truths.
+
+ANGELO:
+This will last out a night in Russia,
+When nights are longest there: I'll take my leave.
+And leave you to the hearing of the cause;
+Hoping you'll find good cause to whip them all.
+
+ESCALUS:
+I think no less. Good morrow to your lordship.
+Now, sir, come on: what was done to Elbow's wife, once more?
+
+POMPEY:
+Once, sir? there was nothing done to her once.
+
+ELBOW:
+I beseech you, sir, ask him what this man did to my wife.
+
+POMPEY:
+I beseech your honour, ask me.
+
+ESCALUS:
+Well, sir; what did this gentleman to her?
+
+POMPEY:
+I beseech you, sir, look in this gentleman's face.
+Good Master Froth, look upon his honour; 'tis for a
+good purpose. Doth your honour mark his face?
+
+ESCALUS:
+Ay, sir, very well.
+
+POMPEY:
+Nay; I beseech you, mark it well.
+
+ESCALUS:
+Well, I do so.
+
+POMPEY:
+Doth your honour see any harm in his face?
+
+ESCALUS:
+Why, no.
+
+POMPEY:
+I'll be supposed upon a book, his face is the worst
+thing about him. Good, then; if his face be the
+worst thing about him, how could Master Froth do the
+constable's wife any harm? I would know that of
+your honour.
+
+ESCALUS:
+He's in the right. Constable, what say you to it?
+
+ELBOW:
+First, an it like you, the house is a respected
+house; next, this is a respected fellow; and his
+mistress is a respected woman.
+
+POMPEY:
+By this hand, sir, his wife is a more respected
+person than any of us all.
+
+ELBOW:
+Varlet, thou liest; thou liest, wicked varlet! the
+time has yet to come that she was ever respected
+with man, woman, or child.
+
+POMPEY:
+Sir, she was respected with him before he married with her.
+
+ESCALUS:
+Which is the wiser here? Justice or Iniquity? Is
+this true?
+
+ELBOW:
+O thou caitiff! O thou varlet! O thou wicked
+Hannibal! I respected with her before I was married
+to her! If ever I was respected with her, or she
+with me, let not your worship think me the poor
+duke's officer. Prove this, thou wicked Hannibal, or
+I'll have mine action of battery on thee.
+
+ESCALUS:
+If he took you a box o' the ear, you might have your
+action of slander too.
+
+ELBOW:
+Marry, I thank your good worship for it. What is't
+your worship's pleasure I shall do with this wicked caitiff?
+
+ESCALUS:
+Truly, officer, because he hath some offences in him
+that thou wouldst discover if thou couldst, let him
+continue in his courses till thou knowest what they
+are.
+
+ELBOW:
+Marry, I thank your worship for it. Thou seest, thou
+wicked varlet, now, what's come upon thee: thou art
+to continue now, thou varlet; thou art to continue.
+
+ESCALUS:
+Where were you born, friend?
+
+FROTH:
+Here in Vienna, sir.
+
+ESCALUS:
+Are you of fourscore pounds a year?
+
+FROTH:
+Yes, an't please you, sir.
+
+ESCALUS:
+So. What trade are you of, sir?
+
+POMPHEY:
+Tapster; a poor widow's tapster.
+
+ESCALUS:
+Your mistress' name?
+
+POMPHEY:
+Mistress Overdone.
+
+ESCALUS:
+Hath she had any more than one husband?
+
+POMPEY:
+Nine, sir; Overdone by the last.
+
+ESCALUS:
+Nine! Come hither to me, Master Froth. Master
+Froth, I would not have you acquainted with
+tapsters: they will draw you, Master Froth, and you
+will hang them. Get you gone, and let me hear no
+more of you.
+
+FROTH:
+I thank your worship. For mine own part, I never
+come into any room in a tap-house, but I am drawn
+in.
+
+ESCALUS:
+Well, no more of it, Master Froth: farewell.
+Come you hither to me, Master tapster. What's your
+name, Master tapster?
+
+POMPEY:
+Pompey.
+
+ESCALUS:
+What else?
+
+POMPEY:
+Bum, sir.
+
+ESCALUS:
+Troth, and your bum is the greatest thing about you;
+so that in the beastliest sense you are Pompey the
+Great. Pompey, you are partly a bawd, Pompey,
+howsoever you colour it in being a tapster, are you
+not? come, tell me true: it shall be the better for you.
+
+POMPEY:
+Truly, sir, I am a poor fellow that would live.
+
+ESCALUS:
+How would you live, Pompey? by being a bawd? What
+do you think of the trade, Pompey? is it a lawful trade?
+
+POMPEY:
+If the law would allow it, sir.
+
+ESCALUS:
+But the law will not allow it, Pompey; nor it shall
+not be allowed in Vienna.
+
+POMPEY:
+Does your worship mean to geld and splay all the
+youth of the city?
+
+ESCALUS:
+No, Pompey.
+
+POMPEY:
+Truly, sir, in my poor opinion, they will to't then.
+If your worship will take order for the drabs and
+the knaves, you need not to fear the bawds.
+
+ESCALUS:
+There are pretty orders beginning, I can tell you:
+it is but heading and hanging.
+
+POMPEY:
+If you head and hang all that offend that way but
+for ten year together, you'll be glad to give out a
+commission for more heads: if this law hold in
+Vienna ten year, I'll rent the fairest house in it
+after three-pence a bay: if you live to see this
+come to pass, say Pompey told you so.
+
+ESCALUS:
+Thank you, good Pompey; and, in requital of your
+prophecy, hark you: I advise you, let me not find
+you before me again upon any complaint whatsoever;
+no, not for dwelling where you do: if I do, Pompey,
+I shall beat you to your tent, and prove a shrewd
+Caesar to you; in plain dealing, Pompey, I shall
+have you whipt: so, for this time, Pompey, fare you well.
+
+POMPEY:
+I thank your worship for your good counsel:
+but I shall follow it as the flesh and fortune shall
+better determine.
+Whip me? No, no; let carman whip his jade:
+The valiant heart is not whipt out of his trade.
+
+ESCALUS:
+Come hither to me, Master Elbow; come hither, Master
+constable. How long have you been in this place of constable?
+
+ELBOW:
+Seven year and a half, sir.
+
+ESCALUS:
+I thought, by your readiness in the office, you had
+continued in it some time. You say, seven years together?
+
+ELBOW:
+And a half, sir.
+
+ESCALUS:
+Alas, it hath been great pains to you. They do you
+wrong to put you so oft upon 't: are there not men
+in your ward sufficient to serve it?
+
+ELBOW:
+Faith, sir, few of any wit in such matters: as they
+are chosen, they are glad to choose me for them; I
+do it for some piece of money, and go through with
+all.
+
+ESCALUS:
+Look you bring me in the names of some six or seven,
+the most sufficient of your parish.
+
+ELBOW:
+To your worship's house, sir?
+
+ESCALUS:
+To my house. Fare you well.
+What's o'clock, think you?
+
+Justice:
+Eleven, sir.
+
+ESCALUS:
+I pray you home to dinner with me.
+
+Justice:
+I humbly thank you.
+
+ESCALUS:
+It grieves me for the death of Claudio;
+But there's no remedy.
+
+Justice:
+Lord Angelo is severe.
+
+ESCALUS:
+It is but needful:
+Mercy is not itself, that oft looks so;
+Pardon is still the nurse of second woe:
+But yet,--poor Claudio! There is no remedy.
+Come, sir.
+
+Servant:
+He's hearing of a cause; he will come straight
+I'll tell him of you.
+
+Provost:
+Pray you, do.
+I'll know
+His pleasure; may be he will relent. Alas,
+He hath but as offended in a dream!
+All sects, all ages smack of this vice; and he
+To die for't!
+
+ANGELO:
+Now, what's the matter. Provost?
+
+Provost:
+Is it your will Claudio shall die tomorrow?
+
+ANGELO:
+Did not I tell thee yea? hadst thou not order?
+Why dost thou ask again?
+
+Provost:
+Lest I might be too rash:
+Under your good correction, I have seen,
+When, after execution, judgment hath
+Repented o'er his doom.
+
+ANGELO:
+Go to; let that be mine:
+Do you your office, or give up your place,
+And you shall well be spared.
+
+Provost:
+I crave your honour's pardon.
+What shall be done, sir, with the groaning Juliet?
+She's very near her hour.
+
+ANGELO:
+Dispose of her
+To some more fitter place, and that with speed.
+
+Servant:
+Here is the sister of the man condemn'd
+Desires access to you.
+
+ANGELO:
+Hath he a sister?
+
+Provost:
+Ay, my good lord; a very virtuous maid,
+And to be shortly of a sisterhood,
+If not already.
+
+ANGELO:
+Well, let her be admitted.
+See you the fornicatress be removed:
+Let have needful, but not lavish, means;
+There shall be order for't.
+
+Provost:
+God save your honour!
+
+ANGELO:
+Stay a little while.
+You're welcome: what's your will?
+
+ISABELLA:
+I am a woeful suitor to your honour,
+Please but your honour hear me.
+
+ANGELO:
+Well; what's your suit?
+
+ISABELLA:
+There is a vice that most I do abhor,
+And most desire should meet the blow of justice;
+For which I would not plead, but that I must;
+For which I must not plead, but that I am
+At war 'twixt will and will not.
+
+ANGELO:
+Well; the matter?
+
+ISABELLA:
+I have a brother is condemn'd to die:
+I do beseech you, let it be his fault,
+And not my brother.
+
+Provost:
+
+ANGELO:
+Condemn the fault and not the actor of it?
+Why, every fault's condemn'd ere it be done:
+Mine were the very cipher of a function,
+To fine the faults whose fine stands in record,
+And let go by the actor.
+
+ISABELLA:
+O just but severe law!
+I had a brother, then. Heaven keep your honour!
+
+LUCIO:
+
+ISABELLA:
+Must he needs die?
+
+ANGELO:
+Maiden, no remedy.
+
+ISABELLA:
+Yes; I do think that you might pardon him,
+And neither heaven nor man grieve at the mercy.
+
+ANGELO:
+I will not do't.
+
+ISABELLA:
+But can you, if you would?
+
+ANGELO:
+Look, what I will not, that I cannot do.
+
+ISABELLA:
+But might you do't, and do the world no wrong,
+If so your heart were touch'd with that remorse
+As mine is to him?
+
+ANGELO:
+He's sentenced; 'tis too late.
+
+LUCIO:
+
+ISABELLA:
+Too late? why, no; I, that do speak a word.
+May call it back again. Well, believe this,
+No ceremony that to great ones 'longs,
+Not the king's crown, nor the deputed sword,
+The marshal's truncheon, nor the judge's robe,
+Become them with one half so good a grace
+As mercy does.
+If he had been as you and you as he,
+You would have slipt like him; but he, like you,
+Would not have been so stern.
+
+ANGELO:
+Pray you, be gone.
+
+ISABELLA:
+I would to heaven I had your potency,
+And you were Isabel! should it then be thus?
+No; I would tell what 'twere to be a judge,
+And what a prisoner.
+
+LUCIO:
+
+ANGELO:
+Your brother is a forfeit of the law,
+And you but waste your words.
+
+ISABELLA:
+Alas, alas!
+Why, all the souls that were were forfeit once;
+And He that might the vantage best have took
+Found out the remedy. How would you be,
+If He, which is the top of judgment, should
+But judge you as you are? O, think on that;
+And mercy then will breathe within your lips,
+Like man new made.
+
+ANGELO:
+Be you content, fair maid;
+It is the law, not I condemn your brother:
+Were he my kinsman, brother, or my son,
+It should be thus with him: he must die tomorrow.
+
+ISABELLA:
+To-morrow! O, that's sudden! Spare him, spare him!
+He's not prepared for death. Even for our kitchens
+We kill the fowl of season: shall we serve heaven
+With less respect than we do minister
+To our gross selves? Good, good my lord, bethink you;
+Who is it that hath died for this offence?
+There's many have committed it.
+
+LUCIO:
+
+ANGELO:
+The law hath not been dead, though it hath slept:
+Those many had not dared to do that evil,
+If the first that did the edict infringe
+Had answer'd for his deed: now 'tis awake
+Takes note of what is done; and, like a prophet,
+Looks in a glass, that shows what future evils,
+Either new, or by remissness new-conceived,
+And so in progress to be hatch'd and born,
+Are now to have no successive degrees,
+But, ere they live, to end.
+
+ISABELLA:
+Yet show some pity.
+
+ANGELO:
+I show it most of all when I show justice;
+For then I pity those I do not know,
+Which a dismiss'd offence would after gall;
+And do him right that, answering one foul wrong,
+Lives not to act another. Be satisfied;
+Your brother dies to-morrow; be content.
+
+ISABELLA:
+So you must be the first that gives this sentence,
+And he, that suffer's. O, it is excellent
+To have a giant's strength; but it is tyrannous
+To use it like a giant.
+
+LUCIO:
+
+ISABELLA:
+Could great men thunder
+As Jove himself does, Jove would ne'er be quiet,
+For every pelting, petty officer
+Would use his heaven for thunder;
+Nothing but thunder! Merciful Heaven,
+Thou rather with thy sharp and sulphurous bolt
+Split'st the unwedgeable and gnarled oak
+Than the soft myrtle: but man, proud man,
+Drest in a little brief authority,
+Most ignorant of what he's most assured,
+His glassy essence, like an angry ape,
+Plays such fantastic tricks before high heaven
+As make the angels weep; who, with our spleens,
+Would all themselves laugh mortal.
+
+LUCIO:
+
+Provost:
+
+ISABELLA:
+We cannot weigh our brother with ourself:
+Great men may jest with saints; 'tis wit in them,
+But in the less foul profanation.
+
+LUCIO:
+Thou'rt i' the right, girl; more o, that.
+
+ISABELLA:
+That in the captain's but a choleric word,
+Which in the soldier is flat blasphemy.
+
+LUCIO:
+
+ANGELO:
+Why do you put these sayings upon me?
+
+ISABELLA:
+Because authority, though it err like others,
+Hath yet a kind of medicine in itself,
+That skins the vice o' the top. Go to your bosom;
+Knock there, and ask your heart what it doth know
+That's like my brother's fault: if it confess
+A natural guiltiness such as is his,
+Let it not sound a thought upon your tongue
+Against my brother's life.
+
+ANGELO:
+
+ISABELLA:
+Gentle my lord, turn back.
+
+ANGELO:
+I will bethink me: come again tomorrow.
+
+ISABELLA:
+Hark how I'll bribe you: good my lord, turn back.
+
+ANGELO:
+How! bribe me?
+
+ISABELLA:
+Ay, with such gifts that heaven shall share with you.
+
+LUCIO:
+
+ISABELLA:
+Not with fond shekels of the tested gold,
+Or stones whose rates are either rich or poor
+As fancy values them; but with true prayers
+That shall be up at heaven and enter there
+Ere sun-rise, prayers from preserved souls,
+From fasting maids whose minds are dedicate
+To nothing temporal.
+
+ANGELO:
+Well; come to me to-morrow.
+
+LUCIO:
+
+ISABELLA:
+Heaven keep your honour safe!
+
+ANGELO:
+
+ISABELLA:
+At what hour to-morrow
+Shall I attend your lordship?
+
+ANGELO:
+At any time 'fore noon.
+
+ISABELLA:
+'Save your honour!
+
+ANGELO:
+From thee, even from thy virtue!
+What's this, what's this? Is this her fault or mine?
+The tempter or the tempted, who sins most?
+Ha!
+Not she: nor doth she tempt: but it is I
+That, lying by the violet in the sun,
+Do as the carrion does, not as the flower,
+Corrupt with virtuous season. Can it be
+That modesty may more betray our sense
+Than woman's lightness? Having waste ground enough,
+Shall we desire to raze the sanctuary
+And pitch our evils there? O, fie, fie, fie!
+What dost thou, or what art thou, Angelo?
+Dost thou desire her foully for those things
+That make her good? O, let her brother live!
+Thieves for their robbery have authority
+When judges steal themselves. What, do I love her,
+That I desire to hear her speak again,
+And feast upon her eyes? What is't I dream on?
+O cunning enemy, that, to catch a saint,
+With saints dost bait thy hook! Most dangerous
+Is that temptation that doth goad us on
+To sin in loving virtue: never could the strumpet,
+With all her double vigour, art and nature,
+Once stir my temper; but this virtuous maid
+Subdues me quite. Even till now,
+When men were fond, I smiled and wonder'd how.
+
+DUKE VINCENTIO:
+Hail to you, provost! so I think you are.
+
+Provost:
+I am the provost. What's your will, good friar?
+
+DUKE VINCENTIO:
+Bound by my charity and my blest order,
+I come to visit the afflicted spirits
+Here in the prison. Do me the common right
+To let me see them and to make me know
+The nature of their crimes, that I may minister
+To them accordingly.
+
+Provost:
+I would do more than that, if more were needful.
+Look, here comes one: a gentlewoman of mine,
+Who, falling in the flaws of her own youth,
+Hath blister'd her report: she is with child;
+And he that got it, sentenced; a young man
+More fit to do another such offence
+Than die for this.
+
+DUKE VINCENTIO:
+When must he die?
+
+Provost:
+As I do think, to-morrow.
+I have provided for you: stay awhile,
+And you shall be conducted.
+
+DUKE VINCENTIO:
+Repent you, fair one, of the sin you carry?
+
+JULIET:
+I do; and bear the shame most patiently.
+
+DUKE VINCENTIO:
+I'll teach you how you shall arraign your conscience,
+And try your penitence, if it be sound,
+Or hollowly put on.
+
+JULIET:
+I'll gladly learn.
+
+DUKE VINCENTIO:
+Love you the man that wrong'd you?
+
+JULIET:
+Yes, as I love the woman that wrong'd him.
+
+DUKE VINCENTIO:
+So then it seems your most offenceful act
+Was mutually committed?
+
+JULIET:
+Mutually.
+
+DUKE VINCENTIO:
+Then was your sin of heavier kind than his.
+
+JULIET:
+I do confess it, and repent it, father.
+
+DUKE VINCENTIO:
+'Tis meet so, daughter: but lest you do repent,
+As that the sin hath brought you to this shame,
+Which sorrow is always towards ourselves, not heaven,
+Showing we would not spare heaven as we love it,
+But as we stand in fear,--
+
+JULIET:
+I do repent me, as it is an evil,
+And take the shame with joy.
+
+DUKE VINCENTIO:
+There rest.
+Your partner, as I hear, must die to-morrow,
+And I am going with instruction to him.
+Grace go with you, Benedicite!
+
+JULIET:
+Must die to-morrow! O injurious love,
+That respites me a life, whose very comfort
+Is still a dying horror!
+
+Provost:
+'Tis pity of him.
+
+ANGELO:
+When I would pray and think, I think and pray
+To several subjects. Heaven hath my empty words;
+Whilst my invention, hearing not my tongue,
+Anchors on Isabel: Heaven in my mouth,
+As if I did but only chew his name;
+And in my heart the strong and swelling evil
+Of my conception. The state, whereon I studied
+Is like a good thing, being often read,
+Grown fear'd and tedious; yea, my gravity,
+Wherein--let no man hear me--I take pride,
+Could I with boot change for an idle plume,
+Which the air beats for vain. O place, O form,
+How often dost thou with thy case, thy habit,
+Wrench awe from fools and tie the wiser souls
+To thy false seeming! Blood, thou art blood:
+Let's write good angel on the devil's horn:
+'Tis not the devil's crest.
+How now! who's there?
+
+Servant:
+One Isabel, a sister, desires access to you.
+
+ANGELO:
+Teach her the way.
+O heavens!
+Why does my blood thus muster to my heart,
+Making both it unable for itself,
+And dispossessing all my other parts
+Of necessary fitness?
+So play the foolish throngs with one that swoons;
+Come all to help him, and so stop the air
+By which he should revive: and even so
+The general, subject to a well-wish'd king,
+Quit their own part, and in obsequious fondness
+Crowd to his presence, where their untaught love
+Must needs appear offence.
+How now, fair maid?
+
+ISABELLA:
+I am come to know your pleasure.
+
+ANGELO:
+That you might know it, would much better please me
+Than to demand what 'tis. Your brother cannot live.
+
+ISABELLA:
+Even so. Heaven keep your honour!
+
+ANGELO:
+Yet may he live awhile; and, it may be,
+As long as you or I yet he must die.
+
+ISABELLA:
+Under your sentence?
+
+ANGELO:
+Yea.
+
+ISABELLA:
+When, I beseech you? that in his reprieve,
+Longer or shorter, he may be so fitted
+That his soul sicken not.
+
+ANGELO:
+Ha! fie, these filthy vices! It were as good
+To pardon him that hath from nature stolen
+A man already made, as to remit
+Their saucy sweetness that do coin heaven's image
+In stamps that are forbid: 'tis all as easy
+Falsely to take away a life true made
+As to put metal in restrained means
+To make a false one.
+
+ISABELLA:
+'Tis set down so in heaven, but not in earth.
+
+ANGELO:
+Say you so? then I shall pose you quickly.
+Which had you rather, that the most just law
+Now took your brother's life; or, to redeem him,
+Give up your body to such sweet uncleanness
+As she that he hath stain'd?
+
+ISABELLA:
+Sir, believe this,
+I had rather give my body than my soul.
+
+ANGELO:
+I talk not of your soul: our compell'd sins
+Stand more for number than for accompt.
+
+ISABELLA:
+How say you?
+
+ANGELO:
+Nay, I'll not warrant that; for I can speak
+Against the thing I say. Answer to this:
+I, now the voice of the recorded law,
+Pronounce a sentence on your brother's life:
+Might there not be a charity in sin
+To save this brother's life?
+
+ISABELLA:
+Please you to do't,
+I'll take it as a peril to my soul,
+It is no sin at all, but charity.
+
+ANGELO:
+Pleased you to do't at peril of your soul,
+Were equal poise of sin and charity.
+
+ISABELLA:
+That I do beg his life, if it be sin,
+Heaven let me bear it! you granting of my suit,
+If that be sin, I'll make it my morn prayer
+To have it added to the faults of mine,
+And nothing of your answer.
+
+ANGELO:
+Nay, but hear me.
+Your sense pursues not mine: either you are ignorant,
+Or seem so craftily; and that's not good.
+
+ISABELLA:
+Let me be ignorant, and in nothing good,
+But graciously to know I am no better.
+
+ANGELO:
+Thus wisdom wishes to appear most bright
+When it doth tax itself; as these black masks
+Proclaim an enshield beauty ten times louder
+Than beauty could, display'd. But mark me;
+To be received plain, I'll speak more gross:
+Your brother is to die.
+
+ISABELLA:
+So.
+
+ANGELO:
+And his offence is so, as it appears,
+Accountant to the law upon that pain.
+
+ISABELLA:
+True.
+
+ANGELO:
+Admit no other way to save his life,--
+As I subscribe not that, nor any other,
+But in the loss of question,--that you, his sister,
+Finding yourself desired of such a person,
+Whose credit with the judge, or own great place,
+Could fetch your brother from the manacles
+Of the all-building law; and that there were
+No earthly mean to save him, but that either
+You must lay down the treasures of your body
+To this supposed, or else to let him suffer;
+What would you do?
+
+ISABELLA:
+As much for my poor brother as myself:
+That is, were I under the terms of death,
+The impression of keen whips I'ld wear as rubies,
+And strip myself to death, as to a bed
+That longing have been sick for, ere I'ld yield
+My body up to shame.
+
+ANGELO:
+Then must your brother die.
+
+ISABELLA:
+And 'twere the cheaper way:
+Better it were a brother died at once,
+Than that a sister, by redeeming him,
+Should die for ever.
+
+ANGELO:
+Were not you then as cruel as the sentence
+That you have slander'd so?
+
+ISABELLA:
+Ignomy in ransom and free pardon
+Are of two houses: lawful mercy
+Is nothing kin to foul redemption.
+
+ANGELO:
+You seem'd of late to make the law a tyrant;
+And rather proved the sliding of your brother
+A merriment than a vice.
+
+ISABELLA:
+O, pardon me, my lord; it oft falls out,
+To have what we would have, we speak not what we mean:
+I something do excuse the thing I hate,
+For his advantage that I dearly love.
+
+ANGELO:
+We are all frail.
+
+ISABELLA:
+Else let my brother die,
+If not a feodary, but only he
+Owe and succeed thy weakness.
+
+ANGELO:
+Nay, women are frail too.
+
+ISABELLA:
+Ay, as the glasses where they view themselves;
+Which are as easy broke as they make forms.
+Women! Help Heaven! men their creation mar
+In profiting by them. Nay, call us ten times frail;
+For we are soft as our complexions are,
+And credulous to false prints.
+
+ANGELO:
+I think it well:
+And from this testimony of your own sex,--
+Since I suppose we are made to be no stronger
+Than faults may shake our frames,--let me be bold;
+I do arrest your words. Be that you are,
+That is, a woman; if you be more, you're none;
+If you be one, as you are well express'd
+By all external warrants, show it now,
+By putting on the destined livery.
+
+ISABELLA:
+I have no tongue but one: gentle my lord,
+Let me entreat you speak the former language.
+
+ANGELO:
+Plainly conceive, I love you.
+
+ISABELLA:
+My brother did love Juliet,
+And you tell me that he shall die for it.
+
+ANGELO:
+He shall not, Isabel, if you give me love.
+
+ISABELLA:
+I know your virtue hath a licence in't,
+Which seems a little fouler than it is,
+To pluck on others.
+
+ANGELO:
+Believe me, on mine honour,
+My words express my purpose.
+
+ISABELLA:
+Ha! little honour to be much believed,
+And most pernicious purpose! Seeming, seeming!
+I will proclaim thee, Angelo; look for't:
+Sign me a present pardon for my brother,
+Or with an outstretch'd throat I'll tell the world aloud
+What man thou art.
+
+ANGELO:
+Who will believe thee, Isabel?
+My unsoil'd name, the austereness of my life,
+My vouch against you, and my place i' the state,
+Will so your accusation overweigh,
+That you shall stifle in your own report
+And smell of calumny. I have begun,
+And now I give my sensual race the rein:
+Fit thy consent to my sharp appetite;
+Lay by all nicety and prolixious blushes,
+That banish what they sue for; redeem thy brother
+By yielding up thy body to my will;
+Or else he must not only die the death,
+But thy unkindness shall his death draw out
+To lingering sufferance. Answer me to-morrow,
+Or, by the affection that now guides me most,
+I'll prove a tyrant to him. As for you,
+Say what you can, my false o'erweighs your true.
+
+ISABELLA:
+To whom should I complain? Did I tell this,
+Who would believe me? O perilous mouths,
+That bear in them one and the self-same tongue,
+Either of condemnation or approof;
+Bidding the law make court'sy to their will:
+Hooking both right and wrong to the appetite,
+To follow as it draws! I'll to my brother:
+Though he hath fallen by prompture of the blood,
+Yet hath he in him such a mind of honour.
+That, had he twenty heads to tender down
+On twenty bloody blocks, he'ld yield them up,
+Before his sister should her body stoop
+To such abhorr'd pollution.
+Then, Isabel, live chaste, and, brother, die:
+More than our brother is our chastity.
+I'll tell him yet of Angelo's request,
+And fit his mind to death, for his soul's rest.
+
+DUKE VINCENTIO:
+So then you hope of pardon from Lord Angelo?
+
+CLAUDIO:
+The miserable have no other medicine
+But only hope:
+I've hope to live, and am prepared to die.
+
+DUKE VINCENTIO:
+Be absolute for death; either death or life
+Shall thereby be the sweeter. Reason thus with life:
+If I do lose thee, I do lose a thing
+That none but fools would keep: a breath thou art,
+Servile to all the skyey influences,
+That dost this habitation, where thou keep'st,
+Hourly afflict: merely, thou art death's fool;
+For him thou labour'st by thy flight to shun
+And yet runn'st toward him still. Thou art not noble;
+For all the accommodations that thou bear'st
+Are nursed by baseness. Thou'rt by no means valiant;
+For thou dost fear the soft and tender fork
+Of a poor worm. Thy best of rest is sleep,
+And that thou oft provokest; yet grossly fear'st
+Thy death, which is no more. Thou art not thyself;
+For thou exist'st on many a thousand grains
+That issue out of dust. Happy thou art not;
+For what thou hast not, still thou strivest to get,
+And what thou hast, forget'st. Thou art not certain;
+For thy complexion shifts to strange effects,
+After the moon. If thou art rich, thou'rt poor;
+For, like an ass whose back with ingots bows,
+Thou bear's thy heavy riches but a journey,
+And death unloads thee. Friend hast thou none;
+For thine own bowels, which do call thee sire,
+The mere effusion of thy proper loins,
+Do curse the gout, serpigo, and the rheum,
+For ending thee no sooner. Thou hast nor youth nor age,
+But, as it were, an after-dinner's sleep,
+Dreaming on both; for all thy blessed youth
+Becomes as aged, and doth beg the alms
+Of palsied eld; and when thou art old and rich,
+Thou hast neither heat, affection, limb, nor beauty,
+To make thy riches pleasant. What's yet in this
+That bears the name of life? Yet in this life
+Lie hid moe thousand deaths: yet death we fear,
+That makes these odds all even.
+
+CLAUDIO:
+I humbly thank you.
+To sue to live, I find I seek to die;
+And, seeking death, find life: let it come on.
+
+ISABELLA:
+
+Provost:
+Who's there? come in: the wish deserves a welcome.
+
+DUKE VINCENTIO:
+Dear sir, ere long I'll visit you again.
+
+CLAUDIO:
+Most holy sir, I thank you.
+
+ISABELLA:
+My business is a word or two with Claudio.
+
+Provost:
+And very welcome. Look, signior, here's your sister.
+
+DUKE VINCENTIO:
+Provost, a word with you.
+
+Provost:
+As many as you please.
+
+DUKE VINCENTIO:
+Bring me to hear them speak, where I may be concealed.
+
+CLAUDIO:
+Now, sister, what's the comfort?
+
+ISABELLA:
+Why,
+As all comforts are; most good, most good indeed.
+Lord Angelo, having affairs to heaven,
+Intends you for his swift ambassador,
+Where you shall be an everlasting leiger:
+Therefore your best appointment make with speed;
+To-morrow you set on.
+
+CLAUDIO:
+Is there no remedy?
+
+ISABELLA:
+None, but such remedy as, to save a head,
+To cleave a heart in twain.
+
+CLAUDIO:
+But is there any?
+
+ISABELLA:
+Yes, brother, you may live:
+There is a devilish mercy in the judge,
+If you'll implore it, that will free your life,
+But fetter you till death.
+
+CLAUDIO:
+Perpetual durance?
+
+ISABELLA:
+Ay, just; perpetual durance, a restraint,
+Though all the world's vastidity you had,
+To a determined scope.
+
+CLAUDIO:
+But in what nature?
+
+ISABELLA:
+In such a one as, you consenting to't,
+Would bark your honour from that trunk you bear,
+And leave you naked.
+
+CLAUDIO:
+Let me know the point.
+
+ISABELLA:
+O, I do fear thee, Claudio; and I quake,
+Lest thou a feverous life shouldst entertain,
+And six or seven winters more respect
+Than a perpetual honour. Darest thou die?
+The sense of death is most in apprehension;
+And the poor beetle, that we tread upon,
+In corporal sufferance finds a pang as great
+As when a giant dies.
+
+CLAUDIO:
+Why give you me this shame?
+Think you I can a resolution fetch
+From flowery tenderness? If I must die,
+I will encounter darkness as a bride,
+And hug it in mine arms.
+
+ISABELLA:
+There spake my brother; there my father's grave
+Did utter forth a voice. Yes, thou must die:
+Thou art too noble to conserve a life
+In base appliances. This outward-sainted deputy,
+Whose settled visage and deliberate word
+Nips youth i' the head and follies doth emmew
+As falcon doth the fowl, is yet a devil
+His filth within being cast, he would appear
+A pond as deep as hell.
+
+CLAUDIO:
+The prenzie Angelo!
+
+ISABELLA:
+O, 'tis the cunning livery of hell,
+The damned'st body to invest and cover
+In prenzie guards! Dost thou think, Claudio?
+If I would yield him my virginity,
+Thou mightst be freed.
+
+CLAUDIO:
+O heavens! it cannot be.
+
+ISABELLA:
+Yes, he would give't thee, from this rank offence,
+So to offend him still. This night's the time
+That I should do what I abhor to name,
+Or else thou diest to-morrow.
+
+CLAUDIO:
+Thou shalt not do't.
+
+ISABELLA:
+O, were it but my life,
+I'ld throw it down for your deliverance
+As frankly as a pin.
+
+CLAUDIO:
+Thanks, dear Isabel.
+
+ISABELLA:
+Be ready, Claudio, for your death tomorrow.
+
+CLAUDIO:
+Yes. Has he affections in him,
+That thus can make him bite the law by the nose,
+When he would force it? Sure, it is no sin,
+Or of the deadly seven, it is the least.
+
+ISABELLA:
+Which is the least?
+
+CLAUDIO:
+If it were damnable, he being so wise,
+Why would he for the momentary trick
+Be perdurably fined? O Isabel!
+
+ISABELLA:
+What says my brother?
+
+CLAUDIO:
+Death is a fearful thing.
+
+ISABELLA:
+And shamed life a hateful.
+
+CLAUDIO:
+Ay, but to die, and go we know not where;
+To lie in cold obstruction and to rot;
+This sensible warm motion to become
+A kneaded clod; and the delighted spirit
+To bathe in fiery floods, or to reside
+In thrilling region of thick-ribbed ice;
+To be imprison'd in the viewless winds,
+And blown with restless violence round about
+The pendent world; or to be worse than worst
+Of those that lawless and incertain thought
+Imagine howling: 'tis too horrible!
+The weariest and most loathed worldly life
+That age, ache, penury and imprisonment
+Can lay on nature is a paradise
+To what we fear of death.
+
+ISABELLA:
+Alas, alas!
+
+CLAUDIO:
+Sweet sister, let me live:
+What sin you do to save a brother's life,
+Nature dispenses with the deed so far
+That it becomes a virtue.
+
+ISABELLA:
+O you beast!
+O faithless coward! O dishonest wretch!
+Wilt thou be made a man out of my vice?
+Is't not a kind of incest, to take life
+From thine own sister's shame? What should I think?
+Heaven shield my mother play'd my father fair!
+For such a warped slip of wilderness
+Ne'er issued from his blood. Take my defiance!
+Die, perish! Might but my bending down
+Reprieve thee from thy fate, it should proceed:
+I'll pray a thousand prayers for thy death,
+No word to save thee.
+
+CLAUDIO:
+Nay, hear me, Isabel.
+
+ISABELLA:
+O, fie, fie, fie!
+Thy sin's not accidental, but a trade.
+Mercy to thee would prove itself a bawd:
+'Tis best thou diest quickly.
+
+CLAUDIO:
+O hear me, Isabella!
+
+DUKE VINCENTIO:
+Vouchsafe a word, young sister, but one word.
+
+ISABELLA:
+What is your will?
+
+DUKE VINCENTIO:
+Might you dispense with your leisure, I would by and
+by have some speech with you: the satisfaction I
+would require is likewise your own benefit.
+
+ISABELLA:
+I have no superfluous leisure; my stay must be
+stolen out of other affairs; but I will attend you awhile.
+
+DUKE VINCENTIO:
+Son, I have overheard what hath passed between you
+and your sister. Angelo had never the purpose to
+corrupt her; only he hath made an essay of her
+virtue to practise his judgment with the disposition
+of natures: she, having the truth of honour in her,
+hath made him that gracious denial which he is most
+glad to receive. I am confessor to Angelo, and I
+know this to be true; therefore prepare yourself to
+death: do not satisfy your resolution with hopes
+that are fallible: tomorrow you must die; go to
+your knees and make ready.
+
+CLAUDIO:
+Let me ask my sister pardon. I am so out of love
+with life that I will sue to be rid of it.
+
+DUKE VINCENTIO:
+Hold you there: farewell.
+Provost, a word with you!
+
+Provost:
+What's your will, father
+
+DUKE VINCENTIO:
+That now you are come, you will be gone. Leave me
+awhile with the maid: my mind promises with my
+habit no loss shall touch her by my company.
+
+Provost:
+In good time.
+
+DUKE VINCENTIO:
+The hand that hath made you fair hath made you good:
+the goodness that is cheap in beauty makes beauty
+brief in goodness; but grace, being the soul of
+your complexion, shall keep the body of it ever
+fair. The assault that Angelo hath made to you,
+fortune hath conveyed to my understanding; and, but
+that frailty hath examples for his falling, I should
+wonder at Angelo. How will you do to content this
+substitute, and to save your brother?
+
+ISABELLA:
+I am now going to resolve him: I had rather my
+brother die by the law than my son should be
+unlawfully born. But, O, how much is the good duke
+deceived in Angelo! If ever he return and I can
+speak to him, I will open my lips in vain, or
+discover his government.
+
+DUKE VINCENTIO:
+That shall not be much amiss: Yet, as the matter
+now stands, he will avoid your accusation; he made
+trial of you only. Therefore fasten your ear on my
+advisings: to the love I have in doing good a
+remedy presents itself. I do make myself believe
+that you may most uprighteously do a poor wronged
+lady a merited benefit; redeem your brother from
+the angry law; do no stain to your own gracious
+person; and much please the absent duke, if
+peradventure he shall ever return to have hearing of
+this business.
+
+ISABELLA:
+Let me hear you speak farther. I have spirit to do
+anything that appears not foul in the truth of my spirit.
+
+DUKE VINCENTIO:
+Virtue is bold, and goodness never fearful. Have
+you not heard speak of Mariana, the sister of
+Frederick the great soldier who miscarried at sea?
+
+ISABELLA:
+I have heard of the lady, and good words went with her name.
+
+DUKE VINCENTIO:
+She should this Angelo have married; was affianced
+to her by oath, and the nuptial appointed: between
+which time of the contract and limit of the
+solemnity, her brother Frederick was wrecked at sea,
+having in that perished vessel the dowry of his
+sister. But mark how heavily this befell to the
+poor gentlewoman: there she lost a noble and
+renowned brother, in his love toward her ever most
+kind and natural; with him, the portion and sinew of
+her fortune, her marriage-dowry; with both, her
+combinate husband, this well-seeming Angelo.
+
+ISABELLA:
+Can this be so? did Angelo so leave her?
+
+DUKE VINCENTIO:
+Left her in her tears, and dried not one of them
+with his comfort; swallowed his vows whole,
+pretending in her discoveries of dishonour: in few,
+bestowed her on her own lamentation, which she yet
+wears for his sake; and he, a marble to her tears,
+is washed with them, but relents not.
+
+ISABELLA:
+What a merit were it in death to take this poor maid
+from the world! What corruption in this life, that
+it will let this man live! But how out of this can she avail?
+
+DUKE VINCENTIO:
+It is a rupture that you may easily heal: and the
+cure of it not only saves your brother, but keeps
+you from dishonour in doing it.
+
+ISABELLA:
+Show me how, good father.
+
+DUKE VINCENTIO:
+This forenamed maid hath yet in her the continuance
+of her first affection: his unjust unkindness, that
+in all reason should have quenched her love, hath,
+like an impediment in the current, made it more
+violent and unruly. Go you to Angelo; answer his
+requiring with a plausible obedience; agree with
+his demands to the point; only refer yourself to
+this advantage, first, that your stay with him may
+not be long; that the time may have all shadow and
+silence in it; and the place answer to convenience.
+This being granted in course,--and now follows
+all,--we shall advise this wronged maid to stead up
+your appointment, go in your place; if the encounter
+acknowledge itself hereafter, it may compel him to
+her recompense: and here, by this, is your brother
+saved, your honour untainted, the poor Mariana
+advantaged, and the corrupt deputy scaled. The maid
+will I frame and make fit for his attempt. If you
+think well to carry this as you may, the doubleness
+of the benefit defends the deceit from reproof.
+What think you of it?
+
+ISABELLA:
+The image of it gives me content already; and I
+trust it will grow to a most prosperous perfection.
+
+DUKE VINCENTIO:
+It lies much in your holding up. Haste you speedily
+to Angelo: if for this night he entreat you to his
+bed, give him promise of satisfaction. I will
+presently to Saint Luke's: there, at the moated
+grange, resides this dejected Mariana. At that
+place call upon me; and dispatch with Angelo, that
+it may be quickly.
+
+ISABELLA:
+I thank you for this comfort. Fare you well, good father.
+
+ELBOW:
+Nay, if there be no remedy for it, but that you will
+needs buy and sell men and women like beasts, we
+shall have all the world drink brown and white bastard.
+
+DUKE VINCENTIO:
+O heavens! what stuff is here
+
+POMPEY:
+'Twas never merry world since, of two usuries, the
+merriest was put down, and the worser allowed by
+order of law a furred gown to keep him warm; and
+furred with fox and lamb-skins too, to signify, that
+craft, being richer than innocency, stands for the facing.
+
+ELBOW:
+Come your way, sir. 'Bless you, good father friar.
+
+DUKE VINCENTIO:
+And you, good brother father. What offence hath
+this man made you, sir?
+
+ELBOW:
+Marry, sir, he hath offended the law: and, sir, we
+take him to be a thief too, sir; for we have found
+upon him, sir, a strange picklock, which we have
+sent to the deputy.
+
+DUKE VINCENTIO:
+Fie, sirrah! a bawd, a wicked bawd!
+The evil that thou causest to be done,
+That is thy means to live. Do thou but think
+What 'tis to cram a maw or clothe a back
+From such a filthy vice: say to thyself,
+From their abominable and beastly touches
+I drink, I eat, array myself, and live.
+Canst thou believe thy living is a life,
+So stinkingly depending? Go mend, go mend.
+
+POMPEY:
+Indeed, it does stink in some sort, sir; but yet,
+sir, I would prove--
+
+DUKE VINCENTIO:
+Nay, if the devil have given thee proofs for sin,
+Thou wilt prove his. Take him to prison, officer:
+Correction and instruction must both work
+Ere this rude beast will profit.
+
+ELBOW:
+He must before the deputy, sir; he has given him
+warning: the deputy cannot abide a whoremaster: if
+he be a whoremonger, and comes before him, he were
+as good go a mile on his errand.
+
+DUKE VINCENTIO:
+That we were all, as some would seem to be,
+From our faults, as faults from seeming, free!
+
+ELBOW:
+His neck will come to your waist,--a cord, sir.
+
+POMPEY:
+I spy comfort; I cry bail. Here's a gentleman and a
+friend of mine.
+
+LUCIO:
+How now, noble Pompey! What, at the wheels of
+Caesar? art thou led in triumph? What, is there
+none of Pygmalion's images, newly made woman, to be
+had now, for putting the hand in the pocket and
+extracting it clutch'd? What reply, ha? What
+sayest thou to this tune, matter and method? Is't
+not drowned i' the last rain, ha? What sayest
+thou, Trot? Is the world as it was, man? Which is
+the way? Is it sad, and few words? or how? The
+trick of it?
+
+DUKE VINCENTIO:
+Still thus, and thus; still worse!
+
+LUCIO:
+How doth my dear morsel, thy mistress? Procures she
+still, ha?
+
+POMPEY:
+Troth, sir, she hath eaten up all her beef, and she
+is herself in the tub.
+
+LUCIO:
+Why, 'tis good; it is the right of it; it must be
+so: ever your fresh whore and your powdered bawd:
+an unshunned consequence; it must be so. Art going
+to prison, Pompey?
+
+POMPEY:
+Yes, faith, sir.
+
+LUCIO:
+Why, 'tis not amiss, Pompey. Farewell: go, say I
+sent thee thither. For debt, Pompey? or how?
+
+ELBOW:
+For being a bawd, for being a bawd.
+
+LUCIO:
+Well, then, imprison him: if imprisonment be the
+due of a bawd, why, 'tis his right: bawd is he
+doubtless, and of antiquity too; bawd-born.
+Farewell, good Pompey. Commend me to the prison,
+Pompey: you will turn good husband now, Pompey; you
+will keep the house.
+
+POMPEY:
+I hope, sir, your good worship will be my bail.
+
+LUCIO:
+No, indeed, will I not, Pompey; it is not the wear.
+I will pray, Pompey, to increase your bondage: If
+you take it not patiently, why, your mettle is the
+more. Adieu, trusty Pompey. 'Bless you, friar.
+
+DUKE VINCENTIO:
+And you.
+
+LUCIO:
+Does Bridget paint still, Pompey, ha?
+
+ELBOW:
+Come your ways, sir; come.
+
+POMPEY:
+You will not bail me, then, sir?
+
+LUCIO:
+Then, Pompey, nor now. What news abroad, friar?
+what news?
+
+ELBOW:
+Come your ways, sir; come.
+
+LUCIO:
+Go to kennel, Pompey; go.
+What news, friar, of the duke?
+
+DUKE VINCENTIO:
+I know none. Can you tell me of any?
+
+LUCIO:
+Some say he is with the Emperor of Russia; other
+some, he is in Rome: but where is he, think you?
+
+DUKE VINCENTIO:
+I know not where; but wheresoever, I wish him well.
+
+LUCIO:
+It was a mad fantastical trick of him to steal from
+the state, and usurp the beggary he was never born
+to. Lord Angelo dukes it well in his absence; he
+puts transgression to 't.
+
+DUKE VINCENTIO:
+He does well in 't.
+
+LUCIO:
+A little more lenity to lechery would do no harm in
+him: something too crabbed that way, friar.
+
+DUKE VINCENTIO:
+It is too general a vice, and severity must cure it.
+
+LUCIO:
+Yes, in good sooth, the vice is of a great kindred;
+it is well allied: but it is impossible to extirp
+it quite, friar, till eating and drinking be put
+down. They say this Angelo was not made by man and
+woman after this downright way of creation: is it
+true, think you?
+
+DUKE VINCENTIO:
+How should he be made, then?
+
+LUCIO:
+Some report a sea-maid spawned him; some, that he
+was begot between two stock-fishes. But it is
+certain that when he makes water his urine is
+congealed ice; that I know to be true: and he is a
+motion generative; that's infallible.
+
+DUKE VINCENTIO:
+You are pleasant, sir, and speak apace.
+
+LUCIO:
+Why, what a ruthless thing is this in him, for the
+rebellion of a codpiece to take away the life of a
+man! Would the duke that is absent have done this?
+Ere he would have hanged a man for the getting a
+hundred bastards, he would have paid for the nursing
+a thousand: he had some feeling of the sport: he
+knew the service, and that instructed him to mercy.
+
+DUKE VINCENTIO:
+I never heard the absent duke much detected for
+women; he was not inclined that way.
+
+LUCIO:
+O, sir, you are deceived.
+
+DUKE VINCENTIO:
+'Tis not possible.
+
+LUCIO:
+Who, not the duke? yes, your beggar of fifty; and
+his use was to put a ducat in her clack-dish: the
+duke had crotchets in him. He would be drunk too;
+that let me inform you.
+
+DUKE VINCENTIO:
+You do him wrong, surely.
+
+LUCIO:
+Sir, I was an inward of his. A shy fellow was the
+duke: and I believe I know the cause of his
+withdrawing.
+
+DUKE VINCENTIO:
+What, I prithee, might be the cause?
+
+LUCIO:
+No, pardon; 'tis a secret must be locked within the
+teeth and the lips: but this I can let you
+understand, the greater file of the subject held the
+duke to be wise.
+
+DUKE VINCENTIO:
+Wise! why, no question but he was.
+
+LUCIO:
+A very superficial, ignorant, unweighing fellow.
+
+DUKE VINCENTIO:
+Either this is the envy in you, folly, or mistaking:
+the very stream of his life and the business he hath
+helmed must upon a warranted need give him a better
+proclamation. Let him be but testimonied in his own
+bringings-forth, and he shall appear to the
+envious a scholar, a statesman and a soldier.
+Therefore you speak unskilfully: or if your
+knowledge be more it is much darkened in your malice.
+
+LUCIO:
+Sir, I know him, and I love him.
+
+DUKE VINCENTIO:
+Love talks with better knowledge, and knowledge with
+dearer love.
+
+LUCIO:
+Come, sir, I know what I know.
+
+DUKE VINCENTIO:
+I can hardly believe that, since you know not what
+you speak. But, if ever the duke return, as our
+prayers are he may, let me desire you to make your
+answer before him. If it be honest you have spoke,
+you have courage to maintain it: I am bound to call
+upon you; and, I pray you, your name?
+
+LUCIO:
+Sir, my name is Lucio; well known to the duke.
+
+DUKE VINCENTIO:
+He shall know you better, sir, if I may live to
+report you.
+
+LUCIO:
+I fear you not.
+
+DUKE VINCENTIO:
+O, you hope the duke will return no more; or you
+imagine me too unhurtful an opposite. But indeed I
+can do you little harm; you'll forswear this again.
+
+LUCIO:
+I'll be hanged first: thou art deceived in me,
+friar. But no more of this. Canst thou tell if
+Claudio die to-morrow or no?
+
+DUKE VINCENTIO:
+Why should he die, sir?
+
+LUCIO:
+Why? For filling a bottle with a tundish. I would
+the duke we talk of were returned again: the
+ungenitured agent will unpeople the province with
+continency; sparrows must not build in his
+house-eaves, because they are lecherous. The duke
+yet would have dark deeds darkly answered; he would
+never bring them to light: would he were returned!
+Marry, this Claudio is condemned for untrussing.
+Farewell, good friar: I prithee, pray for me. The
+duke, I say to thee again, would eat mutton on
+Fridays. He's not past it yet, and I say to thee,
+he would mouth with a beggar, though she smelt brown
+bread and garlic: say that I said so. Farewell.
+
+DUKE VINCENTIO:
+No might nor greatness in mortality
+Can censure 'scape; back-wounding calumny
+The whitest virtue strikes. What king so strong
+Can tie the gall up in the slanderous tongue?
+But who comes here?
+
+ESCALUS:
+Go; away with her to prison!
+
+MISTRESS OVERDONE:
+Good my lord, be good to me; your honour is accounted
+a merciful man; good my lord.
+
+ESCALUS:
+Double and treble admonition, and still forfeit in
+the same kind! This would make mercy swear and play
+the tyrant.
+
+Provost:
+A bawd of eleven years' continuance, may it please
+your honour.
+
+MISTRESS OVERDONE:
+My lord, this is one Lucio's information against me.
+Mistress Kate Keepdown was with child by him in the
+duke's time; he promised her marriage: his child
+is a year and a quarter old, come Philip and Jacob:
+I have kept it myself; and see how he goes about to abuse me!
+
+ESCALUS:
+That fellow is a fellow of much licence: let him be
+called before us. Away with her to prison! Go to;
+no more words.
+Provost, my brother Angelo will not be altered;
+Claudio must die to-morrow: let him be furnished
+with divines, and have all charitable preparation.
+if my brother wrought by my pity, it should not be
+so with him.
+
+Provost:
+So please you, this friar hath been with him, and
+advised him for the entertainment of death.
+
+ESCALUS:
+Good even, good father.
+
+DUKE VINCENTIO:
+Bliss and goodness on you!
+
+ESCALUS:
+Of whence are you?
+
+DUKE VINCENTIO:
+Not of this country, though my chance is now
+To use it for my time: I am a brother
+Of gracious order, late come from the See
+In special business from his holiness.
+
+ESCALUS:
+What news abroad i' the world?
+
+DUKE VINCENTIO:
+None, but that there is so great a fever on
+goodness, that the dissolution of it must cure it:
+novelty is only in request; and it is as dangerous
+to be aged in any kind of course, as it is virtuous
+to be constant in any undertaking. There is scarce
+truth enough alive to make societies secure; but
+security enough to make fellowships accurst: much
+upon this riddle runs the wisdom of the world. This
+news is old enough, yet it is every day's news. I
+pray you, sir, of what disposition was the duke?
+
+ESCALUS:
+One that, above all other strifes, contended
+especially to know himself.
+
+DUKE VINCENTIO:
+What pleasure was he given to?
+
+ESCALUS:
+Rather rejoicing to see another merry, than merry at
+any thing which professed to make him rejoice: a
+gentleman of all temperance. But leave we him to
+his events, with a prayer they may prove prosperous;
+and let me desire to know how you find Claudio
+prepared. I am made to understand that you have
+lent him visitation.
+
+DUKE VINCENTIO:
+He professes to have received no sinister measure
+from his judge, but most willingly humbles himself
+to the determination of justice: yet had he framed
+to himself, by the instruction of his frailty, many
+deceiving promises of life; which I by my good
+leisure have discredited to him, and now is he
+resolved to die.
+
+ESCALUS:
+You have paid the heavens your function, and the
+prisoner the very debt of your calling. I have
+laboured for the poor gentleman to the extremest
+shore of my modesty: but my brother justice have I
+found so severe, that he hath forced me to tell him
+he is indeed Justice.
+
+DUKE VINCENTIO:
+If his own life answer the straitness of his
+proceeding, it shall become him well; wherein if he
+chance to fail, he hath sentenced himself.
+
+ESCALUS:
+I am going to visit the prisoner. Fare you well.
+
+DUKE VINCENTIO:
+Peace be with you!
+He who the sword of heaven will bear
+Should be as holy as severe;
+Pattern in himself to know,
+Grace to stand, and virtue go;
+More nor less to others paying
+Than by self-offences weighing.
+Shame to him whose cruel striking
+Kills for faults of his own liking!
+Twice treble shame on Angelo,
+To weed my vice and let his grow!
+O, what may man within him hide,
+Though angel on the outward side!
+How may likeness made in crimes,
+Making practise on the times,
+To draw with idle spiders' strings
+Most ponderous and substantial things!
+Craft against vice I must apply:
+With Angelo to-night shall lie
+His old betrothed but despised;
+So disguise shall, by the disguised,
+Pay with falsehood false exacting,
+And perform an old contracting.
+
+
+MARIANA:
+Break off thy song, and haste thee quick away:
+Here comes a man of comfort, whose advice
+Hath often still'd my brawling discontent.
+I cry you mercy, sir; and well could wish
+You had not found me here so musical:
+Let me excuse me, and believe me so,
+My mirth it much displeased, but pleased my woe.
+
+DUKE VINCENTIO:
+'Tis good; though music oft hath such a charm
+To make bad good, and good provoke to harm.
+I pray, you, tell me, hath any body inquired
+for me here to-day? much upon this time have
+I promised here to meet.
+
+MARIANA:
+You have not been inquired after:
+I have sat here all day.
+
+DUKE VINCENTIO:
+I do constantly believe you. The time is come even
+now. I shall crave your forbearance a little: may
+be I will call upon you anon, for some advantage to yourself.
+
+MARIANA:
+I am always bound to you.
+
+DUKE VINCENTIO:
+Very well met, and well come.
+What is the news from this good deputy?
+
+ISABELLA:
+He hath a garden circummured with brick,
+Whose western side is with a vineyard back'd;
+And to that vineyard is a planched gate,
+That makes his opening with this bigger key:
+This other doth command a little door
+Which from the vineyard to the garden leads;
+There have I made my promise
+Upon the heavy middle of the night
+To call upon him.
+
+DUKE VINCENTIO:
+But shall you on your knowledge find this way?
+
+ISABELLA:
+I have ta'en a due and wary note upon't:
+With whispering and most guilty diligence,
+In action all of precept, he did show me
+The way twice o'er.
+
+DUKE VINCENTIO:
+Are there no other tokens
+Between you 'greed concerning her observance?
+
+ISABELLA:
+No, none, but only a repair i' the dark;
+And that I have possess'd him my most stay
+Can be but brief; for I have made him know
+I have a servant comes with me along,
+That stays upon me, whose persuasion is
+I come about my brother.
+
+DUKE VINCENTIO:
+'Tis well borne up.
+I have not yet made known to Mariana
+A word of this. What, ho! within! come forth!
+I pray you, be acquainted with this maid;
+She comes to do you good.
+
+ISABELLA:
+I do desire the like.
+
+DUKE VINCENTIO:
+Do you persuade yourself that I respect you?
+
+MARIANA:
+Good friar, I know you do, and have found it.
+
+DUKE VINCENTIO:
+Take, then, this your companion by the hand,
+Who hath a story ready for your ear.
+I shall attend your leisure: but make haste;
+The vaporous night approaches.
+
+MARIANA:
+Will't please you walk aside?
+
+DUKE VINCENTIO:
+O place and greatness! millions of false eyes
+Are stuck upon thee: volumes of report
+Run with these false and most contrarious quests
+Upon thy doings: thousand escapes of wit
+Make thee the father of their idle dreams
+And rack thee in their fancies.
+Welcome, how agreed?
+
+ISABELLA:
+She'll take the enterprise upon her, father,
+If you advise it.
+
+DUKE VINCENTIO:
+It is not my consent,
+But my entreaty too.
+
+ISABELLA:
+Little have you to say
+When you depart from him, but, soft and low,
+'Remember now my brother.'
+
+MARIANA:
+Fear me not.
+
+DUKE VINCENTIO:
+Nor, gentle daughter, fear you not at all.
+He is your husband on a pre-contract:
+To bring you thus together, 'tis no sin,
+Sith that the justice of your title to him
+Doth flourish the deceit. Come, let us go:
+Our corn's to reap, for yet our tithe's to sow.
+
+Provost:
+Come hither, sirrah. Can you cut off a man's head?
+
+POMPEY:
+If the man be a bachelor, sir, I can; but if he be a
+married man, he's his wife's head, and I can never
+cut off a woman's head.
+
+Provost:
+Come, sir, leave me your snatches, and yield me a
+direct answer. To-morrow morning are to die Claudio
+and Barnardine. Here is in our prison a common
+executioner, who in his office lacks a helper: if
+you will take it on you to assist him, it shall
+redeem you from your gyves; if not, you shall have
+your full time of imprisonment and your deliverance
+with an unpitied whipping, for you have been a
+notorious bawd.
+
+POMPEY:
+Sir, I have been an unlawful bawd time out of mind;
+but yet I will be content to be a lawful hangman. I
+would be glad to receive some instruction from my
+fellow partner.
+
+Provost:
+What, ho! Abhorson! Where's Abhorson, there?
+
+ABHORSON:
+Do you call, sir?
+
+Provost:
+Sirrah, here's a fellow will help you to-morrow in
+your execution. If you think it meet, compound with
+him by the year, and let him abide here with you; if
+not, use him for the present and dismiss him. He
+cannot plead his estimation with you; he hath been a bawd.
+
+ABHORSON:
+A bawd, sir? fie upon him! he will discredit our mystery.
+
+Provost:
+Go to, sir; you weigh equally; a feather will turn
+the scale.
+
+POMPEY:
+Pray, sir, by your good favour,--for surely, sir, a
+good favour you have, but that you have a hanging
+look,--do you call, sir, your occupation a mystery?
+
+ABHORSON:
+Ay, sir; a mystery
+
+POMPEY:
+Painting, sir, I have heard say, is a mystery; and
+your whores, sir, being members of my occupation,
+using painting, do prove my occupation a mystery:
+but what mystery there should be in hanging, if I
+should be hanged, I cannot imagine.
+
+ABHORSON:
+Sir, it is a mystery.
+
+POMPEY:
+Proof?
+
+ABHORSON:
+Every true man's apparel fits your thief: if it be
+too little for your thief, your true man thinks it
+big enough; if it be too big for your thief, your
+thief thinks it little enough: so every true man's
+apparel fits your thief.
+
+Provost:
+Are you agreed?
+
+POMPEY:
+Sir, I will serve him; for I do find your hangman is
+a more penitent trade than your bawd; he doth
+oftener ask forgiveness.
+
+Provost:
+You, sirrah, provide your block and your axe
+to-morrow four o'clock.
+
+ABHORSON:
+Come on, bawd; I will instruct thee in my trade; follow.
+
+POMPEY:
+I do desire to learn, sir: and I hope, if you have
+occasion to use me for your own turn, you shall find
+me yare; for truly, sir, for your kindness I owe you
+a good turn.
+
+Provost:
+Call hither Barnardine and Claudio:
+The one has my pity; not a jot the other,
+Being a murderer, though he were my brother.
+Look, here's the warrant, Claudio, for thy death:
+'Tis now dead midnight, and by eight to-morrow
+Thou must be made immortal. Where's Barnardine?
+
+CLAUDIO:
+As fast lock'd up in sleep as guiltless labour
+When it lies starkly in the traveller's bones:
+He will not wake.
+
+Provost:
+Who can do good on him?
+Well, go, prepare yourself.
+But, hark, what noise?
+Heaven give your spirits comfort!
+By and by.
+I hope it is some pardon or reprieve
+For the most gentle Claudio.
+Welcome father.
+
+DUKE VINCENTIO:
+The best and wholesomest spirts of the night
+Envelope you, good Provost! Who call'd here of late?
+
+Provost:
+None, since the curfew rung.
+
+DUKE VINCENTIO:
+Not Isabel?
+
+Provost:
+No.
+
+DUKE VINCENTIO:
+They will, then, ere't be long.
+
+Provost:
+What comfort is for Claudio?
+
+DUKE VINCENTIO:
+There's some in hope.
+
+Provost:
+It is a bitter deputy.
+
+DUKE VINCENTIO:
+Not so, not so; his life is parallel'd
+Even with the stroke and line of his great justice:
+He doth with holy abstinence subdue
+That in himself which he spurs on his power
+To qualify in others: were he meal'd with that
+Which he corrects, then were he tyrannous;
+But this being so, he's just.
+Now are they come.
+This is a gentle provost: seldom when
+The steeled gaoler is the friend of men.
+How now! what noise? That spirit's possessed with haste
+That wounds the unsisting postern with these strokes.
+
+Provost:
+There he must stay until the officer
+Arise to let him in: he is call'd up.
+
+DUKE VINCENTIO:
+Have you no countermand for Claudio yet,
+But he must die to-morrow?
+
+Provost:
+None, sir, none.
+
+DUKE VINCENTIO:
+As near the dawning, provost, as it is,
+You shall hear more ere morning.
+
+Provost:
+Happily
+You something know; yet I believe there comes
+No countermand; no such example have we:
+Besides, upon the very siege of justice
+Lord Angelo hath to the public ear
+Profess'd the contrary.
+This is his lordship's man.
+
+DUKE VINCENTIO:
+And here comes Claudio's pardon.
+
+Messenger:
+
+Provost:
+I shall obey him.
+
+DUKE VINCENTIO:
+
+Provost:
+I told you. Lord Angelo, belike thinking me remiss
+in mine office, awakens me with this unwonted
+putting-on; methinks strangely, for he hath not used it before.
+
+DUKE VINCENTIO:
+Pray you, let's hear.
+
+Provost:
+
+DUKE VINCENTIO:
+What is that Barnardine who is to be executed in the
+afternoon?
+
+Provost:
+A Bohemian born, but here nursed un and bred; one
+that is a prisoner nine years old.
+
+DUKE VINCENTIO:
+How came it that the absent duke had not either
+delivered him to his liberty or executed him? I
+have heard it was ever his manner to do so.
+
+Provost:
+His friends still wrought reprieves for him: and,
+indeed, his fact, till now in the government of Lord
+Angelo, came not to an undoubtful proof.
+
+DUKE VINCENTIO:
+It is now apparent?
+
+Provost:
+Most manifest, and not denied by himself.
+
+DUKE VINCENTIO:
+Hath he born himself penitently in prison? how
+seems he to be touched?
+
+Provost:
+A man that apprehends death no more dreadfully but
+as a drunken sleep; careless, reckless, and fearless
+of what's past, present, or to come; insensible of
+mortality, and desperately mortal.
+
+DUKE VINCENTIO:
+He wants advice.
+
+Provost:
+He will hear none: he hath evermore had the liberty
+of the prison; give him leave to escape hence, he
+would not: drunk many times a day, if not many days
+entirely drunk. We have very oft awaked him, as if
+to carry him to execution, and showed him a seeming
+warrant for it: it hath not moved him at all.
+
+DUKE VINCENTIO:
+More of him anon. There is written in your brow,
+provost, honesty and constancy: if I read it not
+truly, my ancient skill beguiles me; but, in the
+boldness of my cunning, I will lay myself in hazard.
+Claudio, whom here you have warrant to execute, is
+no greater forfeit to the law than Angelo who hath
+sentenced him. To make you understand this in a
+manifested effect, I crave but four days' respite;
+for the which you are to do me both a present and a
+dangerous courtesy.
+
+Provost:
+Pray, sir, in what?
+
+DUKE VINCENTIO:
+In the delaying death.
+
+Provost:
+A lack, how may I do it, having the hour limited,
+and an express command, under penalty, to deliver
+his head in the view of Angelo? I may make my case
+as Claudio's, to cross this in the smallest.
+
+DUKE VINCENTIO:
+By the vow of mine order I warrant you, if my
+instructions may be your guide. Let this Barnardine
+be this morning executed, and his head born to Angelo.
+
+Provost:
+Angelo hath seen them both, and will discover the favour.
+
+DUKE VINCENTIO:
+O, death's a great disguiser; and you may add to it.
+Shave the head, and tie the beard; and say it was
+the desire of the penitent to be so bared before his
+death: you know the course is common. If any thing
+fall to you upon this, more than thanks and good
+fortune, by the saint whom I profess, I will plead
+against it with my life.
+
+Provost:
+Pardon me, good father; it is against my oath.
+
+DUKE VINCENTIO:
+Were you sworn to the duke, or to the deputy?
+
+Provost:
+To him, and to his substitutes.
+
+DUKE VINCENTIO:
+You will think you have made no offence, if the duke
+avouch the justice of your dealing?
+
+Provost:
+But what likelihood is in that?
+
+DUKE VINCENTIO:
+Not a resemblance, but a certainty. Yet since I see
+you fearful, that neither my coat, integrity, nor
+persuasion can with ease attempt you, I will go
+further than I meant, to pluck all fears out of you.
+Look you, sir, here is the hand and seal of the
+duke: you know the character, I doubt not; and the
+signet is not strange to you.
+
+Provost:
+I know them both.
+
+DUKE VINCENTIO:
+The contents of this is the return of the duke: you
+shall anon over-read it at your pleasure; where you
+shall find, within these two days he will be here.
+This is a thing that Angelo knows not; for he this
+very day receives letters of strange tenor;
+perchance of the duke's death; perchance entering
+into some monastery; but, by chance, nothing of what
+is writ. Look, the unfolding star calls up the
+shepherd. Put not yourself into amazement how these
+things should be: all difficulties are but easy
+when they are known. Call your executioner, and off
+with Barnardine's head: I will give him a present
+shrift and advise him for a better place. Yet you
+are amazed; but this shall absolutely resolve you.
+Come away; it is almost clear dawn.
+
+POMPEY:
+I am as well acquainted here as I was in our house
+of profession: one would think it were Mistress
+Overdone's own house, for here be many of her old
+customers. First, here's young Master Rash; he's in
+for a commodity of brown paper and old ginger,
+ninescore and seventeen pounds; of which he made
+five marks, ready money: marry, then ginger was not
+much in request, for the old women were all dead.
+Then is there here one Master Caper, at the suit of
+Master Three-pile the mercer, for some four suits of
+peach-coloured satin, which now peaches him a
+beggar. Then have we here young Dizy, and young
+Master Deep-vow, and Master Copperspur, and Master
+Starve-lackey the rapier and dagger man, and young
+Drop-heir that killed lusty Pudding, and Master
+Forthlight the tilter, and brave Master Shooty the
+great traveller, and wild Half-can that stabbed
+Pots, and, I think, forty more; all great doers in
+our trade, and are now 'for the Lord's sake.'
+
+ABHORSON:
+Sirrah, bring Barnardine hither.
+
+POMPEY:
+Master Barnardine! you must rise and be hanged.
+Master Barnardine!
+
+ABHORSON:
+What, ho, Barnardine!
+
+BARNARDINE:
+
+POMPEY:
+Your friends, sir; the hangman. You must be so
+good, sir, to rise and be put to death.
+
+BARNARDINE:
+
+ABHORSON:
+Tell him he must awake, and that quickly too.
+
+POMPEY:
+Pray, Master Barnardine, awake till you are
+executed, and sleep afterwards.
+
+ABHORSON:
+Go in to him, and fetch him out.
+
+POMPEY:
+He is coming, sir, he is coming; I hear his straw rustle.
+
+ABHORSON:
+Is the axe upon the block, sirrah?
+
+POMPEY:
+Very ready, sir.
+
+BARNARDINE:
+How now, Abhorson? what's the news with you?
+
+ABHORSON:
+Truly, sir, I would desire you to clap into your
+prayers; for, look you, the warrant's come.
+
+BARNARDINE:
+You rogue, I have been drinking all night; I am not
+fitted for 't.
+
+POMPEY:
+O, the better, sir; for he that drinks all night,
+and is hanged betimes in the morning, may sleep the
+sounder all the next day.
+
+ABHORSON:
+Look you, sir; here comes your ghostly father: do
+we jest now, think you?
+
+DUKE VINCENTIO:
+Sir, induced by my charity, and hearing how hastily
+you are to depart, I am come to advise you, comfort
+you and pray with you.
+
+BARNARDINE:
+Friar, not I I have been drinking hard all night,
+and I will have more time to prepare me, or they
+shall beat out my brains with billets: I will not
+consent to die this day, that's certain.
+
+DUKE VINCENTIO:
+O, sir, you must: and therefore I beseech you
+Look forward on the journey you shall go.
+
+BARNARDINE:
+I swear I will not die to-day for any man's
+persuasion.
+
+DUKE VINCENTIO:
+But hear you.
+
+BARNARDINE:
+Not a word: if you have any thing to say to me,
+come to my ward; for thence will not I to-day.
+
+DUKE VINCENTIO:
+Unfit to live or die: O gravel heart!
+After him, fellows; bring him to the block.
+
+Provost:
+Now, sir, how do you find the prisoner?
+
+DUKE VINCENTIO:
+A creature unprepared, unmeet for death;
+And to transport him in the mind he is
+Were damnable.
+
+Provost:
+Here in the prison, father,
+There died this morning of a cruel fever
+One Ragozine, a most notorious pirate,
+A man of Claudio's years; his beard and head
+Just of his colour. What if we do omit
+This reprobate till he were well inclined;
+And satisfy the deputy with the visage
+Of Ragozine, more like to Claudio?
+
+DUKE VINCENTIO:
+O, 'tis an accident that heaven provides!
+Dispatch it presently; the hour draws on
+Prefix'd by Angelo: see this be done,
+And sent according to command; whiles I
+Persuade this rude wretch willingly to die.
+
+Provost:
+This shall be done, good father, presently.
+But Barnardine must die this afternoon:
+And how shall we continue Claudio,
+To save me from the danger that might come
+If he were known alive?
+
+DUKE VINCENTIO:
+Let this be done.
+Put them in secret holds, both Barnardine and Claudio:
+Ere twice the sun hath made his journal greeting
+To the under generation, you shall find
+Your safety manifested.
+
+Provost:
+I am your free dependant.
+
+DUKE VINCENTIO:
+Quick, dispatch, and send the head to Angelo.
+Now will I write letters to Angelo,--
+The provost, he shall bear them, whose contents
+Shall witness to him I am near at home,
+And that, by great injunctions, I am bound
+To enter publicly: him I'll desire
+To meet me at the consecrated fount
+A league below the city; and from thence,
+By cold gradation and well-balanced form,
+We shall proceed with Angelo.
+
+Provost:
+Here is the head; I'll carry it myself.
+
+DUKE VINCENTIO:
+Convenient is it. Make a swift return;
+For I would commune with you of such things
+That want no ear but yours.
+
+Provost:
+I'll make all speed.
+
+ISABELLA:
+
+DUKE VINCENTIO:
+The tongue of Isabel. She's come to know
+If yet her brother's pardon be come hither:
+But I will keep her ignorant of her good,
+To make her heavenly comforts of despair,
+When it is least expected.
+
+ISABELLA:
+Ho, by your leave!
+
+DUKE VINCENTIO:
+Good morning to you, fair and gracious daughter.
+
+ISABELLA:
+The better, given me by so holy a man.
+Hath yet the deputy sent my brother's pardon?
+
+DUKE VINCENTIO:
+He hath released him, Isabel, from the world:
+His head is off and sent to Angelo.
+
+ISABELLA:
+Nay, but it is not so.
+
+DUKE VINCENTIO:
+It is no other: show your wisdom, daughter,
+In your close patience.
+
+ISABELLA:
+O, I will to him and pluck out his eyes!
+
+DUKE VINCENTIO:
+You shall not be admitted to his sight.
+
+ISABELLA:
+Unhappy Claudio! wretched Isabel!
+Injurious world! most damned Angelo!
+
+DUKE VINCENTIO:
+This nor hurts him nor profits you a jot;
+Forbear it therefore; give your cause to heaven.
+Mark what I say, which you shall find
+By every syllable a faithful verity:
+The duke comes home to-morrow; nay, dry your eyes;
+One of our convent, and his confessor,
+Gives me this instance: already he hath carried
+Notice to Escalus and Angelo,
+Who do prepare to meet him at the gates,
+There to give up their power. If you can, pace your wisdom
+In that good path that I would wish it go,
+And you shall have your bosom on this wretch,
+Grace of the duke, revenges to your heart,
+And general honour.
+
+ISABELLA:
+I am directed by you.
+
+DUKE VINCENTIO:
+This letter, then, to Friar Peter give;
+'Tis that he sent me of the duke's return:
+Say, by this token, I desire his company
+At Mariana's house to-night. Her cause and yours
+I'll perfect him withal, and he shall bring you
+Before the duke, and to the head of Angelo
+Accuse him home and home. For my poor self,
+I am combined by a sacred vow
+And shall be absent. Wend you with this letter:
+Command these fretting waters from your eyes
+With a light heart; trust not my holy order,
+If I pervert your course. Who's here?
+
+LUCIO:
+Good even. Friar, where's the provost?
+
+DUKE VINCENTIO:
+Not within, sir.
+
+LUCIO:
+O pretty Isabella, I am pale at mine heart to see
+thine eyes so red: thou must be patient. I am fain
+to dine and sup with water and bran; I dare not for
+my head fill my belly; one fruitful meal would set
+me to 't. But they say the duke will be here
+to-morrow. By my troth, Isabel, I loved thy brother:
+if the old fantastical duke of dark corners had been
+at home, he had lived.
+
+DUKE VINCENTIO:
+Sir, the duke is marvellous little beholding to your
+reports; but the best is, he lives not in them.
+
+LUCIO:
+Friar, thou knowest not the duke so well as I do:
+he's a better woodman than thou takest him for.
+
+DUKE VINCENTIO:
+Well, you'll answer this one day. Fare ye well.
+
+LUCIO:
+Nay, tarry; I'll go along with thee
+I can tell thee pretty tales of the duke.
+
+DUKE VINCENTIO:
+You have told me too many of him already, sir, if
+they be true; if not true, none were enough.
+
+LUCIO:
+I was once before him for getting a wench with child.
+
+DUKE VINCENTIO:
+Did you such a thing?
+
+LUCIO:
+Yes, marry, did I but I was fain to forswear it;
+they would else have married me to the rotten medlar.
+
+DUKE VINCENTIO:
+Sir, your company is fairer than honest. Rest you well.
+
+LUCIO:
+By my troth, I'll go with thee to the lane's end:
+if bawdy talk offend you, we'll have very little of
+it. Nay, friar, I am a kind of burr; I shall stick.
+
+ESCALUS:
+Every letter he hath writ hath disvouched other.
+
+ANGELO:
+In most uneven and distracted manner. His actions
+show much like to madness: pray heaven his wisdom be
+not tainted! And why meet him at the gates, and
+redeliver our authorities there
+
+ESCALUS:
+I guess not.
+
+ANGELO:
+And why should we proclaim it in an hour before his
+entering, that if any crave redress of injustice,
+they should exhibit their petitions in the street?
+
+ESCALUS:
+He shows his reason for that: to have a dispatch of
+complaints, and to deliver us from devices
+hereafter, which shall then have no power to stand
+against us.
+
+ANGELO:
+Well, I beseech you, let it be proclaimed betimes
+i' the morn; I'll call you at your house: give
+notice to such men of sort and suit as are to meet
+him.
+
+ESCALUS:
+I shall, sir. Fare you well.
+
+ANGELO:
+Good night.
+This deed unshapes me quite, makes me unpregnant
+And dull to all proceedings. A deflower'd maid!
+And by an eminent body that enforced
+The law against it! But that her tender shame
+Will not proclaim against her maiden loss,
+How might she tongue me! Yet reason dares her no;
+For my authority bears of a credent bulk,
+That no particular scandal once can touch
+But it confounds the breather. He should have lived,
+Save that riotous youth, with dangerous sense,
+Might in the times to come have ta'en revenge,
+By so receiving a dishonour'd life
+With ransom of such shame. Would yet he had lived!
+A lack, when once our grace we have forgot,
+Nothing goes right: we would, and we would not.
+
+DUKE VINCENTIO:
+These letters at fit time deliver me
+The provost knows our purpose and our plot.
+The matter being afoot, keep your instruction,
+And hold you ever to our special drift;
+Though sometimes you do blench from this to that,
+As cause doth minister. Go call at Flavius' house,
+And tell him where I stay: give the like notice
+To Valentinus, Rowland, and to Crassus,
+And bid them bring the trumpets to the gate;
+But send me Flavius first.
+
+FRIAR PETER:
+It shall be speeded well.
+
+DUKE VINCENTIO:
+I thank thee, Varrius; thou hast made good haste:
+Come, we will walk. There's other of our friends
+Will greet us here anon, my gentle Varrius.
+
+ISABELLA:
+To speak so indirectly I am loath:
+I would say the truth; but to accuse him so,
+That is your part: yet I am advised to do it;
+He says, to veil full purpose.
+
+MARIANA:
+Be ruled by him.
+
+ISABELLA:
+Besides, he tells me that, if peradventure
+He speak against me on the adverse side,
+I should not think it strange; for 'tis a physic
+That's bitter to sweet end.
+
+MARIANA:
+I would Friar Peter--
+
+ISABELLA:
+O, peace! the friar is come.
+
+FRIAR PETER:
+Come, I have found you out a stand most fit,
+Where you may have such vantage on the duke,
+He shall not pass you. Twice have the trumpets sounded;
+The generous and gravest citizens
+Have hent the gates, and very near upon
+The duke is entering: therefore, hence, away!
+
+DUKE VINCENTIO:
+My very worthy cousin, fairly met!
+Our old and faithful friend, we are glad to see you.
+
+ANGELO:
+Happy return be to your royal grace!
+
+DUKE VINCENTIO:
+Many and hearty thankings to you both.
+We have made inquiry of you; and we hear
+Such goodness of your justice, that our soul
+Cannot but yield you forth to public thanks,
+Forerunning more requital.
+
+ANGELO:
+You make my bonds still greater.
+
+DUKE VINCENTIO:
+O, your desert speaks loud; and I should wrong it,
+To lock it in the wards of covert bosom,
+When it deserves, with characters of brass,
+A forted residence 'gainst the tooth of time
+And razure of oblivion. Give me your hand,
+And let the subject see, to make them know
+That outward courtesies would fain proclaim
+Favours that keep within. Come, Escalus,
+You must walk by us on our other hand;
+And good supporters are you.
+
+FRIAR PETER:
+Now is your time: speak loud and kneel before him.
+
+ISABELLA:
+Justice, O royal duke! Vail your regard
+Upon a wrong'd, I would fain have said, a maid!
+O worthy prince, dishonour not your eye
+By throwing it on any other object
+Till you have heard me in my true complaint
+And given me justice, justice, justice, justice!
+
+DUKE VINCENTIO:
+Relate your wrongs; in what? by whom? be brief.
+Here is Lord Angelo shall give you justice:
+Reveal yourself to him.
+
+ISABELLA:
+O worthy duke,
+You bid me seek redemption of the devil:
+Hear me yourself; for that which I must speak
+Must either punish me, not being believed,
+Or wring redress from you. Hear me, O hear me, here!
+
+ANGELO:
+My lord, her wits, I fear me, are not firm:
+She hath been a suitor to me for her brother
+Cut off by course of justice,--
+
+ISABELLA:
+By course of justice!
+
+ANGELO:
+And she will speak most bitterly and strange.
+
+ISABELLA:
+Most strange, but yet most truly, will I speak:
+That Angelo's forsworn; is it not strange?
+That Angelo's a murderer; is 't not strange?
+That Angelo is an adulterous thief,
+An hypocrite, a virgin-violator;
+Is it not strange and strange?
+
+DUKE VINCENTIO:
+Nay, it is ten times strange.
+
+ISABELLA:
+It is not truer he is Angelo
+Than this is all as true as it is strange:
+Nay, it is ten times true; for truth is truth
+To the end of reckoning.
+
+DUKE VINCENTIO:
+Away with her! Poor soul,
+She speaks this in the infirmity of sense.
+
+ISABELLA:
+O prince, I conjure thee, as thou believest
+There is another comfort than this world,
+That thou neglect me not, with that opinion
+That I am touch'd with madness! Make not impossible
+That which but seems unlike: 'tis not impossible
+But one, the wicked'st caitiff on the ground,
+May seem as shy, as grave, as just, as absolute
+As Angelo; even so may Angelo,
+In all his dressings, characts, titles, forms,
+Be an arch-villain; believe it, royal prince:
+If he be less, he's nothing; but he's more,
+Had I more name for badness.
+
+DUKE VINCENTIO:
+By mine honesty,
+If she be mad,--as I believe no other,--
+Her madness hath the oddest frame of sense,
+Such a dependency of thing on thing,
+As e'er I heard in madness.
+
+ISABELLA:
+O gracious duke,
+Harp not on that, nor do not banish reason
+For inequality; but let your reason serve
+To make the truth appear where it seems hid,
+And hide the false seems true.
+
+DUKE VINCENTIO:
+Many that are not mad
+Have, sure, more lack of reason. What would you say?
+
+ISABELLA:
+I am the sister of one Claudio,
+Condemn'd upon the act of fornication
+To lose his head; condemn'd by Angelo:
+I, in probation of a sisterhood,
+Was sent to by my brother; one Lucio
+As then the messenger,--
+
+LUCIO:
+That's I, an't like your grace:
+I came to her from Claudio, and desired her
+To try her gracious fortune with Lord Angelo
+For her poor brother's pardon.
+
+ISABELLA:
+That's he indeed.
+
+DUKE VINCENTIO:
+You were not bid to speak.
+
+LUCIO:
+No, my good lord;
+Nor wish'd to hold my peace.
+
+DUKE VINCENTIO:
+I wish you now, then;
+Pray you, take note of it: and when you have
+A business for yourself, pray heaven you then
+Be perfect.
+
+LUCIO:
+I warrant your honour.
+
+DUKE VINCENTIO:
+The warrants for yourself; take heed to't.
+
+ISABELLA:
+This gentleman told somewhat of my tale,--
+
+LUCIO:
+Right.
+
+DUKE VINCENTIO:
+It may be right; but you are i' the wrong
+To speak before your time. Proceed.
+
+ISABELLA:
+I went
+To this pernicious caitiff deputy,--
+
+DUKE VINCENTIO:
+That's somewhat madly spoken.
+
+ISABELLA:
+Pardon it;
+The phrase is to the matter.
+
+DUKE VINCENTIO:
+Mended again. The matter; proceed.
+
+ISABELLA:
+In brief, to set the needless process by,
+How I persuaded, how I pray'd, and kneel'd,
+How he refell'd me, and how I replied,--
+For this was of much length,--the vile conclusion
+I now begin with grief and shame to utter:
+He would not, but by gift of my chaste body
+To his concupiscible intemperate lust,
+Release my brother; and, after much debatement,
+My sisterly remorse confutes mine honour,
+And I did yield to him: but the next morn betimes,
+His purpose surfeiting, he sends a warrant
+For my poor brother's head.
+
+DUKE VINCENTIO:
+This is most likely!
+
+ISABELLA:
+O, that it were as like as it is true!
+
+DUKE VINCENTIO:
+By heaven, fond wretch, thou knowist not what thou speak'st,
+Or else thou art suborn'd against his honour
+In hateful practise. First, his integrity
+Stands without blemish. Next, it imports no reason
+That with such vehemency he should pursue
+Faults proper to himself: if he had so offended,
+He would have weigh'd thy brother by himself
+And not have cut him off. Some one hath set you on:
+Confess the truth, and say by whose advice
+Thou camest here to complain.
+
+ISABELLA:
+And is this all?
+Then, O you blessed ministers above,
+Keep me in patience, and with ripen'd time
+Unfold the evil which is here wrapt up
+In countenance! Heaven shield your grace from woe,
+As I, thus wrong'd, hence unbelieved go!
+
+DUKE VINCENTIO:
+I know you'ld fain be gone. An officer!
+To prison with her! Shall we thus permit
+A blasting and a scandalous breath to fall
+On him so near us? This needs must be a practise.
+Who knew of Your intent and coming hither?
+
+ISABELLA:
+One that I would were here, Friar Lodowick.
+
+DUKE VINCENTIO:
+A ghostly father, belike. Who knows that Lodowick?
+
+LUCIO:
+My lord, I know him; 'tis a meddling friar;
+I do not like the man: had he been lay, my lord
+For certain words he spake against your grace
+In your retirement, I had swinged him soundly.
+
+DUKE VINCENTIO:
+Words against me? this is a good friar, belike!
+And to set on this wretched woman here
+Against our substitute! Let this friar be found.
+
+LUCIO:
+But yesternight, my lord, she and that friar,
+I saw them at the prison: a saucy friar,
+A very scurvy fellow.
+
+FRIAR PETER:
+Blessed be your royal grace!
+I have stood by, my lord, and I have heard
+Your royal ear abused. First, hath this woman
+Most wrongfully accused your substitute,
+Who is as free from touch or soil with her
+As she from one ungot.
+
+DUKE VINCENTIO:
+We did believe no less.
+Know you that Friar Lodowick that she speaks of?
+
+FRIAR PETER:
+I know him for a man divine and holy;
+Not scurvy, nor a temporary meddler,
+As he's reported by this gentleman;
+And, on my trust, a man that never yet
+Did, as he vouches, misreport your grace.
+
+LUCIO:
+My lord, most villanously; believe it.
+
+FRIAR PETER:
+Well, he in time may come to clear himself;
+But at this instant he is sick my lord,
+Of a strange fever. Upon his mere request,
+Being come to knowledge that there was complaint
+Intended 'gainst Lord Angelo, came I hither,
+To speak, as from his mouth, what he doth know
+Is true and false; and what he with his oath
+And all probation will make up full clear,
+Whensoever he's convented. First, for this woman.
+To justify this worthy nobleman,
+So vulgarly and personally accused,
+Her shall you hear disproved to her eyes,
+Till she herself confess it.
+
+DUKE VINCENTIO:
+Good friar, let's hear it.
+Do you not smile at this, Lord Angelo?
+O heaven, the vanity of wretched fools!
+Give us some seats. Come, cousin Angelo;
+In this I'll be impartial; be you judge
+Of your own cause. Is this the witness, friar?
+First, let her show her face, and after speak.
+
+MARIANA:
+Pardon, my lord; I will not show my face
+Until my husband bid me.
+
+DUKE VINCENTIO:
+What, are you married?
+
+MARIANA:
+No, my lord.
+
+DUKE VINCENTIO:
+Are you a maid?
+
+MARIANA:
+No, my lord.
+
+DUKE VINCENTIO:
+A widow, then?
+
+MARIANA:
+Neither, my lord.
+
+DUKE VINCENTIO:
+Why, you are nothing then: neither maid, widow, nor wife?
+
+LUCIO:
+My lord, she may be a punk; for many of them are
+neither maid, widow, nor wife.
+
+DUKE VINCENTIO:
+Silence that fellow: I would he had some cause
+To prattle for himself.
+
+LUCIO:
+Well, my lord.
+
+MARIANA:
+My lord; I do confess I ne'er was married;
+And I confess besides I am no maid:
+I have known my husband; yet my husband
+Knows not that ever he knew me.
+
+LUCIO:
+He was drunk then, my lord: it can be no better.
+
+DUKE VINCENTIO:
+For the benefit of silence, would thou wert so too!
+
+LUCIO:
+Well, my lord.
+
+DUKE VINCENTIO:
+This is no witness for Lord Angelo.
+
+MARIANA:
+Now I come to't my lord
+She that accuses him of fornication,
+In self-same manner doth accuse my husband,
+And charges him my lord, with such a time
+When I'll depose I had him in mine arms
+With all the effect of love.
+
+ANGELO:
+Charges she more than me?
+
+MARIANA:
+Not that I know.
+
+DUKE VINCENTIO:
+No? you say your husband.
+
+MARIANA:
+Why, just, my lord, and that is Angelo,
+Who thinks he knows that he ne'er knew my body,
+But knows he thinks that he knows Isabel's.
+
+ANGELO:
+This is a strange abuse. Let's see thy face.
+
+MARIANA:
+My husband bids me; now I will unmask.
+This is that face, thou cruel Angelo,
+Which once thou sworest was worth the looking on;
+This is the hand which, with a vow'd contract,
+Was fast belock'd in thine; this is the body
+That took away the match from Isabel,
+And did supply thee at thy garden-house
+In her imagined person.
+
+DUKE VINCENTIO:
+Know you this woman?
+
+LUCIO:
+Carnally, she says.
+
+DUKE VINCENTIO:
+Sirrah, no more!
+
+LUCIO:
+Enough, my lord.
+
+ANGELO:
+My lord, I must confess I know this woman:
+And five years since there was some speech of marriage
+Betwixt myself and her; which was broke off,
+Partly for that her promised proportions
+Came short of composition, but in chief
+For that her reputation was disvalued
+In levity: since which time of five years
+I never spake with her, saw her, nor heard from her,
+Upon my faith and honour.
+
+MARIANA:
+Noble prince,
+As there comes light from heaven and words from breath,
+As there is sense in truth and truth in virtue,
+I am affianced this man's wife as strongly
+As words could make up vows: and, my good lord,
+But Tuesday night last gone in's garden-house
+He knew me as a wife. As this is true,
+Let me in safety raise me from my knees
+Or else for ever be confixed here,
+A marble monument!
+
+ANGELO:
+I did but smile till now:
+Now, good my lord, give me the scope of justice
+My patience here is touch'd. I do perceive
+These poor informal women are no more
+But instruments of some more mightier member
+That sets them on: let me have way, my lord,
+To find this practise out.
+
+DUKE VINCENTIO:
+Ay, with my heart
+And punish them to your height of pleasure.
+Thou foolish friar, and thou pernicious woman,
+Compact with her that's gone, think'st thou thy oaths,
+Though they would swear down each particular saint,
+Were testimonies against his worth and credit
+That's seal'd in approbation? You, Lord Escalus,
+Sit with my cousin; lend him your kind pains
+To find out this abuse, whence 'tis derived.
+There is another friar that set them on;
+Let him be sent for.
+
+FRIAR PETER:
+Would he were here, my lord! for he indeed
+Hath set the women on to this complaint:
+Your provost knows the place where he abides
+And he may fetch him.
+
+DUKE VINCENTIO:
+Go do it instantly.
+And you, my noble and well-warranted cousin,
+Whom it concerns to hear this matter forth,
+Do with your injuries as seems you best,
+In any chastisement: I for a while will leave you;
+But stir not you till you have well determined
+Upon these slanderers.
+
+ESCALUS:
+My lord, we'll do it throughly.
+Signior Lucio, did not you say you knew that
+Friar Lodowick to be a dishonest person?
+
+LUCIO:
+'Cucullus non facit monachum:' honest in nothing
+but in his clothes; and one that hath spoke most
+villanous speeches of the duke.
+
+ESCALUS:
+We shall entreat you to abide here till he come and
+enforce them against him: we shall find this friar a
+notable fellow.
+
+LUCIO:
+As any in Vienna, on my word.
+
+ESCALUS:
+Call that same Isabel here once again; I would speak with her.
+Pray you, my lord, give me leave to question; you
+shall see how I'll handle her.
+
+LUCIO:
+Not better than he, by her own report.
+
+ESCALUS:
+Say you?
+
+LUCIO:
+Marry, sir, I think, if you handled her privately,
+she would sooner confess: perchance, publicly,
+she'll be ashamed.
+
+ESCALUS:
+I will go darkly to work with her.
+
+LUCIO:
+That's the way; for women are light at midnight.
+
+ESCALUS:
+Come on, mistress: here's a gentlewoman denies all
+that you have said.
+
+LUCIO:
+My lord, here comes the rascal I spoke of; here with
+the provost.
+
+ESCALUS:
+In very good time: speak not you to him till we
+call upon you.
+
+LUCIO:
+Mum.
+
+ESCALUS:
+Come, sir: did you set these women on to slander
+Lord Angelo? they have confessed you did.
+
+DUKE VINCENTIO:
+'Tis false.
+
+ESCALUS:
+How! know you where you are?
+
+DUKE VINCENTIO:
+Respect to your great place! and let the devil
+Be sometime honour'd for his burning throne!
+Where is the duke? 'tis he should hear me speak.
+
+ESCALUS:
+The duke's in us; and we will hear you speak:
+Look you speak justly.
+
+DUKE VINCENTIO:
+Boldly, at least. But, O, poor souls,
+Come you to seek the lamb here of the fox?
+Good night to your redress! Is the duke gone?
+Then is your cause gone too. The duke's unjust,
+Thus to retort your manifest appeal,
+And put your trial in the villain's mouth
+Which here you come to accuse.
+
+LUCIO:
+This is the rascal; this is he I spoke of.
+
+ESCALUS:
+Why, thou unreverend and unhallow'd friar,
+Is't not enough thou hast suborn'd these women
+To accuse this worthy man, but, in foul mouth
+And in the witness of his proper ear,
+To call him villain? and then to glance from him
+To the duke himself, to tax him with injustice?
+Take him hence; to the rack with him! We'll touse you
+Joint by joint, but we will know his purpose.
+What 'unjust'!
+
+DUKE VINCENTIO:
+Be not so hot; the duke
+Dare no more stretch this finger of mine than he
+Dare rack his own: his subject am I not,
+Nor here provincial. My business in this state
+Made me a looker on here in Vienna,
+Where I have seen corruption boil and bubble
+Till it o'er-run the stew; laws for all faults,
+But faults so countenanced, that the strong statutes
+Stand like the forfeits in a barber's shop,
+As much in mock as mark.
+
+ESCALUS:
+Slander to the state! Away with him to prison!
+
+ANGELO:
+What can you vouch against him, Signior Lucio?
+Is this the man that you did tell us of?
+
+LUCIO:
+'Tis he, my lord. Come hither, goodman baldpate:
+do you know me?
+
+DUKE VINCENTIO:
+I remember you, sir, by the sound of your voice: I
+met you at the prison, in the absence of the duke.
+
+LUCIO:
+O, did you so? And do you remember what you said of the duke?
+
+DUKE VINCENTIO:
+Most notedly, sir.
+
+LUCIO:
+Do you so, sir? And was the duke a fleshmonger, a
+fool, and a coward, as you then reported him to be?
+
+DUKE VINCENTIO:
+You must, sir, change persons with me, ere you make
+that my report: you, indeed, spoke so of him; and
+much more, much worse.
+
+LUCIO:
+O thou damnable fellow! Did not I pluck thee by the
+nose for thy speeches?
+
+DUKE VINCENTIO:
+I protest I love the duke as I love myself.
+
+ANGELO:
+Hark, how the villain would close now, after his
+treasonable abuses!
+
+ESCALUS:
+Such a fellow is not to be talked withal. Away with
+him to prison! Where is the provost? Away with him
+to prison! lay bolts enough upon him: let him
+speak no more. Away with those giglots too, and
+with the other confederate companion!
+
+DUKE VINCENTIO:
+
+ANGELO:
+What, resists he? Help him, Lucio.
+
+LUCIO:
+Come, sir; come, sir; come, sir; foh, sir! Why, you
+bald-pated, lying rascal, you must be hooded, must
+you? Show your knave's visage, with a pox to you!
+show your sheep-biting face, and be hanged an hour!
+Will't not off?
+
+DUKE VINCENTIO:
+Thou art the first knave that e'er madest a duke.
+First, provost, let me bail these gentle three.
+Sneak not away, sir; for the friar and you
+Must have a word anon. Lay hold on him.
+
+LUCIO:
+This may prove worse than hanging.
+
+DUKE VINCENTIO:
+
+ANGELO:
+O my dread lord,
+I should be guiltier than my guiltiness,
+To think I can be undiscernible,
+When I perceive your grace, like power divine,
+Hath look'd upon my passes. Then, good prince,
+No longer session hold upon my shame,
+But let my trial be mine own confession:
+Immediate sentence then and sequent death
+Is all the grace I beg.
+
+DUKE VINCENTIO:
+Come hither, Mariana.
+Say, wast thou e'er contracted to this woman?
+
+ANGELO:
+I was, my lord.
+
+DUKE VINCENTIO:
+Go take her hence, and marry her instantly.
+Do you the office, friar; which consummate,
+Return him here again. Go with him, provost.
+
+ESCALUS:
+My lord, I am more amazed at his dishonour
+Than at the strangeness of it.
+
+DUKE VINCENTIO:
+Come hither, Isabel.
+Your friar is now your prince: as I was then
+Advertising and holy to your business,
+Not changing heart with habit, I am still
+Attorney'd at your service.
+
+ISABELLA:
+O, give me pardon,
+That I, your vassal, have employ'd and pain'd
+Your unknown sovereignty!
+
+DUKE VINCENTIO:
+You are pardon'd, Isabel:
+And now, dear maid, be you as free to us.
+Your brother's death, I know, sits at your heart;
+And you may marvel why I obscured myself,
+Labouring to save his life, and would not rather
+Make rash remonstrance of my hidden power
+Than let him so be lost. O most kind maid,
+It was the swift celerity of his death,
+Which I did think with slower foot came on,
+That brain'd my purpose. But, peace be with him!
+That life is better life, past fearing death,
+Than that which lives to fear: make it your comfort,
+So happy is your brother.
+
+ISABELLA:
+I do, my lord.
+
+DUKE VINCENTIO:
+For this new-married man approaching here,
+Whose salt imagination yet hath wrong'd
+Your well defended honour, you must pardon
+For Mariana's sake: but as he adjudged your brother,--
+Being criminal, in double violation
+Of sacred chastity and of promise-breach
+Thereon dependent, for your brother's life,--
+The very mercy of the law cries out
+Most audible, even from his proper tongue,
+'An Angelo for Claudio, death for death!'
+Haste still pays haste, and leisure answers leisure;
+Like doth quit like, and MEASURE still FOR MEASURE.
+Then, Angelo, thy fault's thus manifested;
+Which, though thou wouldst deny, denies thee vantage.
+We do condemn thee to the very block
+Where Claudio stoop'd to death, and with like haste.
+Away with him!
+
+MARIANA:
+O my most gracious lord,
+I hope you will not mock me with a husband.
+
+DUKE VINCENTIO:
+It is your husband mock'd you with a husband.
+Consenting to the safeguard of your honour,
+I thought your marriage fit; else imputation,
+For that he knew you, might reproach your life
+And choke your good to come; for his possessions,
+Although by confiscation they are ours,
+We do instate and widow you withal,
+To buy you a better husband.
+
+MARIANA:
+O my dear lord,
+I crave no other, nor no better man.
+
+DUKE VINCENTIO:
+Never crave him; we are definitive.
+
+MARIANA:
+Gentle my liege,--
+
+DUKE VINCENTIO:
+You do but lose your labour.
+Away with him to death!
+Now, sir, to you.
+
+MARIANA:
+O my good lord! Sweet Isabel, take my part;
+Lend me your knees, and all my life to come
+I'll lend you all my life to do you service.
+
+DUKE VINCENTIO:
+Against all sense you do importune her:
+Should she kneel down in mercy of this fact,
+Her brother's ghost his paved bed would break,
+And take her hence in horror.
+
+MARIANA:
+Isabel,
+Sweet Isabel, do yet but kneel by me;
+Hold up your hands, say nothing; I'll speak all.
+They say, best men are moulded out of faults;
+And, for the most, become much more the better
+For being a little bad: so may my husband.
+O Isabel, will you not lend a knee?
+
+DUKE VINCENTIO:
+He dies for Claudio's death.
+
+ISABELLA:
+Most bounteous sir,
+Look, if it please you, on this man condemn'd,
+As if my brother lived: I partly think
+A due sincerity govern'd his deeds,
+Till he did look on me: since it is so,
+Let him not die. My brother had but justice,
+In that he did the thing for which he died:
+For Angelo,
+His act did not o'ertake his bad intent,
+And must be buried but as an intent
+That perish'd by the way: thoughts are no subjects;
+Intents but merely thoughts.
+
+MARIANA:
+Merely, my lord.
+
+DUKE VINCENTIO:
+Your suit's unprofitable; stand up, I say.
+I have bethought me of another fault.
+Provost, how came it Claudio was beheaded
+At an unusual hour?
+
+Provost:
+It was commanded so.
+
+DUKE VINCENTIO:
+Had you a special warrant for the deed?
+
+Provost:
+No, my good lord; it was by private message.
+
+DUKE VINCENTIO:
+For which I do discharge you of your office:
+Give up your keys.
+
+Provost:
+Pardon me, noble lord:
+I thought it was a fault, but knew it not;
+Yet did repent me, after more advice;
+For testimony whereof, one in the prison,
+That should by private order else have died,
+I have reserved alive.
+
+DUKE VINCENTIO:
+What's he?
+
+Provost:
+His name is Barnardine.
+
+DUKE VINCENTIO:
+I would thou hadst done so by Claudio.
+Go fetch him hither; let me look upon him.
+
+ESCALUS:
+I am sorry, one so learned and so wise
+As you, Lord Angelo, have still appear'd,
+Should slip so grossly, both in the heat of blood.
+And lack of temper'd judgment afterward.
+
+ANGELO:
+I am sorry that such sorrow I procure:
+And so deep sticks it in my penitent heart
+That I crave death more willingly than mercy;
+'Tis my deserving, and I do entreat it.
+
+DUKE VINCENTIO:
+Which is that Barnardine?
+
+Provost:
+This, my lord.
+
+DUKE VINCENTIO:
+There was a friar told me of this man.
+Sirrah, thou art said to have a stubborn soul.
+That apprehends no further than this world,
+And squarest thy life according. Thou'rt condemn'd:
+But, for those earthly faults, I quit them all;
+And pray thee take this mercy to provide
+For better times to come. Friar, advise him;
+I leave him to your hand. What muffled fellow's that?
+
+Provost:
+This is another prisoner that I saved.
+Who should have died when Claudio lost his head;
+As like almost to Claudio as himself.
+
+DUKE VINCENTIO:
+
+LUCIO:
+'Faith, my lord. I spoke it but according to the
+trick. If you will hang me for it, you may; but I
+had rather it would please you I might be whipt.
+
+DUKE VINCENTIO:
+Whipt first, sir, and hanged after.
+Proclaim it, provost, round about the city.
+Is any woman wrong'd by this lewd fellow,
+As I have heard him swear himself there's one
+Whom he begot with child, let her appear,
+And he shall marry her: the nuptial finish'd,
+Let him be whipt and hang'd.
+
+LUCIO:
+I beseech your highness, do not marry me to a whore.
+Your highness said even now, I made you a duke:
+good my lord, do not recompense me in making me a cuckold.
+
+DUKE VINCENTIO:
+Upon mine honour, thou shalt marry her.
+Thy slanders I forgive; and therewithal
+Remit thy other forfeits. Take him to prison;
+And see our pleasure herein executed.
+
+LUCIO:
+Marrying a punk, my lord, is pressing to death,
+whipping, and hanging.
+
+DUKE VINCENTIO:
+Slandering a prince deserves it.
+She, Claudio, that you wrong'd, look you restore.
+Joy to you, Mariana! Love her, Angelo:
+I have confess'd her and I know her virtue.
+Thanks, good friend Escalus, for thy much goodness:
+There's more behind that is more gratulate.
+Thanks, provost, for thy care and secrecy:
+We shill employ thee in a worthier place.
+Forgive him, Angelo, that brought you home
+The head of Ragozine for Claudio's:
+The offence pardons itself. Dear Isabel,
+I have a motion much imports your good;
+Whereto if you'll a willing ear incline,
+What's mine is yours and what is yours is mine.
+So, bring us to our palace; where we'll show
+What's yet behind, that's meet you all should know.
+
+SLY:
+I'll pheeze you, in faith.
+
+Hostess:
+A pair of stocks, you rogue!
+
+SLY:
+Ye are a baggage: the Slys are no rogues; look in
+the chronicles; we came in with Richard Conqueror.
+Therefore paucas pallabris; let the world slide: sessa!
+
+Hostess:
+You will not pay for the glasses you have burst?
+
+SLY:
+No, not a denier. Go by, Jeronimy: go to thy cold
+bed, and warm thee.
+
+Hostess:
+I know my remedy; I must go fetch the
+third--borough.
+
+SLY:
+Third, or fourth, or fifth borough, I'll answer him
+by law: I'll not budge an inch, boy: let him come,
+and kindly.
+
+Lord:
+Huntsman, I charge thee, tender well my hounds:
+Brach Merriman, the poor cur is emboss'd;
+And couple Clowder with the deep--mouth'd brach.
+Saw'st thou not, boy, how Silver made it good
+At the hedge-corner, in the coldest fault?
+I would not lose the dog for twenty pound.
+
+First Huntsman:
+Why, Belman is as good as he, my lord;
+He cried upon it at the merest loss
+And twice to-day pick'd out the dullest scent:
+Trust me, I take him for the better dog.
+
+Lord:
+Thou art a fool: if Echo were as fleet,
+I would esteem him worth a dozen such.
+But sup them well and look unto them all:
+To-morrow I intend to hunt again.
+
+First Huntsman:
+I will, my lord.
+
+Lord:
+What's here? one dead, or drunk? See, doth he breathe?
+
+Second Huntsman:
+He breathes, my lord. Were he not warm'd with ale,
+This were a bed but cold to sleep so soundly.
+
+Lord:
+O monstrous beast! how like a swine he lies!
+Grim death, how foul and loathsome is thine image!
+Sirs, I will practise on this drunken man.
+What think you, if he were convey'd to bed,
+Wrapp'd in sweet clothes, rings put upon his fingers,
+A most delicious banquet by his bed,
+And brave attendants near him when he wakes,
+Would not the beggar then forget himself?
+
+First Huntsman:
+Believe me, lord, I think he cannot choose.
+
+Second Huntsman:
+It would seem strange unto him when he waked.
+
+Lord:
+Even as a flattering dream or worthless fancy.
+Then take him up and manage well the jest:
+Carry him gently to my fairest chamber
+And hang it round with all my wanton pictures:
+Balm his foul head in warm distilled waters
+And burn sweet wood to make the lodging sweet:
+Procure me music ready when he wakes,
+To make a dulcet and a heavenly sound;
+And if he chance to speak, be ready straight
+And with a low submissive reverence
+Say 'What is it your honour will command?'
+Let one attend him with a silver basin
+Full of rose-water and bestrew'd with flowers,
+Another bear the ewer, the third a diaper,
+And say 'Will't please your lordship cool your hands?'
+Some one be ready with a costly suit
+And ask him what apparel he will wear;
+Another tell him of his hounds and horse,
+And that his lady mourns at his disease:
+Persuade him that he hath been lunatic;
+And when he says he is, say that he dreams,
+For he is nothing but a mighty lord.
+This do and do it kindly, gentle sirs:
+It will be pastime passing excellent,
+If it be husbanded with modesty.
+
+First Huntsman:
+My lord, I warrant you we will play our part,
+As he shall think by our true diligence
+He is no less than what we say he is.
+
+Lord:
+Take him up gently and to bed with him;
+And each one to his office when he wakes.
+Sirrah, go see what trumpet 'tis that sounds:
+Belike, some noble gentleman that means,
+Travelling some journey, to repose him here.
+How now! who is it?
+
+Servant:
+An't please your honour, players
+That offer service to your lordship.
+
+Lord:
+Bid them come near.
+Now, fellows, you are welcome.
+
+Players:
+We thank your honour.
+
+Lord:
+Do you intend to stay with me tonight?
+
+A Player:
+So please your lordship to accept our duty.
+
+Lord:
+With all my heart. This fellow I remember,
+Since once he play'd a farmer's eldest son:
+'Twas where you woo'd the gentlewoman so well:
+I have forgot your name; but, sure, that part
+Was aptly fitted and naturally perform'd.
+
+A Player:
+I think 'twas Soto that your honour means.
+
+Lord:
+'Tis very true: thou didst it excellent.
+Well, you are come to me in a happy time;
+The rather for I have some sport in hand
+Wherein your cunning can assist me much.
+There is a lord will hear you play to-night:
+But I am doubtful of your modesties;
+Lest over-eyeing of his odd behavior,--
+For yet his honour never heard a play--
+You break into some merry passion
+And so offend him; for I tell you, sirs,
+If you should smile he grows impatient.
+
+A Player:
+Fear not, my lord: we can contain ourselves,
+Were he the veriest antic in the world.
+
+Lord:
+Go, sirrah, take them to the buttery,
+And give them friendly welcome every one:
+Let them want nothing that my house affords.
+Sirrah, go you to Barthol'mew my page,
+And see him dress'd in all suits like a lady:
+That done, conduct him to the drunkard's chamber;
+And call him 'madam,' do him obeisance.
+Tell him from me, as he will win my love,
+He bear himself with honourable action,
+Such as he hath observed in noble ladies
+Unto their lords, by them accomplished:
+Such duty to the drunkard let him do
+With soft low tongue and lowly courtesy,
+And say 'What is't your honour will command,
+Wherein your lady and your humble wife
+May show her duty and make known her love?'
+And then with kind embracements, tempting kisses,
+And with declining head into his bosom,
+Bid him shed tears, as being overjoy'd
+To see her noble lord restored to health,
+Who for this seven years hath esteem'd him
+No better than a poor and loathsome beggar:
+And if the boy have not a woman's gift
+To rain a shower of commanded tears,
+An onion will do well for such a shift,
+Which in a napkin being close convey'd
+Shall in despite enforce a watery eye.
+See this dispatch'd with all the haste thou canst:
+Anon I'll give thee more instructions.
+I know the boy will well usurp the grace,
+Voice, gait and action of a gentlewoman:
+I long to hear him call the drunkard husband,
+And how my men will stay themselves from laughter
+When they do homage to this simple peasant.
+I'll in to counsel them; haply my presence
+May well abate the over-merry spleen
+Which otherwise would grow into extremes.
+
+SLY:
+For God's sake, a pot of small ale.
+
+First Servant:
+Will't please your lordship drink a cup of sack?
+
+Second Servant:
+Will't please your honour taste of these conserves?
+
+Third Servant:
+What raiment will your honour wear to-day?
+
+SLY:
+I am Christophero Sly; call not me 'honour' nor
+'lordship:' I ne'er drank sack in my life; and if
+you give me any conserves, give me conserves of
+beef: ne'er ask me what raiment I'll wear; for I
+have no more doublets than backs, no more stockings
+than legs, nor no more shoes than feet; nay,
+sometimes more feet than shoes, or such shoes as my
+toes look through the over-leather.
+
+Lord:
+Heaven cease this idle humour in your honour!
+O, that a mighty man of such descent,
+Of such possessions and so high esteem,
+Should be infused with so foul a spirit!
+
+SLY:
+What, would you make me mad? Am not I Christopher
+Sly, old Sly's son of Burtonheath, by birth a
+pedlar, by education a cardmaker, by transmutation a
+bear-herd, and now by present profession a tinker?
+Ask Marian Hacket, the fat ale-wife of Wincot, if
+she know me not: if she say I am not fourteen pence
+on the score for sheer ale, score me up for the
+lyingest knave in Christendom. What! I am not
+bestraught: here's--
+
+Third Servant:
+O, this it is that makes your lady mourn!
+
+Second Servant:
+O, this is it that makes your servants droop!
+
+Lord:
+Hence comes it that your kindred shuns your house,
+As beaten hence by your strange lunacy.
+O noble lord, bethink thee of thy birth,
+Call home thy ancient thoughts from banishment
+And banish hence these abject lowly dreams.
+Look how thy servants do attend on thee,
+Each in his office ready at thy beck.
+Wilt thou have music? hark! Apollo plays,
+And twenty caged nightingales do sing:
+Or wilt thou sleep? we'll have thee to a couch
+Softer and sweeter than the lustful bed
+On purpose trimm'd up for Semiramis.
+Say thou wilt walk; we will bestrew the ground:
+Or wilt thou ride? thy horses shall be trapp'd,
+Their harness studded all with gold and pearl.
+Dost thou love hawking? thou hast hawks will soar
+Above the morning lark or wilt thou hunt?
+Thy hounds shall make the welkin answer them
+And fetch shrill echoes from the hollow earth.
+
+First Servant:
+Say thou wilt course; thy greyhounds are as swift
+As breathed stags, ay, fleeter than the roe.
+
+Second Servant:
+Dost thou love pictures? we will fetch thee straight
+Adonis painted by a running brook,
+And Cytherea all in sedges hid,
+Which seem to move and wanton with her breath,
+Even as the waving sedges play with wind.
+
+Lord:
+We'll show thee Io as she was a maid,
+And how she was beguiled and surprised,
+As lively painted as the deed was done.
+
+Third Servant:
+Or Daphne roaming through a thorny wood,
+Scratching her legs that one shall swear she bleeds,
+And at that sight shall sad Apollo weep,
+So workmanly the blood and tears are drawn.
+
+Lord:
+Thou art a lord, and nothing but a lord:
+Thou hast a lady far more beautiful
+Than any woman in this waning age.
+
+First Servant:
+And till the tears that she hath shed for thee
+Like envious floods o'er-run her lovely face,
+She was the fairest creature in the world;
+And yet she is inferior to none.
+
+SLY:
+Am I a lord? and have I such a lady?
+Or do I dream? or have I dream'd till now?
+I do not sleep: I see, I hear, I speak;
+I smell sweet savours and I feel soft things:
+Upon my life, I am a lord indeed
+And not a tinker nor Christophero Sly.
+Well, bring our lady hither to our sight;
+And once again, a pot o' the smallest ale.
+
+Second Servant:
+Will't please your mightiness to wash your hands?
+O, how we joy to see your wit restored!
+O, that once more you knew but what you are!
+These fifteen years you have been in a dream;
+Or when you waked, so waked as if you slept.
+
+SLY:
+These fifteen years! by my fay, a goodly nap.
+But did I never speak of all that time?
+
+First Servant:
+O, yes, my lord, but very idle words:
+For though you lay here in this goodly chamber,
+Yet would you say ye were beaten out of door;
+And rail upon the hostess of the house;
+And say you would present her at the leet,
+Because she brought stone jugs and no seal'd quarts:
+Sometimes you would call out for Cicely Hacket.
+
+SLY:
+Ay, the woman's maid of the house.
+
+Third Servant:
+Why, sir, you know no house nor no such maid,
+Nor no such men as you have reckon'd up,
+As Stephen Sly and did John Naps of Greece
+And Peter Turph and Henry Pimpernell
+And twenty more such names and men as these
+Which never were nor no man ever saw.
+
+SLY:
+Now Lord be thanked for my good amends!
+
+ALL:
+Amen.
+
+SLY:
+I thank thee: thou shalt not lose by it.
+
+Page:
+How fares my noble lord?
+
+SLY:
+Marry, I fare well for here is cheer enough.
+Where is my wife?
+
+Page:
+Here, noble lord: what is thy will with her?
+
+SLY:
+Are you my wife and will not call me husband?
+My men should call me 'lord:' I am your goodman.
+
+Page:
+My husband and my lord, my lord and husband;
+I am your wife in all obedience.
+
+SLY:
+I know it well. What must I call her?
+
+Lord:
+Madam.
+
+SLY:
+Al'ce madam, or Joan madam?
+
+Lord:
+'Madam,' and nothing else: so lords
+call ladies.
+
+SLY:
+Madam wife, they say that I have dream'd
+And slept above some fifteen year or more.
+
+Page:
+Ay, and the time seems thirty unto me,
+Being all this time abandon'd from your bed.
+
+SLY:
+'Tis much. Servants, leave me and her alone.
+Madam, undress you and come now to bed.
+
+Page:
+Thrice noble lord, let me entreat of you
+To pardon me yet for a night or two,
+Or, if not so, until the sun be set:
+For your physicians have expressly charged,
+In peril to incur your former malady,
+That I should yet absent me from your bed:
+I hope this reason stands for my excuse.
+
+SLY:
+Ay, it stands so that I may hardly
+tarry so long. But I would be loath to fall into
+my dreams again: I will therefore tarry in
+despite of the flesh and the blood.
+
+Messenger:
+Your honour's players, heating your amendment,
+Are come to play a pleasant comedy;
+For so your doctors hold it very meet,
+Seeing too much sadness hath congeal'd your blood,
+And melancholy is the nurse of frenzy:
+Therefore they thought it good you hear a play
+And frame your mind to mirth and merriment,
+Which bars a thousand harms and lengthens life.
+
+SLY:
+Marry, I will, let them play it. Is not a
+comondy a Christmas gambold or a tumbling-trick?
+
+Page:
+No, my good lord; it is more pleasing stuff.
+
+SLY:
+What, household stuff?
+
+Page:
+It is a kind of history.
+
+SLY:
+Well, well see't. Come, madam wife, sit by my side
+and let the world slip: we shall ne'er be younger.
+
+LUCENTIO:
+Tranio, since for the great desire I had
+To see fair Padua, nursery of arts,
+I am arrived for fruitful Lombardy,
+The pleasant garden of great Italy;
+And by my father's love and leave am arm'd
+With his good will and thy good company,
+My trusty servant, well approved in all,
+Here let us breathe and haply institute
+A course of learning and ingenious studies.
+Pisa renown'd for grave citizens
+Gave me my being and my father first,
+A merchant of great traffic through the world,
+Vincetino come of Bentivolii.
+Vincetino's son brought up in Florence
+It shall become to serve all hopes conceived,
+To deck his fortune with his virtuous deeds:
+And therefore, Tranio, for the time I study,
+Virtue and that part of philosophy
+Will I apply that treats of happiness
+By virtue specially to be achieved.
+Tell me thy mind; for I have Pisa left
+And am to Padua come, as he that leaves
+A shallow plash to plunge him in the deep
+And with satiety seeks to quench his thirst.
+
+TRANIO:
+Mi perdonato, gentle master mine,
+I am in all affected as yourself;
+Glad that you thus continue your resolve
+To suck the sweets of sweet philosophy.
+Only, good master, while we do admire
+This virtue and this moral discipline,
+Let's be no stoics nor no stocks, I pray;
+Or so devote to Aristotle's cheques
+As Ovid be an outcast quite abjured:
+Balk logic with acquaintance that you have
+And practise rhetoric in your common talk;
+Music and poesy use to quicken you;
+The mathematics and the metaphysics,
+Fall to them as you find your stomach serves you;
+No profit grows where is no pleasure ta'en:
+In brief, sir, study what you most affect.
+
+LUCENTIO:
+Gramercies, Tranio, well dost thou advise.
+If, Biondello, thou wert come ashore,
+We could at once put us in readiness,
+And take a lodging fit to entertain
+Such friends as time in Padua shall beget.
+But stay a while: what company is this?
+
+TRANIO:
+Master, some show to welcome us to town.
+
+BAPTISTA:
+Gentlemen, importune me no farther,
+For how I firmly am resolved you know;
+That is, not bestow my youngest daughter
+Before I have a husband for the elder:
+If either of you both love Katharina,
+Because I know you well and love you well,
+Leave shall you have to court her at your pleasure.
+
+GREMIO:
+
+KATHARINA:
+I pray you, sir, is it your will
+To make a stale of me amongst these mates?
+
+HORTENSIO:
+Mates, maid! how mean you that? no mates for you,
+Unless you were of gentler, milder mould.
+
+KATHARINA:
+I'faith, sir, you shall never need to fear:
+I wis it is not half way to her heart;
+But if it were, doubt not her care should be
+To comb your noddle with a three-legg'd stool
+And paint your face and use you like a fool.
+
+HORTENSIA:
+From all such devils, good Lord deliver us!
+
+GREMIO:
+And me too, good Lord!
+
+TRANIO:
+Hush, master! here's some good pastime toward:
+That wench is stark mad or wonderful froward.
+
+LUCENTIO:
+But in the other's silence do I see
+Maid's mild behavior and sobriety.
+Peace, Tranio!
+
+TRANIO:
+Well said, master; mum! and gaze your fill.
+
+BAPTISTA:
+Gentlemen, that I may soon make good
+What I have said, Bianca, get you in:
+And let it not displease thee, good Bianca,
+For I will love thee ne'er the less, my girl.
+
+KATHARINA:
+A pretty peat! it is best
+Put finger in the eye, an she knew why.
+
+BIANCA:
+Sister, content you in my discontent.
+Sir, to your pleasure humbly I subscribe:
+My books and instruments shall be my company,
+On them to took and practise by myself.
+
+LUCENTIO:
+Hark, Tranio! thou may'st hear Minerva speak.
+
+HORTENSIO:
+Signior Baptista, will you be so strange?
+Sorry am I that our good will effects
+Bianca's grief.
+
+GREMIO:
+Why will you mew her up,
+Signior Baptista, for this fiend of hell,
+And make her bear the penance of her tongue?
+
+BAPTISTA:
+Gentlemen, content ye; I am resolved:
+Go in, Bianca:
+And for I know she taketh most delight
+In music, instruments and poetry,
+Schoolmasters will I keep within my house,
+Fit to instruct her youth. If you, Hortensio,
+Or Signior Gremio, you, know any such,
+Prefer them hither; for to cunning men
+I will be very kind, and liberal
+To mine own children in good bringing up:
+And so farewell. Katharina, you may stay;
+For I have more to commune with Bianca.
+
+KATHARINA:
+Why, and I trust I may go too, may I not? What,
+shall I be appointed hours; as though, belike, I
+knew not what to take and what to leave, ha?
+
+GREMIO:
+You may go to the devil's dam: your gifts are so
+good, here's none will hold you. Their love is not
+so great, Hortensio, but we may blow our nails
+together, and fast it fairly out: our cakes dough on
+both sides. Farewell: yet for the love I bear my
+sweet Bianca, if I can by any means light on a fit
+man to teach her that wherein she delights, I will
+wish him to her father.
+
+HORTENSIO:
+So will I, Signior Gremio: but a word, I pray.
+Though the nature of our quarrel yet never brooked
+parle, know now, upon advice, it toucheth us both,
+that we may yet again have access to our fair
+mistress and be happy rivals in Bianco's love, to
+labour and effect one thing specially.
+
+GREMIO:
+What's that, I pray?
+
+HORTENSIO:
+Marry, sir, to get a husband for her sister.
+
+GREMIO:
+A husband! a devil.
+
+HORTENSIO:
+I say, a husband.
+
+GREMIO:
+I say, a devil. Thinkest thou, Hortensio, though
+her father be very rich, any man is so very a fool
+to be married to hell?
+
+HORTENSIO:
+Tush, Gremio, though it pass your patience and mine
+to endure her loud alarums, why, man, there be good
+fellows in the world, an a man could light on them,
+would take her with all faults, and money enough.
+
+GREMIO:
+I cannot tell; but I had as lief take her dowry with
+this condition, to be whipped at the high cross
+every morning.
+
+HORTENSIO:
+Faith, as you say, there's small choice in rotten
+apples. But come; since this bar in law makes us
+friends, it shall be so far forth friendly
+maintained all by helping Baptista's eldest daughter
+to a husband we set his youngest free for a husband,
+and then have to't a fresh. Sweet Bianca! Happy man
+be his dole! He that runs fastest gets the ring.
+How say you, Signior Gremio?
+
+GREMIO:
+I am agreed; and would I had given him the best
+horse in Padua to begin his wooing that would
+thoroughly woo her, wed her and bed her and rid the
+house of her! Come on.
+
+TRANIO:
+I pray, sir, tell me, is it possible
+That love should of a sudden take such hold?
+
+LUCENTIO:
+O Tranio, till I found it to be true,
+I never thought it possible or likely;
+But see, while idly I stood looking on,
+I found the effect of love in idleness:
+And now in plainness do confess to thee,
+That art to me as secret and as dear
+As Anna to the queen of Carthage was,
+Tranio, I burn, I pine, I perish, Tranio,
+If I achieve not this young modest girl.
+Counsel me, Tranio, for I know thou canst;
+Assist me, Tranio, for I know thou wilt.
+
+TRANIO:
+Master, it is no time to chide you now;
+Affection is not rated from the heart:
+If love have touch'd you, nought remains but so,
+'Redime te captum quam queas minimo.'
+
+LUCENTIO:
+Gramercies, lad, go forward; this contents:
+The rest will comfort, for thy counsel's sound.
+
+TRANIO:
+Master, you look'd so longly on the maid,
+Perhaps you mark'd not what's the pith of all.
+
+LUCENTIO:
+O yes, I saw sweet beauty in her face,
+Such as the daughter of Agenor had,
+That made great Jove to humble him to her hand.
+When with his knees he kiss'd the Cretan strand.
+
+TRANIO:
+Saw you no more? mark'd you not how her sister
+Began to scold and raise up such a storm
+That mortal ears might hardly endure the din?
+
+LUCENTIO:
+Tranio, I saw her coral lips to move
+And with her breath she did perfume the air:
+Sacred and sweet was all I saw in her.
+
+TRANIO:
+Nay, then, 'tis time to stir him from his trance.
+I pray, awake, sir: if you love the maid,
+Bend thoughts and wits to achieve her. Thus it stands:
+Her eldest sister is so curst and shrewd
+That till the father rid his hands of her,
+Master, your love must live a maid at home;
+And therefore has he closely mew'd her up,
+Because she will not be annoy'd with suitors.
+
+LUCENTIO:
+Ah, Tranio, what a cruel father's he!
+But art thou not advised, he took some care
+To get her cunning schoolmasters to instruct her?
+
+TRANIO:
+Ay, marry, am I, sir; and now 'tis plotted.
+
+LUCENTIO:
+I have it, Tranio.
+
+TRANIO:
+Master, for my hand,
+Both our inventions meet and jump in one.
+
+LUCENTIO:
+Tell me thine first.
+
+TRANIO:
+You will be schoolmaster
+And undertake the teaching of the maid:
+That's your device.
+
+LUCENTIO:
+It is: may it be done?
+
+TRANIO:
+Not possible; for who shall bear your part,
+And be in Padua here Vincentio's son,
+Keep house and ply his book, welcome his friends,
+Visit his countrymen and banquet them?
+
+LUCENTIO:
+Basta; content thee, for I have it full.
+We have not yet been seen in any house,
+Nor can we lie distinguish'd by our faces
+For man or master; then it follows thus;
+Thou shalt be master, Tranio, in my stead,
+Keep house and port and servants as I should:
+I will some other be, some Florentine,
+Some Neapolitan, or meaner man of Pisa.
+'Tis hatch'd and shall be so: Tranio, at once
+Uncase thee; take my colour'd hat and cloak:
+When Biondello comes, he waits on thee;
+But I will charm him first to keep his tongue.
+
+TRANIO:
+So had you need.
+In brief, sir, sith it your pleasure is,
+And I am tied to be obedient;
+For so your father charged me at our parting,
+'Be serviceable to my son,' quoth he,
+Although I think 'twas in another sense;
+I am content to be Lucentio,
+Because so well I love Lucentio.
+
+LUCENTIO:
+Tranio, be so, because Lucentio loves:
+And let me be a slave, to achieve that maid
+Whose sudden sight hath thrall'd my wounded eye.
+Here comes the rogue.
+Sirrah, where have you been?
+
+BIONDELLO:
+Where have I been! Nay, how now! where are you?
+Master, has my fellow Tranio stolen your clothes? Or
+you stolen his? or both? pray, what's the news?
+
+LUCENTIO:
+Sirrah, come hither: 'tis no time to jest,
+And therefore frame your manners to the time.
+Your fellow Tranio here, to save my life,
+Puts my apparel and my countenance on,
+And I for my escape have put on his;
+For in a quarrel since I came ashore
+I kill'd a man and fear I was descried:
+Wait you on him, I charge you, as becomes,
+While I make way from hence to save my life:
+You understand me?
+
+BIONDELLO:
+I, sir! ne'er a whit.
+
+LUCENTIO:
+And not a jot of Tranio in your mouth:
+Tranio is changed into Lucentio.
+
+BIONDELLO:
+The better for him: would I were so too!
+
+TRANIO:
+So could I, faith, boy, to have the next wish after,
+That Lucentio indeed had Baptista's youngest daughter.
+But, sirrah, not for my sake, but your master's, I advise
+You use your manners discreetly in all kind of companies:
+When I am alone, why, then I am Tranio;
+But in all places else your master Lucentio.
+
+LUCENTIO:
+Tranio, let's go: one thing more rests, that
+thyself execute, to make one among these wooers: if
+thou ask me why, sufficeth, my reasons are both good
+and weighty.
+
+First Servant:
+My lord, you nod; you do not mind the play.
+
+SLY:
+Yes, by Saint Anne, do I. A good matter, surely:
+comes there any more of it?
+
+Page:
+My lord, 'tis but begun.
+
+SLY:
+'Tis a very excellent piece of work, madam lady:
+would 'twere done!
+
+PETRUCHIO:
+Verona, for a while I take my leave,
+To see my friends in Padua, but of all
+My best beloved and approved friend,
+Hortensio; and I trow this is his house.
+Here, sirrah Grumio; knock, I say.
+
+GRUMIO:
+Knock, sir! whom should I knock? is there man has
+rebused your worship?
+
+PETRUCHIO:
+Villain, I say, knock me here soundly.
+
+GRUMIO:
+Knock you here, sir! why, sir, what am I, sir, that
+I should knock you here, sir?
+
+PETRUCHIO:
+Villain, I say, knock me at this gate
+And rap me well, or I'll knock your knave's pate.
+
+GRUMIO:
+My master is grown quarrelsome. I should knock
+you first,
+And then I know after who comes by the worst.
+
+PETRUCHIO:
+Will it not be?
+Faith, sirrah, an you'll not knock, I'll ring it;
+I'll try how you can sol, fa, and sing it.
+
+GRUMIO:
+Help, masters, help! my master is mad.
+
+PETRUCHIO:
+Now, knock when I bid you, sirrah villain!
+
+HORTENSIO:
+How now! what's the matter? My old friend Grumio!
+and my good friend Petruchio! How do you all at Verona?
+
+PETRUCHIO:
+Signior Hortensio, come you to part the fray?
+'Con tutto il cuore, ben trovato,' may I say.
+
+HORTENSIO:
+'Alla nostra casa ben venuto, molto honorato signor
+mio Petruchio.' Rise, Grumio, rise: we will compound
+this quarrel.
+
+GRUMIO:
+Nay, 'tis no matter, sir, what he 'leges in Latin.
+if this be not a lawful case for me to leave his
+service, look you, sir, he bid me knock him and rap
+him soundly, sir: well, was it fit for a servant to
+use his master so, being perhaps, for aught I see,
+two and thirty, a pip out? Whom would to God I had
+well knock'd at first, Then had not Grumio come by the worst.
+
+PETRUCHIO:
+A senseless villain! Good Hortensio,
+I bade the rascal knock upon your gate
+And could not get him for my heart to do it.
+
+GRUMIO:
+Knock at the gate! O heavens! Spake you not these
+words plain, 'Sirrah, knock me here, rap me here,
+knock me well, and knock me soundly'? And come you
+now with, 'knocking at the gate'?
+
+PETRUCHIO:
+Sirrah, be gone, or talk not, I advise you.
+
+HORTENSIO:
+Petruchio, patience; I am Grumio's pledge:
+Why, this's a heavy chance 'twixt him and you,
+Your ancient, trusty, pleasant servant Grumio.
+And tell me now, sweet friend, what happy gale
+Blows you to Padua here from old Verona?
+
+PETRUCHIO:
+Such wind as scatters young men through the world,
+To seek their fortunes farther than at home
+Where small experience grows. But in a few,
+Signior Hortensio, thus it stands with me:
+Antonio, my father, is deceased;
+And I have thrust myself into this maze,
+Haply to wive and thrive as best I may:
+Crowns in my purse I have and goods at home,
+And so am come abroad to see the world.
+
+HORTENSIO:
+Petruchio, shall I then come roundly to thee
+And wish thee to a shrewd ill-favour'd wife?
+Thou'ldst thank me but a little for my counsel:
+And yet I'll promise thee she shall be rich
+And very rich: but thou'rt too much my friend,
+And I'll not wish thee to her.
+
+PETRUCHIO:
+Signior Hortensio, 'twixt such friends as we
+Few words suffice; and therefore, if thou know
+One rich enough to be Petruchio's wife,
+As wealth is burden of my wooing dance,
+Be she as foul as was Florentius' love,
+As old as Sibyl and as curst and shrewd
+As Socrates' Xanthippe, or a worse,
+She moves me not, or not removes, at least,
+Affection's edge in me, were she as rough
+As are the swelling Adriatic seas:
+I come to wive it wealthily in Padua;
+If wealthily, then happily in Padua.
+
+GRUMIO:
+Nay, look you, sir, he tells you flatly what his
+mind is: Why give him gold enough and marry him to
+a puppet or an aglet-baby; or an old trot with ne'er
+a tooth in her head, though she have as many diseases
+as two and fifty horses: why, nothing comes amiss,
+so money comes withal.
+
+HORTENSIO:
+Petruchio, since we are stepp'd thus far in,
+I will continue that I broach'd in jest.
+I can, Petruchio, help thee to a wife
+With wealth enough and young and beauteous,
+Brought up as best becomes a gentlewoman:
+Her only fault, and that is faults enough,
+Is that she is intolerable curst
+And shrewd and froward, so beyond all measure
+That, were my state far worser than it is,
+I would not wed her for a mine of gold.
+
+PETRUCHIO:
+Hortensio, peace! thou know'st not gold's effect:
+Tell me her father's name and 'tis enough;
+For I will board her, though she chide as loud
+As thunder when the clouds in autumn crack.
+
+HORTENSIO:
+Her father is Baptista Minola,
+An affable and courteous gentleman:
+Her name is Katharina Minola,
+Renown'd in Padua for her scolding tongue.
+
+PETRUCHIO:
+I know her father, though I know not her;
+And he knew my deceased father well.
+I will not sleep, Hortensio, till I see her;
+And therefore let me be thus bold with you
+To give you over at this first encounter,
+Unless you will accompany me thither.
+
+GRUMIO:
+I pray you, sir, let him go while the humour lasts.
+O' my word, an she knew him as well as I do, she
+would think scolding would do little good upon him:
+she may perhaps call him half a score knaves or so:
+why, that's nothing; an he begin once, he'll rail in
+his rope-tricks. I'll tell you what sir, an she
+stand him but a little, he will throw a figure in
+her face and so disfigure her with it that she
+shall have no more eyes to see withal than a cat.
+You know him not, sir.
+
+HORTENSIO:
+Tarry, Petruchio, I must go with thee,
+For in Baptista's keep my treasure is:
+He hath the jewel of my life in hold,
+His youngest daughter, beautiful Binaca,
+And her withholds from me and other more,
+Suitors to her and rivals in my love,
+Supposing it a thing impossible,
+For those defects I have before rehearsed,
+That ever Katharina will be woo'd;
+Therefore this order hath Baptista ta'en,
+That none shall have access unto Bianca
+Till Katharina the curst have got a husband.
+
+GRUMIO:
+Katharina the curst!
+A title for a maid of all titles the worst.
+
+HORTENSIO:
+Now shall my friend Petruchio do me grace,
+And offer me disguised in sober robes
+To old Baptista as a schoolmaster
+Well seen in music, to instruct Bianca;
+That so I may, by this device, at least
+Have leave and leisure to make love to her
+And unsuspected court her by herself.
+
+GRUMIO:
+Here's no knavery! See, to beguile the old folks,
+how the young folks lay their heads together!
+Master, master, look about you: who goes there, ha?
+
+HORTENSIO:
+Peace, Grumio! it is the rival of my love.
+Petruchio, stand by a while.
+
+GRUMIO:
+A proper stripling and an amorous!
+
+GREMIO:
+O, very well; I have perused the note.
+Hark you, sir: I'll have them very fairly bound:
+All books of love, see that at any hand;
+And see you read no other lectures to her:
+You understand me: over and beside
+Signior Baptista's liberality,
+I'll mend it with a largess. Take your paper too,
+And let me have them very well perfumed
+For she is sweeter than perfume itself
+To whom they go to. What will you read to her?
+
+LUCENTIO:
+Whate'er I read to her, I'll plead for you
+As for my patron, stand you so assured,
+As firmly as yourself were still in place:
+Yea, and perhaps with more successful words
+Than you, unless you were a scholar, sir.
+
+GREMIO:
+O this learning, what a thing it is!
+
+GRUMIO:
+O this woodcock, what an ass it is!
+
+PETRUCHIO:
+Peace, sirrah!
+
+HORTENSIO:
+Grumio, mum! God save you, Signior Gremio.
+
+GREMIO:
+And you are well met, Signior Hortensio.
+Trow you whither I am going? To Baptista Minola.
+I promised to inquire carefully
+About a schoolmaster for the fair Bianca:
+And by good fortune I have lighted well
+On this young man, for learning and behavior
+Fit for her turn, well read in poetry
+And other books, good ones, I warrant ye.
+
+HORTENSIO:
+'Tis well; and I have met a gentleman
+Hath promised me to help me to another,
+A fine musician to instruct our mistress;
+So shall I no whit be behind in duty
+To fair Bianca, so beloved of me.
+
+GREMIO:
+Beloved of me; and that my deeds shall prove.
+
+GRUMIO:
+And that his bags shall prove.
+
+HORTENSIO:
+Gremio, 'tis now no time to vent our love:
+Listen to me, and if you speak me fair,
+I'll tell you news indifferent good for either.
+Here is a gentleman whom by chance I met,
+Upon agreement from us to his liking,
+Will undertake to woo curst Katharina,
+Yea, and to marry her, if her dowry please.
+
+GREMIO:
+So said, so done, is well.
+Hortensio, have you told him all her faults?
+
+PETRUCHIO:
+I know she is an irksome brawling scold:
+If that be all, masters, I hear no harm.
+
+GREMIO:
+No, say'st me so, friend? What countryman?
+
+PETRUCHIO:
+Born in Verona, old Antonio's son:
+My father dead, my fortune lives for me;
+And I do hope good days and long to see.
+
+GREMIO:
+O sir, such a life, with such a wife, were strange!
+But if you have a stomach, to't i' God's name:
+You shall have me assisting you in all.
+But will you woo this wild-cat?
+
+PETRUCHIO:
+Will I live?
+
+GRUMIO:
+Will he woo her? ay, or I'll hang her.
+
+PETRUCHIO:
+Why came I hither but to that intent?
+Think you a little din can daunt mine ears?
+Have I not in my time heard lions roar?
+Have I not heard the sea puff'd up with winds
+Rage like an angry boar chafed with sweat?
+Have I not heard great ordnance in the field,
+And heaven's artillery thunder in the skies?
+Have I not in a pitched battle heard
+Loud 'larums, neighing steeds, and trumpets' clang?
+And do you tell me of a woman's tongue,
+That gives not half so great a blow to hear
+As will a chestnut in a farmer's fire?
+Tush, tush! fear boys with bugs.
+
+GRUMIO:
+For he fears none.
+
+GREMIO:
+Hortensio, hark:
+This gentleman is happily arrived,
+My mind presumes, for his own good and ours.
+
+HORTENSIO:
+I promised we would be contributors
+And bear his charging of wooing, whatsoe'er.
+
+GREMIO:
+And so we will, provided that he win her.
+
+GRUMIO:
+I would I were as sure of a good dinner.
+
+TRANIO:
+Gentlemen, God save you. If I may be bold,
+Tell me, I beseech you, which is the readiest way
+To the house of Signior Baptista Minola?
+
+BIONDELLO:
+He that has the two fair daughters: is't he you mean?
+
+TRANIO:
+Even he, Biondello.
+
+GREMIO:
+Hark you, sir; you mean not her to--
+
+TRANIO:
+Perhaps, him and her, sir: what have you to do?
+
+PETRUCHIO:
+Not her that chides, sir, at any hand, I pray.
+
+TRANIO:
+I love no chiders, sir. Biondello, let's away.
+
+LUCENTIO:
+Well begun, Tranio.
+
+HORTENSIO:
+Sir, a word ere you go;
+Are you a suitor to the maid you talk of, yea or no?
+
+TRANIO:
+And if I be, sir, is it any offence?
+
+GREMIO:
+No; if without more words you will get you hence.
+
+TRANIO:
+Why, sir, I pray, are not the streets as free
+For me as for you?
+
+GREMIO:
+But so is not she.
+
+TRANIO:
+For what reason, I beseech you?
+
+GREMIO:
+For this reason, if you'll know,
+That she's the choice love of Signior Gremio.
+
+HORTENSIO:
+That she's the chosen of Signior Hortensio.
+
+TRANIO:
+Softly, my masters! if you be gentlemen,
+Do me this right; hear me with patience.
+Baptista is a noble gentleman,
+To whom my father is not all unknown;
+And were his daughter fairer than she is,
+She may more suitors have and me for one.
+Fair Leda's daughter had a thousand wooers;
+Then well one more may fair Bianca have:
+And so she shall; Lucentio shall make one,
+Though Paris came in hope to speed alone.
+
+GREMIO:
+What! this gentleman will out-talk us all.
+
+LUCENTIO:
+Sir, give him head: I know he'll prove a jade.
+
+PETRUCHIO:
+Hortensio, to what end are all these words?
+
+HORTENSIO:
+Sir, let me be so bold as ask you,
+Did you yet ever see Baptista's daughter?
+
+TRANIO:
+No, sir; but hear I do that he hath two,
+The one as famous for a scolding tongue
+As is the other for beauteous modesty.
+
+PETRUCHIO:
+Sir, sir, the first's for me; let her go by.
+
+GREMIO:
+Yea, leave that labour to great Hercules;
+And let it be more than Alcides' twelve.
+
+PETRUCHIO:
+Sir, understand you this of me in sooth:
+The youngest daughter whom you hearken for
+Her father keeps from all access of suitors,
+And will not promise her to any man
+Until the elder sister first be wed:
+The younger then is free and not before.
+
+TRANIO:
+If it be so, sir, that you are the man
+Must stead us all and me amongst the rest,
+And if you break the ice and do this feat,
+Achieve the elder, set the younger free
+For our access, whose hap shall be to have her
+Will not so graceless be to be ingrate.
+
+HORTENSIO:
+Sir, you say well and well you do conceive;
+And since you do profess to be a suitor,
+You must, as we do, gratify this gentleman,
+To whom we all rest generally beholding.
+
+TRANIO:
+Sir, I shall not be slack: in sign whereof,
+Please ye we may contrive this afternoon,
+And quaff carouses to our mistress' health,
+And do as adversaries do in law,
+Strive mightily, but eat and drink as friends.
+
+GRUMIO:
+O excellent motion! Fellows, let's be gone.
+
+HORTENSIO:
+The motion's good indeed and be it so,
+Petruchio, I shall be your ben venuto.
+
+BIANCA:
+Good sister, wrong me not, nor wrong yourself,
+To make a bondmaid and a slave of me;
+That I disdain: but for these other gawds,
+Unbind my hands, I'll pull them off myself,
+Yea, all my raiment, to my petticoat;
+Or what you will command me will I do,
+So well I know my duty to my elders.
+
+KATHARINA:
+Of all thy suitors, here I charge thee, tell
+Whom thou lovest best: see thou dissemble not.
+
+BIANCA:
+Believe me, sister, of all the men alive
+I never yet beheld that special face
+Which I could fancy more than any other.
+
+KATHARINA:
+Minion, thou liest. Is't not Hortensio?
+
+BIANCA:
+If you affect him, sister, here I swear
+I'll plead for you myself, but you shall have
+him.
+
+KATHARINA:
+O then, belike, you fancy riches more:
+You will have Gremio to keep you fair.
+
+BIANCA:
+Is it for him you do envy me so?
+Nay then you jest, and now I well perceive
+You have but jested with me all this while:
+I prithee, sister Kate, untie my hands.
+
+KATHARINA:
+If that be jest, then all the rest was so.
+
+BAPTISTA:
+Why, how now, dame! whence grows this insolence?
+Bianca, stand aside. Poor girl! she weeps.
+Go ply thy needle; meddle not with her.
+For shame, thou helding of a devilish spirit,
+Why dost thou wrong her that did ne'er wrong thee?
+When did she cross thee with a bitter word?
+
+KATHARINA:
+Her silence flouts me, and I'll be revenged.
+
+BAPTISTA:
+What, in my sight? Bianca, get thee in.
+
+KATHARINA:
+What, will you not suffer me? Nay, now I see
+She is your treasure, she must have a husband;
+I must dance bare-foot on her wedding day
+And for your love to her lead apes in hell.
+Talk not to me: I will go sit and weep
+Till I can find occasion of revenge.
+
+BAPTISTA:
+Was ever gentleman thus grieved as I?
+But who comes here?
+
+GREMIO:
+Good morrow, neighbour Baptista.
+
+BAPTISTA:
+Good morrow, neighbour Gremio.
+God save you, gentlemen!
+
+PETRUCHIO:
+And you, good sir! Pray, have you not a daughter
+Call'd Katharina, fair and virtuous?
+
+BAPTISTA:
+I have a daughter, sir, called Katharina.
+
+GREMIO:
+You are too blunt: go to it orderly.
+
+PETRUCHIO:
+You wrong me, Signior Gremio: give me leave.
+I am a gentleman of Verona, sir,
+That, hearing of her beauty and her wit,
+Her affability and bashful modesty,
+Her wondrous qualities and mild behavior,
+Am bold to show myself a forward guest
+Within your house, to make mine eye the witness
+Of that report which I so oft have heard.
+And, for an entrance to my entertainment,
+I do present you with a man of mine,
+Cunning in music and the mathematics,
+To instruct her fully in those sciences,
+Whereof I know she is not ignorant:
+Accept of him, or else you do me wrong:
+His name is Licio, born in Mantua.
+
+BAPTISTA:
+You're welcome, sir; and he, for your good sake.
+But for my daughter Katharina, this I know,
+She is not for your turn, the more my grief.
+
+PETRUCHIO:
+I see you do not mean to part with her,
+Or else you like not of my company.
+
+BAPTISTA:
+Mistake me not; I speak but as I find.
+Whence are you, sir? what may I call your name?
+
+PETRUCHIO:
+Petruchio is my name; Antonio's son,
+A man well known throughout all Italy.
+
+BAPTISTA:
+I know him well: you are welcome for his sake.
+
+GREMIO:
+Saving your tale, Petruchio, I pray,
+Let us, that are poor petitioners, speak too:
+Baccare! you are marvellous forward.
+
+PETRUCHIO:
+O, pardon me, Signior Gremio; I would fain be doing.
+
+GREMIO:
+I doubt it not, sir; but you will curse your
+wooing. Neighbour, this is a gift very grateful, I am
+sure of it. To express the like kindness, myself,
+that have been more kindly beholding to you than
+any, freely give unto you this young scholar,
+that hath been long studying at Rheims; as cunning
+in Greek, Latin, and other languages, as the other
+in music and mathematics: his name is Cambio; pray,
+accept his service.
+
+BAPTISTA:
+A thousand thanks, Signior Gremio.
+Welcome, good Cambio.
+But, gentle sir, methinks you walk like a stranger:
+may I be so bold to know the cause of your coming?
+
+TRANIO:
+Pardon me, sir, the boldness is mine own,
+That, being a stranger in this city here,
+Do make myself a suitor to your daughter,
+Unto Bianca, fair and virtuous.
+Nor is your firm resolve unknown to me,
+In the preferment of the eldest sister.
+This liberty is all that I request,
+That, upon knowledge of my parentage,
+I may have welcome 'mongst the rest that woo
+And free access and favour as the rest:
+And, toward the education of your daughters,
+I here bestow a simple instrument,
+And this small packet of Greek and Latin books:
+If you accept them, then their worth is great.
+
+BAPTISTA:
+Lucentio is your name; of whence, I pray?
+
+TRANIO:
+Of Pisa, sir; son to Vincentio.
+
+BAPTISTA:
+A mighty man of Pisa; by report
+I know him well: you are very welcome, sir,
+Take you the lute, and you the set of books;
+You shall go see your pupils presently.
+Holla, within!
+Sirrah, lead these gentlemen
+To my daughters; and tell them both,
+These are their tutors: bid them use them well.
+We will go walk a little in the orchard,
+And then to dinner. You are passing welcome,
+And so I pray you all to think yourselves.
+
+PETRUCHIO:
+Signior Baptista, my business asketh haste,
+And every day I cannot come to woo.
+You knew my father well, and in him me,
+Left solely heir to all his lands and goods,
+Which I have better'd rather than decreased:
+Then tell me, if I get your daughter's love,
+What dowry shall I have with her to wife?
+
+BAPTISTA:
+After my death the one half of my lands,
+And in possession twenty thousand crowns.
+
+PETRUCHIO:
+And, for that dowry, I'll assure her of
+Her widowhood, be it that she survive me,
+In all my lands and leases whatsoever:
+Let specialties be therefore drawn between us,
+That covenants may be kept on either hand.
+
+BAPTISTA:
+Ay, when the special thing is well obtain'd,
+That is, her love; for that is all in all.
+
+PETRUCHIO:
+Why, that is nothing: for I tell you, father,
+I am as peremptory as she proud-minded;
+And where two raging fires meet together
+They do consume the thing that feeds their fury:
+Though little fire grows great with little wind,
+Yet extreme gusts will blow out fire and all:
+So I to her and so she yields to me;
+For I am rough and woo not like a babe.
+
+BAPTISTA:
+Well mayst thou woo, and happy be thy speed!
+But be thou arm'd for some unhappy words.
+
+PETRUCHIO:
+Ay, to the proof; as mountains are for winds,
+That shake not, though they blow perpetually.
+
+BAPTISTA:
+How now, my friend! why dost thou look so pale?
+
+HORTENSIO:
+For fear, I promise you, if I look pale.
+
+BAPTISTA:
+What, will my daughter prove a good musician?
+
+HORTENSIO:
+I think she'll sooner prove a soldier
+Iron may hold with her, but never lutes.
+
+BAPTISTA:
+Why, then thou canst not break her to the lute?
+
+HORTENSIO:
+Why, no; for she hath broke the lute to me.
+I did but tell her she mistook her frets,
+And bow'd her hand to teach her fingering;
+When, with a most impatient devilish spirit,
+'Frets, call you these?' quoth she; 'I'll fume
+with them:'
+And, with that word, she struck me on the head,
+And through the instrument my pate made way;
+And there I stood amazed for a while,
+As on a pillory, looking through the lute;
+While she did call me rascal fiddler
+And twangling Jack; with twenty such vile terms,
+As had she studied to misuse me so.
+
+PETRUCHIO:
+Now, by the world, it is a lusty wench;
+I love her ten times more than e'er I did:
+O, how I long to have some chat with her!
+
+BAPTISTA:
+Well, go with me and be not so discomfited:
+Proceed in practise with my younger daughter;
+She's apt to learn and thankful for good turns.
+Signior Petruchio, will you go with us,
+Or shall I send my daughter Kate to you?
+
+PETRUCHIO:
+I pray you do.
+I will attend her here,
+And woo her with some spirit when she comes.
+Say that she rail; why then I'll tell her plain
+She sings as sweetly as a nightingale:
+Say that she frown, I'll say she looks as clear
+As morning roses newly wash'd with dew:
+Say she be mute and will not speak a word;
+Then I'll commend her volubility,
+And say she uttereth piercing eloquence:
+If she do bid me pack, I'll give her thanks,
+As though she bid me stay by her a week:
+If she deny to wed, I'll crave the day
+When I shall ask the banns and when be married.
+But here she comes; and now, Petruchio, speak.
+Good morrow, Kate; for that's your name, I hear.
+
+KATHARINA:
+Well have you heard, but something hard of hearing:
+They call me Katharina that do talk of me.
+
+PETRUCHIO:
+You lie, in faith; for you are call'd plain Kate,
+And bonny Kate and sometimes Kate the curst;
+But Kate, the prettiest Kate in Christendom
+Kate of Kate Hall, my super-dainty Kate,
+For dainties are all Kates, and therefore, Kate,
+Take this of me, Kate of my consolation;
+Hearing thy mildness praised in every town,
+Thy virtues spoke of, and thy beauty sounded,
+Yet not so deeply as to thee belongs,
+Myself am moved to woo thee for my wife.
+
+KATHARINA:
+Moved! in good time: let him that moved you hither
+Remove you hence: I knew you at the first
+You were a moveable.
+
+PETRUCHIO:
+Why, what's a moveable?
+
+KATHARINA:
+A join'd-stool.
+
+PETRUCHIO:
+Thou hast hit it: come, sit on me.
+
+KATHARINA:
+Asses are made to bear, and so are you.
+
+PETRUCHIO:
+Women are made to bear, and so are you.
+
+KATHARINA:
+No such jade as you, if me you mean.
+
+PETRUCHIO:
+Alas! good Kate, I will not burden thee;
+For, knowing thee to be but young and light--
+
+KATHARINA:
+Too light for such a swain as you to catch;
+And yet as heavy as my weight should be.
+
+PETRUCHIO:
+Should be! should--buzz!
+
+KATHARINA:
+Well ta'en, and like a buzzard.
+
+PETRUCHIO:
+O slow-wing'd turtle! shall a buzzard take thee?
+
+KATHARINA:
+Ay, for a turtle, as he takes a buzzard.
+
+PETRUCHIO:
+Come, come, you wasp; i' faith, you are too angry.
+
+KATHARINA:
+If I be waspish, best beware my sting.
+
+PETRUCHIO:
+My remedy is then, to pluck it out.
+
+KATHARINA:
+Ay, if the fool could find it where it lies,
+
+PETRUCHIO:
+Who knows not where a wasp does
+wear his sting? In his tail.
+
+KATHARINA:
+In his tongue.
+
+PETRUCHIO:
+Whose tongue?
+
+KATHARINA:
+Yours, if you talk of tails: and so farewell.
+
+PETRUCHIO:
+What, with my tongue in your tail? nay, come again,
+Good Kate; I am a gentleman.
+
+KATHARINA:
+That I'll try.
+
+PETRUCHIO:
+I swear I'll cuff you, if you strike again.
+
+KATHARINA:
+So may you lose your arms:
+If you strike me, you are no gentleman;
+And if no gentleman, why then no arms.
+
+PETRUCHIO:
+A herald, Kate? O, put me in thy books!
+
+KATHARINA:
+What is your crest? a coxcomb?
+
+PETRUCHIO:
+A combless cock, so Kate will be my hen.
+
+KATHARINA:
+No cock of mine; you crow too like a craven.
+
+PETRUCHIO:
+Nay, come, Kate, come; you must not look so sour.
+
+KATHARINA:
+It is my fashion, when I see a crab.
+
+PETRUCHIO:
+Why, here's no crab; and therefore look not sour.
+
+KATHARINA:
+There is, there is.
+
+PETRUCHIO:
+Then show it me.
+
+KATHARINA:
+Had I a glass, I would.
+
+PETRUCHIO:
+What, you mean my face?
+
+KATHARINA:
+Well aim'd of such a young one.
+
+PETRUCHIO:
+Now, by Saint George, I am too young for you.
+
+KATHARINA:
+Yet you are wither'd.
+
+PETRUCHIO:
+'Tis with cares.
+
+KATHARINA:
+I care not.
+
+PETRUCHIO:
+Nay, hear you, Kate: in sooth you scape not so.
+
+KATHARINA:
+I chafe you, if I tarry: let me go.
+
+PETRUCHIO:
+No, not a whit: I find you passing gentle.
+'Twas told me you were rough and coy and sullen,
+And now I find report a very liar;
+For thou are pleasant, gamesome, passing courteous,
+But slow in speech, yet sweet as spring-time flowers:
+Thou canst not frown, thou canst not look askance,
+Nor bite the lip, as angry wenches will,
+Nor hast thou pleasure to be cross in talk,
+But thou with mildness entertain'st thy wooers,
+With gentle conference, soft and affable.
+Why does the world report that Kate doth limp?
+O slanderous world! Kate like the hazel-twig
+Is straight and slender and as brown in hue
+As hazel nuts and sweeter than the kernels.
+O, let me see thee walk: thou dost not halt.
+
+KATHARINA:
+Go, fool, and whom thou keep'st command.
+
+PETRUCHIO:
+Did ever Dian so become a grove
+As Kate this chamber with her princely gait?
+O, be thou Dian, and let her be Kate;
+And then let Kate be chaste and Dian sportful!
+
+KATHARINA:
+Where did you study all this goodly speech?
+
+PETRUCHIO:
+It is extempore, from my mother-wit.
+
+KATHARINA:
+A witty mother! witless else her son.
+
+PETRUCHIO:
+Am I not wise?
+
+KATHARINA:
+Yes; keep you warm.
+
+PETRUCHIO:
+Marry, so I mean, sweet Katharina, in thy bed:
+And therefore, setting all this chat aside,
+Thus in plain terms: your father hath consented
+That you shall be my wife; your dowry 'greed on;
+And, Will you, nill you, I will marry you.
+Now, Kate, I am a husband for your turn;
+For, by this light, whereby I see thy beauty,
+Thy beauty, that doth make me like thee well,
+Thou must be married to no man but me;
+For I am he am born to tame you Kate,
+And bring you from a wild Kate to a Kate
+Conformable as other household Kates.
+Here comes your father: never make denial;
+I must and will have Katharina to my wife.
+
+BAPTISTA:
+Now, Signior Petruchio, how speed you with my daughter?
+
+PETRUCHIO:
+How but well, sir? how but well?
+It were impossible I should speed amiss.
+
+BAPTISTA:
+Why, how now, daughter Katharina! in your dumps?
+
+KATHARINA:
+Call you me daughter? now, I promise you
+You have show'd a tender fatherly regard,
+To wish me wed to one half lunatic;
+A mad-cup ruffian and a swearing Jack,
+That thinks with oaths to face the matter out.
+
+PETRUCHIO:
+Father, 'tis thus: yourself and all the world,
+That talk'd of her, have talk'd amiss of her:
+If she be curst, it is for policy,
+For she's not froward, but modest as the dove;
+She is not hot, but temperate as the morn;
+For patience she will prove a second Grissel,
+And Roman Lucrece for her chastity:
+And to conclude, we have 'greed so well together,
+That upon Sunday is the wedding-day.
+
+KATHARINA:
+I'll see thee hang'd on Sunday first.
+
+GREMIO:
+Hark, Petruchio; she says she'll see thee
+hang'd first.
+
+TRANIO:
+Is this your speeding? nay, then, good night our part!
+
+PETRUCHIO:
+Be patient, gentlemen; I choose her for myself:
+If she and I be pleased, what's that to you?
+'Tis bargain'd 'twixt us twain, being alone,
+That she shall still be curst in company.
+I tell you, 'tis incredible to believe
+How much she loves me: O, the kindest Kate!
+She hung about my neck; and kiss on kiss
+She vied so fast, protesting oath on oath,
+That in a twink she won me to her love.
+O, you are novices! 'tis a world to see,
+How tame, when men and women are alone,
+A meacock wretch can make the curstest shrew.
+Give me thy hand, Kate: I will unto Venice,
+To buy apparel 'gainst the wedding-day.
+Provide the feast, father, and bid the guests;
+I will be sure my Katharina shall be fine.
+
+BAPTISTA:
+I know not what to say: but give me your hands;
+God send you joy, Petruchio! 'tis a match.
+
+GREMIO:
+Amen, say we: we will be witnesses.
+
+PETRUCHIO:
+Father, and wife, and gentlemen, adieu;
+I will to Venice; Sunday comes apace:
+We will have rings and things and fine array;
+And kiss me, Kate, we will be married o'Sunday.
+
+GREMIO:
+Was ever match clapp'd up so suddenly?
+
+BAPTISTA:
+Faith, gentlemen, now I play a merchant's part,
+And venture madly on a desperate mart.
+
+TRANIO:
+'Twas a commodity lay fretting by you:
+'Twill bring you gain, or perish on the seas.
+
+BAPTISTA:
+The gain I seek is, quiet in the match.
+
+GREMIO:
+No doubt but he hath got a quiet catch.
+But now, Baptists, to your younger daughter:
+Now is the day we long have looked for:
+I am your neighbour, and was suitor first.
+
+TRANIO:
+And I am one that love Bianca more
+Than words can witness, or your thoughts can guess.
+
+GREMIO:
+Youngling, thou canst not love so dear as I.
+
+TRANIO:
+Graybeard, thy love doth freeze.
+
+GREMIO:
+But thine doth fry.
+Skipper, stand back: 'tis age that nourisheth.
+
+TRANIO:
+But youth in ladies' eyes that flourisheth.
+
+BAPTISTA:
+Content you, gentlemen: I will compound this strife:
+'Tis deeds must win the prize; and he of both
+That can assure my daughter greatest dower
+Shall have my Bianca's love.
+Say, Signior Gremio, What can you assure her?
+
+GREMIO:
+First, as you know, my house within the city
+Is richly furnished with plate and gold;
+Basins and ewers to lave her dainty hands;
+My hangings all of Tyrian tapestry;
+In ivory coffers I have stuff'd my crowns;
+In cypress chests my arras counterpoints,
+Costly apparel, tents, and canopies,
+Fine linen, Turkey cushions boss'd with pearl,
+Valance of Venice gold in needlework,
+Pewter and brass and all things that belong
+To house or housekeeping: then, at my farm
+I have a hundred milch-kine to the pail,
+Sixscore fat oxen standing in my stalls,
+And all things answerable to this portion.
+Myself am struck in years, I must confess;
+And if I die to-morrow, this is hers,
+If whilst I live she will be only mine.
+
+TRANIO:
+That 'only' came well in. Sir, list to me:
+I am my father's heir and only son:
+If I may have your daughter to my wife,
+I'll leave her houses three or four as good,
+Within rich Pisa walls, as any one
+Old Signior Gremio has in Padua;
+Besides two thousand ducats by the year
+Of fruitful land, all which shall be her jointure.
+What, have I pinch'd you, Signior Gremio?
+
+GREMIO:
+Two thousand ducats by the year of land!
+My land amounts not to so much in all:
+That she shall have; besides an argosy
+That now is lying in Marseilles' road.
+What, have I choked you with an argosy?
+
+TRANIO:
+Gremio, 'tis known my father hath no less
+Than three great argosies; besides two galliases,
+And twelve tight galleys: these I will assure her,
+And twice as much, whate'er thou offer'st next.
+
+GREMIO:
+Nay, I have offer'd all, I have no more;
+And she can have no more than all I have:
+If you like me, she shall have me and mine.
+
+TRANIO:
+Why, then the maid is mine from all the world,
+By your firm promise: Gremio is out-vied.
+
+BAPTISTA:
+I must confess your offer is the best;
+And, let your father make her the assurance,
+She is your own; else, you must pardon me,
+if you should die before him, where's her dower?
+
+TRANIO:
+That's but a cavil: he is old, I young.
+
+GREMIO:
+And may not young men die, as well as old?
+
+BAPTISTA:
+Well, gentlemen,
+I am thus resolved: on Sunday next you know
+My daughter Katharina is to be married:
+Now, on the Sunday following, shall Bianca
+Be bride to you, if you this assurance;
+If not, Signior Gremio:
+And so, I take my leave, and thank you both.
+
+GREMIO:
+Adieu, good neighbour.
+Now I fear thee not:
+Sirrah young gamester, your father were a fool
+To give thee all, and in his waning age
+Set foot under thy table: tut, a toy!
+An old Italian fox is not so kind, my boy.
+
+TRANIO:
+A vengeance on your crafty wither'd hide!
+Yet I have faced it with a card of ten.
+'Tis in my head to do my master good:
+I see no reason but supposed Lucentio
+Must get a father, call'd 'supposed Vincentio;'
+And that's a wonder: fathers commonly
+Do get their children; but in this case of wooing,
+A child shall get a sire, if I fail not of my cunning.
+
+LUCENTIO:
+Fiddler, forbear; you grow too forward, sir:
+Have you so soon forgot the entertainment
+Her sister Katharina welcomed you withal?
+
+HORTENSIO:
+But, wrangling pedant, this is
+The patroness of heavenly harmony:
+Then give me leave to have prerogative;
+And when in music we have spent an hour,
+Your lecture shall have leisure for as much.
+
+LUCENTIO:
+Preposterous ass, that never read so far
+To know the cause why music was ordain'd!
+Was it not to refresh the mind of man
+After his studies or his usual pain?
+Then give me leave to read philosophy,
+And while I pause, serve in your harmony.
+
+HORTENSIO:
+Sirrah, I will not bear these braves of thine.
+
+BIANCA:
+Why, gentlemen, you do me double wrong,
+To strive for that which resteth in my choice:
+I am no breeching scholar in the schools;
+I'll not be tied to hours nor 'pointed times,
+But learn my lessons as I please myself.
+And, to cut off all strife, here sit we down:
+Take you your instrument, play you the whiles;
+His lecture will be done ere you have tuned.
+
+HORTENSIO:
+You'll leave his lecture when I am in tune?
+
+LUCENTIO:
+That will be never: tune your instrument.
+
+BIANCA:
+Where left we last?
+
+LUCENTIO:
+Here, madam:
+'Hic ibat Simois; hic est Sigeia tellus;
+Hic steterat Priami regia celsa senis.'
+
+BIANCA:
+Construe them.
+
+LUCENTIO:
+'Hic ibat,' as I told you before, 'Simois,' I am
+Lucentio, 'hic est,' son unto Vincentio of Pisa,
+'Sigeia tellus,' disguised thus to get your love;
+'Hic steterat,' and that Lucentio that comes
+a-wooing, 'Priami,' is my man Tranio, 'regia,'
+bearing my port, 'celsa senis,' that we might
+beguile the old pantaloon.
+
+HORTENSIO:
+Madam, my instrument's in tune.
+
+BIANCA:
+Let's hear. O fie! the treble jars.
+
+LUCENTIO:
+Spit in the hole, man, and tune again.
+
+BIANCA:
+Now let me see if I can construe it: 'Hic ibat
+Simois,' I know you not, 'hic est Sigeia tellus,' I
+trust you not; 'Hic steterat Priami,' take heed
+he hear us not, 'regia,' presume not, 'celsa senis,'
+despair not.
+
+HORTENSIO:
+Madam, 'tis now in tune.
+
+LUCENTIO:
+All but the base.
+
+HORTENSIO:
+The base is right; 'tis the base knave that jars.
+How fiery and forward our pedant is!
+Now, for my life, the knave doth court my love:
+Pedascule, I'll watch you better yet.
+
+BIANCA:
+In time I may believe, yet I mistrust.
+
+LUCENTIO:
+Mistrust it not: for, sure, AEacides
+Was Ajax, call'd so from his grandfather.
+
+BIANCA:
+I must believe my master; else, I promise you,
+I should be arguing still upon that doubt:
+But let it rest. Now, Licio, to you:
+Good masters, take it not unkindly, pray,
+That I have been thus pleasant with you both.
+
+HORTENSIO:
+You may go walk, and give me leave a while:
+My lessons make no music in three parts.
+
+LUCENTIO:
+Are you so formal, sir? well, I must wait,
+And watch withal; for, but I be deceived,
+Our fine musician groweth amorous.
+
+HORTENSIO:
+Madam, before you touch the instrument,
+To learn the order of my fingering,
+I must begin with rudiments of art;
+To teach you gamut in a briefer sort,
+More pleasant, pithy and effectual,
+Than hath been taught by any of my trade:
+And there it is in writing, fairly drawn.
+
+BIANCA:
+Why, I am past my gamut long ago.
+
+HORTENSIO:
+Yet read the gamut of Hortensio.
+
+BIANCA:
+
+Servant:
+Mistress, your father prays you leave your books
+And help to dress your sister's chamber up:
+You know to-morrow is the wedding-day.
+
+BIANCA:
+Farewell, sweet masters both; I must be gone.
+
+LUCENTIO:
+Faith, mistress, then I have no cause to stay.
+
+HORTENSIO:
+But I have cause to pry into this pedant:
+Methinks he looks as though he were in love:
+Yet if thy thoughts, Bianca, be so humble
+To cast thy wandering eyes on every stale,
+Seize thee that list: if once I find thee ranging,
+Hortensio will be quit with thee by changing.
+
+BAPTISTA:
+
+KATHARINA:
+No shame but mine: I must, forsooth, be forced
+To give my hand opposed against my heart
+Unto a mad-brain rudesby full of spleen;
+Who woo'd in haste and means to wed at leisure.
+I told you, I, he was a frantic fool,
+Hiding his bitter jests in blunt behavior:
+And, to be noted for a merry man,
+He'll woo a thousand, 'point the day of marriage,
+Make feasts, invite friends, and proclaim the banns;
+Yet never means to wed where he hath woo'd.
+Now must the world point at poor Katharina,
+And say, 'Lo, there is mad Petruchio's wife,
+If it would please him come and marry her!'
+
+TRANIO:
+Patience, good Katharina, and Baptista too.
+Upon my life, Petruchio means but well,
+Whatever fortune stays him from his word:
+Though he be blunt, I know him passing wise;
+Though he be merry, yet withal he's honest.
+
+KATHARINA:
+Would Katharina had never seen him though!
+
+BAPTISTA:
+Go, girl; I cannot blame thee now to weep;
+For such an injury would vex a very saint,
+Much more a shrew of thy impatient humour.
+
+BIONDELLO:
+Master, master! news, old news, and such news as
+you never heard of!
+
+BAPTISTA:
+Is it new and old too? how may that be?
+
+BIONDELLO:
+Why, is it not news, to hear of Petruchio's coming?
+
+BAPTISTA:
+Is he come?
+
+BIONDELLO:
+Why, no, sir.
+
+BAPTISTA:
+What then?
+
+BIONDELLO:
+He is coming.
+
+BAPTISTA:
+When will he be here?
+
+BIONDELLO:
+When he stands where I am and sees you there.
+
+TRANIO:
+But say, what to thine old news?
+
+BIONDELLO:
+Why, Petruchio is coming in a new hat and an old
+jerkin, a pair of old breeches thrice turned, a pair
+of boots that have been candle-cases, one buckled,
+another laced, an old rusty sword ta'en out of the
+town-armory, with a broken hilt, and chapeless;
+with two broken points: his horse hipped with an
+old mothy saddle and stirrups of no kindred;
+besides, possessed with the glanders and like to mose
+in the chine; troubled with the lampass, infected
+with the fashions, full of wingdalls, sped with
+spavins, rayed with yellows, past cure of the fives,
+stark spoiled with the staggers, begnawn with the
+bots, swayed in the back and shoulder-shotten;
+near-legged before and with, a half-chequed bit
+and a head-stall of sheeps leather which, being
+restrained to keep him from stumbling, hath been
+often burst and now repaired with knots; one girth
+six time pieced and a woman's crupper of velure,
+which hath two letters for her name fairly set down
+in studs, and here and there pieced with packthread.
+
+BAPTISTA:
+Who comes with him?
+
+BIONDELLO:
+O, sir, his lackey, for all the world caparisoned
+like the horse; with a linen stock on one leg and a
+kersey boot-hose on the other, gartered with a red
+and blue list; an old hat and 'the humour of forty
+fancies' pricked in't for a feather: a monster, a
+very monster in apparel, and not like a Christian
+footboy or a gentleman's lackey.
+
+TRANIO:
+'Tis some odd humour pricks him to this fashion;
+Yet oftentimes he goes but mean-apparell'd.
+
+BAPTISTA:
+I am glad he's come, howsoe'er he comes.
+
+BIONDELLO:
+Why, sir, he comes not.
+
+BAPTISTA:
+Didst thou not say he comes?
+
+BIONDELLO:
+Who? that Petruchio came?
+
+BAPTISTA:
+Ay, that Petruchio came.
+
+BIONDELLO:
+No, sir, I say his horse comes, with him on his back.
+
+BAPTISTA:
+Why, that's all one.
+
+BIONDELLO:
+Nay, by Saint Jamy,
+I hold you a penny,
+A horse and a man
+Is more than one,
+And yet not many.
+
+PETRUCHIO:
+Come, where be these gallants? who's at home?
+
+BAPTISTA:
+You are welcome, sir.
+
+PETRUCHIO:
+And yet I come not well.
+
+BAPTISTA:
+And yet you halt not.
+
+TRANIO:
+Not so well apparell'd
+As I wish you were.
+
+PETRUCHIO:
+Were it better, I should rush in thus.
+But where is Kate? where is my lovely bride?
+How does my father? Gentles, methinks you frown:
+And wherefore gaze this goodly company,
+As if they saw some wondrous monument,
+Some comet or unusual prodigy?
+
+BAPTISTA:
+Why, sir, you know this is your wedding-day:
+First were we sad, fearing you would not come;
+Now sadder, that you come so unprovided.
+Fie, doff this habit, shame to your estate,
+An eye-sore to our solemn festival!
+
+TRANIO:
+And tells us, what occasion of import
+Hath all so long detain'd you from your wife,
+And sent you hither so unlike yourself?
+
+PETRUCHIO:
+Tedious it were to tell, and harsh to hear:
+Sufficeth I am come to keep my word,
+Though in some part enforced to digress;
+Which, at more leisure, I will so excuse
+As you shall well be satisfied withal.
+But where is Kate? I stay too long from her:
+The morning wears, 'tis time we were at church.
+
+TRANIO:
+See not your bride in these unreverent robes:
+Go to my chamber; Put on clothes of mine.
+
+PETRUCHIO:
+Not I, believe me: thus I'll visit her.
+
+BAPTISTA:
+But thus, I trust, you will not marry her.
+
+PETRUCHIO:
+Good sooth, even thus; therefore ha' done with words:
+To me she's married, not unto my clothes:
+Could I repair what she will wear in me,
+As I can change these poor accoutrements,
+'Twere well for Kate and better for myself.
+But what a fool am I to chat with you,
+When I should bid good morrow to my bride,
+And seal the title with a lovely kiss!
+
+TRANIO:
+He hath some meaning in his mad attire:
+We will persuade him, be it possible,
+To put on better ere he go to church.
+
+BAPTISTA:
+I'll after him, and see the event of this.
+
+TRANIO:
+But to her love concerneth us to add
+Her father's liking: which to bring to pass,
+As I before unparted to your worship,
+I am to get a man,--whate'er he be,
+It skills not much. we'll fit him to our turn,--
+And he shall be Vincentio of Pisa;
+And make assurance here in Padua
+Of greater sums than I have promised.
+So shall you quietly enjoy your hope,
+And marry sweet Bianca with consent.
+
+LUCENTIO:
+Were it not that my fellow-school-master
+Doth watch Bianca's steps so narrowly,
+'Twere good, methinks, to steal our marriage;
+Which once perform'd, let all the world say no,
+I'll keep mine own, despite of all the world.
+
+TRANIO:
+That by degrees we mean to look into,
+And watch our vantage in this business:
+We'll over-reach the greybeard, Gremio,
+The narrow-prying father, Minola,
+The quaint musician, amorous Licio;
+All for my master's sake, Lucentio.
+Signior Gremio, came you from the church?
+
+GREMIO:
+As willingly as e'er I came from school.
+
+TRANIO:
+And is the bride and bridegroom coming home?
+
+GREMIO:
+A bridegroom say you? 'tis a groom indeed,
+A grumbling groom, and that the girl shall find.
+
+TRANIO:
+Curster than she? why, 'tis impossible.
+
+GREMIO:
+Why he's a devil, a devil, a very fiend.
+
+TRANIO:
+Why, she's a devil, a devil, the devil's dam.
+
+GREMIO:
+Tut, she's a lamb, a dove, a fool to him!
+I'll tell you, Sir Lucentio: when the priest
+Should ask, if Katharina should be his wife,
+'Ay, by gogs-wouns,' quoth he; and swore so loud,
+That, all-amazed, the priest let fall the book;
+And, as he stoop'd again to take it up,
+The mad-brain'd bridegroom took him such a cuff
+That down fell priest and book and book and priest:
+'Now take them up,' quoth he, 'if any list.'
+
+TRANIO:
+What said the wench when he rose again?
+
+GREMIO:
+Trembled and shook; for why, he stamp'd and swore,
+As if the vicar meant to cozen him.
+But after many ceremonies done,
+He calls for wine: 'A health!' quoth he, as if
+He had been aboard, carousing to his mates
+After a storm; quaff'd off the muscadel
+And threw the sops all in the sexton's face;
+Having no other reason
+But that his beard grew thin and hungerly
+And seem'd to ask him sops as he was drinking.
+This done, he took the bride about the neck
+And kiss'd her lips with such a clamorous smack
+That at the parting all the church did echo:
+And I seeing this came thence for very shame;
+And after me, I know, the rout is coming.
+Such a mad marriage never was before:
+Hark, hark! I hear the minstrels play.
+
+PETRUCHIO:
+Gentlemen and friends, I thank you for your pains:
+I know you think to dine with me to-day,
+And have prepared great store of wedding cheer;
+But so it is, my haste doth call me hence,
+And therefore here I mean to take my leave.
+
+BAPTISTA:
+Is't possible you will away to-night?
+
+PETRUCHIO:
+I must away to-day, before night come:
+Make it no wonder; if you knew my business,
+You would entreat me rather go than stay.
+And, honest company, I thank you all,
+That have beheld me give away myself
+To this most patient, sweet and virtuous wife:
+Dine with my father, drink a health to me;
+For I must hence; and farewell to you all.
+
+TRANIO:
+Let us entreat you stay till after dinner.
+
+PETRUCHIO:
+It may not be.
+
+GREMIO:
+Let me entreat you.
+
+PETRUCHIO:
+It cannot be.
+
+KATHARINA:
+Let me entreat you.
+
+PETRUCHIO:
+I am content.
+
+KATHARINA:
+Are you content to stay?
+
+PETRUCHIO:
+I am content you shall entreat me stay;
+But yet not stay, entreat me how you can.
+
+KATHARINA:
+Now, if you love me, stay.
+
+PETRUCHIO:
+Grumio, my horse.
+
+GRUMIO:
+Ay, sir, they be ready: the oats have eaten the horses.
+
+KATHARINA:
+Nay, then,
+Do what thou canst, I will not go to-day;
+No, nor to-morrow, not till I please myself.
+The door is open, sir; there lies your way;
+You may be jogging whiles your boots are green;
+For me, I'll not be gone till I please myself:
+'Tis like you'll prove a jolly surly groom,
+That take it on you at the first so roundly.
+
+PETRUCHIO:
+O Kate, content thee; prithee, be not angry.
+
+KATHARINA:
+I will be angry: what hast thou to do?
+Father, be quiet; he shall stay my leisure.
+
+GREMIO:
+Ay, marry, sir, now it begins to work.
+
+KATARINA:
+Gentlemen, forward to the bridal dinner:
+I see a woman may be made a fool,
+If she had not a spirit to resist.
+
+PETRUCHIO:
+They shall go forward, Kate, at thy command.
+Obey the bride, you that attend on her;
+Go to the feast, revel and domineer,
+Carouse full measure to her maidenhead,
+Be mad and merry, or go hang yourselves:
+But for my bonny Kate, she must with me.
+Nay, look not big, nor stamp, nor stare, nor fret;
+I will be master of what is mine own:
+She is my goods, my chattels; she is my house,
+My household stuff, my field, my barn,
+My horse, my ox, my ass, my any thing;
+And here she stands, touch her whoever dare;
+I'll bring mine action on the proudest he
+That stops my way in Padua. Grumio,
+Draw forth thy weapon, we are beset with thieves;
+Rescue thy mistress, if thou be a man.
+Fear not, sweet wench, they shall not touch
+thee, Kate:
+I'll buckler thee against a million.
+
+BAPTISTA:
+Nay, let them go, a couple of quiet ones.
+
+GREMIO:
+Went they not quickly, I should die with laughing.
+
+TRANIO:
+Of all mad matches never was the like.
+
+LUCENTIO:
+Mistress, what's your opinion of your sister?
+
+BIANCA:
+That, being mad herself, she's madly mated.
+
+GREMIO:
+I warrant him, Petruchio is Kated.
+
+BAPTISTA:
+Neighbours and friends, though bride and
+bridegroom wants
+For to supply the places at the table,
+You know there wants no junkets at the feast.
+Lucentio, you shall supply the bridegroom's place:
+And let Bianca take her sister's room.
+
+TRANIO:
+Shall sweet Bianca practise how to bride it?
+
+BAPTISTA:
+She shall, Lucentio. Come, gentlemen, let's go.
+
+GRUMIO:
+Fie, fie on all tired jades, on all mad masters, and
+all foul ways! Was ever man so beaten? was ever
+man so rayed? was ever man so weary? I am sent
+before to make a fire, and they are coming after to
+warm them. Now, were not I a little pot and soon
+hot, my very lips might freeze to my teeth, my
+tongue to the roof of my mouth, my heart in my
+belly, ere I should come by a fire to thaw me: but
+I, with blowing the fire, shall warm myself; for,
+considering the weather, a taller man than I will
+take cold. Holla, ho! Curtis.
+
+CURTIS:
+Who is that calls so coldly?
+
+GRUMIO:
+A piece of ice: if thou doubt it, thou mayst slide
+from my shoulder to my heel with no greater a run
+but my head and my neck. A fire good Curtis.
+
+CURTIS:
+Is my master and his wife coming, Grumio?
+
+GRUMIO:
+O, ay, Curtis, ay: and therefore fire, fire; cast
+on no water.
+
+CURTIS:
+Is she so hot a shrew as she's reported?
+
+GRUMIO:
+She was, good Curtis, before this frost: but, thou
+knowest, winter tames man, woman and beast; for it
+hath tamed my old master and my new mistress and
+myself, fellow Curtis.
+
+CURTIS:
+Away, you three-inch fool! I am no beast.
+
+GRUMIO:
+Am I but three inches? why, thy horn is a foot; and
+so long am I at the least. But wilt thou make a
+fire, or shall I complain on thee to our mistress,
+whose hand, she being now at hand, thou shalt soon
+feel, to thy cold comfort, for being slow in thy hot office?
+
+CURTIS:
+I prithee, good Grumio, tell me, how goes the world?
+
+GRUMIO:
+A cold world, Curtis, in every office but thine; and
+therefore fire: do thy duty, and have thy duty; for
+my master and mistress are almost frozen to death.
+
+CURTIS:
+There's fire ready; and therefore, good Grumio, the news.
+
+GRUMIO:
+Why, 'Jack, boy! ho! boy!' and as much news as
+will thaw.
+
+CURTIS:
+Come, you are so full of cony-catching!
+
+GRUMIO:
+Why, therefore fire; for I have caught extreme cold.
+Where's the cook? is supper ready, the house
+trimmed, rushes strewed, cobwebs swept; the
+serving-men in their new fustian, their white
+stockings, and every officer his wedding-garment on?
+Be the jacks fair within, the jills fair without,
+the carpets laid, and every thing in order?
+
+CURTIS:
+All ready; and therefore, I pray thee, news.
+
+GRUMIO:
+First, know, my horse is tired; my master and
+mistress fallen out.
+
+CURTIS:
+How?
+
+GRUMIO:
+Out of their saddles into the dirt; and thereby
+hangs a tale.
+
+CURTIS:
+Let's ha't, good Grumio.
+
+GRUMIO:
+Lend thine ear.
+
+CURTIS:
+Here.
+
+GRUMIO:
+There.
+
+CURTIS:
+This is to feel a tale, not to hear a tale.
+
+GRUMIO:
+And therefore 'tis called a sensible tale: and this
+cuff was but to knock at your ear, and beseech
+listening. Now I begin: Imprimis, we came down a
+foul hill, my master riding behind my mistress,--
+
+CURTIS:
+Both of one horse?
+
+GRUMIO:
+What's that to thee?
+
+CURTIS:
+Why, a horse.
+
+GRUMIO:
+Tell thou the tale: but hadst thou not crossed me,
+thou shouldst have heard how her horse fell and she
+under her horse; thou shouldst have heard in how
+miry a place, how she was bemoiled, how he left her
+with the horse upon her, how he beat me because
+her horse stumbled, how she waded through the dirt
+to pluck him off me, how he swore, how she prayed,
+that never prayed before, how I cried, how the
+horses ran away, how her bridle was burst, how I
+lost my crupper, with many things of worthy memory,
+which now shall die in oblivion and thou return
+unexperienced to thy grave.
+
+CURTIS:
+By this reckoning he is more shrew than she.
+
+GRUMIO:
+Ay; and that thou and the proudest of you all shall
+find when he comes home. But what talk I of this?
+Call forth Nathaniel, Joseph, Nicholas, Philip,
+Walter, Sugarsop and the rest: let their heads be
+sleekly combed their blue coats brushed and their
+garters of an indifferent knit: let them curtsy
+with their left legs and not presume to touch a hair
+of my master's horse-tail till they kiss their
+hands. Are they all ready?
+
+CURTIS:
+They are.
+
+GRUMIO:
+Call them forth.
+
+CURTIS:
+Do you hear, ho? you must meet my master to
+countenance my mistress.
+
+GRUMIO:
+Why, she hath a face of her own.
+
+CURTIS:
+Who knows not that?
+
+GRUMIO:
+Thou, it seems, that calls for company to
+countenance her.
+
+CURTIS:
+I call them forth to credit her.
+
+GRUMIO:
+Why, she comes to borrow nothing of them.
+
+NATHANIEL:
+Welcome home, Grumio!
+
+PHILIP:
+How now, Grumio!
+
+JOSEPH:
+What, Grumio!
+
+NICHOLAS:
+Fellow Grumio!
+
+NATHANIEL:
+How now, old lad?
+
+GRUMIO:
+Welcome, you;--how now, you;-- what, you;--fellow,
+you;--and thus much for greeting. Now, my spruce
+companions, is all ready, and all things neat?
+
+NATHANIEL:
+All things is ready. How near is our master?
+
+GRUMIO:
+E'en at hand, alighted by this; and therefore be
+not--Cock's passion, silence! I hear my master.
+
+PETRUCHIO:
+Where be these knaves? What, no man at door
+To hold my stirrup nor to take my horse!
+Where is Nathaniel, Gregory, Philip?
+
+ALL SERVING-MEN:
+Here, here, sir; here, sir.
+
+PETRUCHIO:
+Here, sir! here, sir! here, sir! here, sir!
+You logger-headed and unpolish'd grooms!
+What, no attendance? no regard? no duty?
+Where is the foolish knave I sent before?
+
+GRUMIO:
+Here, sir; as foolish as I was before.
+
+PETRUCHIO:
+You peasant swain! you whoreson malt-horse drudge!
+Did I not bid thee meet me in the park,
+And bring along these rascal knaves with thee?
+
+GRUMIO:
+Nathaniel's coat, sir, was not fully made,
+And Gabriel's pumps were all unpink'd i' the heel;
+There was no link to colour Peter's hat,
+And Walter's dagger was not come from sheathing:
+There were none fine but Adam, Ralph, and Gregory;
+The rest were ragged, old, and beggarly;
+Yet, as they are, here are they come to meet you.
+
+PETRUCHIO:
+Go, rascals, go, and fetch my supper in.
+Where is the life that late I led--
+Where are those--Sit down, Kate, and welcome.--
+Sound, sound, sound, sound!
+Why, when, I say? Nay, good sweet Kate, be merry.
+Off with my boots, you rogues! you villains, when?
+It was the friar of orders grey,
+As he forth walked on his way:--
+Out, you rogue! you pluck my foot awry:
+Take that, and mend the plucking off the other.
+Be merry, Kate. Some water, here; what, ho!
+Where's my spaniel Troilus? Sirrah, get you hence,
+And bid my cousin Ferdinand come hither:
+One, Kate, that you must kiss, and be acquainted with.
+Where are my slippers? Shall I have some water?
+Come, Kate, and wash, and welcome heartily.
+You whoreson villain! will you let it fall?
+
+KATHARINA:
+Patience, I pray you; 'twas a fault unwilling.
+
+PETRUCHIO:
+A whoreson beetle-headed, flap-ear'd knave!
+Come, Kate, sit down; I know you have a stomach.
+Will you give thanks, sweet Kate; or else shall I?
+What's this? mutton?
+
+First Servant:
+Ay.
+
+PETRUCHIO:
+Who brought it?
+
+PETER:
+I.
+
+PETRUCHIO:
+'Tis burnt; and so is all the meat.
+What dogs are these! Where is the rascal cook?
+How durst you, villains, bring it from the dresser,
+And serve it thus to me that love it not?
+Theretake it to you, trenchers, cups, and all;
+You heedless joltheads and unmanner'd slaves!
+What, do you grumble? I'll be with you straight.
+
+KATHARINA:
+I pray you, husband, be not so disquiet:
+The meat was well, if you were so contented.
+
+PETRUCHIO:
+I tell thee, Kate, 'twas burnt and dried away;
+And I expressly am forbid to touch it,
+For it engenders choler, planteth anger;
+And better 'twere that both of us did fast,
+Since, of ourselves, ourselves are choleric,
+Than feed it with such over-roasted flesh.
+Be patient; to-morrow 't shall be mended,
+And, for this night, we'll fast for company:
+Come, I will bring thee to thy bridal chamber.
+
+NATHANIEL:
+Peter, didst ever see the like?
+
+PETER:
+He kills her in her own humour.
+
+GRUMIO:
+Where is he?
+
+CURTIS:
+In her chamber, making a sermon of continency to her;
+And rails, and swears, and rates, that she, poor soul,
+Knows not which way to stand, to look, to speak,
+And sits as one new-risen from a dream.
+Away, away! for he is coming hither.
+
+PETRUCHIO:
+Thus have I politicly begun my reign,
+And 'tis my hope to end successfully.
+My falcon now is sharp and passing empty;
+And till she stoop she must not be full-gorged,
+For then she never looks upon her lure.
+Another way I have to man my haggard,
+To make her come and know her keeper's call,
+That is, to watch her, as we watch these kites
+That bate and beat and will not be obedient.
+She eat no meat to-day, nor none shall eat;
+Last night she slept not, nor to-night she shall not;
+As with the meat, some undeserved fault
+I'll find about the making of the bed;
+And here I'll fling the pillow, there the bolster,
+This way the coverlet, another way the sheets:
+Ay, and amid this hurly I intend
+That all is done in reverend care of her;
+And in conclusion she shall watch all night:
+And if she chance to nod I'll rail and brawl
+And with the clamour keep her still awake.
+This is a way to kill a wife with kindness;
+And thus I'll curb her mad and headstrong humour.
+He that knows better how to tame a shrew,
+Now let him speak: 'tis charity to show.
+
+TRANIO:
+Is't possible, friend Licio, that Mistress Bianca
+Doth fancy any other but Lucentio?
+I tell you, sir, she bears me fair in hand.
+
+HORTENSIO:
+Sir, to satisfy you in what I have said,
+Stand by and mark the manner of his teaching.
+
+LUCENTIO:
+Now, mistress, profit you in what you read?
+
+BIANCA:
+What, master, read you? first resolve me that.
+
+LUCENTIO:
+I read that I profess, the Art to Love.
+
+BIANCA:
+And may you prove, sir, master of your art!
+
+LUCENTIO:
+While you, sweet dear, prove mistress of my heart!
+
+HORTENSIO:
+Quick proceeders, marry! Now, tell me, I pray,
+You that durst swear at your mistress Bianca
+Loved none in the world so well as Lucentio.
+
+TRANIO:
+O despiteful love! unconstant womankind!
+I tell thee, Licio, this is wonderful.
+
+HORTENSIO:
+Mistake no more: I am not Licio,
+Nor a musician, as I seem to be;
+But one that scorn to live in this disguise,
+For such a one as leaves a gentleman,
+And makes a god of such a cullion:
+Know, sir, that I am call'd Hortensio.
+
+TRANIO:
+Signior Hortensio, I have often heard
+Of your entire affection to Bianca;
+And since mine eyes are witness of her lightness,
+I will with you, if you be so contented,
+Forswear Bianca and her love for ever.
+
+HORTENSIO:
+See, how they kiss and court! Signior Lucentio,
+Here is my hand, and here I firmly vow
+Never to woo her no more, but do forswear her,
+As one unworthy all the former favours
+That I have fondly flatter'd her withal.
+
+TRANIO:
+And here I take the unfeigned oath,
+Never to marry with her though she would entreat:
+Fie on her! see, how beastly she doth court him!
+
+HORTENSIO:
+Would all the world but he had quite forsworn!
+For me, that I may surely keep mine oath,
+I will be married to a wealthy widow,
+Ere three days pass, which hath as long loved me
+As I have loved this proud disdainful haggard.
+And so farewell, Signior Lucentio.
+Kindness in women, not their beauteous looks,
+Shall win my love: and so I take my leave,
+In resolution as I swore before.
+
+TRANIO:
+Mistress Bianca, bless you with such grace
+As 'longeth to a lover's blessed case!
+Nay, I have ta'en you napping, gentle love,
+And have forsworn you with Hortensio.
+
+BIANCA:
+Tranio, you jest: but have you both forsworn me?
+
+TRANIO:
+Mistress, we have.
+
+LUCENTIO:
+Then we are rid of Licio.
+
+TRANIO:
+I' faith, he'll have a lusty widow now,
+That shall be wood and wedded in a day.
+
+BIANCA:
+God give him joy!
+
+TRANIO:
+Ay, and he'll tame her.
+
+BIANCA:
+He says so, Tranio.
+
+TRANIO:
+Faith, he is gone unto the taming-school.
+
+BIANCA:
+The taming-school! what, is there such a place?
+
+TRANIO:
+Ay, mistress, and Petruchio is the master;
+That teacheth tricks eleven and twenty long,
+To tame a shrew and charm her chattering tongue.
+
+BIONDELLO:
+O master, master, I have watch'd so long
+That I am dog-weary: but at last I spied
+An ancient angel coming down the hill,
+Will serve the turn.
+
+TRANIO:
+What is he, Biondello?
+
+BIONDELLO:
+Master, a mercatante, or a pedant,
+I know not what; but format in apparel,
+In gait and countenance surely like a father.
+
+LUCENTIO:
+And what of him, Tranio?
+
+TRANIO:
+If he be credulous and trust my tale,
+I'll make him glad to seem Vincentio,
+And give assurance to Baptista Minola,
+As if he were the right Vincentio
+Take in your love, and then let me alone.
+
+Pedant:
+God save you, sir!
+
+TRANIO:
+And you, sir! you are welcome.
+Travel you far on, or are you at the farthest?
+
+Pedant:
+Sir, at the farthest for a week or two:
+But then up farther, and as for as Rome;
+And so to Tripoli, if God lend me life.
+
+TRANIO:
+What countryman, I pray?
+
+Pedant:
+Of Mantua.
+
+TRANIO:
+Of Mantua, sir? marry, God forbid!
+And come to Padua, careless of your life?
+
+Pedant:
+My life, sir! how, I pray? for that goes hard.
+
+TRANIO:
+'Tis death for any one in Mantua
+To come to Padua. Know you not the cause?
+Your ships are stay'd at Venice, and the duke,
+For private quarrel 'twixt your duke and him,
+Hath publish'd and proclaim'd it openly:
+'Tis, marvel, but that you are but newly come,
+You might have heard it else proclaim'd about.
+
+Pedant:
+Alas! sir, it is worse for me than so;
+For I have bills for money by exchange
+From Florence and must here deliver them.
+
+TRANIO:
+Well, sir, to do you courtesy,
+This will I do, and this I will advise you:
+First, tell me, have you ever been at Pisa?
+
+Pedant:
+Ay, sir, in Pisa have I often been,
+Pisa renowned for grave citizens.
+
+TRANIO:
+Among them know you one Vincentio?
+
+Pedant:
+I know him not, but I have heard of him;
+A merchant of incomparable wealth.
+
+TRANIO:
+He is my father, sir; and, sooth to say,
+In countenance somewhat doth resemble you.
+
+BIONDELLO:
+
+TRANIO:
+To save your life in this extremity,
+This favour will I do you for his sake;
+And think it not the worst of an your fortunes
+That you are like to Sir Vincentio.
+His name and credit shall you undertake,
+And in my house you shall be friendly lodged:
+Look that you take upon you as you should;
+You understand me, sir: so shall you stay
+Till you have done your business in the city:
+If this be courtesy, sir, accept of it.
+
+Pedant:
+O sir, I do; and will repute you ever
+The patron of my life and liberty.
+
+TRANIO:
+Then go with me to make the matter good.
+This, by the way, I let you understand;
+my father is here look'd for every day,
+To pass assurance of a dower in marriage
+'Twixt me and one Baptista's daughter here:
+In all these circumstances I'll instruct you:
+Go with me to clothe you as becomes you.
+
+GRUMIO:
+No, no, forsooth; I dare not for my life.
+
+KATHARINA:
+The more my wrong, the more his spite appears:
+What, did he marry me to famish me?
+Beggars, that come unto my father's door,
+Upon entreaty have a present aims;
+If not, elsewhere they meet with charity:
+But I, who never knew how to entreat,
+Nor never needed that I should entreat,
+Am starved for meat, giddy for lack of sleep,
+With oath kept waking and with brawling fed:
+And that which spites me more than all these wants,
+He does it under name of perfect love;
+As who should say, if I should sleep or eat,
+'Twere deadly sickness or else present death.
+I prithee go and get me some repast;
+I care not what, so it be wholesome food.
+
+GRUMIO:
+What say you to a neat's foot?
+
+KATHARINA:
+'Tis passing good: I prithee let me have it.
+
+GRUMIO:
+I fear it is too choleric a meat.
+How say you to a fat tripe finely broil'd?
+
+KATHARINA:
+I like it well: good Grumio, fetch it me.
+
+GRUMIO:
+I cannot tell; I fear 'tis choleric.
+What say you to a piece of beef and mustard?
+
+KATHARINA:
+A dish that I do love to feed upon.
+
+GRUMIO:
+Ay, but the mustard is too hot a little.
+
+KATHARINA:
+Why then, the beef, and let the mustard rest.
+
+GRUMIO:
+Nay then, I will not: you shall have the mustard,
+Or else you get no beef of Grumio.
+
+KATHARINA:
+Then both, or one, or any thing thou wilt.
+
+GRUMIO:
+Why then, the mustard without the beef.
+
+KATHARINA:
+Go, get thee gone, thou false deluding slave,
+That feed'st me with the very name of meat:
+Sorrow on thee and all the pack of you,
+That triumph thus upon my misery!
+Go, get thee gone, I say.
+
+PETRUCHIO:
+How fares my Kate? What, sweeting, all amort?
+
+HORTENSIO:
+Mistress, what cheer?
+
+KATHARINA:
+Faith, as cold as can be.
+
+PETRUCHIO:
+Pluck up thy spirits; look cheerfully upon me.
+Here love; thou see'st how diligent I am
+To dress thy meat myself and bring it thee:
+I am sure, sweet Kate, this kindness merits thanks.
+What, not a word? Nay, then thou lovest it not;
+And all my pains is sorted to no proof.
+Here, take away this dish.
+
+KATHARINA:
+I pray you, let it stand.
+
+PETRUCHIO:
+The poorest service is repaid with thanks;
+And so shall mine, before you touch the meat.
+
+KATHARINA:
+I thank you, sir.
+
+HORTENSIO:
+Signior Petruchio, fie! you are to blame.
+Come, mistress Kate, I'll bear you company.
+
+PETRUCHIO:
+
+Haberdasher:
+Here is the cap your worship did bespeak.
+
+PETRUCHIO:
+Why, this was moulded on a porringer;
+A velvet dish: fie, fie! 'tis lewd and filthy:
+Why, 'tis a cockle or a walnut-shell,
+A knack, a toy, a trick, a baby's cap:
+Away with it! come, let me have a bigger.
+
+KATHARINA:
+I'll have no bigger: this doth fit the time,
+And gentlewomen wear such caps as these
+
+PETRUCHIO:
+When you are gentle, you shall have one too,
+And not till then.
+
+HORTENSIO:
+
+KATHARINA:
+Why, sir, I trust I may have leave to speak;
+And speak I will; I am no child, no babe:
+Your betters have endured me say my mind,
+And if you cannot, best you stop your ears.
+My tongue will tell the anger of my heart,
+Or else my heart concealing it will break,
+And rather than it shall, I will be free
+Even to the uttermost, as I please, in words.
+
+PETRUCHIO:
+Why, thou say'st true; it is a paltry cap,
+A custard-coffin, a bauble, a silken pie:
+I love thee well, in that thou likest it not.
+
+KATHARINA:
+Love me or love me not, I like the cap;
+And it I will have, or I will have none.
+
+PETRUCHIO:
+Thy gown? why, ay: come, tailor, let us see't.
+O mercy, God! what masquing stuff is here?
+What's this? a sleeve? 'tis like a demi-cannon:
+What, up and down, carved like an apple-tart?
+Here's snip and nip and cut and slish and slash,
+Like to a censer in a barber's shop:
+Why, what, i' devil's name, tailor, call'st thou this?
+
+HORTENSIO:
+
+Tailor:
+You bid me make it orderly and well,
+According to the fashion and the time.
+
+PETRUCHIO:
+Marry, and did; but if you be remember'd,
+I did not bid you mar it to the time.
+Go, hop me over every kennel home,
+For you shall hop without my custom, sir:
+I'll none of it: hence! make your best of it.
+
+KATHARINA:
+I never saw a better-fashion'd gown,
+More quaint, more pleasing, nor more commendable:
+Belike you mean to make a puppet of me.
+
+PETRUCHIO:
+Why, true; he means to make a puppet of thee.
+
+Tailor:
+She says your worship means to make
+a puppet of her.
+
+PETRUCHIO:
+O monstrous arrogance! Thou liest, thou thread,
+thou thimble,
+Thou yard, three-quarters, half-yard, quarter, nail!
+Thou flea, thou nit, thou winter-cricket thou!
+Braved in mine own house with a skein of thread?
+Away, thou rag, thou quantity, thou remnant;
+Or I shall so be-mete thee with thy yard
+As thou shalt think on prating whilst thou livest!
+I tell thee, I, that thou hast marr'd her gown.
+
+Tailor:
+Your worship is deceived; the gown is made
+Just as my master had direction:
+Grumio gave order how it should be done.
+
+GRUMIO:
+I gave him no order; I gave him the stuff.
+
+Tailor:
+But how did you desire it should be made?
+
+GRUMIO:
+Marry, sir, with needle and thread.
+
+Tailor:
+But did you not request to have it cut?
+
+GRUMIO:
+Thou hast faced many things.
+
+Tailor:
+I have.
+
+GRUMIO:
+Face not me: thou hast braved many men; brave not
+me; I will neither be faced nor braved. I say unto
+thee, I bid thy master cut out the gown; but I did
+not bid him cut it to pieces: ergo, thou liest.
+
+Tailor:
+Why, here is the note of the fashion to testify
+
+PETRUCHIO:
+Read it.
+
+GRUMIO:
+The note lies in's throat, if he say I said so.
+
+Tailor:
+
+GRUMIO:
+Master, if ever I said loose-bodied gown, sew me in
+the skirts of it, and beat me to death with a bottom
+of brown thread: I said a gown.
+
+PETRUCHIO:
+Proceed.
+
+Tailor:
+
+GRUMIO:
+I confess the cape.
+
+Tailor:
+
+GRUMIO:
+I confess two sleeves.
+
+Tailor:
+
+PETRUCHIO:
+Ay, there's the villany.
+
+GRUMIO:
+Error i' the bill, sir; error i' the bill.
+I commanded the sleeves should be cut out and
+sewed up again; and that I'll prove upon thee,
+though thy little finger be armed in a thimble.
+
+Tailor:
+This is true that I say: an I had thee
+in place where, thou shouldst know it.
+
+GRUMIO:
+I am for thee straight: take thou the
+bill, give me thy mete-yard, and spare not me.
+
+HORTENSIO:
+God-a-mercy, Grumio! then he shall have no odds.
+
+PETRUCHIO:
+Well, sir, in brief, the gown is not for me.
+
+GRUMIO:
+You are i' the right, sir: 'tis for my mistress.
+
+PETRUCHIO:
+Go, take it up unto thy master's use.
+
+GRUMIO:
+Villain, not for thy life: take up my mistress'
+gown for thy master's use!
+
+PETRUCHIO:
+Why, sir, what's your conceit in that?
+
+GRUMIO:
+O, sir, the conceit is deeper than you think for:
+Take up my mistress' gown to his master's use!
+O, fie, fie, fie!
+
+PETRUCHIO:
+
+HORTENSIO:
+Tailor, I'll pay thee for thy gown tomorrow:
+Take no unkindness of his hasty words:
+Away! I say; commend me to thy master.
+
+PETRUCHIO:
+Well, come, my Kate; we will unto your father's
+Even in these honest mean habiliments:
+Our purses shall be proud, our garments poor;
+For 'tis the mind that makes the body rich;
+And as the sun breaks through the darkest clouds,
+So honour peereth in the meanest habit.
+What is the jay more precious than the lark,
+Because his fathers are more beautiful?
+Or is the adder better than the eel,
+Because his painted skin contents the eye?
+O, no, good Kate; neither art thou the worse
+For this poor furniture and mean array.
+if thou account'st it shame. lay it on me;
+And therefore frolic: we will hence forthwith,
+To feast and sport us at thy father's house.
+Go, call my men, and let us straight to him;
+And bring our horses unto Long-lane end;
+There will we mount, and thither walk on foot
+Let's see; I think 'tis now some seven o'clock,
+And well we may come there by dinner-time.
+
+KATHARINA:
+I dare assure you, sir, 'tis almost two;
+And 'twill be supper-time ere you come there.
+
+PETRUCHIO:
+It shall be seven ere I go to horse:
+Look, what I speak, or do, or think to do,
+You are still crossing it. Sirs, let't alone:
+I will not go to-day; and ere I do,
+It shall be what o'clock I say it is.
+
+HORTENSIO:
+
+TRANIO:
+Sir, this is the house: please it you that I call?
+
+Pedant:
+Ay, what else? and but I be deceived
+Signior Baptista may remember me,
+Near twenty years ago, in Genoa,
+Where we were lodgers at the Pegasus.
+
+TRANIO:
+'Tis well; and hold your own, in any case,
+With such austerity as 'longeth to a father.
+
+Pedant:
+I warrant you.
+But, sir, here comes your boy;
+'Twere good he were school'd.
+
+TRANIO:
+Fear you not him. Sirrah Biondello,
+Now do your duty throughly, I advise you:
+Imagine 'twere the right Vincentio.
+
+BIONDELLO:
+Tut, fear not me.
+
+TRANIO:
+But hast thou done thy errand to Baptista?
+
+BIONDELLO:
+I told him that your father was at Venice,
+And that you look'd for him this day in Padua.
+
+TRANIO:
+Thou'rt a tall fellow: hold thee that to drink.
+Here comes Baptista: set your countenance, sir.
+Signior Baptista, you are happily met.
+Sir, this is the gentleman I told you of:
+I pray you stand good father to me now,
+Give me Bianca for my patrimony.
+
+Pedant:
+Soft son!
+Sir, by your leave: having come to Padua
+To gather in some debts, my son Lucentio
+Made me acquainted with a weighty cause
+Of love between your daughter and himself:
+And, for the good report I hear of you
+And for the love he beareth to your daughter
+And she to him, to stay him not too long,
+I am content, in a good father's care,
+To have him match'd; and if you please to like
+No worse than I, upon some agreement
+Me shall you find ready and willing
+With one consent to have her so bestow'd;
+For curious I cannot be with you,
+Signior Baptista, of whom I hear so well.
+
+BAPTISTA:
+Sir, pardon me in what I have to say:
+Your plainness and your shortness please me well.
+Right true it is, your son Lucentio here
+Doth love my daughter and she loveth him,
+Or both dissemble deeply their affections:
+And therefore, if you say no more than this,
+That like a father you will deal with him
+And pass my daughter a sufficient dower,
+The match is made, and all is done:
+Your son shall have my daughter with consent.
+
+TRANIO:
+I thank you, sir. Where then do you know best
+We be affied and such assurance ta'en
+As shall with either part's agreement stand?
+
+BAPTISTA:
+Not in my house, Lucentio; for, you know,
+Pitchers have ears, and I have many servants:
+Besides, old Gremio is hearkening still;
+And happily we might be interrupted.
+
+TRANIO:
+Then at my lodging, an it like you:
+There doth my father lie; and there, this night,
+We'll pass the business privately and well.
+Send for your daughter by your servant here:
+My boy shall fetch the scrivener presently.
+The worst is this, that, at so slender warning,
+You are like to have a thin and slender pittance.
+
+BAPTISTA:
+It likes me well. Biondello, hie you home,
+And bid Bianca make her ready straight;
+And, if you will, tell what hath happened,
+Lucentio's father is arrived in Padua,
+And how she's like to be Lucentio's wife.
+
+BIONDELLO:
+I pray the gods she may with all my heart!
+
+TRANIO:
+Dally not with the gods, but get thee gone.
+Signior Baptista, shall I lead the way?
+Welcome! one mess is like to be your cheer:
+Come, sir; we will better it in Pisa.
+
+BAPTISTA:
+I follow you.
+
+BIONDELLO:
+Cambio!
+
+LUCENTIO:
+What sayest thou, Biondello?
+
+BIONDELLO:
+You saw my master wink and laugh upon you?
+
+LUCENTIO:
+Biondello, what of that?
+
+BIONDELLO:
+Faith, nothing; but has left me here behind, to
+expound the meaning or moral of his signs and tokens.
+
+LUCENTIO:
+I pray thee, moralize them.
+
+BIONDELLO:
+Then thus. Baptista is safe, talking with the
+deceiving father of a deceitful son.
+
+LUCENTIO:
+And what of him?
+
+BIONDELLO:
+His daughter is to be brought by you to the supper.
+
+LUCENTIO:
+And then?
+
+BIONDELLO:
+The old priest of Saint Luke's church is at your
+command at all hours.
+
+LUCENTIO:
+And what of all this?
+
+BIONDELLO:
+I cannot tell; expect they are busied about a
+counterfeit assurance: take you assurance of her,
+'cum privilegio ad imprimendum solum:' to the
+church; take the priest, clerk, and some sufficient
+honest witnesses: If this be not that you look for,
+I have no more to say, But bid Bianca farewell for
+ever and a day.
+
+LUCENTIO:
+Hearest thou, Biondello?
+
+BIONDELLO:
+I cannot tarry: I knew a wench married in an
+afternoon as she went to the garden for parsley to
+stuff a rabbit; and so may you, sir: and so, adieu,
+sir. My master hath appointed me to go to Saint
+Luke's, to bid the priest be ready to come against
+you come with your appendix.
+
+LUCENTIO:
+I may, and will, if she be so contented:
+She will be pleased; then wherefore should I doubt?
+Hap what hap may, I'll roundly go about her:
+It shall go hard if Cambio go without her.
+
+PETRUCHIO:
+Come on, i' God's name; once more toward our father's.
+Good Lord, how bright and goodly shines the moon!
+
+KATHARINA:
+The moon! the sun: it is not moonlight now.
+
+PETRUCHIO:
+I say it is the moon that shines so bright.
+
+KATHARINA:
+I know it is the sun that shines so bright.
+
+PETRUCHIO:
+Now, by my mother's son, and that's myself,
+It shall be moon, or star, or what I list,
+Or ere I journey to your father's house.
+Go on, and fetch our horses back again.
+Evermore cross'd and cross'd; nothing but cross'd!
+
+HORTENSIO:
+Say as he says, or we shall never go.
+
+KATHARINA:
+Forward, I pray, since we have come so far,
+And be it moon, or sun, or what you please:
+An if you please to call it a rush-candle,
+Henceforth I vow it shall be so for me.
+
+PETRUCHIO:
+I say it is the moon.
+
+KATHARINA:
+I know it is the moon.
+
+PETRUCHIO:
+Nay, then you lie: it is the blessed sun.
+
+KATHARINA:
+Then, God be bless'd, it is the blessed sun:
+But sun it is not, when you say it is not;
+And the moon changes even as your mind.
+What you will have it named, even that it is;
+And so it shall be so for Katharina.
+
+HORTENSIO:
+Petruchio, go thy ways; the field is won.
+
+PETRUCHIO:
+Well, forward, forward! thus the bowl should run,
+And not unluckily against the bias.
+But, soft! company is coming here.
+Good morrow, gentle mistress: where away?
+Tell me, sweet Kate, and tell me truly too,
+Hast thou beheld a fresher gentlewoman?
+Such war of white and red within her cheeks!
+What stars do spangle heaven with such beauty,
+As those two eyes become that heavenly face?
+Fair lovely maid, once more good day to thee.
+Sweet Kate, embrace her for her beauty's sake.
+
+HORTENSIO:
+A' will make the man mad, to make a woman of him.
+
+KATHARINA:
+Young budding virgin, fair and fresh and sweet,
+Whither away, or where is thy abode?
+Happy the parents of so fair a child;
+Happier the man, whom favourable stars
+Allot thee for his lovely bed-fellow!
+
+PETRUCHIO:
+Why, how now, Kate! I hope thou art not mad:
+This is a man, old, wrinkled, faded, wither'd,
+And not a maiden, as thou say'st he is.
+
+KATHARINA:
+Pardon, old father, my mistaking eyes,
+That have been so bedazzled with the sun
+That everything I look on seemeth green:
+Now I perceive thou art a reverend father;
+Pardon, I pray thee, for my mad mistaking.
+
+PETRUCHIO:
+Do, good old grandsire; and withal make known
+Which way thou travellest: if along with us,
+We shall be joyful of thy company.
+
+VINCENTIO:
+Fair sir, and you my merry mistress,
+That with your strange encounter much amazed me,
+My name is call'd Vincentio; my dwelling Pisa;
+And bound I am to Padua; there to visit
+A son of mine, which long I have not seen.
+
+PETRUCHIO:
+What is his name?
+
+VINCENTIO:
+Lucentio, gentle sir.
+
+PETRUCHIO:
+Happily we met; the happier for thy son.
+And now by law, as well as reverend age,
+I may entitle thee my loving father:
+The sister to my wife, this gentlewoman,
+Thy son by this hath married. Wonder not,
+Nor be grieved: she is of good esteem,
+Her dowery wealthy, and of worthy birth;
+Beside, so qualified as may beseem
+The spouse of any noble gentleman.
+Let me embrace with old Vincentio,
+And wander we to see thy honest son,
+Who will of thy arrival be full joyous.
+
+VINCENTIO:
+But is it true? or else is it your pleasure,
+Like pleasant travellers, to break a jest
+Upon the company you overtake?
+
+HORTENSIO:
+I do assure thee, father, so it is.
+
+PETRUCHIO:
+Come, go along, and see the truth hereof;
+For our first merriment hath made thee jealous.
+
+HORTENSIO:
+Well, Petruchio, this has put me in heart.
+Have to my widow! and if she be froward,
+Then hast thou taught Hortensio to be untoward.
+
+BIONDELLO:
+Softly and swiftly, sir; for the priest is ready.
+
+LUCENTIO:
+I fly, Biondello: but they may chance to need thee
+at home; therefore leave us.
+
+BIONDELLO:
+Nay, faith, I'll see the church o' your back; and
+then come back to my master's as soon as I can.
+
+GREMIO:
+I marvel Cambio comes not all this while.
+
+PETRUCHIO:
+Sir, here's the door, this is Lucentio's house:
+My father's bears more toward the market-place;
+Thither must I, and here I leave you, sir.
+
+VINCENTIO:
+You shall not choose but drink before you go:
+I think I shall command your welcome here,
+And, by all likelihood, some cheer is toward.
+
+GREMIO:
+They're busy within; you were best knock louder.
+
+Pedant:
+What's he that knocks as he would beat down the gate?
+
+VINCENTIO:
+Is Signior Lucentio within, sir?
+
+Pedant:
+He's within, sir, but not to be spoken withal.
+
+VINCENTIO:
+What if a man bring him a hundred pound or two, to
+make merry withal?
+
+Pedant:
+Keep your hundred pounds to yourself: he shall
+need none, so long as I live.
+
+PETRUCHIO:
+Nay, I told you your son was well beloved in Padua.
+Do you hear, sir? To leave frivolous circumstances,
+I pray you, tell Signior Lucentio that his father is
+come from Pisa, and is here at the door to speak with him.
+
+Pedant:
+Thou liest: his father is come from Padua and here
+looking out at the window.
+
+VINCENTIO:
+Art thou his father?
+
+Pedant:
+Ay, sir; so his mother says, if I may believe her.
+
+PETRUCHIO:
+
+Pedant:
+Lay hands on the villain: I believe a' means to
+cozen somebody in this city under my countenance.
+
+BIONDELLO:
+I have seen them in the church together: God send
+'em good shipping! But who is here? mine old
+master Vincentio! now we are undone and brought to nothing.
+
+VINCENTIO:
+
+BIONDELLO:
+Hope I may choose, sir.
+
+VINCENTIO:
+Come hither, you rogue. What, have you forgot me?
+
+BIONDELLO:
+Forgot you! no, sir: I could not forget you, for I
+never saw you before in all my life.
+
+VINCENTIO:
+What, you notorious villain, didst thou never see
+thy master's father, Vincentio?
+
+BIONDELLO:
+What, my old worshipful old master? yes, marry, sir:
+see where he looks out of the window.
+
+VINCENTIO:
+Is't so, indeed.
+
+BIONDELLO:
+Help, help, help! here's a madman will murder me.
+
+Pedant:
+Help, son! help, Signior Baptista!
+
+PETRUCHIO:
+Prithee, Kate, let's stand aside and see the end of
+this controversy.
+
+TRANIO:
+Sir, what are you that offer to beat my servant?
+
+VINCENTIO:
+What am I, sir! nay, what are you, sir? O immortal
+gods! O fine villain! A silken doublet! a velvet
+hose! a scarlet cloak! and a copatain hat! O, I
+am undone! I am undone! while I play the good
+husband at home, my son and my servant spend all at
+the university.
+
+TRANIO:
+How now! what's the matter?
+
+BAPTISTA:
+What, is the man lunatic?
+
+TRANIO:
+Sir, you seem a sober ancient gentleman by your
+habit, but your words show you a madman. Why, sir,
+what 'cerns it you if I wear pearl and gold? I
+thank my good father, I am able to maintain it.
+
+VINCENTIO:
+Thy father! O villain! he is a sailmaker in Bergamo.
+
+BAPTISTA:
+You mistake, sir, you mistake, sir. Pray, what do
+you think is his name?
+
+VINCENTIO:
+His name! as if I knew not his name: I have brought
+him up ever since he was three years old, and his
+name is Tranio.
+
+Pedant:
+Away, away, mad ass! his name is Lucentio and he is
+mine only son, and heir to the lands of me, Signior Vincentio.
+
+VINCENTIO:
+Lucentio! O, he hath murdered his master! Lay hold
+on him, I charge you, in the duke's name. O, my
+son, my son! Tell me, thou villain, where is my son Lucentio?
+
+TRANIO:
+Call forth an officer.
+Carry this mad knave to the gaol. Father Baptista,
+I charge you see that he be forthcoming.
+
+VINCENTIO:
+Carry me to the gaol!
+
+GREMIO:
+Stay, officer: he shall not go to prison.
+
+BAPTISTA:
+Talk not, Signior Gremio: I say he shall go to prison.
+
+GREMIO:
+Take heed, Signior Baptista, lest you be
+cony-catched in this business: I dare swear this
+is the right Vincentio.
+
+Pedant:
+Swear, if thou darest.
+
+GREMIO:
+Nay, I dare not swear it.
+
+TRANIO:
+Then thou wert best say that I am not Lucentio.
+
+GREMIO:
+Yes, I know thee to be Signior Lucentio.
+
+BAPTISTA:
+Away with the dotard! to the gaol with him!
+
+VINCENTIO:
+Thus strangers may be hailed and abused: O
+monstrous villain!
+
+BIONDELLO:
+O! we are spoiled and--yonder he is: deny him,
+forswear him, or else we are all undone.
+
+LUCENTIO:
+
+VINCENTIO:
+Lives my sweet son?
+
+BIANCA:
+Pardon, dear father.
+
+BAPTISTA:
+How hast thou offended?
+Where is Lucentio?
+
+LUCENTIO:
+Here's Lucentio,
+Right son to the right Vincentio;
+That have by marriage made thy daughter mine,
+While counterfeit supposes bleared thine eyne.
+
+GREMIO:
+Here's packing, with a witness to deceive us all!
+
+VINCENTIO:
+Where is that damned villain Tranio,
+That faced and braved me in this matter so?
+
+BAPTISTA:
+Why, tell me, is not this my Cambio?
+
+BIANCA:
+Cambio is changed into Lucentio.
+
+LUCENTIO:
+Love wrought these miracles. Bianca's love
+Made me exchange my state with Tranio,
+While he did bear my countenance in the town;
+And happily I have arrived at the last
+Unto the wished haven of my bliss.
+What Tranio did, myself enforced him to;
+Then pardon him, sweet father, for my sake.
+
+VINCENTIO:
+I'll slit the villain's nose, that would have sent
+me to the gaol.
+
+BAPTISTA:
+But do you hear, sir? have you married my daughter
+without asking my good will?
+
+VINCENTIO:
+Fear not, Baptista; we will content you, go to: but
+I will in, to be revenged for this villany.
+
+BAPTISTA:
+And I, to sound the depth of this knavery.
+
+LUCENTIO:
+Look not pale, Bianca; thy father will not frown.
+
+GREMIO:
+My cake is dough; but I'll in among the rest,
+Out of hope of all, but my share of the feast.
+
+KATHARINA:
+Husband, let's follow, to see the end of this ado.
+
+PETRUCHIO:
+First kiss me, Kate, and we will.
+
+KATHARINA:
+What, in the midst of the street?
+
+PETRUCHIO:
+What, art thou ashamed of me?
+
+KATHARINA:
+No, sir, God forbid; but ashamed to kiss.
+
+PETRUCHIO:
+Why, then let's home again. Come, sirrah, let's away.
+
+KATHARINA:
+Nay, I will give thee a kiss: now pray thee, love, stay.
+
+PETRUCHIO:
+Is not this well? Come, my sweet Kate:
+Better once than never, for never too late.
+
+LUCENTIO:
+At last, though long, our jarring notes agree:
+And time it is, when raging war is done,
+To smile at scapes and perils overblown.
+My fair Bianca, bid my father welcome,
+While I with self-same kindness welcome thine.
+Brother Petruchio, sister Katharina,
+And thou, Hortensio, with thy loving widow,
+Feast with the best, and welcome to my house:
+My banquet is to close our stomachs up,
+After our great good cheer. Pray you, sit down;
+For now we sit to chat as well as eat.
+
+PETRUCHIO:
+Nothing but sit and sit, and eat and eat!
+
+BAPTISTA:
+Padua affords this kindness, son Petruchio.
+
+PETRUCHIO:
+Padua affords nothing but what is kind.
+
+HORTENSIO:
+For both our sakes, I would that word were true.
+
+PETRUCHIO:
+Now, for my life, Hortensio fears his widow.
+
+Widow:
+Then never trust me, if I be afeard.
+
+PETRUCHIO:
+You are very sensible, and yet you miss my sense:
+I mean, Hortensio is afeard of you.
+
+Widow:
+He that is giddy thinks the world turns round.
+
+PETRUCHIO:
+Roundly replied.
+
+KATHARINA:
+Mistress, how mean you that?
+
+Widow:
+Thus I conceive by him.
+
+PETRUCHIO:
+Conceives by me! How likes Hortensio that?
+
+HORTENSIO:
+My widow says, thus she conceives her tale.
+
+PETRUCHIO:
+Very well mended. Kiss him for that, good widow.
+
+KATHARINA:
+'He that is giddy thinks the world turns round:'
+I pray you, tell me what you meant by that.
+
+Widow:
+Your husband, being troubled with a shrew,
+Measures my husband's sorrow by his woe:
+And now you know my meaning,
+
+KATHARINA:
+A very mean meaning.
+
+Widow:
+Right, I mean you.
+
+KATHARINA:
+And I am mean indeed, respecting you.
+
+PETRUCHIO:
+To her, Kate!
+
+HORTENSIO:
+To her, widow!
+
+PETRUCHIO:
+A hundred marks, my Kate does put her down.
+
+HORTENSIO:
+That's my office.
+
+PETRUCHIO:
+Spoke like an officer; ha' to thee, lad!
+
+BAPTISTA:
+How likes Gremio these quick-witted folks?
+
+GREMIO:
+Believe me, sir, they butt together well.
+
+BIANCA:
+Head, and butt! an hasty-witted body
+Would say your head and butt were head and horn.
+
+VINCENTIO:
+Ay, mistress bride, hath that awaken'd you?
+
+BIANCA:
+Ay, but not frighted me; therefore I'll sleep again.
+
+PETRUCHIO:
+Nay, that you shall not: since you have begun,
+Have at you for a bitter jest or two!
+
+BIANCA:
+Am I your bird? I mean to shift my bush;
+And then pursue me as you draw your bow.
+You are welcome all.
+
+PETRUCHIO:
+She hath prevented me. Here, Signior Tranio.
+This bird you aim'd at, though you hit her not;
+Therefore a health to all that shot and miss'd.
+
+TRANIO:
+O, sir, Lucentio slipp'd me like his greyhound,
+Which runs himself and catches for his master.
+
+PETRUCHIO:
+A good swift simile, but something currish.
+
+TRANIO:
+'Tis well, sir, that you hunted for yourself:
+'Tis thought your deer does hold you at a bay.
+
+BAPTISTA:
+O ho, Petruchio! Tranio hits you now.
+
+LUCENTIO:
+I thank thee for that gird, good Tranio.
+
+HORTENSIO:
+Confess, confess, hath he not hit you here?
+
+PETRUCHIO:
+A' has a little gall'd me, I confess;
+And, as the jest did glance away from me,
+'Tis ten to one it maim'd you two outright.
+
+BAPTISTA:
+Now, in good sadness, son Petruchio,
+I think thou hast the veriest shrew of all.
+
+PETRUCHIO:
+Well, I say no: and therefore for assurance
+Let's each one send unto his wife;
+And he whose wife is most obedient
+To come at first when he doth send for her,
+Shall win the wager which we will propose.
+
+HORTENSIO:
+Content. What is the wager?
+
+LUCENTIO:
+Twenty crowns.
+
+PETRUCHIO:
+Twenty crowns!
+I'll venture so much of my hawk or hound,
+But twenty times so much upon my wife.
+
+LUCENTIO:
+A hundred then.
+
+HORTENSIO:
+Content.
+
+PETRUCHIO:
+A match! 'tis done.
+
+HORTENSIO:
+Who shall begin?
+
+LUCENTIO:
+That will I.
+Go, Biondello, bid your mistress come to me.
+
+BIONDELLO:
+I go.
+
+BAPTISTA:
+Son, I'll be your half, Bianca comes.
+
+LUCENTIO:
+I'll have no halves; I'll bear it all myself.
+How now! what news?
+
+BIONDELLO:
+Sir, my mistress sends you word
+That she is busy and she cannot come.
+
+PETRUCHIO:
+How! she is busy and she cannot come!
+Is that an answer?
+
+GREMIO:
+Ay, and a kind one too:
+Pray God, sir, your wife send you not a worse.
+
+PETRUCHIO:
+I hope better.
+
+HORTENSIO:
+Sirrah Biondello, go and entreat my wife
+To come to me forthwith.
+
+PETRUCHIO:
+O, ho! entreat her!
+Nay, then she must needs come.
+
+HORTENSIO:
+I am afraid, sir,
+Do what you can, yours will not be entreated.
+Now, where's my wife?
+
+BIONDELLO:
+She says you have some goodly jest in hand:
+She will not come: she bids you come to her.
+
+PETRUCHIO:
+Worse and worse; she will not come! O vile,
+Intolerable, not to be endured!
+Sirrah Grumio, go to your mistress;
+Say, I command her to come to me.
+
+HORTENSIO:
+I know her answer.
+
+PETRUCHIO:
+What?
+
+HORTENSIO:
+She will not.
+
+PETRUCHIO:
+The fouler fortune mine, and there an end.
+
+BAPTISTA:
+Now, by my holidame, here comes Katharina!
+
+KATHARINA:
+What is your will, sir, that you send for me?
+
+PETRUCHIO:
+Where is your sister, and Hortensio's wife?
+
+KATHARINA:
+They sit conferring by the parlor fire.
+
+PETRUCHIO:
+Go fetch them hither: if they deny to come.
+Swinge me them soundly forth unto their husbands:
+Away, I say, and bring them hither straight.
+
+LUCENTIO:
+Here is a wonder, if you talk of a wonder.
+
+HORTENSIO:
+And so it is: I wonder what it bodes.
+
+PETRUCHIO:
+Marry, peace it bodes, and love and quiet life,
+And awful rule and right supremacy;
+And, to be short, what not, that's sweet and happy?
+
+BAPTISTA:
+Now, fair befal thee, good Petruchio!
+The wager thou hast won; and I will add
+Unto their losses twenty thousand crowns;
+Another dowry to another daughter,
+For she is changed, as she had never been.
+
+PETRUCHIO:
+Nay, I will win my wager better yet
+And show more sign of her obedience,
+Her new-built virtue and obedience.
+See where she comes and brings your froward wives
+As prisoners to her womanly persuasion.
+Katharina, that cap of yours becomes you not:
+Off with that bauble, throw it under-foot.
+
+Widow:
+Lord, let me never have a cause to sigh,
+Till I be brought to such a silly pass!
+
+BIANCA:
+Fie! what a foolish duty call you this?
+
+LUCENTIO:
+I would your duty were as foolish too:
+The wisdom of your duty, fair Bianca,
+Hath cost me an hundred crowns since supper-time.
+
+BIANCA:
+The more fool you, for laying on my duty.
+
+PETRUCHIO:
+Katharina, I charge thee, tell these headstrong women
+What duty they do owe their lords and husbands.
+
+Widow:
+Come, come, you're mocking: we will have no telling.
+
+PETRUCHIO:
+Come on, I say; and first begin with her.
+
+Widow:
+She shall not.
+
+PETRUCHIO:
+I say she shall: and first begin with her.
+
+KATHARINA:
+Fie, fie! unknit that threatening unkind brow,
+And dart not scornful glances from those eyes,
+To wound thy lord, thy king, thy governor:
+It blots thy beauty as frosts do bite the meads,
+Confounds thy fame as whirlwinds shake fair buds,
+And in no sense is meet or amiable.
+A woman moved is like a fountain troubled,
+Muddy, ill-seeming, thick, bereft of beauty;
+And while it is so, none so dry or thirsty
+Will deign to sip or touch one drop of it.
+Thy husband is thy lord, thy life, thy keeper,
+Thy head, thy sovereign; one that cares for thee,
+And for thy maintenance commits his body
+To painful labour both by sea and land,
+To watch the night in storms, the day in cold,
+Whilst thou liest warm at home, secure and safe;
+And craves no other tribute at thy hands
+But love, fair looks and true obedience;
+Too little payment for so great a debt.
+Such duty as the subject owes the prince
+Even such a woman oweth to her husband;
+And when she is froward, peevish, sullen, sour,
+And not obedient to his honest will,
+What is she but a foul contending rebel
+And graceless traitor to her loving lord?
+I am ashamed that women are so simple
+To offer war where they should kneel for peace;
+Or seek for rule, supremacy and sway,
+When they are bound to serve, love and obey.
+Why are our bodies soft and weak and smooth,
+Unapt to toil and trouble in the world,
+But that our soft conditions and our hearts
+Should well agree with our external parts?
+Come, come, you froward and unable worms!
+My mind hath been as big as one of yours,
+My heart as great, my reason haply more,
+To bandy word for word and frown for frown;
+But now I see our lances are but straws,
+Our strength as weak, our weakness past compare,
+That seeming to be most which we indeed least are.
+Then vail your stomachs, for it is no boot,
+And place your hands below your husband's foot:
+In token of which duty, if he please,
+My hand is ready; may it do him ease.
+
+PETRUCHIO:
+Why, there's a wench! Come on, and kiss me, Kate.
+
+LUCENTIO:
+Well, go thy ways, old lad; for thou shalt ha't.
+
+VINCENTIO:
+'Tis a good hearing when children are toward.
+
+LUCENTIO:
+But a harsh hearing when women are froward.
+
+PETRUCHIO:
+Come, Kate, we'll to bed.
+We three are married, but you two are sped.
+'Twas I won the wager, though you hit the white;
+And, being a winner, God give you good night!
+
+HORTENSIO:
+Now, go thy ways; thou hast tamed a curst shrew.
+
+LUCENTIO:
+'Tis a wonder, by your leave, she will be tamed so.
+
+Master:
+Boatswain!
+
+Boatswain:
+Here, master: what cheer?
+
+Master:
+Good, speak to the mariners: fall to't, yarely,
+or we run ourselves aground: bestir, bestir.
+
+Boatswain:
+Heigh, my hearts! cheerly, cheerly, my hearts!
+yare, yare! Take in the topsail. Tend to the
+master's whistle. Blow, till thou burst thy wind,
+if room enough!
+
+ALONSO:
+Good boatswain, have care. Where's the master?
+Play the men.
+
+Boatswain:
+I pray now, keep below.
+
+ANTONIO:
+Where is the master, boatswain?
+
+Boatswain:
+Do you not hear him? You mar our labour: keep your
+cabins: you do assist the storm.
+
+GONZALO:
+Nay, good, be patient.
+
+Boatswain:
+When the sea is. Hence! What cares these roarers
+for the name of king? To cabin: silence! trouble us not.
+
+GONZALO:
+Good, yet remember whom thou hast aboard.
+
+Boatswain:
+None that I more love than myself. You are a
+counsellor; if you can command these elements to
+silence, and work the peace of the present, we will
+not hand a rope more; use your authority: if you
+cannot, give thanks you have lived so long, and make
+yourself ready in your cabin for the mischance of
+the hour, if it so hap. Cheerly, good hearts! Out
+of our way, I say.
+
+GONZALO:
+I have great comfort from this fellow: methinks he
+hath no drowning mark upon him; his complexion is
+perfect gallows. Stand fast, good Fate, to his
+hanging: make the rope of his destiny our cable,
+for our own doth little advantage. If he be not
+born to be hanged, our case is miserable.
+
+Boatswain:
+Down with the topmast! yare! lower, lower! Bring
+her to try with main-course.
+A plague upon this howling! they are louder than
+the weather or our office.
+Yet again! what do you here? Shall we give o'er
+and drown? Have you a mind to sink?
+
+SEBASTIAN:
+A pox o' your throat, you bawling, blasphemous,
+incharitable dog!
+
+Boatswain:
+Work you then.
+
+ANTONIO:
+Hang, cur! hang, you whoreson, insolent noisemaker!
+We are less afraid to be drowned than thou art.
+
+GONZALO:
+I'll warrant him for drowning; though the ship were
+no stronger than a nutshell and as leaky as an
+unstanched wench.
+
+Boatswain:
+Lay her a-hold, a-hold! set her two courses off to
+sea again; lay her off.
+
+Mariners:
+All lost! to prayers, to prayers! all lost!
+
+Boatswain:
+What, must our mouths be cold?
+
+GONZALO:
+The king and prince at prayers! let's assist them,
+For our case is as theirs.
+
+SEBASTIAN:
+I'm out of patience.
+
+ANTONIO:
+We are merely cheated of our lives by drunkards:
+This wide-chapp'd rascal--would thou mightst lie drowning
+The washing of ten tides!
+
+GONZALO:
+He'll be hang'd yet,
+Though every drop of water swear against it
+And gape at widest to glut him.
+
+ANTONIO:
+Let's all sink with the king.
+
+SEBASTIAN:
+Let's take leave of him.
+
+GONZALO:
+Now would I give a thousand furlongs of sea for an
+acre of barren ground, long heath, brown furze, any
+thing. The wills above be done! but I would fain
+die a dry death.
+
+MIRANDA:
+If by your art, my dearest father, you have
+Put the wild waters in this roar, allay them.
+The sky, it seems, would pour down stinking pitch,
+But that the sea, mounting to the welkin's cheek,
+Dashes the fire out. O, I have suffered
+With those that I saw suffer: a brave vessel,
+Who had, no doubt, some noble creature in her,
+Dash'd all to pieces. O, the cry did knock
+Against my very heart. Poor souls, they perish'd.
+Had I been any god of power, I would
+Have sunk the sea within the earth or ere
+It should the good ship so have swallow'd and
+The fraughting souls within her.
+
+PROSPERO:
+Be collected:
+No more amazement: tell your piteous heart
+There's no harm done.
+
+MIRANDA:
+O, woe the day!
+
+PROSPERO:
+No harm.
+I have done nothing but in care of thee,
+Of thee, my dear one, thee, my daughter, who
+Art ignorant of what thou art, nought knowing
+Of whence I am, nor that I am more better
+Than Prospero, master of a full poor cell,
+And thy no greater father.
+
+MIRANDA:
+More to know
+Did never meddle with my thoughts.
+
+PROSPERO:
+'Tis time
+I should inform thee farther. Lend thy hand,
+And pluck my magic garment from me. So:
+Lie there, my art. Wipe thou thine eyes; have comfort.
+The direful spectacle of the wreck, which touch'd
+The very virtue of compassion in thee,
+I have with such provision in mine art
+So safely ordered that there is no soul--
+No, not so much perdition as an hair
+Betid to any creature in the vessel
+Which thou heard'st cry, which thou saw'st sink. Sit down;
+For thou must now know farther.
+
+MIRANDA:
+You have often
+Begun to tell me what I am, but stopp'd
+And left me to a bootless inquisition,
+Concluding 'Stay: not yet.'
+
+PROSPERO:
+The hour's now come;
+The very minute bids thee ope thine ear;
+Obey and be attentive. Canst thou remember
+A time before we came unto this cell?
+I do not think thou canst, for then thou wast not
+Out three years old.
+
+MIRANDA:
+Certainly, sir, I can.
+
+PROSPERO:
+By what? by any other house or person?
+Of any thing the image tell me that
+Hath kept with thy remembrance.
+
+MIRANDA:
+'Tis far off
+And rather like a dream than an assurance
+That my remembrance warrants. Had I not
+Four or five women once that tended me?
+
+PROSPERO:
+Thou hadst, and more, Miranda. But how is it
+That this lives in thy mind? What seest thou else
+In the dark backward and abysm of time?
+If thou remember'st aught ere thou camest here,
+How thou camest here thou mayst.
+
+MIRANDA:
+But that I do not.
+
+PROSPERO:
+Twelve year since, Miranda, twelve year since,
+Thy father was the Duke of Milan and
+A prince of power.
+
+MIRANDA:
+Sir, are not you my father?
+
+PROSPERO:
+Thy mother was a piece of virtue, and
+She said thou wast my daughter; and thy father
+Was Duke of Milan; and thou his only heir
+And princess no worse issued.
+
+MIRANDA:
+O the heavens!
+What foul play had we, that we came from thence?
+Or blessed was't we did?
+
+PROSPERO:
+Both, both, my girl:
+By foul play, as thou say'st, were we heaved thence,
+But blessedly holp hither.
+
+MIRANDA:
+O, my heart bleeds
+To think o' the teen that I have turn'd you to,
+Which is from my remembrance! Please you, farther.
+
+PROSPERO:
+My brother and thy uncle, call'd Antonio--
+I pray thee, mark me--that a brother should
+Be so perfidious!--he whom next thyself
+Of all the world I loved and to him put
+The manage of my state; as at that time
+Through all the signories it was the first
+And Prospero the prime duke, being so reputed
+In dignity, and for the liberal arts
+Without a parallel; those being all my study,
+The government I cast upon my brother
+And to my state grew stranger, being transported
+And rapt in secret studies. Thy false uncle--
+Dost thou attend me?
+
+MIRANDA:
+Sir, most heedfully.
+
+PROSPERO:
+Being once perfected how to grant suits,
+How to deny them, who to advance and who
+To trash for over-topping, new created
+The creatures that were mine, I say, or changed 'em,
+Or else new form'd 'em; having both the key
+Of officer and office, set all hearts i' the state
+To what tune pleased his ear; that now he was
+The ivy which had hid my princely trunk,
+And suck'd my verdure out on't. Thou attend'st not.
+
+MIRANDA:
+O, good sir, I do.
+
+PROSPERO:
+I pray thee, mark me.
+I, thus neglecting worldly ends, all dedicated
+To closeness and the bettering of my mind
+With that which, but by being so retired,
+O'er-prized all popular rate, in my false brother
+Awaked an evil nature; and my trust,
+Like a good parent, did beget of him
+A falsehood in its contrary as great
+As my trust was; which had indeed no limit,
+A confidence sans bound. He being thus lorded,
+Not only with what my revenue yielded,
+But what my power might else exact, like one
+Who having into truth, by telling of it,
+Made such a sinner of his memory,
+To credit his own lie, he did believe
+He was indeed the duke; out o' the substitution
+And executing the outward face of royalty,
+With all prerogative: hence his ambition growing--
+Dost thou hear?
+
+MIRANDA:
+Your tale, sir, would cure deafness.
+
+PROSPERO:
+To have no screen between this part he play'd
+And him he play'd it for, he needs will be
+Absolute Milan. Me, poor man, my library
+Was dukedom large enough: of temporal royalties
+He thinks me now incapable; confederates--
+So dry he was for sway--wi' the King of Naples
+To give him annual tribute, do him homage,
+Subject his coronet to his crown and bend
+The dukedom yet unbow'd--alas, poor Milan!--
+To most ignoble stooping.
+
+MIRANDA:
+O the heavens!
+
+PROSPERO:
+Mark his condition and the event; then tell me
+If this might be a brother.
+
+MIRANDA:
+I should sin
+To think but nobly of my grandmother:
+Good wombs have borne bad sons.
+
+PROSPERO:
+Now the condition.
+The King of Naples, being an enemy
+To me inveterate, hearkens my brother's suit;
+Which was, that he, in lieu o' the premises
+Of homage and I know not how much tribute,
+Should presently extirpate me and mine
+Out of the dukedom and confer fair Milan
+With all the honours on my brother: whereon,
+A treacherous army levied, one midnight
+Fated to the purpose did Antonio open
+The gates of Milan, and, i' the dead of darkness,
+The ministers for the purpose hurried thence
+Me and thy crying self.
+
+MIRANDA:
+Alack, for pity!
+I, not remembering how I cried out then,
+Will cry it o'er again: it is a hint
+That wrings mine eyes to't.
+
+PROSPERO:
+Hear a little further
+And then I'll bring thee to the present business
+Which now's upon's; without the which this story
+Were most impertinent.
+
+MIRANDA:
+Wherefore did they not
+That hour destroy us?
+
+PROSPERO:
+Well demanded, wench:
+My tale provokes that question. Dear, they durst not,
+So dear the love my people bore me, nor set
+A mark so bloody on the business, but
+With colours fairer painted their foul ends.
+In few, they hurried us aboard a bark,
+Bore us some leagues to sea; where they prepared
+A rotten carcass of a boat, not rigg'd,
+Nor tackle, sail, nor mast; the very rats
+Instinctively had quit it: there they hoist us,
+To cry to the sea that roar'd to us, to sigh
+To the winds whose pity, sighing back again,
+Did us but loving wrong.
+
+MIRANDA:
+Alack, what trouble
+Was I then to you!
+
+PROSPERO:
+O, a cherubim
+Thou wast that did preserve me. Thou didst smile.
+Infused with a fortitude from heaven,
+When I have deck'd the sea with drops full salt,
+Under my burthen groan'd; which raised in me
+An undergoing stomach, to bear up
+Against what should ensue.
+
+MIRANDA:
+How came we ashore?
+
+PROSPERO:
+By Providence divine.
+Some food we had and some fresh water that
+A noble Neapolitan, Gonzalo,
+Out of his charity, being then appointed
+Master of this design, did give us, with
+Rich garments, linens, stuffs and necessaries,
+Which since have steaded much; so, of his gentleness,
+Knowing I loved my books, he furnish'd me
+From mine own library with volumes that
+I prize above my dukedom.
+
+MIRANDA:
+Would I might
+But ever see that man!
+
+PROSPERO:
+Now I arise:
+Sit still, and hear the last of our sea-sorrow.
+Here in this island we arrived; and here
+Have I, thy schoolmaster, made thee more profit
+Than other princesses can that have more time
+For vainer hours and tutors not so careful.
+
+MIRANDA:
+Heavens thank you for't! And now, I pray you, sir,
+For still 'tis beating in my mind, your reason
+For raising this sea-storm?
+
+PROSPERO:
+Know thus far forth.
+By accident most strange, bountiful Fortune,
+Now my dear lady, hath mine enemies
+Brought to this shore; and by my prescience
+I find my zenith doth depend upon
+A most auspicious star, whose influence
+If now I court not but omit, my fortunes
+Will ever after droop. Here cease more questions:
+Thou art inclined to sleep; 'tis a good dulness,
+And give it way: I know thou canst not choose.
+Come away, servant, come. I am ready now.
+Approach, my Ariel, come.
+
+ARIEL:
+All hail, great master! grave sir, hail! I come
+To answer thy best pleasure; be't to fly,
+To swim, to dive into the fire, to ride
+On the curl'd clouds, to thy strong bidding task
+Ariel and all his quality.
+
+PROSPERO:
+Hast thou, spirit,
+Perform'd to point the tempest that I bade thee?
+
+ARIEL:
+To every article.
+I boarded the king's ship; now on the beak,
+Now in the waist, the deck, in every cabin,
+I flamed amazement: sometime I'ld divide,
+And burn in many places; on the topmast,
+The yards and bowsprit, would I flame distinctly,
+Then meet and join. Jove's lightnings, the precursors
+O' the dreadful thunder-claps, more momentary
+And sight-outrunning were not; the fire and cracks
+Of sulphurous roaring the most mighty Neptune
+Seem to besiege and make his bold waves tremble,
+Yea, his dread trident shake.
+
+PROSPERO:
+My brave spirit!
+Who was so firm, so constant, that this coil
+Would not infect his reason?
+
+ARIEL:
+Not a soul
+But felt a fever of the mad and play'd
+Some tricks of desperation. All but mariners
+Plunged in the foaming brine and quit the vessel,
+Then all afire with me: the king's son, Ferdinand,
+With hair up-staring,--then like reeds, not hair,--
+Was the first man that leap'd; cried, 'Hell is empty
+And all the devils are here.'
+
+PROSPERO:
+Why that's my spirit!
+But was not this nigh shore?
+
+ARIEL:
+Close by, my master.
+
+PROSPERO:
+But are they, Ariel, safe?
+
+ARIEL:
+Not a hair perish'd;
+On their sustaining garments not a blemish,
+But fresher than before: and, as thou badest me,
+In troops I have dispersed them 'bout the isle.
+The king's son have I landed by himself;
+Whom I left cooling of the air with sighs
+In an odd angle of the isle and sitting,
+His arms in this sad knot.
+
+PROSPERO:
+Of the king's ship
+The mariners say how thou hast disposed
+And all the rest o' the fleet.
+
+ARIEL:
+Safely in harbour
+Is the king's ship; in the deep nook, where once
+Thou call'dst me up at midnight to fetch dew
+From the still-vex'd Bermoothes, there she's hid:
+The mariners all under hatches stow'd;
+Who, with a charm join'd to their suffer'd labour,
+I have left asleep; and for the rest o' the fleet
+Which I dispersed, they all have met again
+And are upon the Mediterranean flote,
+Bound sadly home for Naples,
+Supposing that they saw the king's ship wreck'd
+And his great person perish.
+
+PROSPERO:
+Ariel, thy charge
+Exactly is perform'd: but there's more work.
+What is the time o' the day?
+
+ARIEL:
+Past the mid season.
+
+PROSPERO:
+At least two glasses. The time 'twixt six and now
+Must by us both be spent most preciously.
+
+ARIEL:
+Is there more toil? Since thou dost give me pains,
+Let me remember thee what thou hast promised,
+Which is not yet perform'd me.
+
+PROSPERO:
+How now? moody?
+What is't thou canst demand?
+
+ARIEL:
+My liberty.
+
+PROSPERO:
+Before the time be out? no more!
+
+ARIEL:
+I prithee,
+Remember I have done thee worthy service;
+Told thee no lies, made thee no mistakings, served
+Without or grudge or grumblings: thou didst promise
+To bate me a full year.
+
+PROSPERO:
+Dost thou forget
+From what a torment I did free thee?
+
+ARIEL:
+No.
+
+PROSPERO:
+Thou dost, and think'st it much to tread the ooze
+Of the salt deep,
+To run upon the sharp wind of the north,
+To do me business in the veins o' the earth
+When it is baked with frost.
+
+ARIEL:
+I do not, sir.
+
+PROSPERO:
+Thou liest, malignant thing! Hast thou forgot
+The foul witch Sycorax, who with age and envy
+Was grown into a hoop? hast thou forgot her?
+
+ARIEL:
+No, sir.
+
+PROSPERO:
+Thou hast. Where was she born? speak; tell me.
+
+ARIEL:
+Sir, in Argier.
+
+PROSPERO:
+O, was she so? I must
+Once in a month recount what thou hast been,
+Which thou forget'st. This damn'd witch Sycorax,
+For mischiefs manifold and sorceries terrible
+To enter human hearing, from Argier,
+Thou know'st, was banish'd: for one thing she did
+They would not take her life. Is not this true?
+
+ARIEL:
+Ay, sir.
+
+PROSPERO:
+This blue-eyed hag was hither brought with child
+And here was left by the sailors. Thou, my slave,
+As thou report'st thyself, wast then her servant;
+And, for thou wast a spirit too delicate
+To act her earthy and abhorr'd commands,
+Refusing her grand hests, she did confine thee,
+By help of her more potent ministers
+And in her most unmitigable rage,
+Into a cloven pine; within which rift
+Imprison'd thou didst painfully remain
+A dozen years; within which space she died
+And left thee there; where thou didst vent thy groans
+As fast as mill-wheels strike. Then was this island--
+Save for the son that she did litter here,
+A freckled whelp hag-born--not honour'd with
+A human shape.
+
+ARIEL:
+Yes, Caliban her son.
+
+PROSPERO:
+Dull thing, I say so; he, that Caliban
+Whom now I keep in service. Thou best know'st
+What torment I did find thee in; thy groans
+Did make wolves howl and penetrate the breasts
+Of ever angry bears: it was a torment
+To lay upon the damn'd, which Sycorax
+Could not again undo: it was mine art,
+When I arrived and heard thee, that made gape
+The pine and let thee out.
+
+ARIEL:
+I thank thee, master.
+
+PROSPERO:
+If thou more murmur'st, I will rend an oak
+And peg thee in his knotty entrails till
+Thou hast howl'd away twelve winters.
+
+ARIEL:
+Pardon, master;
+I will be correspondent to command
+And do my spiriting gently.
+
+PROSPERO:
+Do so, and after two days
+I will discharge thee.
+
+ARIEL:
+That's my noble master!
+What shall I do? say what; what shall I do?
+
+PROSPERO:
+Go make thyself like a nymph o' the sea: be subject
+To no sight but thine and mine, invisible
+To every eyeball else. Go take this shape
+And hither come in't: go, hence with diligence!
+Awake, dear heart, awake! thou hast slept well; Awake!
+
+MIRANDA:
+The strangeness of your story put
+Heaviness in me.
+
+PROSPERO:
+Shake it off. Come on;
+We'll visit Caliban my slave, who never
+Yields us kind answer.
+
+MIRANDA:
+'Tis a villain, sir,
+I do not love to look on.
+
+PROSPERO:
+But, as 'tis,
+We cannot miss him: he does make our fire,
+Fetch in our wood and serves in offices
+That profit us. What, ho! slave! Caliban!
+Thou earth, thou! speak.
+
+CALIBAN:
+
+PROSPERO:
+Come forth, I say! there's other business for thee:
+Come, thou tortoise! when?
+Fine apparition! My quaint Ariel,
+Hark in thine ear.
+
+ARIEL:
+My lord it shall be done.
+
+PROSPERO:
+Thou poisonous slave, got by the devil himself
+Upon thy wicked dam, come forth!
+
+CALIBAN:
+As wicked dew as e'er my mother brush'd
+With raven's feather from unwholesome fen
+Drop on you both! a south-west blow on ye
+And blister you all o'er!
+
+PROSPERO:
+For this, be sure, to-night thou shalt have cramps,
+Side-stitches that shall pen thy breath up; urchins
+Shall, for that vast of night that they may work,
+All exercise on thee; thou shalt be pinch'd
+As thick as honeycomb, each pinch more stinging
+Than bees that made 'em.
+
+CALIBAN:
+I must eat my dinner.
+This island's mine, by Sycorax my mother,
+Which thou takest from me. When thou camest first,
+Thou strokedst me and madest much of me, wouldst give me
+Water with berries in't, and teach me how
+To name the bigger light, and how the less,
+That burn by day and night: and then I loved thee
+And show'd thee all the qualities o' the isle,
+The fresh springs, brine-pits, barren place and fertile:
+Cursed be I that did so! All the charms
+Of Sycorax, toads, beetles, bats, light on you!
+For I am all the subjects that you have,
+Which first was mine own king: and here you sty me
+In this hard rock, whiles you do keep from me
+The rest o' the island.
+
+PROSPERO:
+Thou most lying slave,
+Whom stripes may move, not kindness! I have used thee,
+Filth as thou art, with human care, and lodged thee
+In mine own cell, till thou didst seek to violate
+The honour of my child.
+
+CALIBAN:
+O ho, O ho! would't had been done!
+Thou didst prevent me; I had peopled else
+This isle with Calibans.
+
+PROSPERO:
+Abhorred slave,
+Which any print of goodness wilt not take,
+Being capable of all ill! I pitied thee,
+Took pains to make thee speak, taught thee each hour
+One thing or other: when thou didst not, savage,
+Know thine own meaning, but wouldst gabble like
+A thing most brutish, I endow'd thy purposes
+With words that made them known. But thy vile race,
+Though thou didst learn, had that in't which
+good natures
+Could not abide to be with; therefore wast thou
+Deservedly confined into this rock,
+Who hadst deserved more than a prison.
+
+CALIBAN:
+You taught me language; and my profit on't
+Is, I know how to curse. The red plague rid you
+For learning me your language!
+
+PROSPERO:
+Hag-seed, hence!
+Fetch us in fuel; and be quick, thou'rt best,
+To answer other business. Shrug'st thou, malice?
+If thou neglect'st or dost unwillingly
+What I command, I'll rack thee with old cramps,
+Fill all thy bones with aches, make thee roar
+That beasts shall tremble at thy din.
+
+CALIBAN:
+No, pray thee.
+I must obey: his art is of such power,
+It would control my dam's god, Setebos,
+and make a vassal of him.
+
+PROSPERO:
+So, slave; hence!
+Come unto these yellow sands,
+And then take hands:
+Courtsied when you have and kiss'd
+The wild waves whist,
+Foot it featly here and there;
+And, sweet sprites, the burthen bear.
+Hark, hark!
+
+FERDINAND:
+Where should this music be? i' the air or the earth?
+It sounds no more: and sure, it waits upon
+Some god o' the island. Sitting on a bank,
+Weeping again the king my father's wreck,
+This music crept by me upon the waters,
+Allaying both their fury and my passion
+With its sweet air: thence I have follow'd it,
+Or it hath drawn me rather. But 'tis gone.
+No, it begins again.
+Full fathom five thy father lies;
+Of his bones are coral made;
+Those are pearls that were his eyes:
+Nothing of him that doth fade
+But doth suffer a sea-change
+Into something rich and strange.
+Sea-nymphs hourly ring his knell
+Hark! now I hear them,--Ding-dong, bell.
+
+FERDINAND:
+The ditty does remember my drown'd father.
+This is no mortal business, nor no sound
+That the earth owes. I hear it now above me.
+
+PROSPERO:
+The fringed curtains of thine eye advance
+And say what thou seest yond.
+
+MIRANDA:
+What is't? a spirit?
+Lord, how it looks about! Believe me, sir,
+It carries a brave form. But 'tis a spirit.
+
+PROSPERO:
+No, wench; it eats and sleeps and hath such senses
+As we have, such. This gallant which thou seest
+Was in the wreck; and, but he's something stain'd
+With grief that's beauty's canker, thou mightst call him
+A goodly person: he hath lost his fellows
+And strays about to find 'em.
+
+MIRANDA:
+I might call him
+A thing divine, for nothing natural
+I ever saw so noble.
+
+PROSPERO:
+
+FERDINAND:
+Most sure, the goddess
+On whom these airs attend! Vouchsafe my prayer
+May know if you remain upon this island;
+And that you will some good instruction give
+How I may bear me here: my prime request,
+Which I do last pronounce, is, O you wonder!
+If you be maid or no?
+
+MIRANDA:
+No wonder, sir;
+But certainly a maid.
+
+FERDINAND:
+My language! heavens!
+I am the best of them that speak this speech,
+Were I but where 'tis spoken.
+
+PROSPERO:
+How? the best?
+What wert thou, if the King of Naples heard thee?
+
+FERDINAND:
+A single thing, as I am now, that wonders
+To hear thee speak of Naples. He does hear me;
+And that he does I weep: myself am Naples,
+Who with mine eyes, never since at ebb, beheld
+The king my father wreck'd.
+
+MIRANDA:
+Alack, for mercy!
+
+FERDINAND:
+Yes, faith, and all his lords; the Duke of Milan
+And his brave son being twain.
+
+PROSPERO:
+
+MIRANDA:
+Why speaks my father so ungently? This
+Is the third man that e'er I saw, the first
+That e'er I sigh'd for: pity move my father
+To be inclined my way!
+
+FERDINAND:
+O, if a virgin,
+And your affection not gone forth, I'll make you
+The queen of Naples.
+
+PROSPERO:
+Soft, sir! one word more.
+They are both in either's powers; but this swift business
+I must uneasy make, lest too light winning
+Make the prize light.
+One word more; I charge thee
+That thou attend me: thou dost here usurp
+The name thou owest not; and hast put thyself
+Upon this island as a spy, to win it
+From me, the lord on't.
+
+FERDINAND:
+No, as I am a man.
+
+MIRANDA:
+There's nothing ill can dwell in such a temple:
+If the ill spirit have so fair a house,
+Good things will strive to dwell with't.
+
+PROSPERO:
+Follow me.
+Speak not you for him; he's a traitor. Come;
+I'll manacle thy neck and feet together:
+Sea-water shalt thou drink; thy food shall be
+The fresh-brook muscles, wither'd roots and husks
+Wherein the acorn cradled. Follow.
+
+FERDINAND:
+No;
+I will resist such entertainment till
+Mine enemy has more power.
+
+MIRANDA:
+O dear father,
+Make not too rash a trial of him, for
+He's gentle and not fearful.
+
+PROSPERO:
+What? I say,
+My foot my tutor? Put thy sword up, traitor;
+Who makest a show but darest not strike, thy conscience
+Is so possess'd with guilt: come from thy ward,
+For I can here disarm thee with this stick
+And make thy weapon drop.
+
+MIRANDA:
+Beseech you, father.
+
+PROSPERO:
+Hence! hang not on my garments.
+
+MIRANDA:
+Sir, have pity;
+I'll be his surety.
+
+PROSPERO:
+Silence! one word more
+Shall make me chide thee, if not hate thee. What!
+An advocate for an imposter! hush!
+Thou think'st there is no more such shapes as he,
+Having seen but him and Caliban: foolish wench!
+To the most of men this is a Caliban
+And they to him are angels.
+
+MIRANDA:
+My affections
+Are then most humble; I have no ambition
+To see a goodlier man.
+
+PROSPERO:
+Come on; obey:
+Thy nerves are in their infancy again
+And have no vigour in them.
+
+FERDINAND:
+So they are;
+My spirits, as in a dream, are all bound up.
+My father's loss, the weakness which I feel,
+The wreck of all my friends, nor this man's threats,
+To whom I am subdued, are but light to me,
+Might I but through my prison once a day
+Behold this maid: all corners else o' the earth
+Let liberty make use of; space enough
+Have I in such a prison.
+
+PROSPERO:
+
+MIRANDA:
+Be of comfort;
+My father's of a better nature, sir,
+Than he appears by speech: this is unwonted
+Which now came from him.
+
+PROSPERO:
+Thou shalt be free
+As mountain winds: but then exactly do
+All points of my command.
+
+ARIEL:
+To the syllable.
+
+PROSPERO:
+Come, follow. Speak not for him.
+
+GONZALO:
+Beseech you, sir, be merry; you have cause,
+So have we all, of joy; for our escape
+Is much beyond our loss. Our hint of woe
+Is common; every day some sailor's wife,
+The masters of some merchant and the merchant
+Have just our theme of woe; but for the miracle,
+I mean our preservation, few in millions
+Can speak like us: then wisely, good sir, weigh
+Our sorrow with our comfort.
+
+ALONSO:
+Prithee, peace.
+
+SEBASTIAN:
+He receives comfort like cold porridge.
+
+ANTONIO:
+The visitor will not give him o'er so.
+
+SEBASTIAN:
+Look he's winding up the watch of his wit;
+by and by it will strike.
+
+GONZALO:
+Sir,--
+
+SEBASTIAN:
+One: tell.
+
+GONZALO:
+When every grief is entertain'd that's offer'd,
+Comes to the entertainer--
+
+SEBASTIAN:
+A dollar.
+
+GONZALO:
+Dolour comes to him, indeed: you
+have spoken truer than you purposed.
+
+SEBASTIAN:
+You have taken it wiselier than I meant you should.
+
+GONZALO:
+Therefore, my lord,--
+
+ANTONIO:
+Fie, what a spendthrift is he of his tongue!
+
+ALONSO:
+I prithee, spare.
+
+GONZALO:
+Well, I have done: but yet,--
+
+SEBASTIAN:
+He will be talking.
+
+ANTONIO:
+Which, of he or Adrian, for a good
+wager, first begins to crow?
+
+SEBASTIAN:
+The old cock.
+
+ANTONIO:
+The cockerel.
+
+SEBASTIAN:
+Done. The wager?
+
+ANTONIO:
+A laughter.
+
+SEBASTIAN:
+A match!
+
+ADRIAN:
+Though this island seem to be desert,--
+
+SEBASTIAN:
+Ha, ha, ha! So, you're paid.
+
+ADRIAN:
+Uninhabitable and almost inaccessible,--
+
+SEBASTIAN:
+Yet,--
+
+ADRIAN:
+Yet,--
+
+ANTONIO:
+He could not miss't.
+
+ADRIAN:
+It must needs be of subtle, tender and delicate
+temperance.
+
+ANTONIO:
+Temperance was a delicate wench.
+
+SEBASTIAN:
+Ay, and a subtle; as he most learnedly delivered.
+
+ADRIAN:
+The air breathes upon us here most sweetly.
+
+SEBASTIAN:
+As if it had lungs and rotten ones.
+
+ANTONIO:
+Or as 'twere perfumed by a fen.
+
+GONZALO:
+Here is everything advantageous to life.
+
+ANTONIO:
+True; save means to live.
+
+SEBASTIAN:
+Of that there's none, or little.
+
+GONZALO:
+How lush and lusty the grass looks! how green!
+
+ANTONIO:
+The ground indeed is tawny.
+
+SEBASTIAN:
+With an eye of green in't.
+
+ANTONIO:
+He misses not much.
+
+SEBASTIAN:
+No; he doth but mistake the truth totally.
+
+GONZALO:
+But the rarity of it is,--which is indeed almost
+beyond credit,--
+
+SEBASTIAN:
+As many vouched rarities are.
+
+GONZALO:
+That our garments, being, as they were, drenched in
+the sea, hold notwithstanding their freshness and
+glosses, being rather new-dyed than stained with
+salt water.
+
+ANTONIO:
+If but one of his pockets could speak, would it not
+say he lies?
+
+SEBASTIAN:
+Ay, or very falsely pocket up his report
+
+GONZALO:
+Methinks our garments are now as fresh as when we
+put them on first in Afric, at the marriage of
+the king's fair daughter Claribel to the King of Tunis.
+
+SEBASTIAN:
+'Twas a sweet marriage, and we prosper well in our return.
+
+ADRIAN:
+Tunis was never graced before with such a paragon to
+their queen.
+
+GONZALO:
+Not since widow Dido's time.
+
+ANTONIO:
+Widow! a pox o' that! How came that widow in?
+widow Dido!
+
+SEBASTIAN:
+What if he had said 'widower AEneas' too? Good Lord,
+how you take it!
+
+ADRIAN:
+'Widow Dido' said you? you make me study of that:
+she was of Carthage, not of Tunis.
+
+GONZALO:
+This Tunis, sir, was Carthage.
+
+ADRIAN:
+Carthage?
+
+GONZALO:
+I assure you, Carthage.
+
+SEBASTIAN:
+His word is more than the miraculous harp; he hath
+raised the wall and houses too.
+
+ANTONIO:
+What impossible matter will he make easy next?
+
+SEBASTIAN:
+I think he will carry this island home in his pocket
+and give it his son for an apple.
+
+ANTONIO:
+And, sowing the kernels of it in the sea, bring
+forth more islands.
+
+GONZALO:
+Ay.
+
+ANTONIO:
+Why, in good time.
+
+GONZALO:
+Sir, we were talking that our garments seem now
+as fresh as when we were at Tunis at the marriage
+of your daughter, who is now queen.
+
+ANTONIO:
+And the rarest that e'er came there.
+
+SEBASTIAN:
+Bate, I beseech you, widow Dido.
+
+ANTONIO:
+O, widow Dido! ay, widow Dido.
+
+GONZALO:
+Is not, sir, my doublet as fresh as the first day I
+wore it? I mean, in a sort.
+
+ANTONIO:
+That sort was well fished for.
+
+GONZALO:
+When I wore it at your daughter's marriage?
+
+ALONSO:
+You cram these words into mine ears against
+The stomach of my sense. Would I had never
+Married my daughter there! for, coming thence,
+My son is lost and, in my rate, she too,
+Who is so far from Italy removed
+I ne'er again shall see her. O thou mine heir
+Of Naples and of Milan, what strange fish
+Hath made his meal on thee?
+
+FRANCISCO:
+Sir, he may live:
+I saw him beat the surges under him,
+And ride upon their backs; he trod the water,
+Whose enmity he flung aside, and breasted
+The surge most swoln that met him; his bold head
+'Bove the contentious waves he kept, and oar'd
+Himself with his good arms in lusty stroke
+To the shore, that o'er his wave-worn basis bow'd,
+As stooping to relieve him: I not doubt
+He came alive to land.
+
+ALONSO:
+No, no, he's gone.
+
+SEBASTIAN:
+Sir, you may thank yourself for this great loss,
+That would not bless our Europe with your daughter,
+But rather lose her to an African;
+Where she at least is banish'd from your eye,
+Who hath cause to wet the grief on't.
+
+ALONSO:
+Prithee, peace.
+
+SEBASTIAN:
+You were kneel'd to and importuned otherwise
+By all of us, and the fair soul herself
+Weigh'd between loathness and obedience, at
+Which end o' the beam should bow. We have lost your
+son,
+I fear, for ever: Milan and Naples have
+More widows in them of this business' making
+Than we bring men to comfort them:
+The fault's your own.
+
+ALONSO:
+So is the dear'st o' the loss.
+
+GONZALO:
+My lord Sebastian,
+The truth you speak doth lack some gentleness
+And time to speak it in: you rub the sore,
+When you should bring the plaster.
+
+SEBASTIAN:
+Very well.
+
+ANTONIO:
+And most chirurgeonly.
+
+GONZALO:
+It is foul weather in us all, good sir,
+When you are cloudy.
+
+SEBASTIAN:
+Foul weather?
+
+ANTONIO:
+Very foul.
+
+GONZALO:
+Had I plantation of this isle, my lord,--
+
+ANTONIO:
+He'ld sow't with nettle-seed.
+
+SEBASTIAN:
+Or docks, or mallows.
+
+GONZALO:
+And were the king on't, what would I do?
+
+SEBASTIAN:
+'Scape being drunk for want of wine.
+
+GONZALO:
+I' the commonwealth I would by contraries
+Execute all things; for no kind of traffic
+Would I admit; no name of magistrate;
+Letters should not be known; riches, poverty,
+And use of service, none; contract, succession,
+Bourn, bound of land, tilth, vineyard, none;
+No use of metal, corn, or wine, or oil;
+No occupation; all men idle, all;
+And women too, but innocent and pure;
+No sovereignty;--
+
+SEBASTIAN:
+Yet he would be king on't.
+
+ANTONIO:
+The latter end of his commonwealth forgets the
+beginning.
+
+GONZALO:
+All things in common nature should produce
+Without sweat or endeavour: treason, felony,
+Sword, pike, knife, gun, or need of any engine,
+Would I not have; but nature should bring forth,
+Of its own kind, all foison, all abundance,
+To feed my innocent people.
+
+SEBASTIAN:
+No marrying 'mong his subjects?
+
+ANTONIO:
+None, man; all idle: whores and knaves.
+
+GONZALO:
+I would with such perfection govern, sir,
+To excel the golden age.
+
+SEBASTIAN:
+God save his majesty!
+
+ANTONIO:
+Long live Gonzalo!
+
+GONZALO:
+And,--do you mark me, sir?
+
+ALONSO:
+Prithee, no more: thou dost talk nothing to me.
+
+GONZALO:
+I do well believe your highness; and
+did it to minister occasion to these gentlemen,
+who are of such sensible and nimble lungs that
+they always use to laugh at nothing.
+
+ANTONIO:
+'Twas you we laughed at.
+
+GONZALO:
+Who in this kind of merry fooling am nothing
+to you: so you may continue and laugh at
+nothing still.
+
+ANTONIO:
+What a blow was there given!
+
+SEBASTIAN:
+An it had not fallen flat-long.
+
+GONZALO:
+You are gentlemen of brave metal; you would lift
+the moon out of her sphere, if she would continue
+in it five weeks without changing.
+
+SEBASTIAN:
+We would so, and then go a bat-fowling.
+
+ANTONIO:
+Nay, good my lord, be not angry.
+
+GONZALO:
+No, I warrant you; I will not adventure
+my discretion so weakly. Will you laugh
+me asleep, for I am very heavy?
+
+ANTONIO:
+Go sleep, and hear us.
+
+ALONSO:
+What, all so soon asleep! I wish mine eyes
+Would, with themselves, shut up my thoughts: I find
+They are inclined to do so.
+
+SEBASTIAN:
+Please you, sir,
+Do not omit the heavy offer of it:
+It seldom visits sorrow; when it doth,
+It is a comforter.
+
+ANTONIO:
+We two, my lord,
+Will guard your person while you take your rest,
+And watch your safety.
+
+ALONSO:
+Thank you. Wondrous heavy.
+
+SEBASTIAN:
+What a strange drowsiness possesses them!
+
+ANTONIO:
+It is the quality o' the climate.
+
+SEBASTIAN:
+Why
+Doth it not then our eyelids sink? I find not
+Myself disposed to sleep.
+
+ANTONIO:
+Nor I; my spirits are nimble.
+They fell together all, as by consent;
+They dropp'd, as by a thunder-stroke. What might,
+Worthy Sebastian? O, what might?--No more:--
+And yet me thinks I see it in thy face,
+What thou shouldst be: the occasion speaks thee, and
+My strong imagination sees a crown
+Dropping upon thy head.
+
+SEBASTIAN:
+What, art thou waking?
+
+ANTONIO:
+Do you not hear me speak?
+
+SEBASTIAN:
+I do; and surely
+It is a sleepy language and thou speak'st
+Out of thy sleep. What is it thou didst say?
+This is a strange repose, to be asleep
+With eyes wide open; standing, speaking, moving,
+And yet so fast asleep.
+
+ANTONIO:
+Noble Sebastian,
+Thou let'st thy fortune sleep--die, rather; wink'st
+Whiles thou art waking.
\ No newline at end of file
diff --git a/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow b/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow
new file mode 100755
index 0000000000000000000000000000000000000000..2bdeaecc60c331c3c81268e4b679f3118cf0f5b2
Binary files /dev/null and b/test/resources/tiny_shakespeare_tokenized/data-00000-of-00001.arrow differ
diff --git a/test/resources/tiny_shakespeare_tokenized/dataset_info.json b/test/resources/tiny_shakespeare_tokenized/dataset_info.json
new file mode 100755
index 0000000000000000000000000000000000000000..f66a80afe36ade6d2fbb1ea23030dfcbacd96c4d
--- /dev/null
+++ b/test/resources/tiny_shakespeare_tokenized/dataset_info.json
@@ -0,0 +1,48 @@
+{
+  "builder_name": "text",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "text",
+  "dataset_size": 1235394,
+  "description": "",
+  "download_checksums": {
+    "/home/jobuser/Liger-Kernel/test/resources/scripts/./../../resources/tiny_shakespeare.txt": {
+      "num_bytes": 1115393,
+      "checksum": null
+    }
+  },
+  "download_size": 1115393,
+  "features": {
+    "input_ids": {
+      "feature": {
+        "dtype": "int32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "attention_mask": {
+      "feature": {
+        "dtype": "int8",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 2350787,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 1235394,
+      "num_examples": 40000,
+      "dataset_name": "text"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
\ No newline at end of file
diff --git a/test/resources/tiny_shakespeare_tokenized/state.json b/test/resources/tiny_shakespeare_tokenized/state.json
new file mode 100755
index 0000000000000000000000000000000000000000..f7d4452b8ec529a9c2562f8752ead24d869e8248
--- /dev/null
+++ b/test/resources/tiny_shakespeare_tokenized/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "9a3c013ec22d1ce0",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}
\ No newline at end of file
diff --git a/test/transformers/test_auto_model.py b/test/transformers/test_auto_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..c8aaad430500e658cdff36a9875a1b3021fc5eaf
--- /dev/null
+++ b/test/transformers/test_auto_model.py
@@ -0,0 +1,89 @@
+from inspect import signature
+from unittest import mock
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+from transformers import AutoConfig
+from transformers import AutoModelForCausalLM
+
+from liger_kernel.transformers import AutoLigerKernelForCausalLM
+from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
+from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama
+
+
+def test_auto_liger_kernel_for_causal_lm_from_pretrained():
+    pretrained_model_name_or_path = "/path/to/llama/model"
+    model_args = ("model_arg1", "model_arg2")
+
+    original_kwargs = {
+        "valid_arg_1": "some_value_1",
+        "valid_arg_2": 10,
+    }
+
+    # These args should be passed through to apply_liger_kernel_to_llama fn
+    apply_liger_kernel_kwargs = {
+        "rope": False,
+        "swiglu": True,
+    }
+
+    kwargs = {**original_kwargs, **apply_liger_kernel_kwargs}
+
+    # Mock the model config instance returned from AutoConfig.from_pretrained()
+    mock_model_config = MagicMock()
+    mock_model_config.model_type = "llama"
+    mock_llama = mock.Mock()
+
+    with (
+        patch.dict(MODEL_TYPE_TO_APPLY_LIGER_FN, {"llama": mock_llama}),
+        mock.patch.object(AutoConfig, "from_pretrained", return_value=mock_model_config),
+        mock.patch.object(
+            AutoModelForCausalLM, "from_pretrained", return_value="mock_model"
+        ) as mock_super_from_pretrained,
+    ):
+        # Mock the function signature of apply_liger_kernel_to_llama
+        mock_llama.__signature__ = signature(apply_liger_kernel_to_llama)
+
+        model = AutoLigerKernelForCausalLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        # Check that the apply_liger_kernel_to_llama mock was called with the correct kwargs
+        mock_llama.assert_called_once_with(rope=False, swiglu=True)
+        # Check that the original kwargs are passed to super().from_pretrained
+        mock_super_from_pretrained.assert_called_once_with(
+            pretrained_model_name_or_path, *model_args, **original_kwargs
+        )
+        assert model == "mock_model"
+
+
+def test_auto_liger_kernel_for_causal_lm_from_config():
+    original_kwargs = {
+        "valid_arg_1": "some_value_1",
+        "valid_arg_2": 10,
+    }
+
+    # These args should be passed through to apply_liger_kernel_to_llama fn
+    apply_liger_kernel_kwargs = {
+        "rope": False,
+        "swiglu": True,
+    }
+
+    kwargs = {**original_kwargs, **apply_liger_kernel_kwargs}
+
+    # Mock the model config instance returned from AutoConfig.from_pretrained()
+    mock_model_config = MagicMock()
+    mock_model_config.model_type = "llama"
+    mock_llama = mock.Mock()
+
+    with (
+        patch.dict(MODEL_TYPE_TO_APPLY_LIGER_FN, {"llama": mock_llama}),
+        mock.patch.object(AutoModelForCausalLM, "from_config", return_value="mock_model") as mock_super_from_config,
+    ):
+        # Mock the function signature of apply_liger_kernel_to_llama
+        mock_llama.__signature__ = signature(apply_liger_kernel_to_llama)
+
+        model = AutoLigerKernelForCausalLM.from_config(mock_model_config, **kwargs)
+
+        # Check that the apply_liger_kernel_to_llama mock was called with the correct kwargs
+        mock_llama.assert_called_once_with(rope=False, swiglu=True)
+        # Check that the original kwargs are passed to super().from_pretrained
+        mock_super_from_config.assert_called_once_with(mock_model_config, **original_kwargs)
+        assert model == "mock_model"
diff --git a/test/transformers/test_cross_entropy.py b/test/transformers/test_cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..38f4071de6659117bc54354f93f0c4901fa05447
--- /dev/null
+++ b/test/transformers/test_cross_entropy.py
@@ -0,0 +1,1267 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+from torch.nn import CrossEntropyLoss
+
+from liger_kernel.ops import LigerCrossEntropyFunction
+from liger_kernel.ops.cross_entropy import liger_cross_entropy_kernel
+from liger_kernel.ops.utils import is_hip
+from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+from liger_kernel.transformers.functional import CrossEntropyOutput
+from liger_kernel.transformers.functional import liger_cross_entropy
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+set_seed(42)
+
+
+class CrossEntropyWithZLoss(torch.nn.Module):
+    def __init__(
+        self,
+        weight=None,
+        lse_square_scale=0.0,
+        reduction="mean",
+        ignore_index=-100,
+        label_smoothing=0.0,
+        return_z_loss=False,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        self.weight = weight
+        self.lse_square_scale = lse_square_scale
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+        self.return_z_loss = return_z_loss
+        self.label_smoothing = label_smoothing
+        self.dtype = dtype
+
+    def forward(self, logits, targets):
+        # Loss calculations are all in float32
+        logits = logits.to(torch.float32)
+
+        target_mask = targets != self.ignore_index
+
+        # Standard cross entropy loss
+        ce_loss = F.cross_entropy(
+            logits,
+            targets,
+            weight=self.weight,
+            reduction=self.reduction,
+            label_smoothing=self.label_smoothing,
+            ignore_index=self.ignore_index,
+        )
+
+        # Compute log-sum-exp term
+        lse = torch.logsumexp(logits, dim=-1)
+
+        # Z-loss term
+        z_loss = torch.where(targets != self.ignore_index, self.lse_square_scale * (lse**2), 0.0)
+
+        if self.reduction == "mean":
+            z_loss = z_loss.sum() / target_mask.sum()
+        elif self.reduction == "sum":
+            z_loss = z_loss.sum()
+        else:
+            z_loss = z_loss
+        ce_loss = ce_loss.to(self.dtype)
+        z_loss = z_loss.to(self.dtype)
+
+        # Final loss: cross-entropy loss + Z-loss
+        total_loss = ce_loss + z_loss
+        if self.return_z_loss:
+            return total_loss, z_loss
+        else:
+            return total_loss
+
+
+def _test_correctness_once(target_ce, B, T, V, reduction, scalar, dtype, atol, rtol):
+    torch.manual_seed(0)
+    torch_ce = CrossEntropyLoss(reduction=reduction)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    output = torch_ce(_input, target)
+    output2 = target_ce(_input2, target)
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward(gradient=torch.ones_like(output))
+    output2.backward(gradient=torch.ones_like(output))
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_ignore_index_once(target_ce, B, T, V, ignore_index, reduction, scalar, dtype, atol, rtol):
+    torch_ce = CrossEntropyLoss(ignore_index=ignore_index, reduction=reduction)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    output = torch_ce(_input, target)
+    output2 = target_ce(_input2, target)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward(gradient=torch.ones_like(output))
+    output2.backward(gradient=torch.ones_like(output))
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_label_smoothing_once(target_ce, B, T, V, label_smoothing, scalar, dtype, atol, rtol):
+    torch_ce = CrossEntropyLoss(label_smoothing=label_smoothing)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    output = torch_ce(_input, target)
+    output2 = target_ce(_input2, target)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward()
+    output2.backward()
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_label_smoothing_with_ignore_index_once(
+    target_ce, B, T, V, ignore_index, label_smoothing, scalar, dtype, atol, rtol
+):
+    torch_ce = CrossEntropyLoss(ignore_index=ignore_index, label_smoothing=label_smoothing)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    output = torch_ce(_input, target)
+    output2 = target_ce(_input2, target)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward()
+    output2.backward()
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_softcap_once(target_ce, B, T, V, softcap, reduction, scalar, dtype, atol, rtol):
+    torch_ce = CrossEntropyLoss(reduction=reduction)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # upcasting to match liger's casting strategy
+    # and downcasting to original dtype
+    output = torch_ce(softcap * torch.tanh(_input.to(torch.float32) / softcap), target).to(dtype)
+    output2 = target_ce(_input2, target)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward(gradient=torch.ones_like(output))
+    output2.backward(gradient=torch.ones_like(output))
+
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_z_loss_once(
+    target_ce,
+    B,
+    T,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    lse_square_scale,
+    return_z_loss,
+):
+    torch.manual_seed(0)
+    torch_ce = CrossEntropyWithZLoss(
+        lse_square_scale=lse_square_scale,
+        return_z_loss=return_z_loss,
+        dtype=dtype,
+    )
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    if return_z_loss:
+        output, z_output = torch_ce(_input, target)
+        result2 = target_ce(_input2, target)
+        if isinstance(result2, CrossEntropyOutput):
+            output2 = result2.loss
+            z_output2 = result2.z_loss
+        else:
+            output2, z_output2 = result2
+    else:
+        output = torch_ce(_input, target)
+        output2 = target_ce(_input2, target)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    if return_z_loss:
+        assert torch.allclose(z_output, z_output2, atol=atol, rtol=rtol)
+
+    output.backward()
+    output2.backward()
+
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_z_loss_with_other_params_once(
+    target_ce,
+    B,
+    T,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    lse_square_scale,
+    return_z_loss,
+    label_smoothing,
+    ignore_index,
+    reduction,
+):
+    torch.manual_seed(0)
+    torch_ce = CrossEntropyWithZLoss(
+        lse_square_scale=lse_square_scale,
+        return_z_loss=return_z_loss,
+        label_smoothing=label_smoothing,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        dtype=dtype,
+    )
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    if return_z_loss:
+        output, z_output = torch_ce(_input, target)
+        result2 = target_ce(_input2, target)
+        output2 = result2.loss
+        z_output2 = result2.z_loss
+    else:
+        output = torch_ce(_input, target)
+        output2 = target_ce(_input2, target)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    if return_z_loss:
+        assert torch.allclose(z_output, z_output2, atol=atol, rtol=rtol)
+
+    output.backward()
+    output2.backward()
+    assert_verbose_allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_out_of_bounds_target_once(target_ce, B, T, V, ignore_index):
+    torch.manual_seed(0)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=torch.bfloat16)
+    _input = _tensor.detach().clone().requires_grad_(True)
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    # Assign out of bounds target
+    num_out_of_bounds = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_out_of_bounds]  # Randomly select indices
+    target[indices_to_assign] = torch.randint(V, 2 * V, (num_out_of_bounds,)).to(device)
+
+    try:
+        _ = target_ce(_input, target)
+        assert False, "Should have thrown an error"
+    except AssertionError as e:
+        assert "out of bounds" in str(e)
+
+
+def _test_correctness_with_weight_once(target_ce, B, T, V, reduction, weight, scalar, dtype, atol, rtol):
+    torch.manual_seed(0)
+    torch_ce = CrossEntropyLoss(weight=weight, reduction=reduction)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    output = torch_ce(_input, target)
+    output2 = target_ce(_input2, target)
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward(gradient=torch.ones_like(output))
+    output2.backward(gradient=torch.ones_like(output))
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_weight_with_other_params_once(
+    target_ce,
+    B,
+    T,
+    V,
+    reduction,
+    weight,
+    lse_square_scale,
+    ignore_index,
+    label_smoothing,
+    softcap,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+):
+    torch.manual_seed(0)
+    torch_ce = CrossEntropyWithZLoss(
+        weight=weight,
+        lse_square_scale=lse_square_scale,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        label_smoothing=label_smoothing,
+        dtype=dtype,
+    )
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    # upcasting to match liger's casting strategy
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    output = torch_ce(softcap * torch.tanh(_input.to(torch.float32) / softcap), target).to(dtype)
+    output2 = target_ce(_input2, target)
+    assert_verbose_allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward(gradient=torch.ones_like(output))
+    output2.backward(gradient=torch.ones_like(output))
+    assert_verbose_allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_not_last_layer_once(target_ce, B, T, V, reduction, scalar, dtype, atol, rtol):
+    torch_ce = CrossEntropyLoss(reduction=reduction)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    output = torch_ce(_input, target)
+    output2 = target_ce(_input2, target)
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    loss1 = output * 3
+    loss2 = output2 * 3
+
+    grad_output = torch.rand_like(output)
+    loss1.backward(gradient=grad_output)
+    loss2.backward(gradient=grad_output)
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_not_last_layer_with_other_params_once(
+    target_ce,
+    B,
+    T,
+    V,
+    reduction,
+    ignore_index,
+    lse_square_scale,
+    label_smoothing,
+    softcap,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+):
+    torch_ce = CrossEntropyWithZLoss(
+        reduction=reduction,
+        ignore_index=ignore_index,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+    )
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    # upcasting to match liger's casting strategy
+    # and downcasting to original dtype
+    output = torch_ce(softcap * torch.tanh(_input.to(torch.float32) / softcap), target).to(dtype)
+    output2 = target_ce(_input2, target)
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    loss1 = output * 3
+    loss2 = output2 * 3
+
+    grad_output = torch.rand_like(output)
+    loss1.backward(gradient=grad_output)
+    loss2.backward(gradient=grad_output)
+    assert torch.allclose(_input.grad, _input2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_forward_only(target_ce, B, T, V, reduction, dtype, scalar, atol, rtol):
+    torch.manual_seed(0)
+    torch_ce = CrossEntropyLoss(reduction=reduction)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+    _input = _tensor.detach().clone()
+    _input2 = _tensor.detach().clone()
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    with torch.no_grad():
+        output = torch_ce(_input, target)
+        output2 = target_ce(_input2, target)
+        assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    try:
+        # Try running backward on liger output
+        output2.backward(gradient=torch.ones_like(output))
+    except RuntimeError as e:
+        assert "does not require grad" in str(e)
+
+
+def _test_correctness_functional(
+    B,
+    T,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+):
+    _input = torch.randn(B * T, V, device=device, dtype=dtype) * scalar
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    result = liger_cross_entropy(
+        x1,
+        target,
+        None,
+        ignore_index=0,
+        lse_square_scale=1e-4,
+        label_smoothing=0.1,
+        reduction="mean",
+        softcap=30.0,
+        return_z_loss=True,
+    )
+    y1 = result.loss
+    y1_z = result.z_loss
+    y2, y2_z, _, _ = LigerCrossEntropyFunction.apply(x2, target, None, 0, 1e-4, 0.1, "mean", 30.0, True, False, False)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+    assert torch.allclose(y1_z, y2_z, atol=atol, rtol=rtol)
+
+    grad = torch.randn_like(y2)
+
+    y1.backward(grad)
+    y2.backward(grad)
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+#############################################################################
+# Test the correctness of the liger cross entropy loss
+#############################################################################
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 4096, 32000),  # llama
+        (3, 423, 32000),  # weird shapes
+    ],
+)
+@pytest.mark.parametrize("reduction", ["sum", "mean", "none"])
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness(B, T, V, scalar, dtype, reduction, atol, rtol):
+    liger_ce = LigerCrossEntropyLoss(reduction=reduction)
+    _test_correctness_once(liger_ce, B, T, V, reduction, scalar, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 2, 8),
+        # weird shapes
+        (9, 7, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 1e-8, 5e-2),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_functional(B, T, V, scalar, dtype, atol, rtol):
+    _test_correctness_functional(B, T, V, scalar, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, V, ignore_index",
+    [
+        (2, 4096, 32000, 2),
+        # weird shapes
+        (3, 423, 32000, -123),
+    ],
+)
+@pytest.mark.parametrize("reduction", ["sum", "mean", "none"])
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_with_ignore_index(B, T, V, ignore_index, reduction, scalar, dtype, atol, rtol):
+    liger_ce = LigerCrossEntropyLoss(ignore_index=ignore_index, reduction=reduction)
+    _test_correctness_with_ignore_index_once(liger_ce, B, T, V, ignore_index, reduction, scalar, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, V, label_smoothing",
+    [
+        (2, 4096, 32000, 0.1),
+        # weird shapes
+        (3, 423, 32000, 0.1),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_with_label_smoothing_once(B, T, V, label_smoothing, scalar, dtype, atol, rtol):
+    liger_ce = LigerCrossEntropyLoss(label_smoothing=label_smoothing)
+    _test_correctness_with_label_smoothing_once(liger_ce, B, T, V, label_smoothing, scalar, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, V, ignore_index, label_smoothing",
+    [
+        (2, 4096, 32000, 1, 0.1),
+        # weird shapes
+        (3, 423, 32000, -300, 0.2),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_with_label_smoothing_with_ignore_index_once(
+    B, T, V, ignore_index, label_smoothing, scalar, dtype, atol, rtol
+):
+    liger_ce = LigerCrossEntropyLoss(
+        ignore_index=ignore_index,
+        label_smoothing=label_smoothing,
+    )
+    _test_correctness_with_label_smoothing_with_ignore_index_once(
+        liger_ce, B, T, V, ignore_index, label_smoothing, scalar, dtype, atol, rtol
+    )
+
+
+@pytest.mark.parametrize(
+    "B, T, V, softcap",
+    [
+        (2, 4096, 32000, 30.0),  # llama2, mistral
+        # weird shapes
+        (3, 423, 32000, 30.0),
+    ],
+)
+@pytest.mark.parametrize("reduction", ["sum", "mean", "none"])
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_with_softcap_once(B, T, V, softcap, reduction, scalar, dtype, atol, rtol):
+    liger_ce = LigerCrossEntropyLoss(softcap=softcap, reduction=reduction)
+    _test_correctness_with_softcap_once(liger_ce, B, T, V, softcap, reduction, scalar, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 4096, 32000),  # llama2
+        # weird shapes
+        (3, 423, 32000),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+@pytest.mark.parametrize("return_z_loss", [True, False])
+@pytest.mark.parametrize(
+    "lse_square_scale",
+    [
+        1e-4,  # PaLM
+        1e-5,  # Chameleon
+    ],
+)
+def test_correctness_with_z_loss_once(
+    B,
+    T,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    lse_square_scale,
+    return_z_loss,
+):
+    test_ce = LigerCrossEntropyLoss(
+        lse_square_scale=lse_square_scale,
+        return_z_loss=return_z_loss,
+    )
+    _test_correctness_with_z_loss_once(
+        test_ce,
+        B,
+        T,
+        V,
+        scalar,
+        dtype,
+        atol,
+        rtol,
+        lse_square_scale,
+        return_z_loss,
+    )
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 4096, 32000),  # llama2, mistral
+        # weird shapes
+        (3, 423, 32000),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+@pytest.mark.parametrize(
+    "return_z_loss, lse_square_scale",
+    [
+        (True, 1e-4),
+        (False, 1e-5),
+    ],
+)
+@pytest.mark.parametrize(
+    "label_smoothing, ignore_index, reduction",
+    [
+        (0.1, 42, "mean"),
+        (0.2, -42, "sum"),
+    ],
+)
+def test_correctness_with_z_loss_with_other_params_once(
+    B,
+    T,
+    V,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+    lse_square_scale,
+    return_z_loss,
+    label_smoothing,
+    ignore_index,
+    reduction,
+):
+    test_ce = LigerCrossEntropyLoss(
+        lse_square_scale=lse_square_scale,
+        return_z_loss=return_z_loss,
+        label_smoothing=label_smoothing,
+        ignore_index=ignore_index,
+        reduction=reduction,
+    )
+    _test_correctness_with_z_loss_with_other_params_once(
+        test_ce,
+        B,
+        T,
+        V,
+        scalar,
+        dtype,
+        atol,
+        rtol,
+        lse_square_scale,
+        return_z_loss,
+        label_smoothing,
+        ignore_index,
+        reduction,
+    )
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 4096, 32000),  # llama2, mistral
+        # # weird shapes
+        (3, 423, 32000),
+    ],
+)
+@pytest.mark.parametrize("reduction", ["sum", "mean", "none"])
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_with_weight_once(B, T, V, reduction, scalar, dtype, atol, rtol):
+    weight = torch.rand(V, device=device, dtype=dtype)
+    test_ce = LigerCrossEntropyLoss(weight=weight, reduction=reduction)
+    _test_correctness_with_weight_once(test_ce, B, T, V, reduction, weight, scalar, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 4096, 32000),  # llama2, mistral
+        # # weird shapes
+        (3, 423, 32000),
+    ],
+)
+@pytest.mark.parametrize("reduction", ["sum", "mean", "none"])
+@pytest.mark.parametrize(
+    "ignore_index, lse_square_scale, label_smoothing, softcap",
+    [
+        (-100, 1e-4, 0.1, 30.0),
+        (42, 1e-5, 0.2, 40.0),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_with_weight_with_other_params_once(
+    B,
+    T,
+    V,
+    reduction,
+    lse_square_scale,
+    ignore_index,
+    label_smoothing,
+    softcap,
+    scalar,
+    dtype,
+    atol,
+    rtol,
+):
+    weight = torch.rand(V, device=device, dtype=torch.float32)  # match softcap casting
+    test_ce = LigerCrossEntropyLoss(
+        weight=weight,
+        lse_square_scale=lse_square_scale,
+        reduction=reduction,
+        ignore_index=ignore_index,
+        label_smoothing=label_smoothing,
+        softcap=softcap,
+    )
+    _test_correctness_with_weight_with_other_params_once(
+        test_ce,
+        B,
+        T,
+        V,
+        reduction,
+        weight,
+        lse_square_scale,
+        ignore_index,
+        label_smoothing,
+        softcap,
+        scalar,
+        dtype,
+        atol,
+        rtol,
+    )
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 4096, 32000),  # llama2, mistral
+        # # weird shapes
+        (3, 423, 32000),
+    ],
+)
+@pytest.mark.parametrize("reduction", ["sum", "mean"])
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-6),
+    ],
+)
+def test_correctness_not_last_layer(B, T, V, reduction, scalar, dtype, atol, rtol):
+    liger_ce = LigerCrossEntropyLoss(reduction=reduction)
+    _test_correctness_not_last_layer_once(liger_ce, B, T, V, reduction, scalar, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 1024, 32000),  # llama2, mistral
+        # # weird shapes
+        (3, 423, 32000),
+    ],
+)
+@pytest.mark.parametrize(
+    "ignore_index, lse_square_scale, label_smoothing, softcap",
+    [
+        (-100, 1e-4, 0.1, 30.0),
+        (42, 1e-5, 0.2, 40.0),
+    ],
+)
+@pytest.mark.parametrize("reduction", ["sum", "mean"])
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        pytest.param(
+            1.0,
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (1.0, torch.float32, 1e-8, 1e-5),
+    ],
+)
+def test_correctness_not_last_layer_with_other_params(
+    B, T, V, reduction, ignore_index, lse_square_scale, label_smoothing, softcap, scalar, dtype, atol, rtol
+):
+    liger_ce = LigerCrossEntropyLoss(
+        reduction=reduction,
+        ignore_index=ignore_index,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        softcap=softcap,
+    )
+    _test_correctness_not_last_layer_with_other_params_once(
+        liger_ce,
+        B,
+        T,
+        V,
+        reduction,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        softcap,
+        scalar,
+        dtype,
+        atol,
+        rtol,
+    )
+
+
+def test_float32_internal():
+    """
+    This test validates that the internal softmax calculations occur in float32,
+    even if the input dtype is bfloat16.
+    """
+    # Set up test parameters
+    batch_size = 4
+    n_cols = 128256
+    n_non_ignore = batch_size
+    ignore_index = -100
+    label_smoothing = 0.0
+    lse_square_scale = 0.0
+    softcap = 0.0
+    BLOCK_SIZE = 4096 if device == "npu" else 32768
+    reduction = "mean"
+
+    # Initialize input tensors
+    X_init = torch.randn(batch_size, n_cols, dtype=torch.bfloat16, device=device)
+    Y = torch.randint(0, n_cols, (batch_size,), device=device)
+
+    # Run kernel for bfloat16
+    X_bf16 = X_init.clone()
+    loss_bf16 = torch.zeros(batch_size, dtype=torch.float32, device=device)
+    token_accuracy_bf16 = torch.zeros(batch_size, dtype=torch.float32, device=device)
+    predicted_tokens_bf16 = torch.full((batch_size,), -1, dtype=torch.int64, device=device)
+    liger_cross_entropy_kernel[(batch_size,)](
+        X_ptr=X_bf16,
+        X_stride=X_bf16.stride(-2),
+        Y_ptr=Y,
+        Y_stride=Y.stride(-1),
+        weight_ptr=X_bf16,  # dummy ptr, not used
+        z_loss_ptr=loss_bf16,  # dummy ptr, not used
+        loss_ptr=loss_bf16,
+        loss_stride=loss_bf16.stride(-1),
+        token_accuracy_ptr=token_accuracy_bf16,
+        token_accuracy_stride=token_accuracy_bf16.stride(-1),
+        predicted_tokens_ptr=predicted_tokens_bf16,
+        predicted_tokens_stride=predicted_tokens_bf16.stride(-1),
+        n_cols=n_cols,
+        n_non_ignore=n_non_ignore,
+        sum_non_ignore_weight=n_non_ignore,  # not used
+        weight_sum=0.0,  # not used
+        ignore_index=ignore_index,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        reduction=reduction,
+        softcap=softcap,
+        RETURN_Z_LOSS=0,  # False
+        RETURN_TOKEN_ACCURACY=0,
+        RETURN_PREDICTED_TOKENS=0,
+        HAS_WEIGHT=False,
+        HAS_SOFTCAPPING=False,
+        HAS_GRADIENTS=True,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=32 if not is_hip() else 16,
+    )
+
+    # Run kernel for float32
+    X_fp32 = X_init.float()
+    loss_fp32 = torch.zeros(batch_size, dtype=torch.float32, device=device)
+    token_accuracy_fp32 = torch.zeros(batch_size, dtype=torch.float32, device=device)
+    predicted_tokens_fp32 = torch.full((batch_size,), -1, dtype=torch.int64, device=device)
+    liger_cross_entropy_kernel[(batch_size,)](
+        X_ptr=X_fp32,
+        X_stride=X_fp32.stride(-2),
+        Y_ptr=Y,
+        Y_stride=Y.stride(-1),
+        weight_ptr=X_fp32,  # dummy ptr, not used
+        loss_ptr=loss_fp32,
+        z_loss_ptr=loss_fp32,  # dummy ptr, not used
+        loss_stride=loss_fp32.stride(-1),
+        token_accuracy_ptr=token_accuracy_fp32,
+        token_accuracy_stride=token_accuracy_fp32.stride(-1),
+        predicted_tokens_ptr=predicted_tokens_fp32,
+        predicted_tokens_stride=predicted_tokens_fp32.stride(-1),
+        n_cols=n_cols,
+        n_non_ignore=n_non_ignore,
+        sum_non_ignore_weight=n_non_ignore,  # not used
+        weight_sum=n_non_ignore,  # not used
+        ignore_index=ignore_index,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        reduction=reduction,
+        softcap=softcap,
+        RETURN_Z_LOSS=0,  # False
+        RETURN_TOKEN_ACCURACY=0,
+        RETURN_PREDICTED_TOKENS=0,
+        HAS_WEIGHT=False,
+        HAS_SOFTCAPPING=False,
+        HAS_GRADIENTS=True,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=32 if not is_hip() else 16,
+    )
+
+    torch.allclose(X_bf16, X_fp32.bfloat16())
+    torch.allclose(loss_bf16, loss_fp32)
+
+
+@pytest.mark.parametrize(
+    "B, T, V, ignore_index",
+    [
+        (2, 4096, 32000, 2),
+        # weird shapes
+        (3, 423, 32000, -123),
+    ],
+)
+def test_correctness_with_out_of_bounds_target_once(B, T, V, ignore_index):
+    liger_ce = LigerCrossEntropyLoss(ignore_index=ignore_index)
+    _test_correctness_with_out_of_bounds_target_once(liger_ce, B, T, V, ignore_index)
+
+
+@pytest.mark.parametrize(
+    "B, T, V, ignore_index",
+    [
+        (2, 4096, 32000, -100),
+        (3, 423, 32000, 2),
+    ],
+)
+@pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+@pytest.mark.parametrize(
+    "dtype, scalar, atol, rtol",
+    [
+        (torch.float32, 1.0, 1e-4, 1e-4),
+        (torch.float16, 1.0, 1e-2, 1e-2),
+        (torch.bfloat16, 1.0, 1e-2, 1e-2),
+    ],
+)
+def test_correctness_with_forward_only(B, T, V, ignore_index, reduction, dtype, scalar, atol, rtol):
+    liger_ce = LigerCrossEntropyLoss(ignore_index=ignore_index, reduction=reduction)
+    _test_correctness_with_forward_only(liger_ce, B, T, V, reduction, dtype, scalar, atol, rtol)
+
+
+@pytest.mark.parametrize(
+    "return_z_loss, return_token_accuracy, return_predicted_tokens",
+    [
+        (False, False, False),
+        (True, False, False),
+        (False, True, False),
+        (False, False, True),
+        (True, True, False),
+        (True, False, True),
+        (False, True, True),
+        (True, True, True),
+    ],
+)
+def test_liger_cross_entropy_structured_output(return_z_loss, return_token_accuracy, return_predicted_tokens):
+    logits = torch.tensor(
+        [[2.0, 0.5, -1.0], [0.1, 1.5, 0.3], [0.7, -0.2, 0.9]],
+        device=device,
+        requires_grad=True,
+    )
+    targets = torch.tensor([0, 1, 2], device=device)
+
+    original_logits = logits.detach().clone()
+
+    result = liger_cross_entropy(
+        logits,
+        targets,
+        reduction="mean",
+        return_z_loss=return_z_loss,
+        return_token_accuracy=return_token_accuracy,
+        return_predicted_tokens=return_predicted_tokens,
+    )
+
+    if not return_z_loss and not return_token_accuracy and not return_predicted_tokens:
+        assert isinstance(result, torch.Tensor)
+        assert result.shape == ()
+        result.backward()
+        assert logits.grad is not None
+        logits.grad.zero_()
+        return
+
+    assert isinstance(result, CrossEntropyOutput)
+    assert result.loss.shape == ()
+
+    if return_z_loss:
+        assert result.z_loss is not None
+        assert isinstance(result.z_loss, torch.Tensor)
+    else:
+        assert result.z_loss is None
+
+    if return_token_accuracy:
+        assert result.token_accuracy is not None
+        with torch.no_grad():
+            predictions = original_logits.argmax(dim=-1)
+            correct = (predictions == targets).float()
+            expected_accuracy = correct.mean()
+        assert torch.allclose(result.token_accuracy, expected_accuracy, atol=1e-6)
+    else:
+        assert result.token_accuracy is None
+
+    if return_predicted_tokens:
+        assert result.predicted_tokens is not None
+        assert result.predicted_tokens.dtype == torch.int64
+        assert result.predicted_tokens.shape == (3,)
+        with torch.no_grad():
+            expected_predictions = original_logits.argmax(dim=-1)
+        assert torch.equal(result.predicted_tokens, expected_predictions)
+        # When both are enabled, predicted_tokens and token_accuracy should be consistent
+        if return_token_accuracy:
+            correct_from_predictions = (result.predicted_tokens == targets).float().mean()
+            assert torch.allclose(result.token_accuracy, correct_from_predictions, atol=1e-6)
+    else:
+        assert result.predicted_tokens is None
+
+    result.loss.backward()
+    assert logits.grad is not None
+    logits.grad.zero_()
+
+
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 128, 512),
+        (3, 47, 31),  # weird shapes
+    ],
+)
+@pytest.mark.parametrize("ignore_index", [-100, 2])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.float32,
+        pytest.param(
+            torch.bfloat16,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness_with_predicted_tokens(B, T, V, ignore_index, dtype):
+    torch.manual_seed(42)
+
+    _tensor = torch.randn(B * T, V, device=device, dtype=dtype)
+    _input = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    # Assign some elements as ignore_index
+    num_ignore = B * T // 4
+    indices_to_ignore = torch.randperm(B * T)[:num_ignore]
+    target[indices_to_ignore] = ignore_index
+
+    # Compute expected argmax BEFORE the kernel modifies _input in-place
+    with torch.no_grad():
+        expected_predictions = _tensor.float().argmax(dim=-1)
+
+    liger_ce = LigerCrossEntropyLoss(
+        ignore_index=ignore_index,
+        return_predicted_tokens=True,
+    )
+    result = liger_ce(_input, target)
+
+    assert isinstance(result, CrossEntropyOutput)
+    assert result.predicted_tokens is not None
+    assert result.predicted_tokens.shape == (B * T,)
+    assert result.predicted_tokens.dtype == torch.int64
+
+    # For non-ignored tokens, predicted_tokens should match argmax
+    non_ignore_mask = target != ignore_index
+    assert torch.equal(result.predicted_tokens[non_ignore_mask], expected_predictions[non_ignore_mask])
+
+    # For ignored tokens, predicted_tokens should be -1
+    assert torch.all(result.predicted_tokens[~non_ignore_mask] == -1)
+
+    # Verify backward still works
+    result.loss.backward()
+    assert _input.grad is not None
diff --git a/test/transformers/test_dyt.py b/test/transformers/test_dyt.py
new file mode 100755
index 0000000000000000000000000000000000000000..2f28e672dcad1bcc42b21d0a74022d466659167f
--- /dev/null
+++ b/test/transformers/test_dyt.py
@@ -0,0 +1,160 @@
+import pytest
+import torch
+
+from test.utils import assert_verbose_allclose
+from test.utils import infer_device
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+
+from liger_kernel.ops import LigerDyTFunction
+from liger_kernel.transformers.dyt import LigerDyT
+from liger_kernel.transformers.functional import liger_dyt
+
+
+# @torch.compile
+def torch_dyt_with_beta(x, alpha, gamma, beta):
+    return gamma * torch.tanh(x * alpha) + beta
+
+
+# @torch.compile
+def torch_dyt_without_beta(x, alpha, gamma):
+    return gamma * torch.tanh(x * alpha)
+
+
+class TorchDyT(torch.nn.Module):
+    def __init__(self, hidden_size, beta=True, init_alpha=0.5):
+        super().__init__()
+        self.alpha = torch.nn.Parameter(torch.ones(1) * init_alpha)
+        self.gamma = torch.nn.Parameter(torch.ones(hidden_size))
+        self.beta = None
+        if beta:
+            self.beta = torch.nn.Parameter(torch.zeros(hidden_size))
+
+    def forward(self, x):
+        if self.beta is None:
+            return torch_dyt_without_beta(x, self.alpha, self.gamma)
+        return torch_dyt_with_beta(x, self.alpha, self.gamma, self.beta)
+
+
+set_seed(42)
+device = infer_device()
+
+
+@pytest.mark.parametrize("beta", [False, True])
+@pytest.mark.parametrize("init_alpha", [0.5])
+@pytest.mark.parametrize(
+    "B, T, hidden_size",
+    [
+        (2, 8, 4096),
+        (4, 16, 2048),
+        (1, 1, 1023),  # Minimal batch/seq with near power-of-2 hidden
+        (3, 7, 256),  # Prime numbers for batch/seq
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol, alpha_atol, alpha_rtol",
+    [
+        (torch.float32, 1e-5, 1e-5, 1e-5, 1e-3),
+    ],
+)
+def test_liger_dyt_correctness(B, T, hidden_size, beta, init_alpha, dtype, atol, rtol, alpha_atol, alpha_rtol):
+    _input = torch.randn(B, T, hidden_size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    # initialize weights
+    alpha = torch.randn(1, device=device, dtype=dtype)
+    gamma = torch.randn(hidden_size, device=device, dtype=dtype)
+    beta_data = torch.randn(hidden_size, device=device, dtype=dtype)
+
+    torch_dyt = TorchDyT(hidden_size=hidden_size, beta=beta, init_alpha=init_alpha).to(device).to(dtype)
+    torch_dyt.alpha.data = alpha.clone()
+    torch_dyt.gamma.data = gamma.clone()
+    if beta:
+        torch_dyt.beta.data = beta_data.clone()
+
+    liger_dyt = LigerDyT(hidden_size=hidden_size, beta=beta, init_alpha=init_alpha).to(device).to(dtype)
+    liger_dyt.alpha.data = alpha.clone()
+    liger_dyt.gamma.data = gamma.clone()
+    if beta:
+        liger_dyt.beta.data = beta_data.clone()
+
+    torch_output = torch_dyt(x1)
+    liger_output = liger_dyt(x2)
+
+    assert_verbose_allclose(torch_output, liger_output, rtol=rtol, atol=atol, extra_info="[output]")
+
+    grad_output = torch.randn_like(_input)
+    torch_output.backward(grad_output)
+    liger_output.backward(grad_output)
+
+    assert_verbose_allclose(x1.grad, x2.grad, rtol=rtol, atol=atol, extra_info="[input.grad]")
+    assert_verbose_allclose(
+        torch_dyt.alpha.grad, liger_dyt.alpha.grad, rtol=alpha_rtol, atol=alpha_atol, extra_info="[alpha.grad]"
+    )
+    assert_verbose_allclose(torch_dyt.gamma.grad, liger_dyt.gamma.grad, rtol=rtol, atol=atol, extra_info="[gamma.grad]")
+    if beta:
+        assert_verbose_allclose(
+            torch_dyt.beta.grad, liger_dyt.beta.grad, rtol=rtol, atol=atol, extra_info="[beta.grad]"
+        )
+
+
+@pytest.mark.parametrize("beta", [False, True])
+@pytest.mark.parametrize(
+    "B, T, hidden_size",
+    [
+        (2, 8, 4096),
+        (4, 16, 2048),
+        (1, 1, 1023),  # Minimal batch/seq with near power-of-2 hidden
+        (3, 7, 256),  # Prime numbers for batch/seq
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_liger_dyt_functional(B, T, hidden_size, beta, dtype, atol, rtol):
+    _input = torch.randn(B, T, hidden_size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    # initialize weights
+    alpha = torch.randn(1, device=device, dtype=dtype)
+    gamma = torch.randn(hidden_size, device=device, dtype=dtype)
+    beta_data = torch.randn(hidden_size, device=device, dtype=dtype)
+
+    alpha1 = alpha.clone().requires_grad_(True)
+    gamma1 = gamma.clone().requires_grad_(True)
+    beta1 = beta_data.clone().requires_grad_(True) if beta else None
+
+    alpha2 = alpha.clone().requires_grad_(True)
+    gamma2 = gamma.clone().requires_grad_(True)
+
+    beta2 = beta_data.clone().requires_grad_(True) if beta else None
+
+    output1 = liger_dyt(x1, alpha=alpha1, gamma=gamma1, beta=beta1)
+    output2 = LigerDyTFunction.apply(x2, alpha2, gamma2, beta2)
+
+    assert_verbose_allclose(output1, output2, rtol=rtol, atol=atol)
+
+    grad_output = torch.randn_like(_input)
+    output1.backward(grad_output)
+    output2.backward(grad_output)
+
+    assert_verbose_allclose(x1.grad, x2.grad, rtol=rtol, atol=atol)
+    assert_verbose_allclose(alpha1.grad, alpha2.grad, rtol=rtol, atol=atol)
+    assert_verbose_allclose(gamma1.grad, gamma2.grad, rtol=rtol, atol=atol)
+    if beta:
+        assert_verbose_allclose(beta1.grad, beta2.grad, rtol=rtol, atol=atol)
diff --git a/test/transformers/test_embedding.py b/test/transformers/test_embedding.py
new file mode 100755
index 0000000000000000000000000000000000000000..72c3cc721ea3eee1eb831449264380c1f99b2fba
--- /dev/null
+++ b/test/transformers/test_embedding.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+
+from torch.nn import Embedding
+
+from liger_kernel.transformers.experimental.embedding import LigerEmbedding
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+SLEEP_SECONDS = 0.1
+
+
+@pytest.mark.skip(reason="LigerEmbedding is under experimentation")
+@pytest.mark.parametrize(
+    "num_embeddings, embedding_dim, padding_idx",
+    [
+        (100, 64, None),
+        (100, 64, None),
+        (1000, 128, None),
+        (100, 60, None),
+        (100, 60, None),
+        (1000, 120, None),
+        (1000, 500, None),
+        (30522, 768, None),
+        (100, 64, 0),
+        (1000, 128, 50),
+        (30522, 768, 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol, device",
+    [
+        (torch.float32, 1e-6, 1e-5, device),
+    ],
+)
+def test_embedding_correctness(num_embeddings, embedding_dim, padding_idx, dtype, atol, rtol, device):
+    print(f"\nTesting embedding with size: ({num_embeddings}, {embedding_dim}), padding_idx: {padding_idx}")
+    torch.manual_seed(42)
+
+    torch_embedding = Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx).to(dtype).to(device)
+    liger_embedding = LigerEmbedding(num_embeddings, embedding_dim, padding_idx=padding_idx).to(dtype).to(device)
+    liger_embedding.weight.data.copy_(torch_embedding.weight.data)
+
+    if padding_idx is not None:
+        input_ids = torch.randint(0, num_embeddings, (32 * 10,), device=device)
+        input_ids[torch.randint(0, 32 * 10, (32 * 10 // 10,))] = padding_idx
+    else:
+        input_ids = torch.randint(0, num_embeddings, (32 * 10,), device=device)
+
+    torch_output = torch_embedding(input_ids).view(32, 10, -1)
+    liger_output = liger_embedding(input_ids).view(32, 10, -1)
+
+    assert torch.allclose(torch_output, liger_output, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(torch_output)
+
+    torch_output.backward(grad_output)
+    liger_output.backward(grad_output)
+
+    assert torch.allclose(torch_embedding.weight.grad, liger_embedding.weight.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_flex_attention.py b/test/transformers/test_flex_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..0a4827cbd2f60a8ffce3913753776126f620b8c4
--- /dev/null
+++ b/test/transformers/test_flex_attention.py
@@ -0,0 +1,291 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+from torch.nn.attention.flex_attention import create_block_mask
+from torch.nn.attention.flex_attention import create_mask
+from torch.nn.attention.flex_attention import flex_attention
+
+from liger_kernel.utils import infer_device
+
+
+def causal_mask(b, h, q_idx, kv_idx):
+    return q_idx >= kv_idx
+
+
+def prefix_mask(b, h, q_idx, kv_idx, rejected_index, chosen_index):
+    return (~((q_idx >= rejected_index[b]) & (chosen_index[b] <= kv_idx) & (kv_idx < rejected_index[b]))) & (
+        q_idx >= kv_idx
+    )
+
+
+device = infer_device()
+set_seed(42)
+
+
+def _test_correctness_flex(B, H, S, D, mask_func, dtype, atol, rtol, device=infer_device()):
+    """
+    Test attention mechanisms with various implementations.
+
+    Parameters:
+        B (int): Batch size
+        H (int): Number of attention heads
+        S (int): Sequence length
+        D (int): Hidden dimension per head
+        mask_func: A function that generates custom attention mask
+        dtype: Data type for computation
+        atol (float): Absolute tolerance for comparison
+        rtol (float): Relative tolerance for comparison
+    """
+    torch.manual_seed(0)
+
+    # Initialize input tensors, i.e. the tensors after q, k, and v projections of hidden states (attention head input)
+    query_torch = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
+    key_torch = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
+    value_torch = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
+
+    query_flex = query_torch.clone().detach().requires_grad_(True)
+    key_flex = key_torch.clone().detach().requires_grad_(True)
+    value_flex = value_torch.clone().detach().requires_grad_(True)
+
+    block_mask = create_block_mask(mask_func, B, H, S, S, device=device)  # Sparsity block mask
+    mask = create_mask(mask_func, B, H, S, S, device=device)  # Regular mask
+
+    # If you are using a causal mask with FA2, you can enable `is_causal`."
+    # e.g.,
+    # F.scaled_dot_product_attention(query, key, value, is_causal=is_causal)
+
+    torch_out = F.scaled_dot_product_attention(query_torch, key_torch, value_torch, attn_mask=mask)
+
+    flex_out = flex_attention(query_flex, key_flex, value_flex, block_mask=block_mask)
+
+    # Check forward pass
+    assert_verbose_allclose(flex_out, torch_out, atol=atol, rtol=rtol)
+
+    grad_out = torch.ones_like(torch_out)
+    torch_out.backward(grad_out)
+    flex_out.backward(grad_out)
+
+    # Check gradients
+    assert_verbose_allclose(query_flex.grad, query_torch.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(key_flex.grad, key_torch.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(value_flex.grad, value_torch.grad, atol=atol, rtol=rtol)
+
+
+def _is_flex_attention_supported():
+    """Check if flex attention is supported on the current device"""
+    device = infer_device()
+    return device in ["cuda"]
+
+
+@pytest.mark.skipif(not _is_flex_attention_supported(), reason="FlexAttention is only supported on CUDA or CPU devices")
+@pytest.mark.parametrize(
+    "B, H, S, D",
+    [
+        (2, 8, 1024, 32),
+        (3, 12, 2048, 64),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        pytest.param(
+            torch.bfloat16,
+            3e-2,
+            5e-1,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (torch.float16, 1e-2, 5e-3),
+        (torch.float32, 1e-3, 5e-4),
+    ],
+)
+def test_correctness_flex(B, H, S, D, dtype, atol, rtol):
+    _test_correctness_flex(B, H, S, D, causal_mask, dtype, atol, rtol)
+
+    # Roughly generate custom rejected and chosen indices for each batch
+    chosen_index = torch.randint(0, S // 2, (B,), device=infer_device())
+    rejected_index = torch.randint(S // 2, S, (B,), device=infer_device())
+
+    def wrapped_prefix_mask(b, h, q_idx, kv_idx):
+        return prefix_mask(b, h, q_idx, kv_idx, rejected_index, chosen_index)
+
+    _test_correctness_flex(B, H, S, D, wrapped_prefix_mask, dtype, atol, rtol)
+
+
+def _test_correctness_prefix(
+    B=2,
+    H=8,
+    P=512,
+    C=256,
+    R=256,
+    D=32,
+    dtype=torch.float32,
+    atol=1e-3,
+    rtol=5e-4,
+    device=infer_device(),
+):
+    """
+    Test that prefix sharing attention matches separate computations (i.e. two separate casual masked attention, prefix+chosen and prefix+rejected).
+    The mental model is:
+
+    A. prefix + chosen
+    P
+    P P
+    P P P
+    P P P C
+    P P P C C
+    P P P C C C
+
+    B. prefix + rejected
+    P
+    P P
+    P P P
+    P P P R
+    P P P R R
+    P P P R R R
+
+    C. shared prefix + chosen + rejected
+    P
+    P P
+    P P P
+    P P P C
+    P P P C C
+    P P P C C C
+    P P P       R
+    P P P       R R
+    P P P       R R R
+
+
+    We test them as below to ensure attention value equivalence:
+    1. prefix of shared attn (upper of C.) == prefix of chosen attn (upper of A.)
+    2. prefix of shared attn (upper of C.) == prefix of rejected attn (upper of B.)
+    P       P
+    P P   = P P
+    P P P   P P P
+
+    3. prefix of shared attn (middle right of C.) == prefix of chosen attn (lower right of A.)
+    C       C
+    C C   = C C
+    C C C   C C C
+
+    4. prefix of shared attn (lower right of C.) == prefix of rejected attn (lower right of B.)
+    R       R
+    R R   = R R
+    R R R   R R R
+
+    Args:
+        B: batch size
+        H: number of heads
+        P: prefix length
+        C: chosen response length
+        R: rejected response length
+        D: hidden dimension per head
+    """
+    torch.manual_seed(0)
+
+    # Total sequence length for shared version
+    S = P + C + R
+
+    # Initialize input tensors, i.e. the tensors after q, k, and v projections of hidden states (attention head input)
+    query = torch.randn(B, H, S, D, device=device, dtype=dtype)
+    key = torch.randn(B, H, S, D, device=device, dtype=dtype)
+    value = torch.randn(B, H, S, D, device=device, dtype=dtype)
+
+    # Split tensors for separate computation
+    query_prefix = query[:, :, :P, :]
+    key_prefix = key[:, :, :P, :]
+    value_prefix = value[:, :, :P, :]
+
+    query_chosen = query[:, :, P : P + C, :]
+    key_chosen = key[:, :, P : P + C, :]
+    value_chosen = value[:, :, P : P + C, :]
+
+    query_rejected = query[:, :, P + C :, :]
+    key_rejected = key[:, :, P + C :, :]
+    value_rejected = value[:, :, P + C :, :]
+
+    chosen_index = torch.full((B,), P + C, device=device)
+    rejected_index = torch.full((B,), S, device=device)
+
+    def wrapped_prefix_mask(b, h, q_idx, kv_idx):
+        return prefix_mask(b, h, q_idx, kv_idx, rejected_index, chosen_index)
+
+    block_mask = create_block_mask(wrapped_prefix_mask, B, H, S, S, device=device)
+    shared_out = flex_attention(query, key, value, block_mask=block_mask)
+
+    # Compute attention for prefix + chosen separately
+    PC = P + C
+    query_pc = torch.cat([query_prefix, query_chosen], dim=2)
+    key_pc = torch.cat([key_prefix, key_chosen], dim=2)
+    value_pc = torch.cat([value_prefix, value_chosen], dim=2)
+
+    def causal_mask(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+
+    pc_block_mask = create_block_mask(causal_mask, B, H, PC, PC, device=device)
+    pc_out = flex_attention(query_pc, key_pc, value_pc, block_mask=pc_block_mask)
+
+    # Compute attention for prefix + rejected separately
+    PR = P + R
+    query_pr = torch.cat([query_prefix, query_rejected], dim=2)
+    key_pr = torch.cat([key_prefix, key_rejected], dim=2)
+    value_pr = torch.cat([value_prefix, value_rejected], dim=2)
+
+    pr_block_mask = create_block_mask(causal_mask, B, H, PR, PR, device=device)
+    pr_out = flex_attention(query_pr, key_pr, value_pr, block_mask=pr_block_mask)
+
+    shared_prefix = shared_out[:, :, :P, :P]
+    shared_chosen = shared_out[:, :, P : P + C, P : P + C]
+    shared_rejected = shared_out[:, :, P + C :, P + C :]
+
+    separate_prefix_c = pc_out[:, :, :P, :P]
+    separate_chosen = pc_out[:, :, P:, P:]
+    separate_prefix_r = pr_out[:, :, :P, :P]
+    separate_rejected = pr_out[:, :, P:, P:]
+
+    # Verify prefix outputs are identical
+    assert torch.allclose(shared_prefix, separate_prefix_c, atol=atol, rtol=rtol), (
+        "Prefix attention from shared computation doesn't match prefix+chosen computation"
+    )
+    assert torch.allclose(shared_prefix, separate_prefix_r, atol=atol, rtol=rtol), (
+        "Prefix attention from shared computation doesn't match prefix+rejected computation"
+    )
+
+    # Verify chosen and rejected outputs
+    assert torch.allclose(shared_chosen, separate_chosen, atol=atol, rtol=rtol), (
+        "Chosen response attention doesn't match between shared and separate computation"
+    )
+    assert torch.allclose(shared_rejected, separate_rejected, atol=atol, rtol=rtol), (
+        "Rejected response attention doesn't match between shared and separate computation"
+    )
+
+    print("All attention values match between shared and separate computations!")
+
+
+@pytest.mark.skipif(not _is_flex_attention_supported(), reason="FlexAttention is only supported on CUDA or CPU devices")
+@pytest.mark.parametrize(
+    "B, H, P, C, R, D",
+    [
+        (2, 8, 512, 256, 256, 32),
+        (3, 12, 1024, 512, 512, 64),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        pytest.param(
+            torch.bfloat16,
+            3e-2,
+            5e-1,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (torch.float16, 1e-2, 5e-3),
+        (torch.float32, 1e-3, 5e-4),
+    ],
+)
+def test_correctness_prefix(B, H, P, C, R, D, dtype, atol, rtol):
+    """Parametrized test for different configurations"""
+    _test_correctness_prefix(B=B, H=H, P=P, C=C, R=R, D=D, dtype=dtype, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_fused_add_rms_norm.py b/test/transformers/test_fused_add_rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..ea00f0988d556675d54c9ddb4209521f419502e6
--- /dev/null
+++ b/test/transformers/test_fused_add_rms_norm.py
@@ -0,0 +1,219 @@
+import os
+
+import pytest
+import torch
+import torch.nn as nn
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+
+from liger_kernel.ops import LigerFusedAddRMSNormFunction
+from liger_kernel.transformers.functional import liger_fused_add_rms_norm
+from liger_kernel.transformers.fused_add_rms_norm import LigerFusedAddRMSNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+set_seed(42)
+torch.use_deterministic_algorithms(True)
+
+#  Only setting torch.use_deterministic_algorithms(True) might throw the following error:
+#  RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`,
+#  but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an
+#  environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information,
+#  go to https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+
+if device == "cuda":
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+SLEEP_SECONDS = 0.1
+
+
+class BaseAddRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states, residual):
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype), residual
+
+
+# RMSNorm: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L112
+# mimic residual add behavior as done for Llama:
+# https://github.com/huggingface/transformers/blob/1bc9ac5107ff32c0115bd0b269924455be79db64/src/transformers/models/llama/modeling_llama.py#L297
+class LlamaAddRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states, residual):
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype), residual
+
+
+# RMSNorm: https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/gemma/modeling_gemma.py#L122
+# mimic residual add behavior as done for Gemma:
+# https://github.com/huggingface/transformers/blob/174890280b340b89c5bfa092f6b4fb0e2dc2d7fc/src/transformers/models/gemma/modeling_gemma.py#L620
+class GemmaAddRMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x, residual):
+        x = x + residual
+        residual = x
+        output = self._norm(x.float())
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x), residual
+
+
+@pytest.mark.flaky(reruns=3, reruns_delay=2)
+@pytest.mark.parametrize(
+    "bs, sl, hd",
+    [
+        (2, 128, 512),
+        # weird shapes
+        (5, 123, 123),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        pytest.param(
+            torch.bfloat16,
+            2e-1,
+            2e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "reference, offset, casting_mode",
+    [
+        (LlamaAddRMSNorm, 0.0, "llama"),
+        (GemmaAddRMSNorm, 1.0, "gemma"),
+        (BaseAddRMSNorm, 0.0, "none"),
+    ],
+)
+@pytest.mark.parametrize(
+    "in_place",
+    [
+        True,
+        False,
+    ],
+)
+def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_mode, in_place):
+    _tensor = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+    _residual = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    h1 = _tensor.clone().requires_grad_(True)
+    r1 = _residual.clone().requires_grad_(True)
+    h2 = _tensor.clone().requires_grad_(True)
+    r2 = _residual.clone().requires_grad_(True)
+
+    # do
+    dh = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+    dr = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    # reference (llama or gemma)
+    ref_rms = reference(hidden_size=hd).to(device).to(dtype)
+    ref_h, ref_r = ref_rms(h1, r1)
+    torch.autograd.backward((ref_h, ref_r), (dh, dr), retain_graph=True)
+
+    # triton
+    triton_rms = (
+        LigerFusedAddRMSNorm(hidden_size=hd, offset=offset, casting_mode=casting_mode, in_place=in_place)
+        .to(device)
+        .to(dtype)
+    )
+    triton_h, triton_r = triton_rms(h2, r2)
+
+    torch.autograd.backward((triton_h, triton_r), (dh, dr), retain_graph=True)
+
+    assert_verbose_allclose(ref_h, triton_h, atol=atol, rtol=rtol)
+    assert_verbose_allclose(ref_r, triton_r, atol=atol, rtol=rtol)
+    assert_verbose_allclose(ref_rms.weight.grad, triton_rms.weight.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(h1.grad, h2.grad, atol=atol, rtol=rtol, max_print=20)
+    assert_verbose_allclose(r1.grad, r2.grad, atol=atol, rtol=rtol, max_print=20)
+
+
+@pytest.mark.parametrize(
+    "bs, sl, hd",
+    [
+        (2, 2, 8),
+        # weird shapes
+        (9, 7, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        (torch.bfloat16, 2e-1, 2e-2),
+    ],
+)
+@pytest.mark.parametrize(
+    "reference, offset, casting_mode",
+    [
+        (LlamaAddRMSNorm, 0.0, "llama"),
+        (GemmaAddRMSNorm, 1.0, "gemma"),
+    ],
+)
+@pytest.mark.parametrize(
+    "in_place",
+    [
+        True,
+        False,
+    ],
+)
+def test_correctness_functional(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_mode, in_place):
+    # h
+    _tensor = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+    _residual = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    h1 = _tensor.clone().requires_grad_(True)
+    r1 = _residual.clone().requires_grad_(True)
+    h2 = _tensor.clone().requires_grad_(True)
+    r2 = _residual.clone().requires_grad_(True)
+
+    w = torch.randn(hd, device=device, dtype=dtype)
+
+    h, r = liger_fused_add_rms_norm(
+        X=h1, R=r1, W=w, eps=1e-6, offset=offset, casting_mode=casting_mode, in_place=in_place
+    )
+    ref_h, ref_r = LigerFusedAddRMSNormFunction.apply(h2, r2, w, 1e-6, offset, casting_mode, in_place)
+
+    assert torch.allclose(h, ref_h, atol=atol, rtol=rtol)
+    assert torch.allclose(r, ref_r, atol=atol, rtol=rtol)
+
+    dh = torch.randn_like(h)
+    dh_ref = dh.clone()
+    dr = torch.randn_like(r)
+    dr_ref = dr.clone()
+
+    torch.autograd.backward((h, r), (dh, dr), retain_graph=True)
+    torch.autograd.backward((ref_h, ref_r), (dh_ref, dr_ref), retain_graph=True)
+
+    assert torch.allclose(h1.grad, h2.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(r1.grad, r2.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_fused_linear_cross_entropy.py b/test/transformers/test_fused_linear_cross_entropy.py
new file mode 100755
index 0000000000000000000000000000000000000000..e1d5bc1a3156c4c1378e43e23c548e8fca37ff1c
--- /dev/null
+++ b/test/transformers/test_fused_linear_cross_entropy.py
@@ -0,0 +1,1024 @@
+from typing import Optional
+
+import pytest
+import torch
+
+from test.transformers.test_cross_entropy import CrossEntropyWithZLoss
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+from liger_kernel.ops import LigerFusedLinearCrossEntropyFunction
+from liger_kernel.transformers.functional import CrossEntropyOutput
+from liger_kernel.transformers.functional import liger_fused_linear_cross_entropy
+from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+# set random seed globally
+set_seed()
+
+
+class TorchLMHeadCE(torch.nn.Module):
+    """Ground truth implementation of the linear fused with torch based cross entropy loss.
+
+    :param H: hidden size
+    :param V: vocab size
+    :param ignore_index: index to ignore
+    :param reduction: reduction method
+    :param label_smoothing: label_smoothing to apply on target
+    :param lse_square_scale: scaler of lse ^ 2 to compute z loss
+
+    # TODO: if we bump CI env's `transformers` version to >= 4.46, we should just directly
+    # call https://github.com/huggingface/transformers/blob/main/src/transformers/loss/loss_utils.py#L32
+    # to be consistent with Hugging Face model implementation.
+    """
+
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        bias: bool = False,
+        ce_weight: Optional[torch.FloatTensor] = None,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ce_loss = CrossEntropyWithZLoss(
+            weight=ce_weight,
+            ignore_index=ignore_index,
+            lse_square_scale=lse_square_scale,
+            label_smoothing=label_smoothing,
+            reduction=reduction,
+            return_z_loss=return_z_loss,
+        )
+        self.softcap = softcap
+
+    def forward(self, x, y):
+        logits = self.lin(x).to(torch.float32)
+        if self.softcap is not None and self.softcap != 0.0:
+            logits = self.softcap * torch.tanh(logits / self.softcap)
+        return self.ce_loss(logits, y)
+
+
+class LigerLMHeadCE(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        ce_weight: Optional[torch.FloatTensor] = None,
+        bias: bool = False,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        return_z_loss: bool = False,
+        accum_dtype=None,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
+        self.ce_loss = LigerFusedLinearCrossEntropyLoss(
+            ce_weight=ce_weight,
+            ignore_index=ignore_index,
+            lse_square_scale=lse_square_scale,
+            label_smoothing=label_smoothing,
+            reduction=reduction,
+            softcap=softcap,
+            return_z_loss=return_z_loss,
+            accum_dtype=accum_dtype,
+        )
+
+    def forward(self, x, y):
+        return self.ce_loss(self.lin.weight, x, y, self.lin.bias)
+
+
+#############################################################################
+# Test the correctness of the fused linear cross entropy loss
+#############################################################################
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "reduction, scalar, dtype, atol, rtol",
+    [
+        ("mean", 1.0, torch.bfloat16, 5e-3, 5e-2),
+        ("mean", 1.0, torch.float32, 1e-5, 5e-4),
+        ("sum", 1.0, torch.bfloat16, 5e-0, 5e1),
+        ("sum", 1.0, torch.float32, 1e-3, 5e-2),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "has_ce_weight, label_smoothing, ignore_index, lse_square_scale, softcap, return_z_loss, accum_dtype",
+    [
+        (False, 0, -100, 0, None, False, None),
+        # Pass non-default values once to ensure all params work along
+        (True, 0.1, 42, 1e-4, 30.0, True, torch.float32),
+    ],
+)
+def test_correctness(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    bias,
+    has_ce_weight,
+    lse_square_scale,
+    label_smoothing,
+    ignore_index,
+    reduction,
+    softcap,
+    return_z_loss,
+    accum_dtype,
+    atol,
+    rtol,
+):
+    if has_ce_weight:
+        ce_weight = torch.rand(V, device=device, dtype=torch.float32)
+    else:
+        ce_weight = None
+    torch_lm_head_ce = TorchLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        ce_weight=ce_weight,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        softcap=softcap,
+        return_z_loss=return_z_loss,
+        dtype=dtype,
+    ).to(device)
+    liger_lm_head_ce = LigerLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        ce_weight=ce_weight,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        softcap=softcap,
+        return_z_loss=return_z_loss,
+        dtype=dtype,
+        accum_dtype=accum_dtype,
+    ).to(device)
+
+    # init the linear in all CEs with the same weights
+    torch_lm_head_ce.lin.weight.data = liger_lm_head_ce.lin.weight.data = torch.rand(V, H, device=device, dtype=dtype)
+
+    if bias:
+        torch_lm_head_ce.lin.bias.data = liger_lm_head_ce.lin.bias.data = torch.rand(V, device=device, dtype=dtype)
+
+    _tensor = torch.randn(B * T, H, device=device, dtype=dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    if return_z_loss:
+        output1, z_output1 = torch_lm_head_ce(_input1, target)
+        result2 = liger_lm_head_ce(_input2, target)
+        assert isinstance(result2, CrossEntropyOutput)
+        output2 = result2.loss
+        z_output2 = result2.z_loss
+    else:
+        output1 = torch_lm_head_ce(_input1, target)
+        output2 = liger_lm_head_ce(_input2, target)
+
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+    if return_z_loss:
+        assert_verbose_allclose(z_output1, z_output2, atol=atol, rtol=rtol)
+
+    grad_output = torch.ones_like(output1)
+    output1.backward(gradient=grad_output)
+    output2.backward(gradient=grad_output)
+
+    assert_verbose_allclose(_input1.grad, _input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(
+        torch_lm_head_ce.lin.weight.grad,
+        liger_lm_head_ce.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_ce.lin.bias.grad,
+            liger_lm_head_ce.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "reduction, scalar, dtype, atol, rtol",
+    [
+        ("mean", 1.0, torch.bfloat16, 5e-3, 5e-2),
+        ("mean", 1.0, torch.float32, 1e-5, 5e-4),
+        ("sum", 1.0, torch.bfloat16, 5e-0, 5e1),
+        ("sum", 1.0, torch.float32, 1e-3, 5e-2),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "has_ce_weight, label_smoothing, ignore_index, lse_square_scale, softcap, return_z_loss, accum_dtype",
+    [
+        (False, 0, -100, 0, None, False, None),
+        # Pass non-default values once to ensure all params work along
+        (True, 0.1, 42, 1e-4, 30.0, True, torch.float32),
+    ],
+)
+def test_correctness_with_forward_only(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    bias,
+    has_ce_weight,
+    lse_square_scale,
+    label_smoothing,
+    ignore_index,
+    reduction,
+    softcap,
+    return_z_loss,
+    accum_dtype,
+    atol,
+    rtol,
+):
+    if has_ce_weight:
+        ce_weight = torch.rand(V, device=device, dtype=torch.float32)
+    else:
+        ce_weight = None
+    torch_lm_head_ce = TorchLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        ce_weight=ce_weight,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        softcap=softcap,
+        return_z_loss=return_z_loss,
+        dtype=dtype,
+    ).to(device)
+    liger_lm_head_ce = LigerLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        ce_weight=ce_weight,
+        lse_square_scale=lse_square_scale,
+        label_smoothing=label_smoothing,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        softcap=softcap,
+        return_z_loss=return_z_loss,
+        dtype=dtype,
+        accum_dtype=accum_dtype,
+    ).to(device)
+
+    # init the linear in all CEs with the same weights
+    torch_lm_head_ce.lin.weight.data = liger_lm_head_ce.lin.weight.data = torch.rand(V, H, device=device, dtype=dtype)
+
+    if bias:
+        torch_lm_head_ce.lin.bias.data = liger_lm_head_ce.lin.bias.data = torch.rand(V, device=device, dtype=dtype)
+
+    _tensor = torch.randn(B * T, H, device=device, dtype=dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    target[indices_to_assign] = ignore_index
+
+    with torch.no_grad():
+        if return_z_loss:
+            output1, z_output1 = torch_lm_head_ce(_input1, target)
+            result2 = liger_lm_head_ce(_input2, target)
+            assert isinstance(result2, CrossEntropyOutput)
+            output2 = result2.loss
+            z_output2 = result2.z_loss
+        else:
+            output1 = torch_lm_head_ce(_input1, target)
+            output2 = liger_lm_head_ce(_input2, target)
+
+        assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+        if return_z_loss:
+            assert_verbose_allclose(z_output1, z_output2, atol=atol, rtol=rtol)
+
+    try:
+        grad_output = torch.rand_like(output1)
+        output2.backward(gradient=grad_output)
+    except RuntimeError as e:
+        assert "does not require grad" in str(e)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        # weird shapes
+        (9, 7, 41, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-3, 5e-2),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("ce_weight", [True, False])
+@pytest.mark.parametrize("bias", [True, False])
+def test_correctness_functional(B, T, H, V, scalar, dtype, bias, ce_weight, atol, rtol):
+    _input = torch.randn(B * T, H, device=device, dtype=dtype) * scalar
+    x1 = _input.detach().clone().requires_grad_(True)
+    x2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype) if bias else None
+
+    ce_weight = torch.randn(V, device=device) if ce_weight else None
+    result = liger_fused_linear_cross_entropy(
+        input=x1,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ce_weight=ce_weight,
+        ignore_index=-100,
+        lse_square_scale=1e-4,
+        label_smoothing=0.1,
+        reduction="mean",
+        softcap=30.0,
+        return_z_loss=True,
+        accum_dtype=torch.float32,
+    )
+    if isinstance(result, CrossEntropyOutput):
+        y1 = result.loss
+        z1 = result.z_loss
+    else:
+        y1, z1 = result
+
+    y2, z2, _, _ = LigerFusedLinearCrossEntropyFunction.apply(
+        x2, weight, target, bias, ce_weight, -100, 1e-4, 0.1, "mean", 30.0, True, torch.float32, False, False, False
+    )
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+    assert torch.allclose(z1, z2, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(y1)
+
+    y1.backward(grad_output)
+    y2.backward(grad_output)
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "reduction, scalar, dtype, atol, rtol",
+    [
+        ("mean", 1.0, torch.bfloat16, 5e-3, 5e-2),
+        ("mean", 1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("return_token_accuracy", [True, False])
+def test_correctness_with_token_accuracy(
+    B,
+    T,
+    H,
+    V,
+    scalar,
+    dtype,
+    bias,
+    return_token_accuracy,
+    reduction,
+    atol,
+    rtol,
+):
+    """Test that return_token_accuracy flag works correctly."""
+    torch_lm_head_ce = TorchLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        reduction=reduction,
+        dtype=dtype,
+    ).to(device)
+    liger_lm_head_ce = LigerLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        reduction=reduction,
+        dtype=dtype,
+    ).to(device)
+
+    # init the linear in all CEs with the same weights
+    torch_lm_head_ce.lin.weight.data = liger_lm_head_ce.lin.weight.data = torch.rand(V, H, device=device, dtype=dtype)
+
+    if bias:
+        torch_lm_head_ce.lin.bias.data = liger_lm_head_ce.lin.bias.data = torch.rand(V, device=device, dtype=dtype)
+
+    _tensor = torch.randn(B * T, H, device=device, dtype=dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target[indices_to_assign] = -100
+
+    # Compute with torch (baseline - only loss)
+    output1 = torch_lm_head_ce(_input1, target)
+
+    # Compute with liger using functional API with return_token_accuracy
+    result = liger_fused_linear_cross_entropy(
+        input=_input2,
+        weight=liger_lm_head_ce.lin.weight,
+        target=target,
+        bias=liger_lm_head_ce.lin.bias if bias else None,
+        ignore_index=-100,
+        reduction=reduction,
+        return_token_accuracy=return_token_accuracy,
+    )
+
+    if return_token_accuracy:
+        # Should return structured output with token_accuracy populated
+        assert isinstance(result, CrossEntropyOutput), "Expected CrossEntropyOutput when return_token_accuracy=True"
+        output2 = result.loss
+        token_accuracy = result.token_accuracy
+        assert token_accuracy is not None, "token_accuracy should not be None"
+
+        # Verify token_accuracy is computed correctly
+        with torch.no_grad():
+            # Compute expected accuracy
+            logits = _input2 @ liger_lm_head_ce.lin.weight.t()
+            if bias:
+                logits = logits + liger_lm_head_ce.lin.bias
+            predictions = torch.argmax(logits, dim=-1)
+            mask = target != -100
+            correct = (predictions == target) & mask
+            expected_accuracy = correct.sum().float() / mask.sum().float()
+
+        assert_verbose_allclose(token_accuracy, expected_accuracy, atol=atol, rtol=rtol)
+    else:
+        # Should return only loss
+        output2 = result
+        assert not isinstance(result, CrossEntropyOutput), "Expected scalar loss when return_token_accuracy=False"
+
+    # Loss should match regardless of return_token_accuracy flag
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+
+    grad_output = torch.ones_like(output1)
+    output1.backward(gradient=grad_output)
+    output2.backward(gradient=grad_output)
+
+    assert_verbose_allclose(_input1.grad, _input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(
+        torch_lm_head_ce.lin.weight.grad,
+        liger_lm_head_ce.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+    if bias:
+        assert_verbose_allclose(
+            torch_lm_head_ce.lin.bias.grad,
+            liger_lm_head_ce.lin.bias.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "bias, cast_dtype, atol, rtol",
+    [
+        (True, torch.bfloat16, 5e-3, 5e-2),
+        (True, torch.float16, 5e-3, 5e-2),
+        (False, torch.bfloat16, 5e-3, 5e-2),
+        (False, torch.float16, 5e-3, 5e-2),
+    ],
+)
+@pytest.mark.parametrize("accum_dtype", [None, torch.float32])
+def test_amp(B, T, H, V, bias, cast_dtype, accum_dtype, atol, rtol):
+    dtype = torch.float32
+    torch_lm_head_ce = TorchLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        label_smoothing=0.0,
+        reduction="mean",
+        dtype=dtype,
+    ).to(device)
+    liger_lm_head_ce = LigerLMHeadCE(
+        H=H,
+        V=V,
+        bias=bias,
+        label_smoothing=0.0,
+        reduction="mean",
+        dtype=dtype,
+        accum_dtype=accum_dtype,
+    ).to(device)
+
+    # init the linear in all CEs with the same weights
+    torch_lm_head_ce.lin.weight.data = liger_lm_head_ce.lin.weight.data = torch.rand(V, H, device=device, dtype=dtype)
+
+    _tensor = torch.randn(B * T, H, device=device, dtype=dtype)
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    with torch.autocast(device_type=device, dtype=cast_dtype):
+        output1 = torch_lm_head_ce(_input1, target)
+        output2 = liger_lm_head_ce(_input2, target)
+
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+
+    with torch.autocast(device_type=device, dtype=cast_dtype):
+        output1.backward()
+        output2.backward()
+
+    assert_verbose_allclose(_input1.grad, _input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(
+        torch_lm_head_ce.lin.weight.grad,
+        liger_lm_head_ce.lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+def test_correctness_token_scaling():
+    """Test that token scaling produces the correct loss values and gradients."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using functional API with token scaling
+    loss_scaled = liger_fused_linear_cross_entropy(
+        input=_input,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        reduction="none",  # Use "none" to get per-token losses
+        use_token_scaling=True,
+    )
+
+    # Compare with manual implementation
+    # Compute logits
+    logits = _input @ weight.t()
+    if bias is not None:
+        logits = logits + bias
+
+    # Compute standard cross entropy loss per token
+    ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
+
+    # Compute predicted probabilities for target tokens
+    pred_probs = torch.softmax(logits, dim=-1).gather(1, target.unsqueeze(-1)).squeeze(-1).detach()
+
+    # Scale by predicted probabilities
+    expected_loss = ce_loss * pred_probs
+
+    # Check that losses are close
+    assert torch.allclose(loss_scaled, expected_loss, atol=1e-4, rtol=1e-4)
+
+    # Test gradients
+    loss_scaled.sum().backward(retain_graph=True)
+    grad_scaled = _input.grad.clone()
+    _input.grad.zero_()
+
+    expected_loss.sum().backward(retain_graph=True)
+    grad_expected = _input.grad.clone()
+    _input.grad.zero_()
+
+    # Check that gradients are close
+    assert torch.allclose(grad_scaled, grad_expected, atol=1e-4, rtol=1e-4)
+
+
+def test_correctness_token_scaling_consistency():
+    """Test that token scaling is consistent between functional and module APIs."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test functional API
+    loss_functional = liger_fused_linear_cross_entropy(
+        input=_input,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    # Test module API
+    ce_loss_module = LigerFusedLinearCrossEntropyLoss(
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    loss_module = ce_loss_module(weight, _input, target, bias)
+
+    # Check that losses are identical
+    assert torch.allclose(loss_functional, loss_module, atol=1e-6, rtol=1e-6)
+
+    # Test gradients
+    loss_functional.backward(retain_graph=True)
+    grad_functional = _input.grad.clone()
+    _input.grad.zero_()
+
+    loss_module.backward(retain_graph=True)
+    grad_module = _input.grad.clone()
+    _input.grad.zero_()
+
+    # Check that gradients are identical
+    assert torch.allclose(grad_functional, grad_module, atol=1e-6, rtol=1e-6)
+
+
+def test_correctness_token_scaling_functional():
+    """Test token scaling using the functional API."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype)
+    x1 = _input.detach().clone().requires_grad_(True)
+    x2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using functional API with token scaling
+    y1 = liger_fused_linear_cross_entropy(
+        input=x1,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        lse_square_scale=0.0,
+        label_smoothing=0.0,
+        reduction="sum",  # Use sum for easier verification
+        softcap=None,
+        return_z_loss=False,
+        accum_dtype=None,
+        use_token_scaling=True,
+    )
+
+    # Compare with manual implementation
+    # Compute logits
+    logits = x2 @ weight.t()
+    if bias is not None:
+        logits = logits + bias
+
+    # Compute softmax probabilities
+    probs = torch.softmax(logits.detach(), dim=-1)  # Detach to avoid gradient flow
+
+    # Get predicted probabilities for target tokens
+    pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
+
+    # Compute standard cross entropy loss
+    ce_loss = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
+
+    # Scale by predicted probabilities
+    scaled_loss = ce_loss * pred_probs
+
+    # Sum over all tokens
+    y2 = scaled_loss.sum()
+
+    # Check that losses are close
+    assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
+
+    # Test gradients
+    y1.backward()
+    y2.backward()
+
+    # Check that gradients are close
+    assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)
+
+
+def test_correctness_token_scaling_module():
+    """Test token scaling using the module API."""
+    B, T, H, V = 2, 4, 8, 16
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype)
+    x1 = _input.detach().clone().requires_grad_(True)
+    x2 = _input.detach().clone().requires_grad_(True)
+
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Create module with token scaling
+    ce_loss = LigerFusedLinearCrossEntropyLoss(
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using module API with token scaling
+    y1 = ce_loss(weight, x1, target, bias)
+
+    # Compare with manual implementation
+    # Compute logits
+    logits = x2 @ weight.t()
+    if bias is not None:
+        logits = logits + bias
+
+    # Compute softmax probabilities
+    probs = torch.softmax(logits.detach(), dim=-1)  # Detach to avoid gradient flow
+
+    # Get predicted probabilities for target tokens
+    pred_probs = torch.gather(probs, -1, target.unsqueeze(-1)).squeeze(-1)
+
+    # Compute standard cross entropy loss
+    ce_loss_manual = torch.nn.functional.cross_entropy(logits, target, ignore_index=-100, reduction="none")
+
+    # Scale by predicted probabilities
+    scaled_loss = ce_loss_manual * pred_probs
+
+    # Sum over all tokens
+    y2 = scaled_loss.sum()
+
+    # Check that losses are close
+    assert torch.allclose(y1, y2, atol=1e-5, rtol=1e-5)
+
+    # Test gradients
+    y1.backward()
+    y2.backward()
+
+    # Check that gradients are close
+    assert torch.allclose(x1.grad, x2.grad, atol=1e-5, rtol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "return_z_loss, return_token_accuracy",
+    [
+        (False, False),
+        (True, False),
+        (False, True),
+        (True, True),
+    ],
+)
+def test_liger_fused_linear_cross_entropy_structured_output(return_z_loss, return_token_accuracy):
+    hidden_states = torch.tensor(
+        [[0.2, -0.1], [1.0, 0.5], [-0.3, 0.7]],
+        device=device,
+        dtype=torch.float32,
+        requires_grad=True,
+    )
+    weight = torch.tensor(
+        [[0.5, -0.4], [-0.2, 0.3], [0.1, 0.6]],
+        device=device,
+        dtype=torch.float32,
+    )
+    bias = torch.tensor([0.1, -0.2, 0.05], device=device, dtype=torch.float32)
+    targets = torch.tensor([0, 1, 2], device=device)
+
+    result = liger_fused_linear_cross_entropy(
+        input=hidden_states,
+        weight=weight,
+        target=targets,
+        bias=bias,
+        return_z_loss=return_z_loss,
+        return_token_accuracy=return_token_accuracy,
+    )
+
+    logits = hidden_states @ weight.t() + bias
+    expected_loss = torch.nn.functional.cross_entropy(logits, targets)
+
+    if not return_z_loss and not return_token_accuracy:
+        assert isinstance(result, torch.Tensor)
+        assert torch.allclose(result, expected_loss, atol=1e-6)
+        result.backward()
+        assert hidden_states.grad is not None
+        hidden_states.grad.zero_()
+    else:
+        assert isinstance(result, CrossEntropyOutput)
+        assert torch.allclose(result.loss, expected_loss, atol=1e-6)
+
+        if return_z_loss:
+            assert result.z_loss is not None
+        else:
+            assert result.z_loss is None
+
+        if return_token_accuracy:
+            assert result.token_accuracy is not None
+            with torch.no_grad():
+                predictions = logits.argmax(dim=-1)
+                expected_accuracy = (predictions == targets).float().mean()
+            assert torch.allclose(result.token_accuracy, expected_accuracy, atol=1e-6)
+        else:
+            assert result.token_accuracy is None
+
+        result.loss.backward()
+        assert hidden_states.grad is not None
+        hidden_states.grad.zero_()
+
+    module = LigerFusedLinearCrossEntropyLoss(
+        return_z_loss=return_z_loss,
+        return_token_accuracy=return_token_accuracy,
+    )
+
+    module_result = module(weight, hidden_states, targets, bias)
+
+    if not return_z_loss and not return_token_accuracy:
+        assert isinstance(module_result, torch.Tensor)
+        assert torch.allclose(module_result, expected_loss, atol=1e-6)
+    else:
+        assert isinstance(module_result, CrossEntropyOutput)
+        assert torch.allclose(module_result.loss, expected_loss, atol=1e-6)
+        if return_z_loss:
+            assert module_result.z_loss is not None
+        else:
+            assert module_result.z_loss is None
+        if return_token_accuracy:
+            assert module_result.token_accuracy is not None
+        else:
+            assert module_result.token_accuracy is None
+
+
+def test_token_scaling_with_ignore_index():
+    """Test token scaling when some targets have ignore_index values."""
+    B, T, H, V = 2, 4, 8, 1000
+    dtype = torch.float32
+
+    # Create inputs
+    _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
+
+    # Create targets with some ignore_index values (-100)
+    target = torch.tensor([0, 100, -100, 500, -100, 999], device=device, dtype=torch.long)
+    _input = torch.randn(6, H, device=device, dtype=dtype, requires_grad=True)  # Adjust input size
+
+    # Create weights
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias = torch.randn(V, device=device, dtype=dtype)
+
+    # Test using functional API with token scaling
+    loss_scaled = liger_fused_linear_cross_entropy(
+        input=_input,
+        weight=weight,
+        target=target,
+        bias=bias,
+        ignore_index=-100,
+        reduction="sum",
+        use_token_scaling=True,
+    )
+
+    # This should not raise any CUDA errors
+    assert loss_scaled.numel() == 1  # Should return a scalar for sum reduction
+    assert not torch.isnan(loss_scaled)  # Should not be NaN
+    assert not torch.isinf(loss_scaled)  # Should not be infinite
+
+    # Test gradients
+    loss_scaled.backward()
+    assert _input.grad is not None
+    assert not torch.isnan(_input.grad).any()  # Gradients should not be NaN
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 47, 31, 123),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "reduction, dtype, atol, rtol",
+    [
+        ("mean", torch.bfloat16, 5e-3, 5e-2),
+        ("mean", torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("ignore_index", [-100, 2])
+def test_correctness_with_predicted_tokens(B, T, H, V, reduction, dtype, bias, ignore_index, atol, rtol):
+    """Test that return_predicted_tokens flag works correctly with fused linear CE."""
+    torch.manual_seed(42)
+
+    weight = torch.randn(V, H, device=device, dtype=dtype)
+    bias_tensor = torch.randn(V, device=device, dtype=dtype) if bias else None
+
+    _input = torch.randn(B * T, H, device=device, dtype=dtype, requires_grad=True)
+    target = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some elements as ignore_index
+    num_ignore = B * T // 4
+    indices_to_ignore = torch.randperm(B * T)[:num_ignore]
+    target[indices_to_ignore] = ignore_index
+
+    result = liger_fused_linear_cross_entropy(
+        input=_input,
+        weight=weight,
+        target=target,
+        bias=bias_tensor,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        return_predicted_tokens=True,
+    )
+
+    assert isinstance(result, CrossEntropyOutput)
+    assert result.predicted_tokens is not None
+    assert result.predicted_tokens.shape == (B * T,)
+    assert result.predicted_tokens.dtype == torch.int64
+
+    # Verify predicted tokens are correct by checking the logit at the predicted position
+    # is close to the max logit. Chunked matmul in bfloat16 can break ties differently
+    # from full matmul, so we compare logit values rather than exact argmax indices.
+    non_ignore_mask = target != ignore_index
+    with torch.no_grad():
+        logits = _input.float() @ weight.float().t()
+        if bias_tensor is not None:
+            logits = logits + bias_tensor.float()
+        max_logits = logits[non_ignore_mask].max(dim=-1).values
+        predicted_logits = (
+            logits[non_ignore_mask].gather(1, result.predicted_tokens[non_ignore_mask].unsqueeze(1)).squeeze(1)
+        )
+
+    assert torch.allclose(predicted_logits, max_logits, atol=atol, rtol=rtol)
+
+    # For ignored tokens, predicted_tokens should be -1
+    assert torch.all(result.predicted_tokens[~non_ignore_mask] == -1)
+
+    # Verify backward still works
+    result.loss.backward()
+    assert _input.grad is not None
diff --git a/test/transformers/test_fused_linear_jsd.py b/test/transformers/test_fused_linear_jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..9218a397f6d3d6b7dd5e42597ce514a0f3b0f321
--- /dev/null
+++ b/test/transformers/test_fused_linear_jsd.py
@@ -0,0 +1,450 @@
+import pytest
+import torch
+
+from test.transformers.test_jsd import JSD as TorchJSD
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+from liger_kernel.ops import LigerFusedLinearJSDFunction
+from liger_kernel.transformers.functional import liger_fused_linear_jsd
+from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+set_seed(42)
+
+
+class TorchLMHeadJSD(torch.nn.Module):
+    """Ground truth implementation of the linear fused with torch based jsd loss.
+
+    :param H: hidden size
+    :param V: vocab size
+    :param temperature: softmax temperature
+    :param beta: jsd beta
+    """
+
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__()
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=False, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.jsd = TorchJSD(beta=beta, ignore_index=ignore_index, dtype=dtype)
+        self.temperature = temperature
+
+    def forward(self, student_input, teacher_input, label=None):
+        student_logits = self.student_lin(student_input).to(torch.float32)
+        teacher_logits = self.teacher_lin(teacher_input).to(torch.float32)
+        student_prob = torch.log_softmax(student_logits / self.temperature, dim=-1)
+        teacher_prob = torch.log_softmax(teacher_logits / self.temperature, dim=-1)
+
+        return self.jsd(student_prob, teacher_prob, label)
+
+
+class LigerLMHeadJSD(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+    ):
+        super().__init__()
+        self.student_lin = torch.nn.Linear(in_features=H // 2, out_features=V, bias=False, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.fused_jsd = LigerFusedLinearJSD(jsd_beta=beta, ignore_index=ignore_index, temperature=temperature)
+
+    def forward(self, student_input, teacher_input, label=None):
+        return self.fused_jsd(
+            student_input,
+            self.student_lin.weight,
+            teacher_input,
+            self.teacher_lin.weight,
+            label,
+        )
+
+
+#############################################################################
+# Test the correctness of the fused linear JSD
+#############################################################################
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 423, 167, 1423),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-3, 5e-2),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize(
+    "temperature, beta",
+    [
+        (1.0, 0.5),
+        (2.0, 0.1),
+        (1.0, 0.0),  # FKL
+        (1.0, 1.0),  # RKL
+    ],
+)
+def test_correctness(B, T, H, V, scalar, dtype, beta, temperature, atol, rtol):
+    torch_lm_head_jsd = TorchLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        beta=beta,
+    ).to(device)
+    liger_lm_head_jsd = LigerLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        beta=beta,
+    ).to(device)
+
+    # init the linear in all FusedLinearJSDs with the same weights
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H // 2, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+
+    with torch.autograd.detect_anomaly():
+        output1 = torch_lm_head_jsd(_input1, teacher_input)
+        output2 = liger_lm_head_jsd(_input2, teacher_input)
+
+        assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+
+    output1.backward()
+    output2.backward()
+
+    assert_verbose_allclose(_input1.grad, _input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(
+        torch_lm_head_jsd.student_lin.weight.grad,
+        liger_lm_head_jsd.student_lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 423, 167, 1423),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-3, 5e-2),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize(
+    "temperature, beta, ignore_index",
+    [
+        (1.0, 0.5, 2),
+        (1.0, 0.0, 2),
+        (2.0, 0.1, 42),
+        (1.0, 1.0, 2),
+    ],
+)
+def test_correctness_with_ignore_index(B, T, H, V, scalar, dtype, beta, ignore_index, temperature, atol, rtol):
+    torch_lm_head_jsd = TorchLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+    liger_lm_head_jsd = LigerLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+
+    # init the linear in all FusedLinearJSDs with the same weights
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H // 2, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+
+    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    label[indices_to_assign] = ignore_index
+
+    output1 = torch_lm_head_jsd(_input1, teacher_input, label)
+    output2 = liger_lm_head_jsd(_input2, teacher_input, label)
+
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+
+    output1.backward()
+    output2.backward()
+
+    assert_verbose_allclose(_input1.grad, _input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(
+        torch_lm_head_jsd.student_lin.weight.grad,
+        liger_lm_head_jsd.student_lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (2, 2, 8, 8),
+        # weird shapes
+        (9, 7, 41, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (0.5, torch.bfloat16, 5e-3, 5e-2),
+        (0.5, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize("temperature, beta, ignore_index", [(1.0, 0.5, -100), (2.0, 0.1, 42)])
+def test_correctness_functional(B, T, H, V, scalar, dtype, beta, ignore_index, temperature, atol, rtol):
+    # init the linear in all FusedLinearJSDs with the same weights
+    _weight = torch.rand(V, H // 2, device=device, dtype=dtype)
+    _weight1 = _weight.detach().clone().requires_grad_(True)
+    _weight2 = _weight.detach().clone().requires_grad_(True)
+    teacher_weight = torch.rand(V, H, device=device, dtype=dtype)
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+
+    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    label[indices_to_assign] = ignore_index
+
+    output1 = liger_fused_linear_jsd(
+        student_input=_input1,
+        student_weight=_weight1,
+        teacher_input=teacher_input,
+        teacher_weight=teacher_weight,
+        shift_labels=label,
+        jsd_beta=beta,
+        ignore_index=ignore_index,
+        temperature=temperature,
+    )
+    output2 = LigerFusedLinearJSDFunction.apply(
+        _input2,
+        _weight2,
+        teacher_input,
+        teacher_weight,
+        label,
+        beta,
+        ignore_index,
+        temperature,
+    )
+
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+
+    output1.backward()
+    output2.backward()
+
+    assert_verbose_allclose(_input1.grad, _input2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(_weight1.grad, _weight2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "B, T, H, V",
+    [
+        (8, 128, 1024, 4096),
+        (4, 423, 167, 1423),  # random shape
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar, dtype, atol, rtol",
+    [
+        (1.0, torch.bfloat16, 5e-3, 5e-2),
+        (1.0, torch.float32, 1e-5, 5e-4),
+    ],
+)
+@pytest.mark.parametrize(
+    "temperature, beta, ignore_index",
+    [
+        (1.0, 0.5, 2),
+        (2.0, 0.1, 42),
+    ],
+)
+def test_correctness_all_ignored(B, T, H, V, scalar, dtype, beta, ignore_index, temperature, atol, rtol):
+    torch_lm_head_jsd = TorchLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+    liger_lm_head_jsd = LigerLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+
+    # init the linear in all FusedLinearJSDs with the same weights
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H // 2, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(B * T, H, device=device, dtype=dtype) * scalar
+
+    label = torch.full((B * T,), ignore_index, device=device, dtype=torch.long)
+
+    output1 = torch_lm_head_jsd(_input1, teacher_input, label)
+    output2 = liger_lm_head_jsd(_input2, teacher_input, label)
+
+    assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+    assert_verbose_allclose(output2, torch.zeros_like(output2), atol=atol, rtol=rtol)
+
+    output2.backward()
+
+    assert_verbose_allclose(torch.zeros_like(_input2.grad), _input2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "autocast_dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-3, 5e-2),
+        (torch.float16, 5e-3, 5e-2),
+    ],
+)
+def test_amp(autocast_dtype, atol, rtol):
+    B = 2
+    T = 4
+    H = 2048
+    V = 3200
+    scalar = 1.0
+    ignore_index = -100
+    temperature = 1.0
+    beta = 0.5
+    dtype = torch.float32
+    torch_lm_head_jsd = TorchLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+    liger_lm_head_jsd = LigerLMHeadJSD(
+        H=H,
+        V=V,
+        dtype=dtype,
+        device=device,
+        temperature=temperature,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+    # init the linear in all FusedLinearJSDs with the same weights
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H // 2, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+
+    _tensor = torch.rand(B * T, H // 2, device=device, dtype=autocast_dtype) * scalar
+    _input1 = _tensor.detach().clone().requires_grad_(True)
+    _input2 = _tensor.detach().clone().requires_grad_(True)
+
+    teacher_input = torch.rand(B * T, H, device=device, dtype=autocast_dtype) * scalar
+
+    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    label[indices_to_assign] = ignore_index
+
+    with torch.autocast(device_type=device, dtype=autocast_dtype):
+        output1 = torch_lm_head_jsd(_input1, teacher_input, label)
+        output2 = liger_lm_head_jsd(_input2, teacher_input, label)
+
+        assert_verbose_allclose(output1, output2, atol=atol, rtol=rtol)
+
+        output1.backward()
+        output2.backward()
+
+    assert_verbose_allclose(_input1.grad, _input2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(
+        torch_lm_head_jsd.student_lin.weight.grad,
+        liger_lm_head_jsd.student_lin.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
diff --git a/test/transformers/test_fused_neighborhood_attention.py b/test/transformers/test_fused_neighborhood_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..c271d993249721dfbfee6accab3dac577d5c3427
--- /dev/null
+++ b/test/transformers/test_fused_neighborhood_attention.py
@@ -0,0 +1,572 @@
+import math
+
+import pytest
+import torch
+import torch.nn as nn
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+from liger_kernel.transformers.functional import liger_fused_neighborhood_attention
+from liger_kernel.transformers.fused_neighborhood_attention import LigerFusedNeighborhoodAttention
+from liger_kernel.transformers.fused_neighborhood_attention import LigerFusedNeighborhoodAttentionLayer
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+set_seed()
+
+
+class TorchNeighborhoodAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        kernel_size: int = 7,
+        dilation: int = 1,
+        bias: bool = True,
+        dropout: float = 0.0,
+        scale: float = None,
+    ):
+        super().__init__()
+
+        if hidden_size % num_heads != 0:
+            raise ValueError(f"hidden_size ({hidden_size}) must be divisible by num_heads ({num_heads})")
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.scale = scale if scale is not None else 1.0 / math.sqrt(self.head_dim)
+
+        self.q_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.k_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.v_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+        self.out_proj = nn.Linear(hidden_size, hidden_size, bias=bias)
+
+        if dropout > 0.0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def _create_neighborhood_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        mask = torch.zeros(seq_len, seq_len, device=device, dtype=torch.bool)
+        half_kernel = self.kernel_size // 2
+
+        for i in range(seq_len):
+            start = max(0, i - half_kernel * self.dilation)
+            end = min(seq_len, i + half_kernel * self.dilation + 1)
+
+            for j in range(start, end):
+                if self.dilation == 1 or (j - i) % self.dilation == 0:
+                    mask[i, j] = True
+
+        return mask
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, hidden_size = hidden_states.shape
+
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = query.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key = key.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value = value.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
+
+        mask = self._create_neighborhood_mask(seq_len, hidden_states.device)
+        scores = scores.masked_fill(~mask, float("-inf"))
+
+        attn_weights = torch.softmax(scores, dim=-1)
+
+        if self.dropout is not None:
+            attn_weights = self.dropout(attn_weights)
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_size)
+
+        output = self.out_proj(attn_output)
+
+        return output
+
+
+@pytest.mark.parametrize(
+    "batch_size, seq_len, hidden_size, num_heads, kernel_size",
+    [
+        (2, 32, 128, 4, 7),
+        (1, 32, 128, 8, 5),
+        (2, 24, 96, 3, 9),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 5e-3, 5e-3),
+        (torch.bfloat16, 5e-2, 5e-2),
+    ],
+)
+def test_fused_neighborhood_attention_correctness(
+    batch_size, seq_len, hidden_size, num_heads, kernel_size, bias, dtype, atol, rtol
+):
+    set_seed(42)
+
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size, device=device, dtype=dtype)
+
+    ref_attn = (
+        TorchNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=1,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    liger_attn = (
+        LigerFusedNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=1,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    with torch.no_grad():
+        liger_attn.q_proj.weight.copy_(ref_attn.q_proj.weight)
+        liger_attn.k_proj.weight.copy_(ref_attn.k_proj.weight)
+        liger_attn.v_proj.weight.copy_(ref_attn.v_proj.weight)
+        liger_attn.out_proj.weight.copy_(ref_attn.out_proj.weight)
+
+        if bias:
+            liger_attn.q_proj.bias.copy_(ref_attn.q_proj.bias)
+            liger_attn.k_proj.bias.copy_(ref_attn.k_proj.bias)
+            liger_attn.v_proj.bias.copy_(ref_attn.v_proj.bias)
+            liger_attn.out_proj.bias.copy_(ref_attn.out_proj.bias)
+
+    hidden_states1 = hidden_states.detach().clone().requires_grad_(True)
+    hidden_states2 = hidden_states.detach().clone().requires_grad_(True)
+
+    out1 = liger_attn(hidden_states1)
+    out2 = ref_attn(hidden_states2)
+
+    assert_verbose_allclose(out1, out2, atol=atol, rtol=rtol)
+
+    loss1 = out1.sum()
+    loss2 = out2.sum()
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(hidden_states1.grad, hidden_states2.grad, atol=atol, rtol=rtol)
+
+    assert_verbose_allclose(liger_attn.q_proj.weight.grad, ref_attn.q_proj.weight.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(liger_attn.k_proj.weight.grad, ref_attn.k_proj.weight.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(liger_attn.v_proj.weight.grad, ref_attn.v_proj.weight.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(liger_attn.out_proj.weight.grad, ref_attn.out_proj.weight.grad, atol=atol, rtol=rtol)
+
+    if bias:
+        assert_verbose_allclose(liger_attn.q_proj.bias.grad, ref_attn.q_proj.bias.grad, atol=atol, rtol=rtol)
+        assert_verbose_allclose(liger_attn.k_proj.bias.grad, ref_attn.k_proj.bias.grad, atol=atol, rtol=rtol)
+        assert_verbose_allclose(liger_attn.v_proj.bias.grad, ref_attn.v_proj.bias.grad, atol=atol, rtol=rtol)
+        assert_verbose_allclose(liger_attn.out_proj.bias.grad, ref_attn.out_proj.bias.grad, atol=atol, rtol=rtol)
+
+
+class TorchNeighborhoodAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        kernel_size: int = 7,
+        dilation: int = 1,
+        bias: bool = True,
+        dropout: float = 0.0,
+        layer_norm_eps: float = 1e-5,
+        scale: float = None,
+    ):
+        super().__init__()
+
+        self.attention = TorchNeighborhoodAttention(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            bias=bias,
+            dropout=dropout,
+            scale=scale,
+        )
+
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+        if dropout > 0.0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        attn_output = self.attention(normed_hidden_states)
+
+        if self.dropout is not None:
+            attn_output = self.dropout(attn_output)
+
+        output = hidden_states + attn_output
+
+        return output
+
+
+@pytest.mark.parametrize(
+    "batch_size, seq_len, hidden_size, num_heads, kernel_size",
+    [
+        (2, 32, 128, 4, 7),
+        (1, 32, 128, 8, 5),
+        (2, 24, 96, 3, 9),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 5e-3, 5e-3),
+        (torch.bfloat16, 5e-2, 5e-2),
+    ],
+)
+def test_fused_neighborhood_attention_layer_correctness(
+    batch_size, seq_len, hidden_size, num_heads, kernel_size, bias, dtype, atol, rtol
+):
+    set_seed(42)
+
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size, device=device, dtype=dtype)
+
+    ref_layer = (
+        TorchNeighborhoodAttentionLayer(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    liger_layer = (
+        LigerFusedNeighborhoodAttentionLayer(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            kernel_size=kernel_size,
+            bias=bias,
+            dropout=0.0,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    with torch.no_grad():
+        liger_layer.attention.q_proj.weight.copy_(ref_layer.attention.q_proj.weight)
+        liger_layer.attention.k_proj.weight.copy_(ref_layer.attention.k_proj.weight)
+        liger_layer.attention.v_proj.weight.copy_(ref_layer.attention.v_proj.weight)
+        liger_layer.attention.out_proj.weight.copy_(ref_layer.attention.out_proj.weight)
+
+        liger_layer.layer_norm.weight.copy_(ref_layer.layer_norm.weight)
+        liger_layer.layer_norm.bias.copy_(ref_layer.layer_norm.bias)
+
+        if bias:
+            liger_layer.attention.q_proj.bias.copy_(ref_layer.attention.q_proj.bias)
+            liger_layer.attention.k_proj.bias.copy_(ref_layer.attention.k_proj.bias)
+            liger_layer.attention.v_proj.bias.copy_(ref_layer.attention.v_proj.bias)
+            liger_layer.attention.out_proj.bias.copy_(ref_layer.attention.out_proj.bias)
+
+    hidden_states1 = hidden_states.detach().clone().requires_grad_(True)
+    hidden_states2 = hidden_states.detach().clone().requires_grad_(True)
+
+    out1 = liger_layer(hidden_states1)
+    out2 = ref_layer(hidden_states2)
+
+    assert_verbose_allclose(out1, out2, atol=atol, rtol=rtol)
+
+    loss1 = out1.sum()
+    loss2 = out2.sum()
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(hidden_states1.grad, hidden_states2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "hidden_size, num_heads, kernel_size",
+    [
+        (128, 8, 7),
+        (256, 16, 5),
+        (64, 4, 3),
+    ],
+)
+def test_fused_neighborhood_attention_shapes(hidden_size, num_heads, kernel_size):
+    batch_size, seq_len = 2, 32
+
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size, device=device)
+
+    attention = LigerFusedNeighborhoodAttention(
+        hidden_size=hidden_size,
+        num_heads=num_heads,
+        kernel_size=kernel_size,
+    ).to(device)
+
+    output = attention(hidden_states)
+
+    assert output.shape == hidden_states.shape, f"Expected shape {hidden_states.shape}, got {output.shape}"
+
+    assert not torch.isnan(output).any(), "Output contains NaN values"
+    assert not torch.isinf(output).any(), "Output contains Inf values"
+
+
+def test_fused_neighborhood_attention_edge_cases():
+    with pytest.raises(ValueError, match="hidden_size .* must be divisible by num_heads"):
+        LigerFusedNeighborhoodAttention(hidden_size=100, num_heads=7)
+
+    with pytest.raises(ValueError, match="kernel_size .* must be odd"):
+        LigerFusedNeighborhoodAttention(hidden_size=128, num_heads=8, kernel_size=6)
+
+    with pytest.raises(ValueError, match="kernel_size .* must be positive"):
+        LigerFusedNeighborhoodAttention(hidden_size=128, num_heads=8, kernel_size=0)
+
+    with pytest.raises(ValueError, match="dilation .* must be positive"):
+        LigerFusedNeighborhoodAttention(hidden_size=128, num_heads=8, dilation=0)
+
+    attention = LigerFusedNeighborhoodAttention(hidden_size=64, num_heads=4).to(device)
+    hidden_states = torch.randn(1, 16, 64, device=device)
+    attention_mask = torch.ones(1, 16, device=device)
+
+    with pytest.raises(NotImplementedError, match="Attention mask is not yet supported"):
+        attention(hidden_states, attention_mask)
+
+
+def test_fused_neighborhood_attention_deterministic():
+    set_seed(42)
+
+    batch_size, seq_len, hidden_size, num_heads = 2, 32, 128, 8
+
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size, device=device)
+
+    attention = LigerFusedNeighborhoodAttention(
+        hidden_size=hidden_size,
+        num_heads=num_heads,
+        kernel_size=7,
+    ).to(device)
+
+    output1 = attention(hidden_states)
+    output2 = attention(hidden_states)
+
+    assert torch.allclose(output1, output2, atol=1e-6, rtol=1e-6), "Results are not deterministic"
+
+
+@pytest.mark.parametrize(
+    "batch_size, seq_len, hidden_size, num_heads",
+    [
+        (1, 16, 64, 4),
+        (2, 32, 128, 8),
+        (1, 48, 192, 6),
+    ],
+)
+def test_fused_neighborhood_attention_gradient_flow(batch_size, seq_len, hidden_size, num_heads):
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size, device=device, requires_grad=True)
+
+    attention = LigerFusedNeighborhoodAttention(
+        hidden_size=hidden_size,
+        num_heads=num_heads,
+        kernel_size=7,
+    ).to(device)
+
+    output = attention(hidden_states)
+    loss = output.sum()
+
+    loss.backward()
+
+    assert hidden_states.grad is not None, "Input gradients are None"
+    assert not torch.allclose(hidden_states.grad, torch.zeros_like(hidden_states.grad)), "Input gradients are zero"
+
+    for name, param in attention.named_parameters():
+        assert param.grad is not None, f"Parameter {name} has no gradient"
+        assert not torch.allclose(param.grad, torch.zeros_like(param.grad)), f"Parameter {name} has zero gradient"
+
+
+def torch_fused_neighborhood_attention(
+    query,
+    key,
+    value,
+    kernel_size: int = 7,
+    dilation: int = 1,
+    scale: float = None,
+):
+    batch_size, num_heads, seq_len, head_dim = query.shape
+
+    if scale is None:
+        scale = 1.0 / math.sqrt(head_dim)
+
+    scores = torch.matmul(query, key.transpose(-2, -1)) * scale
+
+    mask = torch.zeros(seq_len, seq_len, device=query.device, dtype=torch.bool)
+    half_kernel = kernel_size // 2
+
+    for i in range(seq_len):
+        start = max(0, i - half_kernel * dilation)
+        end = min(seq_len, i + half_kernel * dilation + 1)
+
+        for j in range(start, end):
+            if dilation == 1 or (j - i) % dilation == 0:
+                mask[i, j] = True
+
+    scores = scores.masked_fill(~mask, float("-inf"))
+
+    attn_weights = torch.softmax(scores, dim=-1)
+
+    output = torch.matmul(attn_weights, value)
+
+    return output
+
+
+@pytest.mark.parametrize(
+    "batch_size, num_heads, seq_len, head_dim, kernel_size",
+    [
+        (2, 4, 32, 32, 7),
+        (1, 8, 24, 16, 5),
+        (2, 6, 16, 64, 9),
+        (1, 2, 48, 128, 3),
+    ],
+)
+@pytest.mark.parametrize("dilation", [1, 2])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 5e-3, 5e-3),
+        (torch.bfloat16, 5e-2, 5e-2),
+    ],
+)
+def test_liger_fused_neighborhood_attention_functional_correctness(
+    batch_size, num_heads, seq_len, head_dim, kernel_size, dilation, dtype, atol, rtol
+):
+    set_seed(42)
+
+    query = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
+    key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
+    value = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device, dtype=dtype)
+
+    query1 = query.detach().clone().requires_grad_(True)
+    key1 = key.detach().clone().requires_grad_(True)
+    value1 = value.detach().clone().requires_grad_(True)
+
+    query2 = query.detach().clone().requires_grad_(True)
+    key2 = key.detach().clone().requires_grad_(True)
+    value2 = value.detach().clone().requires_grad_(True)
+
+    liger_output = liger_fused_neighborhood_attention(query1, key1, value1, kernel_size=kernel_size, dilation=dilation)
+
+    torch_output = torch_fused_neighborhood_attention(query2, key2, value2, kernel_size=kernel_size, dilation=dilation)
+
+    assert_verbose_allclose(liger_output, torch_output, atol=atol, rtol=rtol)
+
+    liger_loss = liger_output.sum()
+    torch_loss = torch_output.sum()
+
+    liger_loss.backward()
+    torch_loss.backward()
+
+    assert_verbose_allclose(query1.grad, query2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(key1.grad, key2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(value1.grad, value2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "batch_size, num_heads, seq_len, head_dim",
+    [
+        (2, 4, 32, 32),
+        (1, 8, 16, 64),
+    ],
+)
+def test_liger_fused_neighborhood_attention_functional_custom_scale(batch_size, num_heads, seq_len, head_dim):
+    set_seed(42)
+
+    query = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    value = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+
+    custom_scale = 0.5
+
+    query1 = query.detach().clone().requires_grad_(True)
+    key1 = key.detach().clone().requires_grad_(True)
+    value1 = value.detach().clone().requires_grad_(True)
+
+    query2 = query.detach().clone().requires_grad_(True)
+    key2 = key.detach().clone().requires_grad_(True)
+    value2 = value.detach().clone().requires_grad_(True)
+
+    liger_output = liger_fused_neighborhood_attention(
+        query1, key1, value1, kernel_size=7, dilation=1, scale=custom_scale
+    )
+
+    torch_output = torch_fused_neighborhood_attention(
+        query2, key2, value2, kernel_size=7, dilation=1, scale=custom_scale
+    )
+
+    assert_verbose_allclose(liger_output, torch_output, atol=5e-3, rtol=5e-3)
+
+
+def test_liger_fused_neighborhood_attention_functional_shapes():
+    batch_size, num_heads, seq_len, head_dim = 2, 4, 16, 32
+
+    query = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    value = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+
+    output = liger_fused_neighborhood_attention(query, key, value)
+
+    expected_shape = (batch_size, num_heads, seq_len, head_dim)
+    assert output.shape == expected_shape, f"Expected shape {expected_shape}, got {output.shape}"
+
+    assert not torch.isnan(output).any(), "Output contains NaN values"
+    assert not torch.isinf(output).any(), "Output contains Inf values"
+
+
+def test_liger_fused_neighborhood_attention_functional_deterministic():
+    """Test that the functional interface is deterministic."""
+    set_seed(42)
+
+    batch_size, num_heads, seq_len, head_dim = 2, 4, 16, 32
+
+    query = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    value = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+
+    output1 = liger_fused_neighborhood_attention(query, key, value)
+    output2 = liger_fused_neighborhood_attention(query, key, value)
+
+    assert torch.allclose(output1, output2, atol=1e-6, rtol=1e-6), "Functional interface is not deterministic"
+
+
+@pytest.mark.parametrize("kernel_size", [3, 5, 7, 9])
+@pytest.mark.parametrize("dilation", [1, 2, 3])
+def test_liger_fused_neighborhood_attention_functional_parameters(kernel_size, dilation):
+    """Test the functional interface with different kernel sizes and dilations."""
+    batch_size, num_heads, seq_len, head_dim = 1, 2, 24, 16
+
+    query = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    key = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+    value = torch.randn(batch_size, num_heads, seq_len, head_dim, device=device)
+
+    output = liger_fused_neighborhood_attention(query, key, value, kernel_size=kernel_size, dilation=dilation)
+
+    expected_shape = (batch_size, num_heads, seq_len, head_dim)
+    assert output.shape == expected_shape, f"Expected shape {expected_shape}, got {output.shape}"
+
+    assert not torch.isnan(output).any(), "Output contains NaN values"
+    assert not torch.isinf(output).any(), "Output contains Inf values"
diff --git a/test/transformers/test_geglu.py b/test/transformers/test_geglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..6f3696cfba563039ea53ec30b543d0b23052ce27
--- /dev/null
+++ b/test/transformers/test_geglu.py
@@ -0,0 +1,264 @@
+import math
+
+import pytest
+import torch
+
+from test.utils import supports_bfloat16
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP
+
+from liger_kernel.ops import LigerGELUMulFunction
+from liger_kernel.transformers.functional import liger_geglu
+from liger_kernel.transformers.geglu import LigerGEGLUMLP
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+LLAMA_CONFIG = LlamaConfig(
+    hidden_size=4096,
+    intermediate_size=11008,
+    hidden_act="gelu_pytorch_tanh",
+)
+SLEEP_SECONDS = 0.1
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, hidden_size, intermediate_size",
+    [
+        (2, 2048, 2048, 4096),
+        # weird shapes
+        (9, 41, 341, 4231),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-0, 2e-6),
+        pytest.param(
+            torch.bfloat16,
+            # For NPU: use quack's distance-based comparison method (tolerance params not used)
+            # Reference for quack method: https://github.com/Dao-AILab/quack/blob/9a333c70288a07e135e415f9c2ae96520178ecf5/tests/test_linear.py#L65
+            1e-2,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol):
+    # For NPU + bfloat16: use quack's distance-based comparison method
+    # For GPU + bfloat16: use direct comparison
+    # For float32: use direct comparison
+    if dtype == torch.bfloat16 and device == "npu":
+        _test_correctness_quack_method(bsz, seq_len, hidden_size, intermediate_size)
+    else:
+        # For GPU + bfloat16 or float32, use direct comparison
+        _test_correctness_direct(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol)
+
+
+def _test_correctness_quack_method(bsz, seq_len, hidden_size, intermediate_size):
+    """Test using quack's distance-based comparison method."""
+    torch.manual_seed(0)
+
+    # Create inputs in fp32, then convert to bf16
+    _input_fp32 = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=torch.float32)
+    _input_bf16 = _input_fp32.to(torch.bfloat16)
+
+    x_fp32 = _input_fp32.clone().requires_grad_(True)
+    x_bf16_ref = _input_bf16.clone().requires_grad_(True)
+    x_bf16_custom = _input_bf16.clone().requires_grad_(True)
+
+    # Initialize weights with scaled initialization (following quack: 1/sqrt(in_features))
+    scale_g = 1.0 / math.sqrt(hidden_size)
+    scale_u = 1.0 / math.sqrt(hidden_size)
+    scale_d = 1.0 / math.sqrt(intermediate_size)
+
+    G_fp32 = torch.randn(hidden_size, intermediate_size, device=device, dtype=torch.float32) * scale_g
+    U_fp32 = torch.randn(hidden_size, intermediate_size, device=device, dtype=torch.float32) * scale_u
+    D_fp32 = torch.randn(intermediate_size, hidden_size, device=device, dtype=torch.float32) * scale_d
+
+    G_bf16 = G_fp32.to(torch.bfloat16)
+    U_bf16 = U_fp32.to(torch.bfloat16)
+    D_bf16 = D_fp32.to(torch.bfloat16)
+
+    # Reference implementations
+    llama_mlp_fp32 = LlamaMLP(config=LLAMA_CONFIG).to(device).to(torch.float32)
+    llama_mlp_fp32.gate_proj.weight.data = G_fp32.T
+    llama_mlp_fp32.up_proj.weight.data = U_fp32.T
+    llama_mlp_fp32.down_proj.weight.data = D_fp32.T
+
+    llama_mlp_bf16 = LlamaMLP(config=LLAMA_CONFIG).to(device).to(torch.bfloat16)
+    llama_mlp_bf16.gate_proj.weight.data = G_bf16.T
+    llama_mlp_bf16.up_proj.weight.data = U_bf16.T
+    llama_mlp_bf16.down_proj.weight.data = D_bf16.T
+
+    liger_mlp_bf16 = LigerGEGLUMLP(config=LLAMA_CONFIG).to(device).to(torch.bfloat16)
+    liger_mlp_bf16.gate_proj.weight.data = G_bf16.T
+    liger_mlp_bf16.up_proj.weight.data = U_bf16.T
+    liger_mlp_bf16.down_proj.weight.data = D_bf16.T
+
+    # Forward pass
+    y_fp32_ref = llama_mlp_fp32(x_fp32)
+    y_bf16_ref = llama_mlp_bf16(x_bf16_ref)
+    y_bf16_custom = liger_mlp_bf16(x_bf16_custom)
+
+    # Quack's method: compare distances to fp32 reference
+    # Custom bf16 distance to fp32 should be < 2 * ref bf16 distance to fp32 + 1e-6
+    dist_custom = (y_bf16_custom.float() - y_fp32_ref).abs()
+    dist_ref = (y_bf16_ref.float() - y_fp32_ref).abs()
+    max_dist_custom = dist_custom.max().item()
+    max_dist_ref = dist_ref.max().item()
+
+    assert max_dist_custom < 2 * max_dist_ref + 1e-6, (
+        f"Output distance to fp32 reference too large: "
+        f"custom={max_dist_custom:.6e}, ref={max_dist_ref:.6e}, "
+        f"threshold={2 * max_dist_ref + 1e-6:.6e}"
+    )
+
+    # Backward pass
+    dy_fp32 = torch.randn_like(y_fp32_ref)
+    dy_bf16 = dy_fp32.to(torch.bfloat16)
+
+    y_fp32_ref.backward(dy_fp32.clone(), retain_graph=True)
+    y_bf16_ref.backward(dy_bf16.clone(), retain_graph=True)
+    y_bf16_custom.backward(dy_bf16.clone(), retain_graph=True)
+
+    # Check gradients using quack's method
+    def _check_grad_quack(grad_custom, grad_ref_bf16, grad_ref_fp32, name):
+        dist_custom = (grad_custom.float() - grad_ref_fp32).abs()
+        dist_ref = (grad_ref_bf16.float() - grad_ref_fp32).abs()
+        max_dist_custom = dist_custom.max().item()
+        max_dist_ref = dist_ref.max().item()
+        assert max_dist_custom < 2 * max_dist_ref + 1e-6, (
+            f"{name} gradient distance to fp32 reference too large: "
+            f"custom={max_dist_custom:.6e}, ref={max_dist_ref:.6e}, "
+            f"threshold={2 * max_dist_ref + 1e-6:.6e}"
+        )
+
+    _check_grad_quack(
+        liger_mlp_bf16.gate_proj.weight.grad,
+        llama_mlp_bf16.gate_proj.weight.grad,
+        llama_mlp_fp32.gate_proj.weight.grad,
+        "gate_proj.weight",
+    )
+
+    _check_grad_quack(
+        liger_mlp_bf16.up_proj.weight.grad,
+        llama_mlp_bf16.up_proj.weight.grad,
+        llama_mlp_fp32.up_proj.weight.grad,
+        "up_proj.weight",
+    )
+
+    _check_grad_quack(
+        liger_mlp_bf16.down_proj.weight.grad,
+        llama_mlp_bf16.down_proj.weight.grad,
+        llama_mlp_fp32.down_proj.weight.grad,
+        "down_proj.weight",
+    )
+
+    _check_grad_quack(x_bf16_custom.grad, x_bf16_ref.grad, x_fp32.grad, "input")
+
+
+def _test_correctness_direct(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol):
+    """Test using direct comparison (for GPU + bfloat16 or float32)."""
+    _input = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    # Initialize weights
+    G = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+    U = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+    D = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+
+    llama_mlp = LlamaMLP(config=LLAMA_CONFIG).to(device).to(dtype)
+    llama_mlp.gate_proj.weight.data = G.T
+    llama_mlp.up_proj.weight.data = U.T
+    llama_mlp.down_proj.weight.data = D.T
+
+    liger_mlp = LigerGEGLUMLP(config=LLAMA_CONFIG).to(device).to(dtype)
+    liger_mlp.gate_proj.weight.data = G.T
+    liger_mlp.up_proj.weight.data = U.T
+    liger_mlp.down_proj.weight.data = D.T
+
+    y1 = llama_mlp(x1)
+    y2 = liger_mlp(x2)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol) is True
+
+    dy = torch.randn_like(y1)
+
+    y1.backward(dy.clone(), retain_graph=True)
+    y2.backward(dy.clone(), retain_graph=True)
+
+    assert (
+        torch.allclose(
+            llama_mlp.gate_proj.weight.grad,
+            liger_mlp.gate_proj.weight.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+        is True
+    )
+    assert (
+        torch.allclose(
+            llama_mlp.up_proj.weight.grad,
+            liger_mlp.up_proj.weight.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+        is True
+    )
+    assert (
+        torch.allclose(
+            llama_mlp.down_proj.weight.grad,
+            liger_mlp.down_proj.weight.grad,
+            atol=atol,
+            rtol=rtol,
+        )
+        is True
+    )
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol) is True
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, size",
+    [
+        (2, 2, 8),
+        # weird shapes
+        (9, 7, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-2, 1e-2),
+    ],
+)
+def test_correctness_functional(bsz, seq_len, size, dtype, atol, rtol):
+    _input = torch.randn(bsz, seq_len, size, device=device, dtype=dtype)
+    _b = torch.randn(bsz, seq_len, size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    b1 = _b.clone().requires_grad_(True)
+    b2 = _b.clone().requires_grad_(True)
+
+    y1 = liger_geglu(a=x1, b=b1)
+    y2 = LigerGELUMulFunction.apply(x2, b2)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(y1)
+
+    y1.backward(grad_output)
+    y2.backward(grad_output)
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(b1.grad, b2.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_group_norm.py b/test/transformers/test_group_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..3454947f9e85e546d9e5769779aa37ec9121eec5
--- /dev/null
+++ b/test/transformers/test_group_norm.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+
+from liger_kernel.transformers.group_norm import LigerGroupNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+@pytest.mark.parametrize(
+    "batch_size, num_channels, num_groups, hidden_size",
+    [
+        (1, 1, 1, 3),  # minimal
+        (1, 32, 32, 4),  # group == channel
+        (16, 32, 1, 4096),  # single group
+        (2, 63, 21, 2163),  # non-aligned hidden
+        (16, 48, 12, 8192),  # large hidden
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-4),
+    ],
+)
+def test_liger_group_norm(batch_size, num_channels, num_groups, hidden_size, dtype, atol, rtol):
+    torch.manual_seed(0)
+
+    _tensor = torch.randn(batch_size, num_channels, hidden_size, dtype=dtype, device=device)
+
+    liger_x = _tensor.clone().detach().requires_grad_(True)
+    torch_x = _tensor.clone().detach().requires_grad_(True)
+
+    liger_ln = LigerGroupNorm(num_channels, num_groups, eps=1e-6).to(dtype).to(device)
+    torch_ln = torch.nn.GroupNorm(num_channels=num_channels, num_groups=num_groups, eps=1e-6).to(dtype).to(device)
+
+    with torch.no_grad():
+        torch_ln.weight.copy_(liger_ln.weight)
+        torch_ln.bias.copy_(liger_ln.bias)
+
+    liger_output = liger_ln(
+        liger_x,
+    )
+    torch_output = torch_ln(torch_x)
+
+    assert torch.allclose(liger_output, torch_output, atol=atol, rtol=rtol)
+    grad_output = torch.randn_like(torch_x)
+    liger_output.backward(grad_output, retain_graph=True)
+    torch_output.backward(grad_output, retain_graph=True)
+    assert torch.allclose(liger_x.grad, torch_x.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(liger_ln.bias.grad, torch_ln.bias.grad, atol=atol, rtol=rtol), "Bias grads different"
+    assert torch.allclose(liger_ln.weight.grad, torch_ln.weight.grad, atol=atol, rtol=rtol), "Weight grads different"
diff --git a/test/transformers/test_grpo_loss.py b/test/transformers/test_grpo_loss.py
new file mode 100755
index 0000000000000000000000000000000000000000..c45f74f34e7e7c224f200753d461c6b3452bfef0
--- /dev/null
+++ b/test/transformers/test_grpo_loss.py
@@ -0,0 +1,1331 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from test.utils import assert_verbose_allclose
+from test.utils import infer_device
+from test.utils import set_seed
+
+from liger_kernel.ops.grpo_loss import fused_selective_log_softmax
+from liger_kernel.transformers.grpo_loss import triton_grpo_loss
+
+
+@torch.no_grad
+def selective_log_softmax(logits, input_ids, temperature=0.9):
+    logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+    logits_to_keep = logits.size(1)
+    index = input_ids[:, -logits_to_keep:]
+    logits = logits[:, -logits_to_keep:]
+    logits = logits / temperature
+
+    if logits.dtype in [torch.float32, torch.float64]:
+        selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
+        # loop to reduce peak mem consumption
+        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
+        per_token_logps = selected_logits - logsumexp_values  # log_softmax(x_i) = x_i - logsumexp(x)
+    else:
+        # logsumexp approach is unstable with bfloat16, fall back to slightly less efficent approach
+        per_token_logps = []
+        for row_logits, row_labels in zip(logits, index):  # loop to reduce peak mem consumption
+            row_logps = F.log_softmax(row_logits, dim=-1)
+            row_per_token_logps = row_logps.gather(dim=-1, index=row_labels.unsqueeze(-1)).squeeze(-1)
+            per_token_logps.append(row_per_token_logps)
+        per_token_logps = torch.stack(per_token_logps)
+    return per_token_logps
+
+
+def _get_log_probs(logits, input_ids):
+    """Helper function to compute per-token log probabilities."""
+    per_token_logps = []
+    for logits_row, input_ids_row in zip(logits, input_ids[:, -logits.size(1) :]):
+        log_probs = logits_row.log_softmax(dim=-1)
+        token_log_prob = torch.gather(log_probs, dim=1, index=input_ids_row.unsqueeze(1)).squeeze(1)
+        per_token_logps.append(token_log_prob)
+    return torch.stack(per_token_logps)
+
+
+def torch_grpo_loss(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask,
+    temperature,
+    beta,
+    eps_low,
+    eps_high,
+    delta=None,
+    use_bias_correction_kl=False,
+):
+    assert logits.is_contiguous() and completion_ids.is_contiguous()
+    assert old_logp is None or old_logp.is_contiguous()
+    assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
+    logits = logits[:, :-1]
+
+    per_token_logps = _get_log_probs(logits / temperature, completion_ids)
+    ref_per_token_logps = ref_logp
+
+    if old_logp is None:
+        old_logp = per_token_logps.detach()
+    coef_1 = torch.exp(per_token_logps - old_logp)
+    coef_2 = torch.clamp(coef_1, 1 - eps_low, 1 + eps_high)
+    if delta is not None:
+        coef_1 = torch.clamp(coef_1, max=delta)
+    per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+    per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+    per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+    per_token_loss = per_token_loss * completion_mask if completion_mask is not None else per_token_loss
+
+    per_token_kl = None
+    if beta != 0.0:
+        per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+        if use_bias_correction_kl:
+            per_token_kl = per_token_kl * torch.exp(per_token_logps - old_logp)
+        if completion_mask is not None:
+            per_token_kl *= completion_mask
+        per_token_loss = per_token_loss + beta * per_token_kl
+    is_clipped = (per_token_loss1 < per_token_loss2).float()
+    return per_token_loss, per_token_kl, is_clipped
+
+
+def torch_cispo_loss(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask,
+    temperature,
+    beta,
+    eps_high,
+    use_bias_correction_kl=False,
+):
+    """Reference implementation for CISPO loss.
+
+    CISPO (Clipped Importance Sampling Policy Optimization) uses:
+    - Upper-bound only clipping (no lower bound)
+    - Detached clipped coefficient (no gradient through clipping)
+    - Loss includes per_token_logps multiplication
+
+    Reference: MiniMax-M1 technical report
+    """
+    assert logits.is_contiguous() and completion_ids.is_contiguous()
+    assert old_logp is None or old_logp.is_contiguous()
+    assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
+    logits = logits[:, :-1]
+
+    per_token_logps = _get_log_probs(logits / temperature, completion_ids)
+    ref_per_token_logps = ref_logp
+
+    if old_logp is None:
+        old_logp = per_token_logps.detach()
+    coef_1 = torch.exp(per_token_logps - old_logp)
+    # CISPO: upper-bound only clipping with detach
+    coef_2 = torch.clamp(coef_1, max=eps_high).detach()
+    # CISPO loss includes per_token_logps
+    per_token_loss = -coef_2 * advantages.unsqueeze(1) * per_token_logps
+    per_token_loss = per_token_loss * completion_mask if completion_mask is not None else per_token_loss
+
+    per_token_kl = None
+    if beta != 0.0:
+        per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+        if use_bias_correction_kl:
+            per_token_kl = per_token_kl * torch.exp(per_token_logps - old_logp)
+        if completion_mask is not None:
+            per_token_kl *= completion_mask
+        per_token_loss = per_token_loss + beta * per_token_kl
+    is_clipped = ((coef_1 > eps_high) & (advantages.unsqueeze(1) > 0)).float()
+    return per_token_loss, per_token_kl, is_clipped
+
+
+def torch_sapo_loss(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask,
+    temperature,
+    beta,
+    sapo_temperature_pos,
+    sapo_temperature_neg,
+    use_bias_correction_kl=False,
+):
+    """Reference implementation for SAPO loss.
+
+    SAPO (Soft Adaptive Policy Optimization) uses:
+    - Sigmoid-based soft gating instead of hard clipping
+    - Different temperatures for positive/negative advantages
+
+    Reference: https://huggingface.co/papers/2511.20347
+    """
+    assert logits.is_contiguous() and completion_ids.is_contiguous()
+    assert old_logp is None or old_logp.is_contiguous()
+    assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
+    logits = logits[:, :-1]
+
+    per_token_logps = _get_log_probs(logits / temperature, completion_ids)
+    ref_per_token_logps = ref_logp
+
+    if old_logp is None:
+        old_logp = per_token_logps.detach()
+    coef_1 = torch.exp(per_token_logps - old_logp)
+
+    # SAPO: sigmoid-based soft gating
+    # Select temperature based on advantage sign
+    temp = torch.where(advantages.unsqueeze(1) > 0, sapo_temperature_pos, sapo_temperature_neg)
+    sigmoid_input = temp * (coef_1 - 1.0)
+    sapo_coef = torch.sigmoid(sigmoid_input) * 4.0 / temp
+    per_token_loss = -sapo_coef * advantages.unsqueeze(1)
+    per_token_loss = per_token_loss * completion_mask if completion_mask is not None else per_token_loss
+
+    per_token_kl = None
+    if beta != 0.0:
+        per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+        if use_bias_correction_kl:
+            per_token_kl = per_token_kl * torch.exp(per_token_logps - old_logp)
+        if completion_mask is not None:
+            per_token_kl *= completion_mask
+        per_token_loss = per_token_loss + beta * per_token_kl
+    # SAPO has no clipping concept
+    is_clipped = torch.zeros_like(per_token_loss)
+    return per_token_loss, per_token_kl, is_clipped
+
+
+set_seed(42)
+device = infer_device()
+
+
+@pytest.mark.parametrize(
+    "temperature, B, T, V",
+    [
+        (0.9, 1, 1024, 64000),
+        (0.7, 1, 1024, 151936),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-2, 5e-1),
+    ],
+)
+def test_selective_log_softmax(B, T, V, temperature, dtype, atol, rtol):
+    # logits_to_keep + 1
+    _input = torch.randn(B, T + 1, V, device=device, dtype=dtype)
+
+    logit1 = _input.clone()
+    logit2 = _input.clone()
+    logit3 = _input.clone().float()
+
+    # we set the length of prompt_ids is 100 and the length of completion_ids is T
+    input_ids = torch.randint(0, V - 1, (B, 100 + T), dtype=torch.int64, device=device)
+
+    torch_bf16_logp = selective_log_softmax(logit1, input_ids, temperature)
+    triton_bf16_logp = fused_selective_log_softmax(logit2, input_ids, temperature)
+    torch_fp32_logp = selective_log_softmax(logit3, input_ids, temperature)
+
+    assert_verbose_allclose(torch_bf16_logp, torch_fp32_logp.to(dtype), rtol=rtol, atol=atol)
+    assert_verbose_allclose(triton_bf16_logp, torch_fp32_logp.to(dtype), rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize(
+    "temperature, num_iteration, beta, eps_low, eps_high",
+    [(0.7, num_iteration, beta, 0.2, 0.4) for num_iteration in [1, 5] for beta in [0.0, 0.04]],
+)
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (1, 1024, 151936),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-2, 5e-1),
+    ],
+)
+def test_grpo_loss(B, T, V, temperature, num_iteration, beta, eps_low, eps_high, dtype, atol, rtol):
+    _input = torch.randn(B, T + 1, V, device=device, dtype=dtype)
+
+    logits1 = _input.clone().requires_grad_(True)
+    logits2 = _input.clone().requires_grad_(True)
+    logits3 = _input.clone().float().requires_grad_(True)
+
+    completion_ids = torch.randint(0, V - 1, (B, T), dtype=torch.int64, device=device)
+    completion_mask = torch.ones_like(completion_ids, dtype=torch.int32)
+    # we set num_padding is 100
+    completion_mask[:, -100:] = 0
+
+    # we set these in fp32, because fused_selective_log_softmax retutn fp32 logp, although logits in bf16
+    ref_logp = torch.randn(B, T, device=device, dtype=torch.float32) if beta != 0.0 else None
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32) if num_iteration > 1 else None
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    loss1, kl1, is_clipped1 = torch_grpo_loss(
+        logits1, old_logp, ref_logp, completion_ids, advantages, completion_mask, temperature, beta, eps_low, eps_high
+    )
+
+    loss2, kl2, is_clipped2 = triton_grpo_loss(
+        logits2,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=True,
+    )
+
+    loss3, kl3, is_clipped3 = torch_grpo_loss(
+        logits3, old_logp, ref_logp, completion_ids, advantages, completion_mask, temperature, beta, eps_low, eps_high
+    )
+
+    dy = torch.randn_like(loss3)
+    loss1.backward(dy)
+    loss2.backward(dy)
+    loss3.backward(dy)
+
+    assert_verbose_allclose(loss1, loss3, atol=atol, rtol=rtol)
+    if kl1 is not None and kl3 is not None:
+        assert_verbose_allclose(kl1, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits1.grad, logits3.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(loss2, loss3, atol=atol, rtol=rtol)
+    if kl2 is not None and kl3 is not None:
+        assert_verbose_allclose(kl2, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits2.grad, logits3.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("delta", [1.5, 2.0])
+@pytest.mark.parametrize(
+    "temperature, num_iteration, beta, eps_low, eps_high",
+    [(0.7, 5, beta, 0.2, 0.4) for beta in [0.0, 0.04]],
+)
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 128, 1000),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-2, 5e-1),
+    ],
+)
+def test_grpo_loss_with_delta(B, T, V, temperature, num_iteration, beta, eps_low, eps_high, dtype, atol, rtol, delta):
+    """Test delta (two-sided clipping) support for standard PPO loss types."""
+    _input = torch.randn(B, T + 1, V, device=device, dtype=dtype)
+
+    logits1 = _input.clone().requires_grad_(True)
+    logits2 = _input.clone().requires_grad_(True)
+    logits3 = _input.clone().float().requires_grad_(True)
+
+    completion_ids = torch.randint(0, V - 1, (B, T), dtype=torch.int64, device=device)
+    completion_mask = torch.ones_like(completion_ids, dtype=torch.int32)
+    completion_mask[:, -20:] = 0
+
+    ref_logp = torch.randn(B, T, device=device, dtype=torch.float32) if beta != 0.0 else None
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32) if num_iteration > 1 else None
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    loss1, kl1, is_clipped1 = torch_grpo_loss(
+        logits1,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        delta=delta,
+    )
+
+    loss2, kl2, is_clipped2 = triton_grpo_loss(
+        logits2,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=True,
+        delta=delta,
+    )
+
+    loss3, kl3, is_clipped3 = torch_grpo_loss(
+        logits3,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        delta=delta,
+    )
+
+    dy = torch.randn_like(loss3)
+    loss1.backward(dy)
+    loss2.backward(dy)
+    loss3.backward(dy)
+
+    assert_verbose_allclose(loss1, loss3, atol=atol, rtol=rtol)
+    if kl1 is not None and kl3 is not None:
+        assert_verbose_allclose(kl1, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits1.grad, logits3.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(loss2, loss3, atol=atol, rtol=rtol)
+    if kl2 is not None and kl3 is not None:
+        assert_verbose_allclose(kl2, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits2.grad, logits3.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "temperature, num_iteration, eps_low, eps_high",
+    [(0.7, 5, 0.2, 0.4)],
+)
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 128, 1000),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-2, 5e-1),
+    ],
+)
+def test_grpo_loss_with_bias_correction_kl(B, T, V, temperature, num_iteration, eps_low, eps_high, dtype, atol, rtol):
+    """Test use_bias_correction_kl (importance-sampling-corrected KL from DeepSeek-V3.2)."""
+    beta = 0.04  # Must be non-zero for KL to matter
+    _input = torch.randn(B, T + 1, V, device=device, dtype=dtype)
+
+    logits1 = _input.clone().requires_grad_(True)
+    logits2 = _input.clone().requires_grad_(True)
+    logits3 = _input.clone().float().requires_grad_(True)
+
+    completion_ids = torch.randint(0, V - 1, (B, T), dtype=torch.int64, device=device)
+    completion_mask = torch.ones_like(completion_ids, dtype=torch.int32)
+    completion_mask[:, -20:] = 0
+
+    ref_logp = torch.randn(B, T, device=device, dtype=torch.float32)
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32)
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    loss1, kl1, is_clipped1 = torch_grpo_loss(
+        logits1,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        use_bias_correction_kl=True,
+    )
+
+    loss2, kl2, is_clipped2 = triton_grpo_loss(
+        logits2,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=True,
+        use_bias_correction_kl=True,
+    )
+
+    loss3, kl3, is_clipped3 = torch_grpo_loss(
+        logits3,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        use_bias_correction_kl=True,
+    )
+
+    dy = torch.randn_like(loss3)
+    loss1.backward(dy)
+    loss2.backward(dy)
+    loss3.backward(dy)
+
+    assert_verbose_allclose(loss1, loss3, atol=atol, rtol=rtol)
+    if kl1 is not None and kl3 is not None:
+        assert_verbose_allclose(kl1, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits1.grad, logits3.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(loss2, loss3, atol=atol, rtol=rtol)
+    if kl2 is not None and kl3 is not None:
+        assert_verbose_allclose(kl2, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits2.grad, logits3.grad, atol=atol, rtol=rtol)
+
+
+def trl_reference_grpo_loss(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask,
+    temperature,
+    beta,
+    eps_low,
+    eps_high,
+    loss_type,
+    importance_sampling_level,
+    delta=None,
+    use_bias_correction_kl=False,
+):
+    """TRL reference implementation from grpo_trainer.py"""
+    B, L_ADD_1, V = logits.shape
+    L = L_ADD_1 - 1
+
+    logits_scaled = logits[:, :-1, :] / temperature
+    log_probs = torch.log_softmax(logits_scaled.float(), dim=-1)
+    per_token_logps = log_probs.gather(-1, completion_ids.unsqueeze(-1)).squeeze(-1)
+
+    if old_logp is None:
+        old_logp = per_token_logps.detach()
+
+    log_ratio = per_token_logps - old_logp
+
+    if importance_sampling_level == "token":
+        log_importance_weights = log_ratio
+    else:  # sequence
+        log_importance_weights = (log_ratio * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)
+        log_importance_weights = log_importance_weights.unsqueeze(-1)
+
+    coef_1 = torch.exp(log_importance_weights)
+    coef_2 = torch.clamp(coef_1, 1 - eps_low, 1 + eps_high)
+    if delta is not None:
+        coef_1 = torch.clamp(coef_1, max=delta)
+
+    per_token_loss1 = coef_1 * advantages.unsqueeze(-1)
+    per_token_loss2 = coef_2 * advantages.unsqueeze(-1)
+    per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+
+    if importance_sampling_level == "sequence":
+        per_token_loss = per_token_loss.expand(B, L)
+
+    if beta != 0.0:
+        kl = torch.exp(ref_logp - per_token_logps) - (ref_logp - per_token_logps) - 1.0
+        if use_bias_correction_kl:
+            kl = kl * torch.exp(per_token_logps - old_logp)
+        per_token_loss = per_token_loss + beta * kl
+
+    # Loss reduction
+    if loss_type == "grpo":
+        loss = ((per_token_loss * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)).mean()
+    elif loss_type == "bnpo":
+        loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+    elif loss_type == "dr_grpo":
+        loss = (per_token_loss * completion_mask).sum() / (B * L)
+    elif loss_type == "dapo":
+        loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+    elif loss_type == "luspo":
+        loss = (per_token_loss * completion_mask.sum(-1, keepdim=True)).mean()
+
+    return loss
+
+
+@pytest.mark.parametrize("delta", [None, 1.5])
+@pytest.mark.parametrize("importance_sampling_level", ["token", "sequence"])
+@pytest.mark.parametrize("loss_type", ["grpo", "bnpo", "dr_grpo", "dapo", "luspo"])
+@pytest.mark.parametrize("beta", [0.0, 0.04])
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 128, 1000),
+    ],
+)
+def test_grpo_loss_vs_trl(B, T, V, beta, loss_type, importance_sampling_level, delta):
+    """Test that triton_grpo_loss matches TRL's exact implementation."""
+    torch.manual_seed(42)
+
+    logits = torch.randn(B, T + 1, V, device=device, dtype=torch.float32)
+    completion_ids = torch.randint(0, V, (B, T), device=device)
+    completion_mask = torch.ones(B, T, device=device, dtype=torch.float32)
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    # Compute realistic old_logp and ref_logp
+    with torch.no_grad():
+        log_probs = torch.log_softmax(logits[:, :-1, :] / 0.9, dim=-1)
+        current_logp = log_probs.gather(-1, completion_ids.unsqueeze(-1)).squeeze(-1)
+        old_logp = current_logp + torch.randn_like(current_logp) * 0.3
+        ref_logp = current_logp + torch.randn_like(current_logp) * 0.2 if beta != 0.0 else None
+
+    temperature = 0.9
+    eps_low, eps_high = 0.2, 0.4
+
+    # TRL reference
+    trl_loss = trl_reference_grpo_loss(
+        logits.clone(),
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        loss_type,
+        importance_sampling_level,
+        delta=delta,
+    )
+
+    # Triton implementation
+    logits_triton = logits.clone().requires_grad_(True)
+    triton_loss, _ = triton_grpo_loss(
+        logits_triton,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature=temperature,
+        beta=beta,
+        eps_low=eps_low,
+        eps_high=eps_high,
+        importance_sampling_level=importance_sampling_level,
+        loss_type=loss_type,
+        max_completion_length=T,
+        reduce=True,
+        delta=delta,
+    )
+
+    # Verify forward match
+    torch.testing.assert_close(triton_loss, trl_loss, rtol=1e-4, atol=1e-4)
+
+    # Verify backward works
+    triton_loss.backward()
+    assert logits_triton.grad is not None
+    assert not torch.isnan(logits_triton.grad).any()
+
+
+def trl_reference_grpo_loss_with_vllm_is(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask,
+    temperature,
+    beta,
+    eps_low,
+    eps_high,
+    loss_type,
+    importance_sampling_level,
+    vllm_is_ratio,
+    delta=None,
+    use_bias_correction_kl=False,
+):
+    """TRL reference implementation with vLLM IS ratio correction."""
+    B, L_ADD_1, V = logits.shape
+    L = L_ADD_1 - 1
+
+    logits_scaled = logits[:, :-1, :] / temperature
+    log_probs = torch.log_softmax(logits_scaled.float(), dim=-1)
+    per_token_logps = log_probs.gather(-1, completion_ids.unsqueeze(-1)).squeeze(-1)
+
+    if old_logp is None:
+        old_logp = per_token_logps.detach()
+
+    log_ratio = per_token_logps - old_logp
+
+    if importance_sampling_level == "token":
+        log_importance_weights = log_ratio
+    else:  # sequence
+        log_importance_weights = (log_ratio * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)
+        log_importance_weights = log_importance_weights.unsqueeze(-1)
+
+    coef_1 = torch.exp(log_importance_weights)
+    coef_2 = torch.clamp(coef_1, 1 - eps_low, 1 + eps_high)
+    if delta is not None:
+        coef_1 = torch.clamp(coef_1, max=delta)
+
+    per_token_loss1 = coef_1 * advantages.unsqueeze(-1)
+    per_token_loss2 = coef_2 * advantages.unsqueeze(-1)
+    per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+
+    if importance_sampling_level == "sequence":
+        per_token_loss = per_token_loss.expand(B, L)
+
+    # Apply vLLM IS ratio BEFORE KL penalty (matches TRL)
+    if vllm_is_ratio is not None:
+        per_token_loss = per_token_loss * vllm_is_ratio
+
+    if beta != 0.0:
+        kl = torch.exp(ref_logp - per_token_logps) - (ref_logp - per_token_logps) - 1.0
+        if use_bias_correction_kl:
+            kl = kl * torch.exp(per_token_logps - old_logp)
+        per_token_loss = per_token_loss + beta * kl
+
+    # Loss reduction
+    if loss_type == "grpo":
+        loss = ((per_token_loss * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)).mean()
+    elif loss_type == "bnpo":
+        loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+    elif loss_type == "dr_grpo":
+        loss = (per_token_loss * completion_mask).sum() / (B * L)
+    elif loss_type == "dapo":
+        loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0)
+    elif loss_type == "luspo":
+        loss = (per_token_loss * completion_mask.sum(-1, keepdim=True)).mean()
+
+    return loss
+
+
+def torch_grpo_loss_with_vllm_is(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask,
+    temperature,
+    beta,
+    eps_low,
+    eps_high,
+    vllm_is_ratio,
+    loss_type="grpo",
+    sapo_temperature_pos=1.0,
+    sapo_temperature_neg=1.05,
+    delta=None,
+    use_bias_correction_kl=False,
+):
+    """Reference implementation with vLLM IS ratio correction for all loss types."""
+    assert logits.is_contiguous() and completion_ids.is_contiguous()
+    logits = logits[:, :-1]
+    per_token_logps = _get_log_probs(logits / temperature, completion_ids)
+    ref_per_token_logps = ref_logp
+    if old_logp is None:
+        old_logp = per_token_logps.detach()
+    coef_1 = torch.exp(per_token_logps - old_logp)
+
+    if loss_type == "cispo":
+        coef_2 = torch.clamp(coef_1, max=eps_high).detach()
+        per_token_loss = -coef_2 * advantages.unsqueeze(1) * per_token_logps
+        is_clipped = ((coef_1 > eps_high) & (advantages.unsqueeze(1) > 0)).float()
+    elif loss_type == "sapo":
+        temp = torch.where(advantages.unsqueeze(1) > 0, sapo_temperature_pos, sapo_temperature_neg)
+        sigmoid_input = temp * (coef_1 - 1.0)
+        sapo_coef = torch.sigmoid(sigmoid_input) * 4.0 / temp
+        per_token_loss = -sapo_coef * advantages.unsqueeze(1)
+        is_clipped = torch.zeros_like(per_token_loss)
+    else:
+        coef_2 = torch.clamp(coef_1, 1 - eps_low, 1 + eps_high)
+        if delta is not None:
+            coef_1 = torch.clamp(coef_1, max=delta)
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        is_clipped = (per_token_loss1 < per_token_loss2).float()
+
+    # Apply vLLM IS correction BEFORE KL penalty
+    if vllm_is_ratio is not None:
+        per_token_loss = per_token_loss * vllm_is_ratio
+    per_token_loss = per_token_loss * completion_mask if completion_mask is not None else per_token_loss
+    per_token_kl = None
+    if beta != 0.0:
+        per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+        if use_bias_correction_kl:
+            per_token_kl = per_token_kl * torch.exp(per_token_logps - old_logp)
+        if completion_mask is not None:
+            per_token_kl *= completion_mask
+        per_token_loss = per_token_loss + beta * per_token_kl
+    return per_token_loss, per_token_kl, is_clipped
+
+
+@pytest.mark.parametrize("importance_sampling_level", ["token", "sequence"])
+@pytest.mark.parametrize("loss_type", ["grpo", "dapo", "luspo"])
+@pytest.mark.parametrize("beta", [0.0, 0.04])
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 128, 1000),
+    ],
+)
+def test_grpo_loss_with_vllm_is_ratio_reduced(B, T, V, beta, loss_type, importance_sampling_level):
+    """Test that triton_grpo_loss with vllm_is_ratio matches TRL's behavior with reduce=True."""
+    torch.manual_seed(42)
+
+    logits = torch.randn(B, T + 1, V, device=device, dtype=torch.float32)
+    completion_ids = torch.randint(0, V, (B, T), device=device)
+    completion_mask = torch.ones(B, T, device=device, dtype=torch.float32)
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    # Compute realistic old_logp and ref_logp
+    with torch.no_grad():
+        log_probs = torch.log_softmax(logits[:, :-1, :] / 0.9, dim=-1)
+        current_logp = log_probs.gather(-1, completion_ids.unsqueeze(-1)).squeeze(-1)
+        old_logp = current_logp + torch.randn_like(current_logp) * 0.3
+        ref_logp = current_logp + torch.randn_like(current_logp) * 0.2 if beta != 0.0 else None
+
+    # Create vLLM IS ratio (random values between 0.5 and 1.5)
+    vllm_is_ratio = torch.rand(B, T, device=device, dtype=torch.float32) + 0.5
+
+    temperature = 0.9
+    eps_low, eps_high = 0.2, 0.4
+
+    # TRL reference with vLLM IS ratio
+    trl_loss = trl_reference_grpo_loss_with_vllm_is(
+        logits.clone(),
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        loss_type,
+        importance_sampling_level,
+        vllm_is_ratio,
+    )
+
+    # Triton implementation with vLLM IS ratio
+    logits_triton = logits.clone().requires_grad_(True)
+    triton_loss, _ = triton_grpo_loss(
+        logits_triton,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature=temperature,
+        beta=beta,
+        eps_low=eps_low,
+        eps_high=eps_high,
+        importance_sampling_level=importance_sampling_level,
+        loss_type=loss_type,
+        max_completion_length=T,
+        reduce=True,
+        vllm_is_ratio=vllm_is_ratio,
+    )
+
+    # Verify forward match
+    torch.testing.assert_close(triton_loss, trl_loss, rtol=1e-4, atol=1e-4)
+
+    # Verify backward works
+    triton_loss.backward()
+    assert logits_triton.grad is not None
+    assert not torch.isnan(logits_triton.grad).any()
+
+    # Also verify that vllm_is_ratio=None gives same result as vllm_is_ratio=1
+    logits_no_ratio = logits.clone().requires_grad_(True)
+    loss_no_ratio, _ = triton_grpo_loss(
+        logits_no_ratio,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature=temperature,
+        beta=beta,
+        eps_low=eps_low,
+        eps_high=eps_high,
+        importance_sampling_level=importance_sampling_level,
+        loss_type=loss_type,
+        max_completion_length=T,
+        reduce=True,
+        vllm_is_ratio=None,
+    )
+
+    logits_ones_ratio = logits.clone().requires_grad_(True)
+    loss_ones_ratio, _ = triton_grpo_loss(
+        logits_ones_ratio,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature=temperature,
+        beta=beta,
+        eps_low=eps_low,
+        eps_high=eps_high,
+        importance_sampling_level=importance_sampling_level,
+        loss_type=loss_type,
+        max_completion_length=T,
+        reduce=True,
+        vllm_is_ratio=torch.ones(B, T, device=device),
+    )
+
+    torch.testing.assert_close(loss_no_ratio, loss_ones_ratio, rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "temperature, num_iteration, beta, eps_low, eps_high",
+    [(0.7, num_iteration, beta, 0.2, 0.4) for num_iteration in [1, 5] for beta in [0.0, 0.04]],
+)
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (2, 128, 1000),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-2, 5e-1),
+    ],
+)
+@pytest.mark.parametrize("loss_type", ["grpo", "cispo", "sapo"])
+def test_grpo_loss_with_vllm_is_ratio(
+    B, T, V, temperature, num_iteration, beta, eps_low, eps_high, dtype, atol, rtol, loss_type
+):
+    """Test that triton_grpo_loss with vllm_is_ratio matches PyTorch reference for all loss types."""
+    _input = torch.randn(B, T + 1, V, device=device, dtype=dtype)
+
+    logits1 = _input.clone().requires_grad_(True)
+    logits2 = _input.clone().requires_grad_(True)
+    logits3 = _input.clone().float().requires_grad_(True)
+
+    completion_ids = torch.randint(0, V - 1, (B, T), dtype=torch.int64, device=device)
+    completion_mask = torch.ones_like(completion_ids, dtype=torch.int32)
+    completion_mask[:, -20:] = 0
+
+    ref_logp = torch.randn(B, T, device=device, dtype=torch.float32) if beta != 0.0 else None
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32) if num_iteration > 1 else None
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    # Create vLLM IS ratio (random values between 0.001 and 1.0 to simulate typical IS correction)
+    vllm_is_ratio = torch.rand(B, T, device=device, dtype=torch.float32) * 0.999 + 0.001
+
+    loss1, kl1, _ = torch_grpo_loss_with_vllm_is(
+        logits1,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        vllm_is_ratio,
+        loss_type=loss_type,
+    )
+    loss2, kl2, _ = triton_grpo_loss(
+        logits2,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=True,
+        vllm_is_ratio=vllm_is_ratio,
+        loss_type=loss_type,
+    )
+    loss3, kl3, _ = torch_grpo_loss_with_vllm_is(
+        logits3,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        vllm_is_ratio,
+        loss_type=loss_type,
+    )
+
+    dy = torch.randn_like(loss3)
+    loss1.backward(dy)
+    loss2.backward(dy)
+    loss3.backward(dy)
+
+    # Compare triton bf16 vs torch fp32
+    assert_verbose_allclose(loss2, loss3, atol=atol, rtol=rtol)
+    if kl2 is not None and kl3 is not None:
+        assert_verbose_allclose(kl2, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits2.grad, logits3.grad, atol=atol, rtol=rtol)
+
+    # Verify vllm_is_ratio=None gives same result as vllm_is_ratio=ones
+    logits_none = _input.clone().float().requires_grad_(True)
+    logits_ones = _input.clone().float().requires_grad_(True)
+    loss_none, _, _ = triton_grpo_loss(
+        logits_none,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=False,
+        vllm_is_ratio=None,
+        loss_type=loss_type,
+    )
+    loss_ones, _, _ = triton_grpo_loss(
+        logits_ones,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=False,
+        vllm_is_ratio=torch.ones(B, T, device=device, dtype=torch.float32),
+        loss_type=loss_type,
+    )
+    assert_verbose_allclose(loss_none, loss_ones, atol=1e-5, rtol=1e-5)
+
+    # Verify (B, 1) shape gives same result as (B, T) with uniform value
+    uniform_val = 0.42
+    logits_b1 = _input.clone().float().requires_grad_(True)
+    logits_bt = _input.clone().float().requires_grad_(True)
+    loss_b1, _, _ = triton_grpo_loss(
+        logits_b1,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=False,
+        vllm_is_ratio=torch.full((B, 1), uniform_val, device=device, dtype=torch.float32),
+        loss_type=loss_type,
+    )
+    loss_bt, _, _ = triton_grpo_loss(
+        logits_bt,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=False,
+        vllm_is_ratio=torch.full((B, T), uniform_val, device=device, dtype=torch.float32),
+        loss_type=loss_type,
+    )
+    loss_b1.backward(dy)
+    loss_bt.backward(dy)
+    assert_verbose_allclose(loss_b1, loss_bt, atol=1e-5, rtol=1e-5)
+    assert_verbose_allclose(logits_b1.grad, logits_bt.grad, atol=1e-5, rtol=1e-5)
+
+    # Verify 1D (B,) shape gives same result as (B, 1)
+    logits_1d = _input.clone().float().requires_grad_(True)
+    logits_2d = _input.clone().float().requires_grad_(True)
+    loss_1d, _, _ = triton_grpo_loss(
+        logits_1d,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=False,
+        vllm_is_ratio=torch.full((B,), uniform_val, device=device, dtype=torch.float32),
+        loss_type=loss_type,
+    )
+    loss_2d, _, _ = triton_grpo_loss(
+        logits_2d,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace=False,
+        vllm_is_ratio=torch.full((B, 1), uniform_val, device=device, dtype=torch.float32),
+        loss_type=loss_type,
+    )
+    loss_1d.backward(dy)
+    loss_2d.backward(dy)
+    assert_verbose_allclose(loss_1d, loss_2d, atol=1e-5, rtol=1e-5)
+    assert_verbose_allclose(logits_1d.grad, logits_2d.grad, atol=1e-5, rtol=1e-5)
+
+
+@pytest.mark.parametrize("beta", [0.0, 0.04])
+def test_grpo_loss_sequence_backward_matches_reference(beta):
+    """Sequence-level importance sampling should match reference gradients."""
+    pytest.importorskip("triton")
+    torch.manual_seed(0)
+
+    B, T, V = 2, 8, 32
+    logits = torch.randn(B, T + 1, V, device=device, dtype=torch.float32)
+    completion_ids = torch.randint(0, V, (B, T), device=device)
+    completion_mask = torch.ones(B, T, device=device, dtype=torch.float32)
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    with torch.no_grad():
+        log_probs = torch.log_softmax(logits[:, :-1, :] / 1.1, dim=-1)
+        current_logp = log_probs.gather(-1, completion_ids.unsqueeze(-1)).squeeze(-1)
+        old_logp = current_logp + torch.randn_like(current_logp) * 0.2
+        ref_logp = current_logp + torch.randn_like(current_logp) * 0.1 if beta != 0.0 else None
+
+    temperature = 1.1
+    eps_low, eps_high = 0.2, 0.4
+
+    logits_triton = logits.clone().requires_grad_(True)
+    triton_loss, _ = triton_grpo_loss(
+        logits_triton,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature=temperature,
+        beta=beta,
+        eps_low=eps_low,
+        eps_high=eps_high,
+        importance_sampling_level="sequence",
+        loss_type="grpo",
+        max_completion_length=T,
+        reduce=True,
+    )
+    triton_loss.backward()
+
+    logits_ref = logits.clone().requires_grad_(True)
+    reference_loss = trl_reference_grpo_loss(
+        logits_ref,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        loss_type="grpo",
+        importance_sampling_level="sequence",
+    )
+    reference_loss.backward()
+
+    torch.testing.assert_close(triton_loss, reference_loss, rtol=1e-5, atol=1e-5)
+    torch.testing.assert_close(logits_triton.grad, logits_ref.grad, rtol=1e-4, atol=1e-4)
+
+
+@pytest.mark.parametrize(
+    "temperature, num_iteration, beta, eps_high",
+    [(0.7, num_iteration, beta, 5.0) for num_iteration in [1, 5] for beta in [0.0, 0.04]],
+)
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (1, 1024, 151936),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-2, 5e-1),
+    ],
+)
+def test_cispo_loss(B, T, V, temperature, num_iteration, beta, eps_high, dtype, atol, rtol):
+    """Test CISPO loss type support in Triton kernel."""
+    _input = torch.randn(B, T + 1, V, device=device, dtype=dtype)
+
+    logits1 = _input.clone().requires_grad_(True)
+    logits2 = _input.clone().requires_grad_(True)
+    logits3 = _input.clone().float().requires_grad_(True)
+
+    completion_ids = torch.randint(0, V - 1, (B, T), dtype=torch.int64, device=device)
+    completion_mask = torch.ones_like(completion_ids, dtype=torch.int32)
+    completion_mask[:, -100:] = 0
+
+    ref_logp = torch.randn(B, T, device=device, dtype=torch.float32) if beta != 0.0 else None
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32) if num_iteration > 1 else None
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    loss1, kl1, is_clipped1 = torch_cispo_loss(
+        logits1, old_logp, ref_logp, completion_ids, advantages, completion_mask, temperature, beta, eps_high
+    )
+
+    loss2, kl2, is_clipped2 = triton_grpo_loss(
+        logits2,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low=0.2,  # not used for CISPO
+        eps_high=eps_high,
+        inplace=True,
+        loss_type="cispo",
+    )
+
+    loss3, kl3, is_clipped3 = torch_cispo_loss(
+        logits3, old_logp, ref_logp, completion_ids, advantages, completion_mask, temperature, beta, eps_high
+    )
+
+    dy = torch.randn_like(loss3)
+    loss1.backward(dy)
+    loss2.backward(dy)
+    loss3.backward(dy)
+
+    assert_verbose_allclose(loss1, loss3, atol=atol, rtol=rtol)
+    if kl1 is not None and kl3 is not None:
+        assert_verbose_allclose(kl1, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits1.grad, logits3.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(loss2, loss3, atol=atol, rtol=rtol)
+    if kl2 is not None and kl3 is not None:
+        assert_verbose_allclose(kl2, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits2.grad, logits3.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "temperature, num_iteration, beta, sapo_temp_pos, sapo_temp_neg",
+    [(0.7, num_iteration, beta, 1.0, 1.05) for num_iteration in [1, 5] for beta in [0.0, 0.04]],
+)
+@pytest.mark.parametrize(
+    "B, T, V",
+    [
+        (1, 1024, 151936),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.bfloat16, 5e-2, 5e-1),
+    ],
+)
+def test_sapo_loss(B, T, V, temperature, num_iteration, beta, sapo_temp_pos, sapo_temp_neg, dtype, atol, rtol):
+    """Test SAPO loss type support in Triton kernel."""
+    _input = torch.randn(B, T + 1, V, device=device, dtype=dtype)
+
+    logits1 = _input.clone().requires_grad_(True)
+    logits2 = _input.clone().requires_grad_(True)
+    logits3 = _input.clone().float().requires_grad_(True)
+
+    completion_ids = torch.randint(0, V - 1, (B, T), dtype=torch.int64, device=device)
+    completion_mask = torch.ones_like(completion_ids, dtype=torch.int32)
+    completion_mask[:, -100:] = 0
+
+    ref_logp = torch.randn(B, T, device=device, dtype=torch.float32) if beta != 0.0 else None
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32) if num_iteration > 1 else None
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+
+    loss1, kl1, is_clipped1 = torch_sapo_loss(
+        logits1,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        sapo_temp_pos,
+        sapo_temp_neg,
+    )
+
+    loss2, kl2, is_clipped2 = triton_grpo_loss(
+        logits2,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low=0.2,  # not used for SAPO
+        eps_high=0.4,  # not used for SAPO
+        inplace=True,
+        loss_type="sapo",
+        sapo_temperature_pos=sapo_temp_pos,
+        sapo_temperature_neg=sapo_temp_neg,
+    )
+
+    loss3, kl3, is_clipped3 = torch_sapo_loss(
+        logits3,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        sapo_temp_pos,
+        sapo_temp_neg,
+    )
+
+    dy = torch.randn_like(loss3)
+    loss1.backward(dy)
+    loss2.backward(dy)
+    loss3.backward(dy)
+
+    assert_verbose_allclose(loss1, loss3, atol=atol, rtol=rtol)
+    if kl1 is not None and kl3 is not None:
+        assert_verbose_allclose(kl1, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits1.grad, logits3.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(loss2, loss3, atol=atol, rtol=rtol)
+    if kl2 is not None and kl3 is not None:
+        assert_verbose_allclose(kl2, kl3, atol=atol, rtol=rtol)
+    assert_verbose_allclose(logits2.grad, logits3.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("loss_type", ["cispo", "sapo"])
+def test_triton_sequence_level_rejects_unsupported_loss_types(loss_type):
+    """Sequence-level importance sampling should raise ValueError for cispo and sapo."""
+    B, T, V = 2, 8, 32
+    logits = torch.randn(B, T + 1, V, device=device, dtype=torch.float32).contiguous()
+    completion_ids = torch.randint(0, V, (B, T), device=device)
+    completion_mask = torch.ones(B, T, device=device, dtype=torch.float32)
+    advantages = torch.randn(B, device=device, dtype=torch.float32)
+    old_logp = torch.randn(B, T, device=device, dtype=torch.float32)
+
+    with pytest.raises(ValueError, match="Sequence-level importance sampling is not supported"):
+        triton_grpo_loss(
+            logits,
+            old_logp,
+            None,
+            completion_ids,
+            advantages,
+            completion_mask,
+            temperature=0.9,
+            beta=0.0,
+            eps_low=0.2,
+            eps_high=0.4,
+            importance_sampling_level="sequence",
+            loss_type=loss_type,
+            reduce=True,
+        )
diff --git a/test/transformers/test_jsd.py b/test/transformers/test_jsd.py
new file mode 100755
index 0000000000000000000000000000000000000000..8735864096341790f9393791109e52a0a7ffc2f0
--- /dev/null
+++ b/test/transformers/test_jsd.py
@@ -0,0 +1,352 @@
+from typing import Optional
+
+import pytest
+import torch
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+from torch.nn import KLDivLoss
+
+from liger_kernel.transformers.functional import liger_jsd
+from liger_kernel.transformers.jsd import LigerJSD
+from liger_kernel.transformers.jsd import LigerJSDFunction
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+set_seed(42)
+
+
+class NPUKLDivLoss(torch.nn.Module):
+    """
+    A custom KLDivLoss for NPU.
+
+    On NPU devices, torch.nn.KLDivLoss does not compute gradients with respect to the target.
+    This leads to incorrect gradient computation when the target depends on the input,
+    such as in JSD or reverse KLDiv.
+    See https://github.com/linkedin/Liger-Kernel/issues/1021 for more details.
+    """
+
+    def __init__(self, reduction="none", log_target=True):
+        super().__init__()
+
+    def forward(self, input, target):
+        original_dtype = input.dtype
+
+        if input.dtype in [torch.float16, torch.bfloat16]:
+            input = input.float()
+            target = target.float()
+
+        loss = torch.exp(target) * (target - input)
+
+        return loss.to(original_dtype)
+
+
+class JSD(torch.nn.Module):
+    def __init__(
+        self,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        dtype: torch.dtype = torch.float,
+    ):
+        super(JSD, self).__init__()
+        if device == "npu":
+            self.kl = NPUKLDivLoss(reduction="none", log_target=True)
+        else:
+            self.kl = KLDivLoss(reduction="none", log_target=True)
+        self.beta = beta
+        self.ignore_index = ignore_index
+        self.dtype = dtype
+
+    def forward(
+        self,
+        log_q: torch.Tensor,  # input student logits
+        log_p: torch.Tensor,  # target
+        label: Optional[torch.Tensor] = None,
+    ):
+        if self.beta == 0.0:  # KL(p||q) -> kl(q, p)
+            loss = self.kl(log_q, log_p).sum(dim=-1)
+        elif self.beta == 1.0:  # KL(q||p) -> kl(p, q)
+            loss = self.kl(log_p, log_q).sum(dim=-1)
+        else:
+            log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
+            log_p, log_q = (
+                log_p.view(-1, log_p.size(-1)),
+                log_q.view(-1, log_q.size(-1)),
+            )
+            m = torch.lerp(torch.exp(log_q), torch.exp(log_p), self.beta)
+            loss = self.beta * self.kl(torch.log(m), log_p).sum(dim=-1) + (1 - self.beta) * self.kl(
+                torch.log(m), log_q
+            ).sum(dim=-1)
+
+        if label is not None:
+            loss = torch.where(label != self.ignore_index, loss, 0.0)
+            n_non_ignore = (label != self.ignore_index).sum().item()
+            if n_non_ignore == 0:
+                loss = torch.tensor(0.0).to(loss.device)
+            else:
+                loss = (loss / n_non_ignore).sum()
+        else:
+            loss = (loss / log_q.shape[0]).sum()
+        return loss.to(self.dtype)
+
+
+_SHAPE_PARAMS = (
+    "B, T, V",
+    [
+        (2, 1024, 3200),
+        # weird shape
+        (41, 401, 1271),
+    ],
+)
+
+_DTYPE_PARAMS = (
+    "dtype, atol, rtol",
+    [
+        pytest.param(
+            torch.bfloat16,
+            1e-7,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (torch.float32, 1e-7, 1e-6),
+        (torch.float16, 1e-3, 1e-3),
+    ],
+)
+
+
+def _test_correctness_once(
+    target_jsd,
+    B,
+    T,
+    V,
+    dtype,
+    atol,
+    rtol,
+    is_last_layer=True,
+    device=device,
+):
+    torch_jsd = JSD(dtype=dtype)
+
+    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True).log_softmax(dim=-1)
+
+    x1 = input.detach().clone().requires_grad_(True)
+    x2 = input.detach().clone().requires_grad_(True)
+    x3 = input.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, dtype=dtype, device=device).log_softmax(dim=-1)
+
+    output = torch_jsd(x1, target)
+    output2 = target_jsd(x2, target)
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+    # symmetry
+    output3 = target_jsd(target, x3)
+    assert torch.allclose(output3, output2, atol=atol, rtol=rtol)
+    if (
+        not is_last_layer
+    ):  # if the loss is the last layer, grad_output is 1.0 and mul op is skipped, testing for that reason
+        output = output * 2.0
+        output2 = output2 * 2.0
+
+    output.backward()
+    output2.backward()
+    assert_verbose_allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_beta_once(
+    target_jsd,
+    beta,
+    B,
+    T,
+    V,
+    dtype,
+    atol,
+    rtol,
+    is_last_layer=True,
+    device=device,
+):
+    torch_jsd = JSD(beta=beta, dtype=dtype)
+
+    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True).log_softmax(dim=-1)
+
+    x1 = input.detach().clone().requires_grad_(True)
+    x2 = input.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, dtype=dtype, device=device).log_softmax(dim=-1)
+
+    output = torch_jsd(x1, target)
+    output2 = target_jsd(x2, target)
+    assert_verbose_allclose(output, output2, atol=atol, rtol=rtol)
+    if (
+        not is_last_layer
+    ):  # if the loss is the last layer, grad_output is 1.0 and mul op is skipped, testing for that reason
+        output = output * 2.0
+        output2 = output2 * 2.0
+
+    output.backward()
+    output2.backward()
+    assert_verbose_allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_ignore_index_once(
+    target_jsd,
+    ignore_index,
+    B,
+    T,
+    V,
+    dtype,
+    atol,
+    rtol,
+    device=device,
+):
+    torch_jsd = JSD(ignore_index=ignore_index, dtype=dtype)
+
+    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True).log_softmax(dim=-1)
+
+    x1 = input.detach().clone().requires_grad_(True)
+    x2 = input.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, dtype=dtype, device=device).log_softmax(dim=-1)
+
+    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    label[indices_to_assign] = ignore_index
+
+    output = torch_jsd(x1, target, label)
+    output2 = target_jsd(x2, target, label)
+    assert_verbose_allclose(output, output2, atol=atol, rtol=rtol)
+
+    output.backward()
+    output2.backward()
+    assert_verbose_allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_functional(B, T, V, beta, ignore_index, is_last_layer, dtype, atol, rtol, device=device):
+    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True).log_softmax(dim=-1)
+
+    x1 = input.detach().clone().requires_grad_(True)
+    x2 = input.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, dtype=dtype, device=device).log_softmax(dim=-1)
+
+    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    label[indices_to_assign] = ignore_index
+
+    output = LigerJSDFunction.apply(x1, target, label, beta, ignore_index)
+    output2 = liger_jsd(
+        input=x2,
+        target=target,
+        shift_labels=label,
+        beta=beta,
+        ignore_index=ignore_index,
+    )
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+    if (
+        not is_last_layer
+    ):  # if the loss is the last layer, grad_output is 1.0 and mul op is skipped, testing for that reason
+        output = output * 2.0
+        output2 = output2 * 2.0
+    output.backward()
+    output2.backward()
+    assert_verbose_allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+def test_correctness(B, T, V, dtype, atol, rtol):
+    liger_jsd = LigerJSD()
+    _test_correctness_once(liger_jsd, B, T, V, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+def test_correctness_not_last(B, T, V, dtype, atol, rtol):
+    liger_jsd = LigerJSD()
+
+    _test_correctness_once(liger_jsd, B, T, V, dtype, atol, rtol, is_last_layer=False)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+@pytest.mark.parametrize("beta", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_correctness_with_beta(B, T, V, beta, dtype, atol, rtol):
+    liger_jsd = LigerJSD(beta=beta)
+    _test_correctness_with_beta_once(liger_jsd, beta, B, T, V, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+@pytest.mark.parametrize("ignore_index", [2, 42])
+def test_correctness_with_ignore_index(B, T, V, ignore_index, dtype, atol, rtol):
+    liger_jsd = LigerJSD(ignore_index=ignore_index)
+    _test_correctness_with_ignore_index_once(liger_jsd, ignore_index, B, T, V, dtype, atol, rtol)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+@pytest.mark.parametrize(
+    "beta, ignore_index, is_last_layer",
+    [
+        (0.5, 2, False),
+        (0.1, 42, True),
+    ],
+)
+def test_correctness_functional(B, T, V, beta, ignore_index, is_last_layer, dtype, atol, rtol):
+    _test_correctness_functional(B, T, V, beta, ignore_index, is_last_layer, dtype, atol, rtol)
+
+
+# @pytest.mark.parametrize(*_SHAPE_PARAMS)
+def test_correctness_with_all_indices_ignored(
+    B=2,
+    T=10,
+    V=32,
+    dtype=torch.bfloat16,
+    atol=1e-3,
+    rtol=1e-3,
+    device=device,
+):
+    ignore_index = -100
+    torch_jsd = JSD(ignore_index=ignore_index, dtype=dtype)
+    liger_jsd = LigerJSD(ignore_index=ignore_index)
+
+    inp = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True).log_softmax(dim=-1)
+
+    x1 = inp.detach().clone().requires_grad_(True)
+    x2 = inp.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, dtype=dtype, device=device).log_softmax(dim=-1)
+
+    # label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+    label = torch.full((B * T,), ignore_index, device=device, dtype=torch.long)
+
+    # Assign some random number of elements as ignore_index
+    num_elements_to_assign = torch.randint(
+        1, B * T // 2, (1,)
+    ).item()  # Random number of elements to set to ignore_index
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]  # Randomly select indices
+    label[indices_to_assign] = ignore_index
+
+    output = torch_jsd(x1, target, label)
+    output2 = liger_jsd(x2, target, label)
+    assert_verbose_allclose(output, output2, atol=atol, rtol=rtol)
+    assert_verbose_allclose(torch.zeros_like(output2), output2, atol=atol, rtol=rtol)
+
+    output2.backward()
+    assert_verbose_allclose(torch.zeros_like(x2.grad), x2.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_kl_div.py b/test/transformers/test_kl_div.py
new file mode 100755
index 0000000000000000000000000000000000000000..84386f4e416b00c9742ceb261b0a8b1ca8c8db30
--- /dev/null
+++ b/test/transformers/test_kl_div.py
@@ -0,0 +1,104 @@
+import pytest
+import torch
+
+from test.utils import supports_bfloat16
+from torch.nn import KLDivLoss
+
+from liger_kernel.transformers.kl_div import LigerKLDIVLoss
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+_SHAPE_PARAMS = (
+    "B, T, V",
+    [
+        (1, 4096, 32000),
+        # weird shape
+        (41, 401, 1271),
+    ],
+)
+
+_DTYPE_PARAMS = (
+    "dtype, atol, rtol",
+    [
+        pytest.param(
+            torch.bfloat16,
+            1e-8,
+            5e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (torch.float32, 1e-8, 1e-6),
+        (torch.float16, 1e-3, 1e-3),
+    ],
+)
+
+
+def _test_correctness_once(
+    target_kldiv,
+    B,
+    T,
+    V,
+    dtype,
+    atol,
+    rtol,
+    reduction,
+    log_target,
+    is_last_layer=True,
+    device=device,
+):
+    torch.manual_seed(0)
+    torch_kldiv = KLDivLoss(reduction=reduction, log_target=log_target)
+
+    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True).log_softmax(dim=-1)
+
+    x1 = input.detach().clone().requires_grad_(True)
+    x2 = input.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, device=device).softmax(dim=-1)
+
+    output = torch_kldiv(x1, target)
+    output2 = target_kldiv(x2, target)
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    if (
+        not is_last_layer
+    ):  # if the loss is the last layer, grad_output is 1.0 and mul op is skipped, testing for that reason
+        output = output * 2.0
+        output2 = output2 * 2.0
+
+    if reduction == "none":
+        return
+
+    output.backward()
+    output2.backward()
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize("log_target", [True, False])
+@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+def test_correctness(B, T, V, log_target, reduction, dtype, atol, rtol):
+    liger_kldiv = LigerKLDIVLoss(reduction=reduction, log_target=log_target)
+    _test_correctness_once(liger_kldiv, B, T, V, dtype, atol, rtol, reduction, log_target)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize("log_target", [True, False])
+@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+def test_correctness_not_last(B, T, V, log_target, reduction, dtype, atol, rtol):
+    liger_kldiv = LigerKLDIVLoss(reduction=reduction, log_target=log_target)
+    _test_correctness_once(
+        liger_kldiv,
+        B,
+        T,
+        V,
+        dtype,
+        atol,
+        rtol,
+        reduction,
+        log_target,
+        is_last_layer=False,
+    )
diff --git a/test/transformers/test_layer_norm.py b/test/transformers/test_layer_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..cabfeceb6914c6e570cfd32292e54b6ef1c90f4a
--- /dev/null
+++ b/test/transformers/test_layer_norm.py
@@ -0,0 +1,116 @@
+import pytest
+import torch
+
+from liger_kernel.ops import LigerLayerNormFunction
+from liger_kernel.transformers.functional import liger_layer_norm
+from liger_kernel.transformers.layer_norm import LigerLayerNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+@pytest.mark.parametrize(
+    "batch_size, seq_len, hidden_size",
+    [
+        (2, 8, 64),
+        (4, 16, 128),
+        (1, 1, 1023),  # Minimal batch/seq with near power-of-2 hidden
+        (3, 7, 256),  # Prime numbers for batch/seq
+        (1, 1, 1500),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 2e-2, 2e-2),  # Relaxed tolerance for bfloat16 due to lower precision + atomic limitations
+    ],
+)
+def test_liger_layer_norm(
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    atol: float,
+    rtol: float,
+) -> None:
+    """Test basic layer norm functionality against PyTorch implementation."""
+    torch.manual_seed(0)
+
+    x = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device=device)
+
+    liger_x = x.clone().requires_grad_(True)
+    torch_x = x.clone().requires_grad_(True)
+
+    liger_ln = LigerLayerNorm(hidden_size, eps=1e-6).to(dtype).to(device)
+    torch_ln = torch.nn.LayerNorm(hidden_size, eps=1e-6).to(dtype).to(device)
+
+    with torch.no_grad():
+        torch_ln.weight.copy_(liger_ln.weight)
+        torch_ln.bias.copy_(liger_ln.bias)
+
+    liger_output = liger_ln(liger_x)
+    torch_output = torch_ln(torch_x)
+
+    assert torch.allclose(liger_output, torch_output, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(x)
+    liger_output.backward(grad_output, retain_graph=True)
+    torch_output.backward(grad_output, retain_graph=True)
+
+    assert torch.allclose(liger_x.grad, torch_x.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(liger_ln.weight.grad, torch_ln.weight.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(liger_ln.bias.grad, torch_ln.bias.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "batch_size, seq_len, hidden_size",
+    [
+        (2, 8, 64),
+        (4, 16, 128),
+        (3, 512, 128),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 2e-2, 2e-2),  # Relaxed tolerance for bfloat16 due to lower precision + atomic limitations
+    ],
+)
+def test_liger_layer_norm_functional(
+    hidden_size: int,
+    batch_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+    atol: float,
+    rtol: float,
+) -> None:
+    """Test functional layer norm interface against autograd function."""
+    torch.manual_seed(0)
+
+    input = torch.randn(batch_size, seq_len, hidden_size, dtype=dtype, device=device)
+
+    x1 = input.clone().requires_grad_(True)
+    x2 = input.clone().requires_grad_(True)
+
+    w = torch.randn(hidden_size, device=device, dtype=dtype)
+    w1 = w.clone().requires_grad_(True)
+    w2 = w.clone().requires_grad_(True)
+
+    b = torch.randn(hidden_size, device=device, dtype=dtype)
+    b1 = b.clone().requires_grad_(True)
+    b2 = b.clone().requires_grad_(True)
+
+    y1 = liger_layer_norm(X=x1, W=w1, B=b1, eps=1e-6)
+    y2 = LigerLayerNormFunction.apply(x2, w2, b2, 1e-6)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(y2)
+    y1.backward(grad_output, retain_graph=True)
+    y2.backward(grad_output, retain_graph=True)
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(w1.grad, w2.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(b1.grad, b2.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_llama4_rope.py b/test/transformers/test_llama4_rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..91f3d6dea8b424d25143259dc173aef1da6fe035
--- /dev/null
+++ b/test/transformers/test_llama4_rope.py
@@ -0,0 +1,149 @@
+import pytest
+import torch
+
+from test.utils import supports_bfloat16
+
+from liger_kernel.ops import LigerLlama4RopeFunction
+from liger_kernel.transformers.llama4_rope import liger_llama4_text_rotary_pos_emb
+from liger_kernel.utils import infer_device
+
+try:
+    from transformers.models.llama4.configuration_llama4 import Llama4TextConfig
+    from transformers.models.llama4.modeling_llama4 import Llama4TextRotaryEmbedding
+    from transformers.models.llama4.modeling_llama4 import apply_rotary_emb
+
+    IS_LLAMA4_AVAILABLE = True
+except Exception:
+    IS_LLAMA4_AVAILABLE = False
+
+device = infer_device()
+
+
+@pytest.mark.skipif(not IS_LLAMA4_AVAILABLE, reason="Llama4 is not available in transformers.")
+@pytest.mark.parametrize(
+    "bsz, seq_len, num_q_heads, num_kv_heads, head_dim",
+    [
+        (1, 128, 32, 32, 64),
+        (2, 128, 32, 32, 64),
+        (1, 128, 32, 8, 64),
+        (2, 128, 32, 8, 64),
+        # weird shapes
+        (3, 423, 73, 213, 92),
+        (3, 423, 73, 155, 92),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            1e-1,
+            1e-5,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness(bsz, seq_len, num_q_heads, num_kv_heads, head_dim, dtype, atol, rtol):
+    config = Llama4TextConfig(
+        hidden_size=num_q_heads * head_dim,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        head_dim=head_dim,
+        max_position_embeddings=seq_len,
+        rope_theta=10000.0,
+        rope_scaling=None,
+    )
+    rotary_emb = Llama4TextRotaryEmbedding(config=config, device=device)
+
+    _tensor_q = torch.randn((bsz, seq_len, num_q_heads, head_dim), device=device).to(dtype)
+    _tensor_k = torch.randn((bsz, seq_len, num_kv_heads, head_dim), device=device).to(dtype)
+
+    q1 = _tensor_q.clone().requires_grad_(True)
+    k1 = _tensor_k.clone().requires_grad_(True)
+    q2 = _tensor_q.clone().requires_grad_(True)
+    k2 = _tensor_k.clone().requires_grad_(True)
+
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0).expand(bsz, -1)
+    freqs_cis = rotary_emb(q1, pos_ids)
+
+    hf_q, hf_k = apply_rotary_emb(q1, k1, freqs_cis)
+    tt_q, tt_k = liger_llama4_text_rotary_pos_emb(q2, k2, freqs_cis)
+    assert torch.allclose(hf_q, tt_q, atol=atol, rtol=rtol)
+    assert torch.allclose(hf_k, tt_k, atol=atol, rtol=rtol)
+
+    # backward
+    dq, dk = torch.randn_like(hf_q, device=device), torch.randn_like(hf_k, device=device).to(dtype)
+
+    q1_grad, k1_grad = torch.autograd.grad((hf_q, hf_k), (q1, k1), (dq, dk), allow_unused=True)
+    q2_grad, k2_grad = torch.autograd.grad((tt_q, tt_k), (q2, k2), (dq.clone(), dk.clone()), allow_unused=True)
+
+    assert torch.allclose(q1_grad, q2_grad, atol=atol, rtol=rtol)
+    assert torch.allclose(k1_grad, k2_grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.skipif(not IS_LLAMA4_AVAILABLE, reason="Llama4 is not available in transformers.")
+@pytest.mark.parametrize(
+    "bsz, seq_len, num_q_heads, num_kv_heads, head_dim",
+    [
+        (1, 2, 2, 2, 8),
+        (1, 2, 1, 2, 8),
+        (9, 7, 41, 41, 40),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-1, 1e-5),
+    ],
+)
+def test_functional_correctness(bsz, seq_len, num_q_heads, num_kv_heads, head_dim, dtype, atol, rtol):
+    config = Llama4TextConfig(
+        hidden_size=num_q_heads * head_dim,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        head_dim=head_dim,
+        max_position_embeddings=seq_len,
+        rope_theta=10000.0,
+        rope_scaling=None,
+    )
+    rotary_emb = Llama4TextRotaryEmbedding(config=config, device=device)
+
+    _q = torch.randn((bsz, seq_len, num_q_heads, head_dim), device=device, dtype=dtype)
+    _k = torch.randn((bsz, seq_len, num_kv_heads, head_dim), device=device, dtype=dtype)
+
+    q1 = _q.clone().requires_grad_(True)
+    q2 = _q.clone().requires_grad_(True)
+    k1 = _k.clone().requires_grad_(True)
+    k2 = _k.clone().requires_grad_(True)
+
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0).expand(bsz, -1)
+    freqs_cis = rotary_emb(q1, pos_ids)
+
+    functional_q, functional_k = liger_llama4_text_rotary_pos_emb(q1, k1, freqs_cis)
+    class_q, class_k = LigerLlama4RopeFunction.apply(q2, k2, freqs_cis)
+
+    assert torch.allclose(functional_q, class_q, atol=atol, rtol=rtol)
+    assert torch.allclose(functional_k, class_k, atol=atol, rtol=rtol)
+
+    dq, dk = torch.randn_like(functional_q), torch.randn_like(functional_k)
+    dq1, dk1 = dq.clone(), dk.clone()
+    dq2, dk2 = dq.clone(), dk.clone()
+
+    q1_grad, k1_grad = torch.autograd.grad(
+        (functional_q, functional_k),
+        (q1, k1),
+        (dq1, dk1),
+        allow_unused=True,
+    )
+
+    q2_grad, k2_grad = torch.autograd.grad(
+        (class_q, class_k),
+        (q2, k2),
+        (dq2, dk2),
+        allow_unused=True,
+    )
+
+    assert torch.allclose(q1_grad, q2_grad, atol=atol, rtol=rtol)
+    assert torch.allclose(k1_grad, k2_grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_mhc.py b/test/transformers/test_mhc.py
new file mode 100755
index 0000000000000000000000000000000000000000..6c8b273f6671f72cc90db4011ec1ca7d433b2a21
--- /dev/null
+++ b/test/transformers/test_mhc.py
@@ -0,0 +1,522 @@
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from test.utils import assert_verbose_allclose
+from test.utils import infer_device
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+
+from liger_kernel.transformers.functional import liger_mhc_coeffs
+from liger_kernel.transformers.functional import liger_mhc_post_res
+from liger_kernel.transformers.functional import liger_mhc_pre
+from liger_kernel.transformers.mhc import LigerMHC
+
+device = infer_device()
+
+MHC_SHAPES = [
+    (2, 4, 2, 32),
+    (1, 8, 4, 64),
+]
+
+MHC_DTYPE_TOLS = [
+    (torch.float16, 8e-3, 1.5e-2, 2e-2),
+    pytest.param(
+        torch.bfloat16,
+        1.5e-2,
+        2.5e-2,
+        5e-2,
+        marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported"),
+    ),
+]
+
+MHC_COEFFS_DTYPE_TOLS = MHC_DTYPE_TOLS + [
+    (torch.float32, 5e-4, 1e-3, 2e-3),
+]
+
+
+def mhc_sinkhorn_ref(logits: torch.Tensor, *, tmax: int, eps: float) -> torch.Tensor:
+    """
+    logits: [N, HC, HC]
+    """
+    mat = torch.softmax(logits, dim=-1) + eps
+    mat = mat / (mat.sum(dim=-2, keepdim=True) + eps)
+    for _ in range(tmax - 1):
+        mat = mat / (mat.sum(dim=-1, keepdim=True) + eps)
+        mat = mat / (mat.sum(dim=-2, keepdim=True) + eps)
+    return mat
+
+
+def mhc_coeffs_ref(
+    x: torch.Tensor,
+    phi: torch.Tensor,
+    b: torch.Tensor,
+    alpha_pre: torch.Tensor,
+    alpha_post: torch.Tensor,
+    alpha_res: torch.Tensor,
+    *,
+    tmax: int,
+    rms_eps: float,
+    pre_eps: float,
+    sinkhorn_eps: float,
+    post_mult: float,
+):
+    x_flat = x.contiguous().view(-1, x.shape[-2], x.shape[-1]).float()
+    n, hc, c = x_flat.shape
+    k = hc * c
+    x_mat = x_flat.view(n, k)
+    invr = torch.rsqrt(x_mat.pow(2).mean(dim=-1, keepdim=True) + rms_eps)
+    mix = (x_mat @ phi.float()) * invr
+
+    pre_logits = mix[:, :hc] * alpha_pre + b[:hc]
+    post_logits = mix[:, hc : 2 * hc] * alpha_post + b[hc : 2 * hc]
+    res_logits = mix[:, 2 * hc :].view(n, hc, hc) * alpha_res + b[2 * hc :].view(hc, hc)
+
+    h_pre = torch.sigmoid(pre_logits) + pre_eps
+    h_post = torch.sigmoid(post_logits) * post_mult
+    h_res = mhc_sinkhorn_ref(res_logits, tmax=tmax, eps=sinkhorn_eps)
+
+    outer = x.shape[:-2]
+    return (
+        h_pre.view(*outer, hc),
+        h_post.view(*outer, hc),
+        h_res.view(*outer, hc, hc),
+    )
+
+
+@pytest.mark.parametrize("B, T, HC, C", MHC_SHAPES)
+@pytest.mark.parametrize("phi_dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype, pre_post_tol, res_tol, grad_tol", MHC_COEFFS_DTYPE_TOLS)
+def test_mhc_coeffs_forward_backward(B, T, HC, C, phi_dtype, dtype, pre_post_tol, res_tol, grad_tol):
+    set_seed(42)
+    K = HC * C
+    M = HC * HC + 2 * HC
+
+    allow_fp32 = dtype == torch.float32
+    if allow_fp32:
+        phi_dtype = torch.float32
+
+    x = torch.randn(B, T, HC, C, device=device, dtype=dtype, requires_grad=True)
+    phi = (torch.randn(K, M, device=device, dtype=phi_dtype) * 0.02).requires_grad_(True)
+    b = torch.zeros(M, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_pre = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_post = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_res = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+
+    cfg = dict(tmax=8, rms_eps=1e-6, pre_eps=1e-4, sinkhorn_eps=1e-6, post_mult=2.0)
+
+    h_pre, h_post, h_res = liger_mhc_coeffs(x, phi, b, alpha_pre, alpha_post, alpha_res, allow_fp32=allow_fp32, **cfg)
+
+    loss = h_pre.square().mean() + h_post.square().mean() + h_res.square().mean()
+    loss.backward()
+
+    grads_triton = (
+        x.grad.detach().float().clone(),
+        phi.grad.detach().float().clone(),
+        b.grad.detach().float().clone(),
+        alpha_pre.grad.detach().float().clone(),
+        alpha_post.grad.detach().float().clone(),
+        alpha_res.grad.detach().float().clone(),
+    )
+
+    x2 = x.detach().clone().requires_grad_(True)
+    phi2 = phi.detach().clone().requires_grad_(True)
+    b2 = b.detach().clone().requires_grad_(True)
+    ap2 = alpha_pre.detach().clone().requires_grad_(True)
+    apo2 = alpha_post.detach().clone().requires_grad_(True)
+    ar2 = alpha_res.detach().clone().requires_grad_(True)
+
+    rh_pre, rh_post, rh_res = mhc_coeffs_ref(x2, phi2, b2, ap2, apo2, ar2, **cfg)
+    rloss = rh_pre.square().mean() + rh_post.square().mean() + rh_res.square().mean()
+    rloss.backward()
+
+    grads_ref = (
+        x2.grad.detach().float(),
+        phi2.grad.detach().float(),
+        b2.grad.detach().float(),
+        ap2.grad.detach().float(),
+        apo2.grad.detach().float(),
+        ar2.grad.detach().float(),
+    )
+
+    assert_verbose_allclose(h_pre.float(), rh_pre.float(), rtol=pre_post_tol, atol=pre_post_tol)
+    assert_verbose_allclose(h_post.float(), rh_post.float(), rtol=pre_post_tol, atol=pre_post_tol)
+    assert_verbose_allclose(h_res.float(), rh_res.float(), rtol=res_tol, atol=res_tol)
+
+    for gt, gr in zip(grads_triton, grads_ref):
+        assert_verbose_allclose(gt, gr, rtol=grad_tol, atol=grad_tol)
+
+
+def test_mhc_coeffs_disallow_fp32():
+    B, T, HC, C = 1, 2, 2, 8
+    K = HC * C
+    M = HC * HC + 2 * HC
+
+    x = torch.randn(B, T, HC, C, device=device, dtype=torch.float32)
+    phi = torch.randn(K, M, device=device, dtype=torch.float32)
+    b = torch.zeros(M, device=device, dtype=torch.float32)
+    alpha_pre = torch.tensor(1.0, device=device, dtype=torch.float32)
+    alpha_post = torch.tensor(1.0, device=device, dtype=torch.float32)
+    alpha_res = torch.tensor(1.0, device=device, dtype=torch.float32)
+
+    with pytest.raises(AssertionError):
+        _ = liger_mhc_coeffs(x, phi, b, alpha_pre, alpha_post, alpha_res)
+
+
+@pytest.mark.parametrize("B, T, HC, C", MHC_SHAPES)
+@pytest.mark.parametrize(
+    "use_pre,use_post,use_res",
+    [
+        (True, False, False),
+        (False, True, False),
+        (False, False, True),
+    ],
+)
+def test_mhc_coeffs_backward_allows_unused_outputs(B, T, HC, C, use_pre, use_post, use_res):
+    set_seed(42)
+    K = HC * C
+    M = HC * HC + 2 * HC
+
+    x = torch.randn(B, T, HC, C, device=device, dtype=torch.float16, requires_grad=True)
+    phi = (torch.randn(K, M, device=device, dtype=torch.float16) * 0.02).requires_grad_(True)
+    b = torch.zeros(M, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_pre = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_post = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_res = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+
+    cfg = dict(tmax=4, rms_eps=1e-6, pre_eps=1e-4, sinkhorn_eps=1e-6, post_mult=2.0)
+
+    h_pre, h_post, h_res = liger_mhc_coeffs(x, phi, b, alpha_pre, alpha_post, alpha_res, **cfg)
+
+    loss = torch.zeros((), device=device)
+    if use_pre:
+        loss = loss + h_pre.square().mean()
+    if use_post:
+        loss = loss + h_post.square().mean()
+    if use_res:
+        loss = loss + h_res.square().mean()
+    loss.backward()
+
+    for tensor in (x, phi, b, alpha_pre, alpha_post, alpha_res):
+        assert tensor.grad is not None
+
+
+@pytest.mark.parametrize("B, T, HC, C", MHC_SHAPES)
+@pytest.mark.parametrize("dtype, pre_post_tol, res_tol, grad_tol", MHC_DTYPE_TOLS)
+def test_mhc_pre_and_post_res_match_reference(B, T, HC, C, dtype, pre_post_tol, res_tol, grad_tol):
+    set_seed(42)
+
+    # Liger path
+    x1 = torch.randn(B, T, HC, C, device=device, dtype=dtype, requires_grad=True)
+    h_pre1 = torch.rand(B, T, HC, device=device, dtype=torch.float32, requires_grad=True)
+    h_post1 = torch.rand(B, T, HC, device=device, dtype=torch.float32, requires_grad=True)
+    h_res1 = torch.rand(B, T, HC, HC, device=device, dtype=torch.float32, requires_grad=True)
+    f_out1 = torch.randn(B, T, C, device=device, dtype=dtype, requires_grad=True)
+
+    x_in = liger_mhc_pre(x1, h_pre1)
+    x_out = liger_mhc_post_res(x1, f_out1, h_post1, h_res1)
+
+    # Reference path (clone inputs for independent computation graph)
+    x2 = x1.detach().clone().requires_grad_(True)
+    h_pre2 = h_pre1.detach().clone().requires_grad_(True)
+    h_post2 = h_post1.detach().clone().requires_grad_(True)
+    h_res2 = h_res1.detach().clone().requires_grad_(True)
+    f_out2 = f_out1.detach().clone().requires_grad_(True)
+
+    x_in_ref = (x2.float() * h_pre2.unsqueeze(-1)).sum(dim=-2)
+    x_out_ref = torch.einsum("...oi,...ic->...oc", h_res2, x2.float()) + h_post2.unsqueeze(
+        -1
+    ) * f_out2.float().unsqueeze(-2)
+
+    # Forward check
+    assert_verbose_allclose(x_in.float(), x_in_ref, rtol=pre_post_tol, atol=pre_post_tol)
+    assert_verbose_allclose(x_out.float(), x_out_ref, rtol=res_tol, atol=res_tol)
+
+    # Backward check
+    loss = x_in.square().mean() + x_out.square().mean()
+    loss.backward()
+
+    loss_ref = x_in_ref.square().mean() + x_out_ref.square().mean()
+    loss_ref.backward()
+
+    assert_verbose_allclose(x1.grad.float(), x2.grad.float(), rtol=grad_tol, atol=grad_tol)
+    assert_verbose_allclose(h_pre1.grad.float(), h_pre2.grad.float(), rtol=grad_tol, atol=grad_tol)
+    assert_verbose_allclose(h_post1.grad.float(), h_post2.grad.float(), rtol=grad_tol, atol=grad_tol)
+    assert_verbose_allclose(h_res1.grad.float(), h_res2.grad.float(), rtol=grad_tol, atol=grad_tol)
+    assert_verbose_allclose(f_out1.grad.float(), f_out2.grad.float(), rtol=grad_tol, atol=grad_tol)
+
+
+@pytest.mark.parametrize("B, T, HC, C", MHC_SHAPES)
+@pytest.mark.parametrize("dtype, pre_post_tol, res_tol, grad_tol", MHC_DTYPE_TOLS)
+def test_liger_mhc_functional(B, T, HC, C, dtype, pre_post_tol, res_tol, grad_tol):
+    set_seed(42)
+    K = HC * C
+    M = HC * HC + 2 * HC
+
+    x = torch.randn(B, T, HC, C, device=device, dtype=dtype, requires_grad=True)
+    phi = (torch.randn(K, M, device=device, dtype=dtype) * 0.02).requires_grad_(True)
+    b = torch.zeros(M, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_pre = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_post = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+    alpha_res = torch.tensor(1.0, device=device, dtype=torch.float32, requires_grad=True)
+
+    cfg = dict(tmax=4, rms_eps=1e-6, pre_eps=1e-4, sinkhorn_eps=1e-6, post_mult=2.0)
+
+    h_pre, h_post, h_res = liger_mhc_coeffs(x, phi, b, alpha_pre, alpha_post, alpha_res, **cfg)
+    rh_pre, rh_post, rh_res = mhc_coeffs_ref(x, phi, b, alpha_pre, alpha_post, alpha_res, **cfg)
+
+    assert_verbose_allclose(h_pre.float(), rh_pre.float(), rtol=pre_post_tol, atol=pre_post_tol, extra_info="[h_pre]")
+    assert_verbose_allclose(
+        h_post.float(), rh_post.float(), rtol=pre_post_tol, atol=pre_post_tol, extra_info="[h_post]"
+    )
+    assert_verbose_allclose(h_res.float(), rh_res.float(), rtol=res_tol, atol=res_tol, extra_info="[h_res]")
+
+    loss = h_pre.square().mean() + h_post.square().mean() + h_res.square().mean()
+    loss.backward()
+
+    x2 = x.detach().clone().requires_grad_(True)
+    phi2 = phi.detach().clone().requires_grad_(True)
+    b2 = b.detach().clone().requires_grad_(True)
+    ap2 = alpha_pre.detach().clone().requires_grad_(True)
+    apo2 = alpha_post.detach().clone().requires_grad_(True)
+    ar2 = alpha_res.detach().clone().requires_grad_(True)
+    rh_pre2, rh_post2, rh_res2 = mhc_coeffs_ref(x2, phi2, b2, ap2, apo2, ar2, **cfg)
+    rloss = rh_pre2.square().mean() + rh_post2.square().mean() + rh_res2.square().mean()
+    rloss.backward()
+
+    assert_verbose_allclose(x.grad.float(), x2.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[x.grad]")
+    assert_verbose_allclose(phi.grad.float(), phi2.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[phi.grad]")
+    assert_verbose_allclose(b.grad.float(), b2.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[b.grad]")
+    assert_verbose_allclose(
+        alpha_pre.grad.float(), ap2.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[alpha_pre]"
+    )
+    assert_verbose_allclose(
+        alpha_post.grad.float(), apo2.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[alpha_post]"
+    )
+    assert_verbose_allclose(
+        alpha_res.grad.float(), ar2.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[alpha_res]"
+    )
+
+    x3 = x.detach().clone().requires_grad_(True)
+    h_pre3 = h_pre.detach().clone().requires_grad_(True)
+    h_post3 = h_post.detach().clone().requires_grad_(True)
+    h_res3 = h_res.detach().clone().requires_grad_(True)
+    f_out = torch.randn(B, T, C, device=device, dtype=dtype, requires_grad=True)
+
+    x_in = liger_mhc_pre(x3, h_pre3)
+    x_out = liger_mhc_post_res(x3, f_out, h_post3, h_res3)
+
+    x_in_ref = (x3.float() * h_pre3.unsqueeze(-1)).sum(dim=-2)
+    x_out_ref = torch.einsum("...oi,...ic->...oc", h_res3, x3.float()) + h_post3.unsqueeze(
+        -1
+    ) * f_out.float().unsqueeze(-2)
+
+    assert_verbose_allclose(x_in.float(), x_in_ref, rtol=pre_post_tol, atol=pre_post_tol, extra_info="[x_in]")
+    assert_verbose_allclose(x_out.float(), x_out_ref, rtol=res_tol, atol=res_tol, extra_info="[x_out]")
+
+
+@pytest.mark.parametrize("B, T, HC, C", MHC_SHAPES)
+@pytest.mark.parametrize("dtype, _pre_post_tol, res_tol, grad_tol", MHC_DTYPE_TOLS)
+def test_liger_mhc_module(B, T, HC, C, dtype, _pre_post_tol, res_tol, grad_tol):
+    set_seed(42)
+
+    layer = nn.Linear(C, C, bias=False, device=device, dtype=dtype)
+    model = LigerMHC(
+        layer,
+        hc=HC,
+        c=C,
+        tmax=4,
+        rms_eps=1e-6,
+        pre_eps=1e-4,
+        sinkhorn_eps=1e-6,
+        post_mult=2.0,
+        phi_dtype=dtype,
+    ).to(device)
+
+    x_fast = torch.randn(B, T, HC, C, device=device, dtype=dtype, requires_grad=True)
+    out_fast = model(x_fast)
+
+    x_ref = x_fast.detach().clone().requires_grad_(True)
+    phi_ref = model.phi.detach().clone().requires_grad_(True)
+    b_ref = model.b.detach().clone().requires_grad_(True)
+    ap_ref = model.alpha_pre.detach().clone().requires_grad_(True)
+    apo_ref = model.alpha_post.detach().clone().requires_grad_(True)
+    ar_ref = model.alpha_res.detach().clone().requires_grad_(True)
+
+    layer_ref = nn.Linear(C, C, bias=False, device=device, dtype=dtype)
+    layer_ref.weight.data.copy_(model.layer.weight.data)
+
+    h_pre, h_post, h_res = mhc_coeffs_ref(
+        x_ref,
+        phi_ref,
+        b_ref,
+        ap_ref,
+        apo_ref,
+        ar_ref,
+        tmax=4,
+        rms_eps=1e-6,
+        pre_eps=1e-4,
+        sinkhorn_eps=1e-6,
+        post_mult=2.0,
+    )
+    x_in_ref = (x_ref.float() * h_pre.unsqueeze(-1)).sum(dim=-2).to(dtype)
+    f_out_ref = layer_ref(x_in_ref)
+    out_ref = torch.einsum("...oi,...ic->...oc", h_res, x_ref.float()) + h_post.unsqueeze(
+        -1
+    ) * f_out_ref.float().unsqueeze(-2)
+
+    assert_verbose_allclose(out_fast.float(), out_ref.float(), rtol=res_tol, atol=res_tol, extra_info="[output]")
+
+    grad = torch.randn_like(out_fast, dtype=torch.float32)
+    out_fast.backward(grad.to(out_fast.dtype))
+    out_ref.backward(grad)
+
+    assert_verbose_allclose(
+        x_fast.grad.float(), x_ref.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[x.grad]"
+    )
+    phi_grad_tol = grad_tol * 4 if dtype == torch.bfloat16 else grad_tol
+    assert_verbose_allclose(
+        model.phi.grad.float(),
+        phi_ref.grad.float(),
+        rtol=phi_grad_tol,
+        atol=phi_grad_tol,
+        extra_info="[phi.grad]",
+    )
+    assert_verbose_allclose(
+        model.b.grad.float(), b_ref.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[b.grad]"
+    )
+    assert_verbose_allclose(
+        model.alpha_pre.grad.float(), ap_ref.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[alpha_pre.grad]"
+    )
+    assert_verbose_allclose(
+        model.alpha_post.grad.float(),
+        apo_ref.grad.float(),
+        rtol=grad_tol,
+        atol=grad_tol,
+        extra_info="[alpha_post.grad]",
+    )
+    assert_verbose_allclose(
+        model.alpha_res.grad.float(), ar_ref.grad.float(), rtol=grad_tol, atol=grad_tol, extra_info="[alpha_res.grad]"
+    )
+    layer_grad_tol = grad_tol * 4 if dtype == torch.bfloat16 else grad_tol
+    assert_verbose_allclose(
+        model.layer.weight.grad.float(),
+        layer_ref.weight.grad.float(),
+        rtol=layer_grad_tol,
+        atol=layer_grad_tol,
+        extra_info="[layer.weight.grad]",
+    )
+
+
+class MiniMHCLM(nn.Module):
+    """Tiny language model using mHC for end-to-end correctness testing."""
+
+    def __init__(self, *, vocab_size, hc, c, tmax, rms_eps, pre_eps, sinkhorn_eps, post_mult, use_fast, device):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.hc = hc
+        self.c = c
+        self.tmax = tmax
+        self.rms_eps = rms_eps
+        self.pre_eps = pre_eps
+        self.sinkhorn_eps = sinkhorn_eps
+        self.post_mult = post_mult
+        self.use_fast = use_fast
+        self.act_dtype = torch.bfloat16
+
+        self.embed = nn.Embedding(vocab_size, hc * c, device=device)
+        self.inner = nn.Linear(c, c, bias=False, device=device)
+        self.head = nn.Linear(hc * c, vocab_size, bias=False, device=device)
+
+        m = hc * hc + 2 * hc
+        k = hc * c
+        self.phi = nn.Parameter(torch.randn(k, m, device=device, dtype=self.act_dtype) * 0.02)
+        self.b = nn.Parameter(torch.zeros(m, device=device, dtype=torch.float32))
+        self.alpha_pre = nn.Parameter(torch.tensor(1.0, device=device, dtype=torch.float32))
+        self.alpha_post = nn.Parameter(torch.tensor(1.0, device=device, dtype=torch.float32))
+        self.alpha_res = nn.Parameter(torch.tensor(1.0, device=device, dtype=torch.float32))
+
+    def forward(self, input_ids):
+        x = self.embed(input_ids).to(self.act_dtype)
+        bsz, seq_len, _ = x.shape
+        x = x.view(bsz, seq_len, self.hc, self.c)
+
+        cfg = dict(
+            tmax=self.tmax,
+            rms_eps=self.rms_eps,
+            pre_eps=self.pre_eps,
+            sinkhorn_eps=self.sinkhorn_eps,
+            post_mult=self.post_mult,
+        )
+        if self.use_fast:
+            h_pre, h_post, h_res = liger_mhc_coeffs(
+                x, self.phi, self.b, self.alpha_pre, self.alpha_post, self.alpha_res, **cfg
+            )
+            x_in = liger_mhc_pre(x, h_pre)
+            f_out = self.inner(x_in.float())
+            x_out = liger_mhc_post_res(x, f_out, h_post, h_res)
+        else:
+            h_pre, h_post, h_res = mhc_coeffs_ref(
+                x, self.phi, self.b, self.alpha_pre, self.alpha_post, self.alpha_res, **cfg
+            )
+            x_in = (x.float() * h_pre.unsqueeze(-1)).sum(dim=-2)
+            f_out = self.inner(x_in)
+            x_out = torch.einsum("...oi,...ic->...oc", h_res, x.float()) + h_post.unsqueeze(-1) * f_out.unsqueeze(-2)
+
+        x_merge = x_out.float().view(bsz, seq_len, self.hc * self.c)
+        return self.head(x_merge)
+
+
+@pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU")
+@pytest.mark.parametrize(
+    "vocab_size, hc, c, tmax",
+    [
+        (32, 2, 16, 4),
+        (64, 4, 32, 8),
+    ],
+)
+def test_mhc_mini_lm_output_match(vocab_size, hc, c, tmax):
+    set_seed(42)
+
+    model_cfg = dict(
+        vocab_size=vocab_size, hc=hc, c=c, tmax=tmax, rms_eps=1e-6, pre_eps=1e-4, sinkhorn_eps=1e-6, post_mult=2.0
+    )
+
+    model_fast = MiniMHCLM(**model_cfg, use_fast=True, device=device)
+    model_ref = MiniMHCLM(**model_cfg, use_fast=False, device=device)
+    model_ref.load_state_dict(model_fast.state_dict())
+
+    input_ids = torch.randint(0, vocab_size, (2, 8), device=device)
+    labels = torch.randint(0, vocab_size, (2, 8), device=device)
+
+    logits_fast = model_fast(input_ids)
+    logits_ref = model_ref(input_ids)
+
+    assert_verbose_allclose(logits_fast.float(), logits_ref.float(), atol=5e-3, rtol=2e-2, extra_info="[logits]")
+
+    loss_fast = F.cross_entropy(logits_fast.view(-1, vocab_size), labels.view(-1))
+    loss_ref = F.cross_entropy(logits_ref.view(-1, vocab_size), labels.view(-1))
+
+    loss_fast.backward()
+    loss_ref.backward()
+
+    for name in ["phi", "b", "alpha_pre", "alpha_post", "alpha_res"]:
+        g_fast = getattr(model_fast, name).grad.float()
+        g_ref = getattr(model_ref, name).grad.float()
+        assert_verbose_allclose(g_fast, g_ref, atol=5e-2, rtol=5e-2, extra_info=f"[{name}.grad]")
+
+    assert_verbose_allclose(
+        model_fast.inner.weight.grad.float(),
+        model_ref.inner.weight.grad.float(),
+        atol=5e-2,
+        rtol=5e-2,
+        extra_info="[inner.weight.grad]",
+    )
+    assert_verbose_allclose(
+        model_fast.head.weight.grad.float(),
+        model_ref.head.weight.grad.float(),
+        atol=5e-2,
+        rtol=5e-2,
+        extra_info="[head.weight.grad]",
+    )
diff --git a/test/transformers/test_mm_int8int2.py b/test/transformers/test_mm_int8int2.py
new file mode 100755
index 0000000000000000000000000000000000000000..57347ca890cc8aec97fdac9a2f224540a6f6dafb
--- /dev/null
+++ b/test/transformers/test_mm_int8int2.py
@@ -0,0 +1,97 @@
+import pytest
+import torch
+
+from liger_kernel.ops.experimental.mm_int8int2 import matmul
+from liger_kernel.ops.experimental.mm_int8int2 import pack_weights
+from liger_kernel.ops.experimental.mm_int8int2 import unpack_weights
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+# input_features = size*4 when the weight matrix is unpacked
+@pytest.mark.skip(reason="mm_int8int2 is under experimentation")
+@pytest.mark.parametrize(
+    "size",
+    [
+        2048,
+        1024,
+        512,
+    ],
+)
+@pytest.mark.parametrize(
+    "batch_size",
+    [1, 2, 3, 8],
+)
+@pytest.mark.parametrize(
+    "seq_len",
+    [1, 7, 16, 2048],
+)
+@pytest.mark.parametrize(
+    "out_features",
+    [
+        1024,
+        2048,
+        4096,
+        10000,
+    ],
+)
+@pytest.mark.parametrize(
+    "atol, rtol, device",
+    [
+        (1e-2, 1e-2, device),
+    ],
+)
+def test_kernel_correctness(batch_size, seq_len, out_features, size, atol, rtol, device):
+    print(f"\nTesting kernel with size: {size}, atol: {atol}, rtol: {rtol}")
+
+    # Generate the random tensors
+    ht = torch.randint(-127, 127, (batch_size, seq_len, size * 4), device=device, dtype=torch.int8)
+    u = torch.randint(0, 255, (out_features, size), device=device, dtype=torch.uint8)
+
+    # Calculate dimensions
+    B, M, N = ht.size()
+
+    # Compute triton output
+    triton_output = matmul(ht.view(B * M, N), u.T.contiguous()).view(B, M, -1)
+
+    # Unpack weights and compute torch output
+    unpacked = unpack_weights(u.T, bits=2).T
+    torch_output = torch.matmul(ht.to(torch.float32), unpacked.T.contiguous().to(torch.float32))
+
+    # Print the results (optional, can be commented out)
+    print("triton_output =", triton_output)
+    print("torch_output =", torch_output)
+
+    # Check if outputs are close within the given tolerances
+    assert torch.allclose(triton_output, torch_output.to(torch.int32), atol=atol, rtol=rtol), "Results differ"
+
+
+@pytest.mark.skip(reason="mm_int8int2 is under experimentation")
+@pytest.mark.parametrize(
+    "size",
+    [
+        2048,
+        1024,
+        512,
+    ],
+)
+@pytest.mark.parametrize(
+    "out_features",
+    [
+        1024,
+        2048,
+        4096,
+        10000,
+    ],
+)
+@pytest.mark.parametrize(
+    "device",
+    [
+        device,
+    ],
+)
+def test_unpack_pack_correctness(out_features, size, device):
+    u = torch.randint(0, 255, (out_features, size), device=device, dtype=torch.uint8)
+
+    assert (pack_weights(unpack_weights(u.T), 2) == u.T).all(), "Packed weights do not match original weights."
diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py
new file mode 100755
index 0000000000000000000000000000000000000000..707b809a37a35d4e890c05296ef0cd335609b971
--- /dev/null
+++ b/test/transformers/test_monkey_patch.py
@@ -0,0 +1,3209 @@
+import inspect
+
+from inspect import signature
+from unittest.mock import MagicMock
+from unittest.mock import Mock
+from unittest.mock import patch
+
+import pytest
+import torch
+import transformers
+
+from packaging import version
+from test.utils import get_mllama_rope_config
+from test.utils import get_qwen3_vl_rope_config
+from transformers import AutoModelForCausalLM
+from transformers import PretrainedConfig
+from transformers import PreTrainedModel
+
+from liger_kernel.transformers import LigerBlockSparseTop2MLP
+from liger_kernel.transformers import LigerExperts
+from liger_kernel.transformers import LigerGEGLUMLP
+from liger_kernel.transformers import LigerPhi3SwiGLUMLP
+from liger_kernel.transformers import LigerQwen3MoeSwiGLUMLP
+from liger_kernel.transformers import LigerRMSNorm
+from liger_kernel.transformers import LigerSwiGLUMLP
+from liger_kernel.transformers import monkey_patch
+from liger_kernel.transformers.layer_norm import LigerLayerNorm
+from liger_kernel.transformers.model.falcon_h1 import lce_forward as falcon_h1_lce_forward
+from liger_kernel.transformers.model.gemma import lce_forward as gemma_lce_forward
+from liger_kernel.transformers.model.gemma2 import lce_forward as gemma2_lce_forward
+from liger_kernel.transformers.model.llama import lce_forward as llama_lce_forward
+from liger_kernel.transformers.model.mistral import lce_forward as mistral_lce_forward
+from liger_kernel.transformers.model.mixtral import lce_forward as mixtral_lce_forward
+from liger_kernel.transformers.model.mllama import lce_forward as mllama_lce_forward
+from liger_kernel.transformers.model.paligemma import lce_forward as paligemma_lce_forward
+from liger_kernel.transformers.model.phi3 import lce_forward as phi3_lce_forward
+from liger_kernel.transformers.model.qwen2 import lce_forward as qwen2_lce_forward
+from liger_kernel.transformers.model.qwen3_5 import lce_forward as qwen3_5_lce_forward
+from liger_kernel.transformers.model.qwen3_5 import lce_forward_for_multimodal as qwen3_5_lce_forward_for_multimodal
+from liger_kernel.transformers.model.qwen3_next import lce_forward as qwen3_next_lce_forward
+from liger_kernel.transformers.model.smollm3 import lce_forward as smolllm3_lce_forward
+from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel
+from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance
+
+# We only support transformers >= 4.52.0
+transformer_version = version.parse(transformers.__version__)
+MIN_SUPPORTED_TRANSFORMERS_VERSION = version.parse("4.52.0")
+if transformer_version < MIN_SUPPORTED_TRANSFORMERS_VERSION:
+    pytest.skip(
+        f"tests require transformers >= {MIN_SUPPORTED_TRANSFORMERS_VERSION}, got {transformers.__version__}",
+        allow_module_level=True,
+    )
+
+IS_TRANSFORMERS_V5_OR_LATER = transformer_version >= version.parse("5.0.0")
+
+
+# Check if optional modules are available
+def is_mllama_available():
+    try:
+        import transformers.models.mllama  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_internvl_available():
+    try:
+        import transformers.models.internvl  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_smolvlm_available():
+    try:
+        import transformers.models.smolvlm  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_llama4_available():
+    try:
+        import transformers.models.llama4  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_qwen3_available():
+    try:
+        import transformers.models.qwen3  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_qwen3_vl_available():
+    try:
+        import transformers.models.qwen3_vl  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_qwen3_vl_moe_available():
+    try:
+        import transformers.models.qwen3_vl_moe  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_smollm3_available():
+    try:
+        import transformers.models.smollm3  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_olmo2_available():
+    try:
+        import transformers.models.olmo2  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_olmo3_available():
+    try:
+        import transformers.models.olmo3  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_glm4_available():
+    try:
+        import transformers.models.glm4  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_glm4v_available():
+    try:
+        import transformers.models.glm4v  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_glm4v_moe_available():
+    try:
+        import transformers.models.glm4v_moe  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_gemma3_available():
+    try:
+        import transformers.models.gemma3  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_paligemma_available():
+    try:
+        import transformers.models.paligemma  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_falcon_h1_available():
+    try:
+        import transformers.models.falcon_h1  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_qwen3_next_available():
+    try:
+        import transformers.models.qwen3_next  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_qwen3_5_available():
+    try:
+        import transformers.models.qwen3_5  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_qwen3_5_moe_available():
+    try:
+        import transformers.models.qwen3_5_moe  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_pixtral_available():
+    try:
+        import transformers.models.pixtral  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def is_hunyuan_v1_available():
+    try:
+        import transformers.models.hunyuan_v1_dense  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+def test_import_from_root():
+    try:
+        from liger_kernel.transformers import AutoLigerKernelForCausalLM  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma2  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma3  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_gemma3_text  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_glm4  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_glm4v  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_internvl  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_llama  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_mistral  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_mixtral  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_mllama  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_phi3  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3_5  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_qwen3_next  # noqa: F401
+        from liger_kernel.transformers import apply_liger_kernel_to_smollm3  # noqa: F401
+    except Exception:
+        pytest.fail("Import kernel patch from root fails")
+
+
+def test_apply_liger_kernel_no_supported_model_type():
+    # Test that calling _apply_liger_kernel with an unsupported model type is a no-op
+    mock_mistral = Mock()
+
+    with patch.dict(MODEL_TYPE_TO_APPLY_LIGER_FN, {"mistral": mock_mistral}):
+        _apply_liger_kernel("foobar")
+        MODEL_TYPE_TO_APPLY_LIGER_FN["mistral"].assert_not_called()
+
+
+def test_apply_liger_kernel_only_supported_model_type_called():
+    # Test that liger kernel is applied only to the specified model
+    mock_gemma = Mock()
+    mock_llama = Mock()
+    mock_mistral = Mock()
+
+    with patch.dict(
+        MODEL_TYPE_TO_APPLY_LIGER_FN,
+        {"gemma": mock_gemma, "llama": mock_llama, "mistral": mock_mistral},
+    ):
+        _apply_liger_kernel("llama")
+        mock_llama.assert_called_once()
+        mock_gemma.assert_not_called()
+        mock_mistral.assert_not_called()
+
+
+def test_apply_liger_kernel_only_passes_valid_kwargs():
+    # Test that keyword args that are not valid for the apply_liger_* function are not passed
+    mock_llama = Mock()
+
+    def dummy_apply_liger_kernal_to_llama(
+        rope=False,
+        cross_entropy=False,
+        fused_linear_cross_entropy=True,
+        rms_norm=True,
+        swiglu=True,
+    ):
+        pass
+
+    apply_liger_kernal_to_llama_sig = signature(dummy_apply_liger_kernal_to_llama)
+
+    with patch.dict(MODEL_TYPE_TO_APPLY_LIGER_FN, {"llama": mock_llama}):
+        mock_llama.__signature__ = apply_liger_kernal_to_llama_sig
+        (
+            _apply_liger_kernel(
+                "llama",
+                rope=False,
+                fused_linear_cross_entropy=False,
+                cross_entropy=True,
+                foobar=True,
+                barbaz=False,
+            ),
+        )
+        mock_llama.assert_called_once()
+        mock_llama.assert_called_once_with(
+            rope=False,
+            fused_linear_cross_entropy=False,
+            cross_entropy=True,
+        )
+
+
+def test_apply_liger_kernel_to_instance_no_supported_model_type():
+    # Test that calling _apply_liger_kernel_to_instance with an unsupported model type is a no-op
+    mock_mistral = Mock()
+    mock_unknown_model = MagicMock(spec=PreTrainedModel)
+    mock_unknown_model.config = {"model_type": "foobar"}
+
+    with patch.dict(MODEL_TYPE_TO_APPLY_LIGER_FN, {"mistral": mock_mistral}):
+        _apply_liger_kernel_to_instance(model=mock_unknown_model)
+        MODEL_TYPE_TO_APPLY_LIGER_FN["mistral"].assert_not_called()
+
+
+def test_apply_liger_kernel_to_instance_only_supported_model_type_called():
+    # Test that liger kernel is applied only to the specified model
+    mock_gemma = Mock()
+    mock_llama = Mock()
+    mock_mistral = Mock()
+
+    mock_llama_model_instance = MagicMock(spec=PreTrainedModel)
+    mock_llama_model_instance.config = MagicMock(spec=PretrainedConfig)
+    mock_llama_model_instance.config.model_type = "llama"
+
+    with patch.dict(
+        MODEL_TYPE_TO_APPLY_LIGER_FN,
+        {"gemma": mock_gemma, "llama": mock_llama, "mistral": mock_mistral},
+    ):
+        _apply_liger_kernel_to_instance(model=mock_llama_model_instance)
+        mock_llama.assert_called_once()
+        mock_gemma.assert_not_called()
+        mock_mistral.assert_not_called()
+
+
+def test_apply_liger_kernel_to_instance_only_passes_valid_kwargs():
+    # Test that keyword args that are not valid for the apply_liger_* function are not passed
+    mock_llama = Mock()
+
+    mock_llama_model_instance = MagicMock(spec=PreTrainedModel)
+    mock_llama_model_instance.config = MagicMock(spec=PretrainedConfig)
+    mock_llama_model_instance.config.model_type = "llama"
+
+    def dummy_apply_liger_kernel_to_llama(
+        rope=False,
+        cross_entropy=False,
+        fused_linear_cross_entropy=True,
+        rms_norm=True,
+        swiglu=True,
+        model=None,
+    ):
+        pass
+
+    apply_liger_kernel_to_llama_sig = signature(dummy_apply_liger_kernel_to_llama)
+
+    with patch.dict(MODEL_TYPE_TO_APPLY_LIGER_FN, {"llama": mock_llama}):
+        mock_llama.__signature__ = apply_liger_kernel_to_llama_sig
+        (
+            _apply_liger_kernel_to_instance(
+                model=mock_llama_model_instance,
+                rope=False,
+                fused_linear_cross_entropy=False,
+                cross_entropy=True,
+                foobar=True,
+                barbaz=False,
+            ),
+        )
+        mock_llama.assert_called_once()
+        mock_llama.assert_called_once_with(
+            model=mock_llama_model_instance,
+            rope=False,
+            fused_linear_cross_entropy=False,
+            cross_entropy=True,
+        )
+
+
+def test_patching_apis_match_auto_mapping():
+    # Test that all of the patching APIs present also have a corresponding entry in the auto mapping
+    patching_functions = [
+        func
+        for name, func in inspect.getmembers(monkey_patch, inspect.isfunction)
+        if name.startswith("apply_liger_kernel_to_")
+    ]
+
+    assert set(patching_functions) == set(MODEL_TYPE_TO_APPLY_LIGER_FN.values())
+
+
+def test_patching_apis_support_patching_model_instance():
+    # Test that all the patching APIs present support passing in
+    # model (PreTrainedModel) as an argument indicating that it supports
+    # patching post-model creation
+    patching_functions = [
+        func
+        for name, func in inspect.getmembers(monkey_patch, inspect.isfunction)
+        if name.startswith("apply_liger_kernel_to_")
+    ]
+
+    for func in patching_functions:
+        sig = inspect.signature(func)
+        # Ensure 'model' is in the parameters
+        assert "model" in sig.parameters, (
+            f"{func.__name__} does not have 'model' as an argument. All patching methods must support patching an existing model instance."
+        )
+
+
+def test_apply_liger_kernel_to_instance_for_llama():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.llama.modeling_llama"):
+        # Instantiate a dummy model
+        config = transformers.models.llama.configuration_llama.LlamaConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(llama_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(llama_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        # Ensure that the model patched with Liger modules can work properly
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_vl_available(), reason="qwen3_vl module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_vl_for_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl.modeling_qwen3_vl"):
+        from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+
+        from liger_kernel.transformers.model.qwen3_vl import lce_forward as qwen3_vl_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_vl.configuration_qwen3_vl.Qwen3VLConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=transformers.models.qwen3_vl.configuration_qwen3_vl.Qwen3VLVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=transformers.models.qwen3_vl.configuration_qwen3_vl.Qwen3VLTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+                **get_qwen3_vl_rope_config(),  # Version-aware rope configuration
+            ).to_dict(),
+        )
+        dummy_model_instance = Qwen3VLForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen3VLForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_vl_available(), reason="qwen3_vl module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_vl():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl.modeling_qwen3_vl"):
+        from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLModel
+
+        from liger_kernel.transformers.model.qwen3_vl import lce_forward as qwen3_vl_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_vl.configuration_qwen3_vl.Qwen3VLConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=transformers.models.qwen3_vl.configuration_qwen3_vl.Qwen3VLVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=transformers.models.qwen3_vl.configuration_qwen3_vl.Qwen3VLTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+                **get_qwen3_vl_rope_config(),  # Version-aware rope configuration
+            ).to_dict(),
+        )
+        dummy_model_instance = Qwen3VLModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen3VLModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_vl_available(), reason="qwen3_vl module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_vl_text():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl.modeling_qwen3_vl"):
+        from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextModel
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_vl.configuration_qwen3_vl.Qwen3VLTextConfig(
+            vocab_size=32000,
+            hidden_size=512,
+            intermediate_size=2048,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            head_dim=64,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=False,
+            tie_word_embeddings=True,
+            attention_dropout=0.0,
+            attention_bias=False,
+            **get_qwen3_vl_rope_config(),  # Version-aware rope configuration
+        )
+        dummy_model_instance = Qwen3VLTextModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen3VLTextModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for decoder_layer in dummy_model_instance.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for decoder_layer in dummy_model_instance.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_vl_moe_available(), reason="qwen3_vl_moe module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe_for_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe"):
+        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
+
+        from liger_kernel.transformers.model.qwen3_vl_moe import lce_forward as qwen3_vl_moe_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe.Qwen3VLMoeConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe.Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe.Qwen3VLMoeTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+                decoder_sparse_step=1,
+                moe_intermediate_size=1024,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+                pad_token_id=None,
+                **get_qwen3_vl_rope_config(),  # Version-aware rope configuration
+            ).to_dict(),
+        )
+        dummy_model_instance = Qwen3VLMoeForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen3VLMoeForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_vl_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_vl_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_vl_moe_available(), reason="qwen3_vl_moe module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe"):
+        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeModel
+
+        from liger_kernel.transformers.model.qwen3_vl_moe import lce_forward as qwen3_vl_moe_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe.Qwen3VLMoeConfig(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe.Qwen3VLMoeVisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                deepstack_visual_indexes=[1, 2, 3],
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe.Qwen3VLMoeTextConfig(
+                vocab_size=32000,
+                hidden_size=512,
+                intermediate_size=2048,
+                num_hidden_layers=4,
+                num_attention_heads=8,
+                num_key_value_heads=2,
+                head_dim=64,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=False,
+                tie_word_embeddings=True,
+                attention_dropout=0.0,
+                attention_bias=False,
+                decoder_sparse_step=1,
+                moe_intermediate_size=1024,
+                num_experts_per_tok=2,
+                num_experts=4,
+                mlp_only_layers=[],
+                pad_token_id=None,
+                **get_qwen3_vl_rope_config(),  # Version-aware rope configuration
+            ).to_dict(),
+        )
+        dummy_model_instance = Qwen3VLMoeModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen3VLMoeModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_vl_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_vl_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for decoder_layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_vl_moe_available(), reason="qwen3_vl_moe module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe_text():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe"):
+        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextModel
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe.Qwen3VLMoeTextConfig(
+            vocab_size=32000,
+            hidden_size=512,
+            intermediate_size=2048,
+            num_hidden_layers=4,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            head_dim=64,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=False,
+            tie_word_embeddings=True,
+            attention_dropout=0.0,
+            attention_bias=False,
+            decoder_sparse_step=1,
+            moe_intermediate_size=1024,
+            num_experts_per_tok=2,
+            num_experts=4,
+            mlp_only_layers=[],
+            pad_token_id=None,
+            **get_qwen3_vl_rope_config(),  # Version-aware rope configuration
+        )
+        dummy_model_instance = Qwen3VLMoeTextModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen3VLMoeTextModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for decoder_layer in dummy_model_instance.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for decoder_layer in dummy_model_instance.layers:
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            self_attn = getattr(decoder_layer, "self_attn", None)
+            if self_attn is not None:
+                if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
+                    assert inspect.getsource(self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+                if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
+                    assert inspect.getsource(self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_vl_available(), reason="qwen3_vl module not available")
+def test_qwen3_vl_rope_hooks_applied():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl.modeling_qwen3_vl") as modeling_mod:
+        from liger_kernel.transformers.monkey_patch import liger_rotary_pos_emb
+        from liger_kernel.transformers.monkey_patch import liger_rotary_pos_emb_vision
+
+        # Before applying, make sure attributes exist but are not the liger implementations
+        setattr(modeling_mod, "apply_rotary_pos_emb", object())
+        setattr(modeling_mod, "apply_rotary_pos_emb_vision", object())
+
+        _apply_liger_kernel("qwen3_vl")
+
+        assert modeling_mod.apply_rotary_pos_emb is liger_rotary_pos_emb
+        assert modeling_mod.apply_rotary_pos_emb_vision is liger_rotary_pos_emb_vision
+
+
+@pytest.mark.skipif(not is_qwen3_vl_moe_available(), reason="qwen3_vl_moe module not available")
+def test_qwen3_vl_moe_rope_hooks_applied():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe") as modeling_mod:
+        from liger_kernel.transformers.monkey_patch import liger_rotary_pos_emb
+        from liger_kernel.transformers.monkey_patch import liger_rotary_pos_emb_vision
+
+        # Before applying, make sure attributes exist but are not the liger implementations
+        setattr(modeling_mod, "apply_rotary_pos_emb", object())
+        setattr(modeling_mod, "apply_rotary_pos_emb_vision", object())
+
+        _apply_liger_kernel("qwen3_vl_moe")
+
+        assert modeling_mod.apply_rotary_pos_emb is liger_rotary_pos_emb
+        assert modeling_mod.apply_rotary_pos_emb_vision is liger_rotary_pos_emb_vision
+
+
+@pytest.mark.skipif(not is_falcon_h1_available(), reason="falcon_h1 module not available")
+def test_apply_liger_kernel_to_falcon_h1_for_causal_lm():
+    with patch("transformers.models.falcon_h1.modeling_falcon_h1"):
+        from transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1ForCausalLM
+
+        # Instantiate a dummy model
+        config = transformers.models.falcon_h1.configuration_falcon_h1.FalconH1Config(
+            hidden_size=256,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            intermediate_size=1024,
+        )
+        dummy_model_instance = FalconH1ForCausalLM(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(falcon_h1_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.final_layernorm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_ff_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(falcon_h1_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.final_layernorm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_ff_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_mllama_available(), reason="mllama module not available")
+def test_apply_liger_kernel_to_instance_for_mllama_for_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.mllama.modeling_mllama"):
+        from transformers.models.mllama.modeling_mllama import MllamaForConditionalGeneration
+        from transformers.models.mllama.modeling_mllama import MllamaTextModel
+
+        # Instantiate a dummy model
+        config = transformers.models.mllama.configuration_mllama.MllamaConfig(
+            dtype=torch.bfloat16,
+            text_config=transformers.models.mllama.configuration_mllama.MllamaTextConfig(
+                rms_norm_eps=1e-5,
+                hidden_size=32,
+                intermediate_size=64,
+                hidden_act="silu",
+                num_hidden_layers=2,
+                **get_mllama_rope_config(),  # Version-aware rope configuration
+            ),
+            vision_config=transformers.models.mllama.configuration_mllama.MllamaVisionConfig(
+                rms_norm_eps=1e-5,
+                hidden_size=32,
+                intermediate_size=64,
+                hidden_act="gelu",
+                num_hidden_layers=2,
+                vision_output_dim=64,
+            ),
+        )
+        dummy_model_instance = MllamaForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, MllamaForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(mllama_lce_forward)
+
+        if isinstance(dummy_model_instance.model.language_model, MllamaTextModel):
+            language_model = dummy_model_instance.model.language_model
+        else:
+            language_model = dummy_model_instance.model.language_model.model
+
+        assert inspect.getsource(language_model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        assert inspect.getsource(dummy_model_instance.model.vision_model.layernorm_pre.forward) != inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        assert inspect.getsource(dummy_model_instance.model.vision_model.layernorm_post.forward) != inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        for layer in dummy_model_instance.model.vision_model.transformer.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerLayerNorm.forward
+            )
+        for layer in dummy_model_instance.model.vision_model.global_transformer.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerLayerNorm.forward
+            )
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(mllama_lce_forward)
+        assert inspect.getsource(language_model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        assert inspect.getsource(dummy_model_instance.model.vision_model.layernorm_pre.forward) == inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        assert inspect.getsource(dummy_model_instance.model.vision_model.layernorm_post.forward) == inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        for layer in dummy_model_instance.model.vision_model.transformer.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerLayerNorm.forward
+            )
+        for layer in dummy_model_instance.model.vision_model.global_transformer.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerLayerNorm.forward
+            )
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_mllama_available(), reason="mllama module not available")
+def test_apply_liger_kernel_to_instance_for_mllama_for_causal_lm():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.mllama.modeling_mllama"):
+        from transformers.models.mllama.modeling_mllama import MllamaForCausalLM
+
+        # Instantiate a dummy model
+        config = transformers.models.mllama.configuration_mllama.MllamaTextConfig(
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            **get_mllama_rope_config(),  # Version-aware rope configuration
+        )
+
+        dummy_model_instance = MllamaForCausalLM._from_config(config)
+
+        assert isinstance(dummy_model_instance, MllamaForCausalLM)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(mllama_lce_forward)
+        assert not isinstance(dummy_model_instance.model.norm, LigerRMSNorm)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(mllama_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_pixtral_available(), reason="pixtral module not available")
+def test_apply_liger_kernel_to_instance_for_pixtral_vision_model():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.pixtral.modeling_pixtral"):
+        from transformers.models.pixtral.modeling_pixtral import PixtralVisionModel
+
+        # Instantiate a dummy model
+        config = transformers.models.pixtral.configuration_pixtral.PixtralVisionConfig(
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=2,
+            num_attention_heads=4,
+            num_channels=3,
+            image_size=64,
+            patch_size=16,
+            hidden_act="silu",
+            attention_dropout=0.0,
+            rope_theta=10000.0,
+        )
+        dummy_model_instance = PixtralVisionModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, PixtralVisionModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.ln_pre.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.transformer.layers:
+            assert inspect.getsource(layer.feed_forward.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.attention_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.ffn_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.ln_pre.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.transformer.layers:
+            assert inspect.getsource(layer.feed_forward.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.attention_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.ffn_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_llama4_available(), reason="llama4 module not available")
+def test_apply_liger_kernel_to_instance_for_llama4_for_causal_lm():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.llama4.modeling_llama4"):
+        from transformers.models.llama4.modeling_llama4 import Llama4ForCausalLM
+
+        # Instantiate a dummy model
+        config = transformers.models.llama4.configuration_llama4.Llama4TextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            moe_layers=[1],
+        )
+        dummy_model_instance = Llama4ForCausalLM._from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if layer.is_moe_layer:
+                assert inspect.getsource(layer.feed_forward.shared_expert.forward) != inspect.getsource(
+                    LigerSwiGLUMLP.forward
+                )
+            else:
+                assert inspect.getsource(layer.feed_forward.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if layer.is_moe_layer:
+                assert inspect.getsource(layer.feed_forward.shared_expert.forward) == inspect.getsource(
+                    LigerSwiGLUMLP.forward
+                )
+            else:
+                assert inspect.getsource(layer.feed_forward.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_llama4_available(), reason="llama4 module not available")
+def test_apply_liger_kernel_to_instance_for_llama4_for_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.llama4.modeling_llama4"):
+        from transformers.models.llama4.modeling_llama4 import Llama4ForConditionalGeneration
+
+        # Instantiate a dummy model
+        config = transformers.models.llama4.configuration_llama4.Llama4Config(
+            dtype=torch.bfloat16,
+            text_config=transformers.models.llama4.configuration_llama4.Llama4TextConfig(
+                dtype=torch.bfloat16,
+                rms_norm_eps=1e-5,
+                hidden_size=32,
+                intermediate_size=64,
+                hidden_act="silu",
+                num_hidden_layers=2,
+                moe_layers=[1],
+            ),
+            vision_config=transformers.models.llama4.configuration_llama4.Llama4VisionConfig(
+                rms_norm_eps=1e-5,
+                hidden_size=32,
+                intermediate_size=64,
+                hidden_act="gelu",
+                num_hidden_layers=2,
+                vision_output_dim=64,
+            ),
+            pad_token_id=None,
+        )
+        dummy_model_instance = Llama4ForConditionalGeneration._from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert isinstance(dummy_model_instance, Llama4ForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.language_model.model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.language_model.model.layers:
+            if layer.is_moe_layer:
+                assert inspect.getsource(layer.feed_forward.shared_expert.forward) != inspect.getsource(
+                    LigerSwiGLUMLP.forward
+                )
+            else:
+                assert inspect.getsource(layer.feed_forward.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        assert inspect.getsource(dummy_model_instance.vision_model.layernorm_pre.forward) != inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        assert inspect.getsource(dummy_model_instance.vision_model.layernorm_post.forward) != inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        for layer in dummy_model_instance.vision_model.model.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerLayerNorm.forward
+            )
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.language_model.model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.language_model.model.layers:
+            if layer.is_moe_layer:
+                assert inspect.getsource(layer.feed_forward.shared_expert.forward) == inspect.getsource(
+                    LigerSwiGLUMLP.forward
+                )
+            else:
+                assert inspect.getsource(layer.feed_forward.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        assert inspect.getsource(dummy_model_instance.vision_model.layernorm_pre.forward) == inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        assert inspect.getsource(dummy_model_instance.vision_model.layernorm_post.forward) == inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        for layer in dummy_model_instance.vision_model.model.layers:
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(
+                LigerLayerNorm.forward
+            )
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+def test_apply_liger_kernel_to_instance_for_mistral():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.mistral.modeling_mistral"):
+        # Instantiate a dummy model
+        config = transformers.models.mistral.configuration_mistral.MistralConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(mistral_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(mistral_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+def test_apply_liger_kernel_to_instance_for_mixtral():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.mixtral.modeling_mixtral"):
+        # Instantiate a dummy model
+        config = transformers.models.mixtral.configuration_mixtral.MixtralConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_local_experts=3,
+            num_experts_per_tok=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(mixtral_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) != inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in layer.block_sparse_moe.experts:
+                    assert inspect.getsource(expert.forward) != inspect.getsource(LigerBlockSparseTop2MLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(mixtral_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) == inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in layer.block_sparse_moe.experts:
+                    assert inspect.getsource(expert.forward) == inspect.getsource(LigerBlockSparseTop2MLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+def test_apply_liger_kernel_to_instance_for_gemma():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.gemma.modeling_gemma"):
+        # Instantiate a dummy model
+        config = transformers.models.gemma.configuration_gemma.GemmaConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(gemma_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(gemma_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+def test_apply_liger_kernel_to_instance_for_gemma2():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.gemma2.modeling_gemma2"):
+        # Instantiate a dummy model
+        config = transformers.models.gemma2.configuration_gemma2.Gemma2Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(gemma2_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_feedforward_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(gemma2_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_feedforward_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_paligemma_available(), reason="paligemma module not available")
+def test_apply_liger_kernel_to_instance_for_paligemma():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.paligemma.modeling_paligemma"):
+        from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+
+        # Instantiate a dummy model
+        config = transformers.models.paligemma.configuration_paligemma.PaliGemmaConfig(
+            dtype=torch.bfloat16,
+            text_config={
+                "num_hidden_layers": 2,
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 32,
+                "intermediate_size": 64,
+                "hidden_act": "silu",
+            },
+            vision_config={
+                "num_hidden_layers": 2,
+                "layer_norm_eps": 1e-5,
+                "hidden_size": 48,
+                "intermediate_size": 64,
+            },
+        )
+
+        dummy_model_instance = PaliGemmaForConditionalGeneration(config)
+        assert isinstance(dummy_model_instance, PaliGemmaForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(paligemma_lce_forward)
+        assert inspect.getsource(
+            dummy_model_instance.model.vision_tower.vision_model.post_layernorm.forward
+        ) != inspect.getsource(LigerLayerNorm.forward)
+
+        for layer in dummy_model_instance.model.vision_tower.vision_model.encoder.layers:
+            assert inspect.getsource(layer.layer_norm1.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.layer_norm2.forward) != inspect.getsource(LigerLayerNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(paligemma_lce_forward)
+        assert inspect.getsource(
+            dummy_model_instance.model.vision_tower.vision_model.post_layernorm.forward
+        ) == inspect.getsource(LigerLayerNorm.forward)
+
+        for layer in dummy_model_instance.model.vision_tower.vision_model.encoder.layers:
+            assert inspect.getsource(layer.layer_norm1.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.layer_norm2.forward) == inspect.getsource(LigerLayerNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_gemma3_available(), reason="gemma3 module not available")
+def test_apply_liger_kernel_to_instance_for_gemma3_text():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.gemma3.modeling_gemma3"):
+        from liger_kernel.transformers.model.gemma3 import causal_forward as gemma3_causal_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.gemma3.configuration_gemma3.Gemma3TextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(gemma3_causal_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_feedforward_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            assert inspect.getsource(layer.self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(gemma3_causal_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_feedforward_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            assert inspect.getsource(layer.self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_gemma3_available(), reason="gemma3 module not available")
+def test_apply_liger_kernel_to_instance_for_gemma3_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+
+    with patch("transformers.models.gemma3.modeling_gemma3"):
+        from transformers.models.gemma3.modeling_gemma3 import Gemma3ForConditionalGeneration
+
+        from liger_kernel.transformers.model.gemma3 import multimodal_forward as gemma3_multimodal_forward
+
+        # Instantiate a dummy model
+        text_config = transformers.models.gemma3.configuration_gemma3.Gemma3TextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=2,
+        )
+        vision_config = transformers.models.siglip.configuration_siglip.SiglipVisionConfig(
+            layer_norm_eps=1e-5,
+            hidden_size=48,
+            intermediate_size=64,
+        )
+        config = transformers.models.gemma3.configuration_gemma3.Gemma3Config(text_config, vision_config)
+
+        dummy_model_instance = Gemma3ForConditionalGeneration._from_config(config)
+        assert isinstance(dummy_model_instance, Gemma3ForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(gemma3_multimodal_forward)
+        assert inspect.getsource(
+            dummy_model_instance.model.vision_tower.vision_model.post_layernorm.forward
+        ) != inspect.getsource(LigerLayerNorm.forward)
+
+        for layer in dummy_model_instance.model.vision_tower.vision_model.encoder.layers:
+            assert inspect.getsource(layer.layer_norm1.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.layer_norm2.forward) != inspect.getsource(LigerLayerNorm.forward)
+
+        assert inspect.getsource(
+            dummy_model_instance.model.multi_modal_projector.mm_soft_emb_norm.forward
+        ) != inspect.getsource(LigerRMSNorm.forward)
+
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_feedforward_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            assert inspect.getsource(layer.self_attn.q_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.self_attn.k_norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(gemma3_multimodal_forward)
+        assert inspect.getsource(
+            dummy_model_instance.model.vision_tower.vision_model.post_layernorm.forward
+        ) == inspect.getsource(LigerLayerNorm.forward)
+
+        for layer in dummy_model_instance.model.vision_tower.vision_model.encoder.layers:
+            assert inspect.getsource(layer.layer_norm1.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.layer_norm2.forward) == inspect.getsource(LigerLayerNorm.forward)
+
+        assert inspect.getsource(
+            dummy_model_instance.model.multi_modal_projector.mm_soft_emb_norm.forward
+        ) == inspect.getsource(LigerRMSNorm.forward)
+
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerGEGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.pre_feedforward_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+            assert inspect.getsource(layer.self_attn.q_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.self_attn.k_norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+def test_apply_liger_kernel_to_instance_for_qwen2():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen2.modeling_qwen2"):
+        # Instantiate a dummy model
+        config = transformers.models.qwen2.configuration_qwen2.Qwen2Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen2_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen2_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_available(), reason="qwen3 module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3.modeling_qwen3"):
+        from liger_kernel.transformers.model.qwen3 import lce_forward as qwen3_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3.configuration_qwen3.Qwen3Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_available(), reason="qwen3 module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_moe():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_moe.modeling_qwen3_moe"):
+        from liger_kernel.transformers.model.qwen3_moe import lce_forward as qwen3_moe_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_moe.configuration_qwen3_moe.Qwen3MoeConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) != inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in layer.mlp.experts:
+                    assert inspect.getsource(expert.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) == inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in layer.mlp.experts:
+                    assert inspect.getsource(expert.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(
+    transformer_version < version.parse("4.52.4"),
+    reason="Qwen2-VL support is only compatible with transformers >= 4.52.4",
+)
+def test_apply_liger_kernel_to_instance_for_qwen2_vl_for_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen2_vl.modeling_qwen2_vl"):
+        from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+
+        from liger_kernel.transformers.model.qwen2_vl import lce_forward as qwen2_vl_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen2_vl.configuration_qwen2_vl.Qwen2VLConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=48,
+            embed_dim=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            max_position_embeddings=128,
+            vocab_size=1000,
+            vision_config={
+                "depth": 4,
+                "embed_dim": 128,
+                "num_heads": 8,
+                "hidden_size": 1024,
+            },
+        )
+        dummy_model_instance = Qwen2VLForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen2VLForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen2_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) != inspect.getsource(LigerLayerNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen2_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) == inspect.getsource(LigerLayerNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(
+    transformer_version < version.parse("4.52.4"),
+    reason="Qwen2-VL support is only compatible with transformers >= 4.52.4",
+)
+def test_apply_liger_kernel_to_instance_for_qwen2_vl():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen2_vl.modeling_qwen2_vl"):
+        from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLModel
+
+        from liger_kernel.transformers.model.qwen2_vl import lce_forward as qwen2_vl_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen2_vl.configuration_qwen2_vl.Qwen2VLConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=48,
+            embed_dim=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            max_position_embeddings=128,
+            vocab_size=1000,
+            vision_config={
+                "depth": 4,
+                "embed_dim": 128,
+                "num_heads": 8,
+                "hidden_size": 1024,
+            },
+        )
+        dummy_model_instance = Qwen2VLModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen2VLModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen2_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) != inspect.getsource(LigerLayerNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen2_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) == inspect.getsource(LigerLayerNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(
+    transformer_version < version.parse("4.52.4"),
+    reason="Qwen2-VL support is only compatible with transformers >= 4.52.4",
+)
+def test_apply_liger_kernel_to_instance_for_qwen2_vl_text():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen2_vl.modeling_qwen2_vl"):
+        from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLTextModel
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen2_vl.configuration_qwen2_vl.Qwen2VLTextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=48,
+            embed_dim=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            max_position_embeddings=128,
+            vocab_size=1000,
+        )
+        dummy_model_instance = Qwen2VLTextModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen2VLTextModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(
+    transformer_version < version.parse("4.52.4"),
+    reason="Qwen2.5-VL support is only compatible with transformers >= 4.52.4",
+)
+def test_apply_liger_kernel_to_instance_for_qwen2_5_vl():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen2_5_vl.modeling_qwen2_5_vl"):
+        from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLModel
+
+        from liger_kernel.transformers.model.qwen2_5_vl import lce_forward as qwen2_5_vl_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen2_5_vl.configuration_qwen2_5_vl.Qwen2_5_VLConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=48,
+            embed_dim=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            max_position_embeddings=128,
+            vocab_size=1000,
+            vision_config={
+                "depth": 4,
+                "embed_dim": 128,
+                "num_heads": 8,
+                "hidden_size": 1024,
+            },
+        )
+        dummy_model_instance = Qwen2_5_VLModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen2_5_VLModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen2_5_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen2_5_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(
+    transformer_version < version.parse("4.52.4"),
+    reason="Qwen2.5-VL support is only compatible with transformers >= 4.52.4",
+)
+def test_apply_liger_kernel_to_instance_for_qwen2_5_vl_for_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen2_5_vl.modeling_qwen2_5_vl"):
+        from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+
+        from liger_kernel.transformers.model.qwen2_5_vl import lce_forward as qwen2_5_vl_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen2_5_vl.configuration_qwen2_5_vl.Qwen2_5_VLConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=48,
+            embed_dim=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            max_position_embeddings=128,
+            vocab_size=1000,
+            vision_config={
+                "depth": 4,
+                "embed_dim": 128,
+                "num_heads": 8,
+                "hidden_size": 1024,
+            },
+        )
+        dummy_model_instance = Qwen2_5_VLForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen2_5_VLForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen2_5_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen2_5_vl_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(
+    transformer_version < version.parse("4.52.4"),
+    reason="Qwen2.5-VL support is only compatible with transformers >= 4.52.4",
+)
+def test_apply_liger_kernel_to_instance_for_qwen2_5_vl_text():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen2_5_vl.modeling_qwen2_5_vl"):
+        from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLTextModel
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen2_5_vl.configuration_qwen2_5_vl.Qwen2_5_VLTextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=48,
+            embed_dim=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            max_position_embeddings=128,
+            vocab_size=1000,
+        )
+        dummy_model_instance = Qwen2_5_VLTextModel._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen2_5_VLTextModel)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        # Note: Text models don't have forward method patching, so skip this check
+        assert inspect.getsource(dummy_model_instance.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_internvl_available(), reason="internvl module not available")
+def test_apply_liger_kernel_to_instance_for_internvl():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.internvl.modeling_internvl"):
+        from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+
+        # Instantiate a dummy model
+        config = transformers.models.internvl.configuration_internvl.InternVLConfig(
+            dtype=torch.bfloat16,
+            text_config={
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "hidden_act": "silu",
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+                "num_key_value_heads": 2,  # 16
+                "max_position_embeddings": 4096,  # 8192
+                "vocab_size": 32000,  # 151936
+                "bos_token_id": 1,
+                "eos_token_id": 2,
+                "pad_token_id": 2,
+                "tie_word_embeddings": False,
+            },
+            vision_config={
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+            },
+            image_token_id=10,
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        )
+        dummy_model_instance = InternVLForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, InternVLForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_smolvlm_available(), reason="smolvlm module not available")
+def test_apply_liger_kernel_to_instance_for_smolvlm2():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.smolvlm.modeling_smolvlm"):
+        from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration
+
+        # Instantiate a dummy model
+        config = transformers.models.smolvlm.configuration_smolvlm.SmolVLMConfig(
+            dtype=torch.bfloat16,
+            text_config={
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 576,
+                "intermediate_size": 1536,
+                "hidden_act": "silu",
+                "num_hidden_layers": 2,
+                "num_attention_heads": 9,
+                "num_key_value_heads": 3,
+                "max_position_embeddings": 128,
+                "vocab_size": 1000,
+            },
+            vision_config={
+                "hidden_size": 768,
+                "intermediate_size": 3072,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 12,
+            },
+        )
+        dummy_model_instance = SmolVLMForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, SmolVLMForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        # Text model checks
+        assert inspect.getsource(dummy_model_instance.model.text_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.text_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Vision model checks
+        assert inspect.getsource(dummy_model_instance.model.vision_model.post_layernorm.forward) != inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        for encoder_layer in dummy_model_instance.model.vision_model.encoder.layers:
+            assert inspect.getsource(encoder_layer.layer_norm1.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(encoder_layer.layer_norm2.forward) != inspect.getsource(LigerLayerNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        # Text model checks
+        assert inspect.getsource(dummy_model_instance.model.text_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.text_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        # Vision model checks
+        assert inspect.getsource(dummy_model_instance.model.vision_model.post_layernorm.forward) == inspect.getsource(
+            LigerLayerNorm.forward
+        )
+        for encoder_layer in dummy_model_instance.model.vision_model.encoder.layers:
+            assert inspect.getsource(encoder_layer.layer_norm1.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(encoder_layer.layer_norm2.forward) == inspect.getsource(LigerLayerNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+def test_apply_liger_kernel_to_instance_for_phi3():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.phi3.modeling_phi3"):
+        # Instantiate a dummy model
+        config = transformers.models.phi3.configuration_phi3.Phi3Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(phi3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerPhi3SwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(phi3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerPhi3SwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_olmo2_available(), reason="olmo2 module not available")
+def test_apply_liger_kernel_to_instance_for_olmo2():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.olmo2.modeling_olmo2"):
+        from liger_kernel.transformers.model.olmo2 import lce_forward as olmo2_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.olmo2.configuration_olmo2.Olmo2Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(olmo2_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(olmo2_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_olmo3_available(), reason="olmo3 module not available")
+def test_apply_liger_kernel_to_instance_for_olmo3():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.olmo3.modeling_olmo3"):
+        from liger_kernel.transformers.model.olmo3 import lce_forward as olmo3_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.olmo3.configuration_olmo3.Olmo3Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(olmo3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) != inspect.getsource(
+                LigerRMSNorm.forward
+            )
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(olmo3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_feedforward_layernorm.forward) == inspect.getsource(
+                LigerRMSNorm.forward
+            )
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_glm4_available(), reason="glm4 module not available")
+def test_apply_liger_kernel_to_instance_for_glm4():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.glm4.modeling_glm4"):
+        from liger_kernel.transformers.model.glm4 import lce_forward as glm4_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.glm4.configuration_glm4.Glm4Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(glm4_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerPhi3SwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_self_attn_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_mlp_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(glm4_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerPhi3SwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_self_attn_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_mlp_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_glm4v_available(), reason="glm4v module not available")
+def test_apply_liger_kernel_to_instance_for_glm4v():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.glm4v.modeling_glm4v"):
+        from transformers.models.glm4v.modeling_glm4v import Glm4vForConditionalGeneration
+
+        from liger_kernel.transformers.model.glm4v import lce_forward as glm4v_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.glm4v.configuration_glm4v.Glm4vConfig(
+            dtype=torch.bfloat16,
+            text_config={
+                "num_hidden_layers": 2,
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 32,
+                "intermediate_size": 64,
+                "hidden_act": "silu",
+                "pad_token_id": None,
+            },
+            vision_config={
+                "num_hidden_layers": 2,
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 48,
+                "intermediate_size": 64,
+            },
+        )
+        dummy_model_instance = Glm4vForConditionalGeneration(config)
+        assert isinstance(dummy_model_instance, Glm4vForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(glm4v_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerPhi3SwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_self_attn_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_mlp_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(glm4v_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerPhi3SwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_self_attn_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_mlp_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.norm2.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(vision_block.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_glm4v_moe_available(), reason="glm4v_moe module not available")
+def test_apply_liger_kernel_to_instance_for_glm4v_moe():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.glm4v_moe.modeling_glm4v_moe"):
+        from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeForConditionalGeneration
+
+        from liger_kernel.transformers.model.glm4v_moe import lce_forward as glm4v_moe_lce_forward
+        from liger_kernel.transformers.rms_norm import LigerRMSNormForGlm4
+
+        # Instantiate a dummy model
+        config = transformers.models.glm4v_moe.configuration_glm4v_moe.Glm4vMoeConfig(
+            dtype=torch.bfloat16,
+            hidden_size=32,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            text_config={
+                "hidden_size": 16,
+                "intermediate_size": 32,
+                "num_attention_heads": 4,
+                "num_hidden_layers": 2,
+                "rms_norm_eps": 1e-5,
+                "hidden_act": "silu",
+                "n_routed_experts": 1,
+            },
+            vision_config={
+                "num_hidden_layers": 2,
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 48,
+                "intermediate_size": 64,
+            },
+        )
+        dummy_model_instance = Glm4vMoeForConditionalGeneration(config)
+        assert isinstance(dummy_model_instance, Glm4vMoeForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(glm4v_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNormForGlm4.forward
+        )
+        assert inspect.getsource(dummy_model_instance.model.visual.post_conv_layernorm.forward) != inspect.getsource(
+            LigerRMSNormForGlm4.forward
+        )
+        assert inspect.getsource(dummy_model_instance.model.visual.post_layernorm.forward) != inspect.getsource(
+            LigerRMSNormForGlm4.forward
+        )
+
+        for decoder_layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(decoder_layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) != inspect.getsource(
+                LigerRMSNormForGlm4.forward
+            )
+            assert inspect.getsource(decoder_layer.input_layernorm.forward) != inspect.getsource(
+                LigerRMSNormForGlm4.forward
+            )
+        if decoder_layer.mlp.experts is not None:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(decoder_layer.mlp.experts.forward) != inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in decoder_layer.mlp.experts:
+                    assert inspect.getsource(expert.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            if decoder_layer.mlp.shared_experts is not None:
+                assert inspect.getsource(decoder_layer.mlp.shared_experts.forward) != inspect.getsource(
+                    LigerSwiGLUMLP.forward
+                )
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) != inspect.getsource(LigerRMSNormForGlm4.forward)
+            assert inspect.getsource(vision_block.norm2.forward) != inspect.getsource(LigerRMSNormForGlm4.forward)
+            assert inspect.getsource(vision_block.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(glm4v_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNormForGlm4.forward
+        )
+        assert inspect.getsource(dummy_model_instance.model.visual.post_conv_layernorm.forward) == inspect.getsource(
+            LigerRMSNormForGlm4.forward
+        )
+        assert inspect.getsource(dummy_model_instance.model.visual.post_layernorm.forward) == inspect.getsource(
+            LigerRMSNormForGlm4.forward
+        )
+
+        for decoder_layer in dummy_model_instance.model.language_model.layers:
+            if decoder_layer.mlp is not None:
+                assert inspect.getsource(decoder_layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+                assert inspect.getsource(decoder_layer.post_attention_layernorm.forward) == inspect.getsource(
+                    LigerRMSNormForGlm4.forward
+                )
+                assert inspect.getsource(decoder_layer.input_layernorm.forward) == inspect.getsource(
+                    LigerRMSNormForGlm4.forward
+                )
+            if getattr(decoder_layer.mlp, "experts", None) is not None:
+                if IS_TRANSFORMERS_V5_OR_LATER:
+                    assert inspect.getsource(decoder_layer.mlp.experts.forward) == inspect.getsource(
+                        LigerExperts.forward
+                    )
+                else:
+                    for expert in decoder_layer.mlp.experts:
+                        assert inspect.getsource(expert.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            if getattr(decoder_layer.mlp, "shared_experts", None) is not None:
+                assert inspect.getsource(decoder_layer.mlp.shared_experts.forward) == inspect.getsource(
+                    LigerSwiGLUMLP.forward
+                )
+        for vision_block in dummy_model_instance.model.visual.blocks:
+            assert inspect.getsource(vision_block.norm1.forward) == inspect.getsource(LigerRMSNormForGlm4.forward)
+            assert inspect.getsource(vision_block.norm2.forward) == inspect.getsource(LigerRMSNormForGlm4.forward)
+            assert inspect.getsource(vision_block.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_smollm3_available(), reason="smollm3 module not available")
+def test_apply_liger_kernel_to_instance_for_smollm3():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.smollm3.modeling_smollm3"):
+        # Instantiate a dummy model
+        config = transformers.models.smollm3.configuration_smollm3.SmolLM3Config(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(smolllm3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(smolllm3_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        # Ensure that the model patched with Liger modules can work properly
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_next_available(), reason="qwen3_next module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_next():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_next.modeling_qwen3_next"):
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_next.configuration_qwen3_next.Qwen3NextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            moe_intermediate_size=16,
+            shared_expert_intermediate_size=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_experts=2,
+            num_experts_per_tok=1,
+            mlp_only_layers=[1],
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_next_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if hasattr(layer, "mlp") and hasattr(layer.mlp, "experts"):
+                if IS_TRANSFORMERS_V5_OR_LATER:
+                    assert inspect.getsource(layer.mlp.experts.forward) != inspect.getsource(LigerExperts.forward)
+                else:
+                    for expert in layer.mlp.experts:
+                        assert inspect.getsource(expert.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+                if hasattr(layer.mlp, "shared_expert"):
+                    assert inspect.getsource(layer.mlp.shared_expert.forward) != inspect.getsource(
+                        LigerSwiGLUMLP.forward
+                    )
+            else:
+                assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_next_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if hasattr(layer, "mlp") and hasattr(layer.mlp, "experts"):
+                if IS_TRANSFORMERS_V5_OR_LATER:
+                    assert inspect.getsource(layer.mlp.experts.forward) == inspect.getsource(LigerExperts.forward)
+                else:
+                    for expert in layer.mlp.experts:
+                        assert inspect.getsource(expert.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+                if hasattr(layer.mlp, "shared_expert"):
+                    assert inspect.getsource(layer.mlp.shared_expert.forward) == inspect.getsource(
+                        LigerSwiGLUMLP.forward
+                    )
+            else:
+                assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_5_moe_available(), reason="qwen3_5_moe module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_5_moe():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_5_moe.modeling_qwen3_5_moe"):
+        from liger_kernel.transformers.model.qwen3_5_moe import lce_forward as qwen3_5_moe_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_5_moe.configuration_qwen3_5_moe.Qwen3_5MoeTextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            moe_intermediate_size=16,
+            shared_expert_intermediate_size=16,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            num_attention_heads=2,
+            num_key_value_heads=1,
+            head_dim=16,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=16,
+            linear_value_head_dim=16,
+            linear_num_key_heads=2,
+            linear_num_value_heads=2,
+            num_experts=2,
+            num_experts_per_tok=1,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_5_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) != inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in layer.mlp.experts:
+                    assert inspect.getsource(expert.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.mlp.shared_expert.forward) != inspect.getsource(
+                LigerQwen3MoeSwiGLUMLP.forward
+            )
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_5_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) == inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in layer.mlp.experts:
+                    assert inspect.getsource(expert.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.mlp.shared_expert.forward) == inspect.getsource(
+                LigerQwen3MoeSwiGLUMLP.forward
+            )
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_5_available(), reason="qwen3_5 module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_5():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_5.modeling_qwen3_5"):
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_5.configuration_qwen3_5.Qwen3_5TextConfig(
+            dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=4,
+            num_attention_heads=2,
+            num_key_value_heads=2,
+            head_dim=16,
+            linear_conv_kernel_dim=4,
+            linear_key_head_dim=16,
+            linear_value_head_dim=16,
+            linear_num_key_heads=2,
+            linear_num_value_heads=2,
+            layer_types=["linear_attention", "linear_attention", "linear_attention", "full_attention"],
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_5_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_5_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_qwen3_5_available(), reason="qwen3_5 module not available")
+def test_apply_liger_kernel_to_instance_for_qwen3_5_for_conditional_generation():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.qwen3_5.modeling_qwen3_5"):
+        # Instantiate a dummy model
+        config = transformers.models.qwen3_5.configuration_qwen3_5.Qwen3_5Config(
+            attn_implementation="sdpa",
+            image_token_id=4,
+            video_token_id=5,
+            vision_start_token_id=1,
+            vision_end_token_id=2,
+            tie_word_embeddings=True,
+            vision_config=transformers.models.qwen3_5.configuration_qwen3_5.Qwen3_5VisionConfig(
+                depth=4,
+                hidden_size=256,
+                hidden_act="gelu_pytorch_tanh",
+                intermediate_size=512,
+                num_heads=4,
+                in_channels=3,
+                patch_size=16,
+                spatial_merge_size=2,
+                temporal_patch_size=2,
+                out_hidden_size=512,
+                num_position_embeddings=256,
+                initializer_range=0.02,
+            ).to_dict(),
+            text_config=transformers.models.qwen3_5.configuration_qwen3_5.Qwen3_5TextConfig(
+                dtype=torch.bfloat16,
+                rms_norm_eps=1e-5,
+                hidden_size=32,
+                intermediate_size=64,
+                hidden_act="silu",
+                num_hidden_layers=4,
+                num_attention_heads=2,
+                num_key_value_heads=2,
+                head_dim=16,
+                linear_conv_kernel_dim=4,
+                linear_key_head_dim=16,
+                linear_value_head_dim=16,
+                linear_num_key_heads=2,
+                linear_num_value_heads=2,
+                layer_types=["linear_attention", "linear_attention", "linear_attention", "full_attention"],
+            ).to_dict(),
+        )
+        from transformers.models.qwen3_5.modeling_qwen3_5 import Qwen3_5ForConditionalGeneration
+
+        dummy_model_instance = Qwen3_5ForConditionalGeneration._from_config(config)
+
+        assert isinstance(dummy_model_instance, Qwen3_5ForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(qwen3_5_lce_forward_for_multimodal)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) != inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(qwen3_5_lce_forward_for_multimodal)
+        assert inspect.getsource(dummy_model_instance.model.language_model.norm.forward) == inspect.getsource(
+            LigerRMSNorm.forward
+        )
+        for layer in dummy_model_instance.model.language_model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_hunyuan_v1_available(), reason="hunyuan_v1 module not available")
+def test_apply_liger_kernel_to_instance_for_hunyuan_v1_moe():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe"):
+        from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_moe_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe.HunYuanMoEV1Config(
+            torch_dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            head_dim=1,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(hunyuan_v1_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) != inspect.getsource(LigerExperts.forward)
+            else:
+                for expert in layer.mlp.experts:
+                    assert inspect.getsource(expert.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(hunyuan_v1_moe_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            if IS_TRANSFORMERS_V5_OR_LATER:
+                assert inspect.getsource(layer.mlp.experts.forward) == inspect.getsource(LigerExperts.forward)
+            else:
+                for mlp_expert in layer.mlp.experts:
+                    assert inspect.getsource(mlp_expert.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
+@pytest.mark.skipif(not is_hunyuan_v1_available(), reason="hunyuan_v1_dense module not available")
+def test_apply_liger_kernel_to_instance_for_hunyuan_v1_dense():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense"):
+        from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_dense_lce_forward
+
+        # Instantiate a dummy model
+        config = transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense.HunYuanDenseV1Config(
+            torch_dtype=torch.bfloat16,
+            rms_norm_eps=1e-5,
+            hidden_size=32,
+            intermediate_size=64,
+            hidden_act="silu",
+            num_hidden_layers=2,
+            head_dim=1,
+        )
+        dummy_model_instance = AutoModelForCausalLM.from_config(config)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(hunyuan_v1_dense_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(hunyuan_v1_dense_lce_forward)
+        assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward)
+        for layer in dummy_model_instance.model.layers:
+            assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward)
+            assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+            assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
diff --git a/test/transformers/test_multi_token_attention.py b/test/transformers/test_multi_token_attention.py
new file mode 100755
index 0000000000000000000000000000000000000000..793550384d4e4220f7f0931b68163653cf78ac0c
--- /dev/null
+++ b/test/transformers/test_multi_token_attention.py
@@ -0,0 +1,327 @@
+import pytest
+import torch
+import torch.nn.functional as F
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+
+from liger_kernel.transformers.functional import liger_multi_token_attention
+from liger_kernel.transformers.multi_token_attention import LigerMultiTokenAttention
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+set_seed()
+
+
+def _make_mask(L, device):
+    tril = torch.tril(torch.ones(L, L, dtype=torch.bool, device=device))
+    return tril.view(1, 1, L, L)
+
+
+class TorchMultiTokenAttention(torch.nn.Module):
+    def __init__(self, C_in, C_out, K, groups, bias, dtype, device):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(C_out, C_in // groups, K, K, dtype=dtype, device=device))
+        self.bias = torch.nn.Parameter(torch.empty(C_out, dtype=dtype, device=device)) if bias else None
+        self.K = K
+        self.groups = groups
+
+    def forward(self, scores):
+        B, C_in, L, _ = scores.shape
+        mask = _make_mask(L, scores.device)
+        inf = torch.tensor(-1e9, device=scores.device, dtype=scores.dtype)
+        zero = torch.tensor(0.0, device=scores.device, dtype=scores.dtype)
+        s_inf = scores.masked_fill(~mask, inf)
+        probs = F.softmax(s_inf, dim=-1)
+        out_c = F.conv2d(probs, self.weight, self.bias, stride=1, padding=self.K // 2, groups=self.groups)
+        return out_c.masked_fill(~mask, zero)
+
+
+@pytest.mark.skipif(device == "xpu", reason="Skip for xpu")
+@pytest.mark.parametrize(
+    "B,C_in,C_out,L,K,groups",
+    [
+        (2, 4, 4, 8, 3, 1),
+        (1, 2, 2, 5, 1, 1),
+        (3, 6, 6, 6, 3, 1),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-4),
+        pytest.param(
+            torch.bfloat16,
+            2e-2,
+            2e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(),
+                reason="bfloat16 not supported on this device",
+            ),
+        ),
+    ],
+)
+def test_multi_token_attention_correctness(B, C_in, C_out, L, K, groups, bias, dtype, atol, rtol):
+    set_seed(42)
+    scores = torch.randn(B, C_in, L, L, device=device, dtype=dtype)  # input
+
+    ref_attn = TorchMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    liger_attn = (
+        LigerMultiTokenAttention(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=1,
+            padding=K // 2,
+            groups=groups,
+            bias=bias,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    with torch.no_grad():
+        ref_attn.weight.copy_(liger_attn.weight)
+        if bias:
+            ref_attn.bias.copy_(liger_attn.bias)
+
+    scores1 = scores.detach().clone().requires_grad_(True)
+    scores2 = scores.detach().clone().requires_grad_(True)
+
+    out1 = liger_attn(scores1)
+    out2 = ref_attn(scores2)
+
+    assert_verbose_allclose(out1, out2, atol=atol, rtol=rtol)
+
+    loss1 = out1.sum()
+    loss2 = out2.sum()
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(scores1.grad, scores2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(liger_attn.weight.grad, ref_attn.weight.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(liger_attn.bias.grad, ref_attn.bias.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "B,C_in,C_out,L,K,groups",
+    [
+        (2, 4, 4, 8, 3, 1),
+        (1, 2, 2, 5, 1, 1),
+        (3, 6, 6, 6, 3, 1),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-4),
+        pytest.param(
+            torch.bfloat16,
+            2e-2,
+            2e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(),
+                reason="bfloat16 not supported on this device",
+            ),
+        ),
+    ],
+)
+def test_multi_token_attention_functional(B, C_in, C_out, L, K, groups, bias, dtype, atol, rtol):
+    scores = torch.randn(B, C_in, L, L, device=device, dtype=dtype)
+
+    ref_attn = TorchMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    weight = torch.empty(C_out, C_in // groups, K, K, device=device, dtype=dtype)
+    torch.nn.init.kaiming_uniform_(weight, a=5**0.5)
+    if bias:
+        bias_tensor = torch.empty(C_out, device=device, dtype=dtype)
+        torch.nn.init.zeros_(bias_tensor)
+    else:
+        bias_tensor = None
+
+    with torch.no_grad():
+        ref_attn.weight.copy_(weight)
+        if bias:
+            ref_attn.bias.copy_(bias_tensor)
+
+    scores1 = scores.detach().clone().requires_grad_(True)
+    scores2 = scores.detach().clone().requires_grad_(True)
+    weight1 = weight.detach().clone().requires_grad_(True)
+    if bias:
+        bias1 = bias_tensor.detach().clone().requires_grad_(True)
+    else:
+        bias1 = None
+
+    out1 = liger_multi_token_attention(scores1, weight1, bias1, stride=1, padding=K // 2, groups=groups)
+    out2 = ref_attn(scores2)
+
+    assert_verbose_allclose(out1, out2, atol=atol, rtol=rtol)
+
+    loss1 = out1.sum()
+    loss2 = out2.sum()
+    loss1.backward()
+    loss2.backward()
+
+    assert_verbose_allclose(scores1.grad, scores2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, ref_attn.weight.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, ref_attn.bias.grad, atol=atol, rtol=rtol)
+
+
+class TorchSparseMultiTokenAttention(TorchMultiTokenAttention):
+    def forward(self, scores):
+        B, C_in, L, _ = scores.shape
+        mask = _make_mask(L, scores.device)
+        inf = torch.tensor(-1e9, device=scores.device, dtype=scores.dtype)
+        zero = torch.tensor(0.0, device=scores.device, dtype=scores.dtype)
+        s_inf = scores.masked_fill(~mask, inf)
+        dim = -1
+        z = s_inf
+        z_sorted, _ = torch.sort(z, dim=dim, descending=True)
+        cum_sum = torch.cumsum(z_sorted, dim=dim)
+        k_indices = torch.arange(1, L + 1, device=z.device, dtype=z.dtype).view(1, 1, 1, L)
+        is_positive = z_sorted > -1e8
+        condition = (1 + k_indices * z_sorted > cum_sum) & is_positive
+        k_sparsemax = torch.sum(condition, dim=dim, keepdim=True)
+        k_sparsemax_safe = torch.max(k_sparsemax, torch.ones_like(k_sparsemax))
+        cum_sum_k = torch.gather(cum_sum, dim=dim, index=k_sparsemax_safe.long() - 1)
+        tau = (cum_sum_k - 1) / k_sparsemax_safe.to(z.dtype)
+        tau = torch.where(k_sparsemax == 0, torch.full_like(tau, float("inf")), tau)
+        probs = torch.clamp(z - tau, min=0)
+        out_c = F.conv2d(probs, self.weight, self.bias, stride=1, padding=self.K // 2, groups=self.groups)
+        return out_c.masked_fill(~mask, zero)
+
+
+# NOTE(tcc): Unknown failure on xpu. Issue #761
+@pytest.mark.skipif(device == "xpu", reason="Skip for xpu")
+@pytest.mark.parametrize(
+    "B,C_in,C_out,L,K,groups",
+    [
+        (2, 4, 4, 8, 3, 1),
+        (1, 2, 2, 5, 1, 1),
+        (3, 6, 6, 6, 3, 1),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 5e-4, 5e-4),
+    ],
+)
+def test_sparse_multi_token_attention_correctness(B, C_in, C_out, L, K, groups, bias, dtype, atol, rtol):
+    set_seed()
+    scores = torch.randn(B, C_in, L, L, device=device, dtype=dtype)
+
+    ref_attn = TorchSparseMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    liger_attn = (
+        LigerMultiTokenAttention(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=1,
+            padding=K // 2,
+            groups=groups,
+            bias=bias,
+            sparse=True,
+        )
+        .to(device)
+        .to(dtype)
+    )
+
+    torch.nn.init.kaiming_uniform_(liger_attn.weight, a=5**0.5)
+    if bias:
+        torch.nn.init.zeros_(liger_attn.bias)
+
+    with torch.no_grad():
+        ref_attn.weight.copy_(liger_attn.weight)
+        if bias:
+            ref_attn.bias.copy_(liger_attn.bias)
+
+    scores1 = scores.detach().clone().requires_grad_(True)
+    scores2 = scores.detach().clone().requires_grad_(True)
+
+    out1 = liger_attn(scores1)
+    out2 = ref_attn(scores2)
+
+    assert_verbose_allclose(out1, out2, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(out1)
+    out1.backward(gradient=grad_output)
+    out2.backward(gradient=grad_output.clone())
+
+    assert_verbose_allclose(scores1.grad, scores2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(liger_attn.weight.grad, ref_attn.weight.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(liger_attn.bias.grad, ref_attn.bias.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "B,C_in,C_out,L,K,groups",
+    [
+        (2, 4, 4, 8, 3, 1),
+        (1, 2, 2, 5, 1, 1),
+        (3, 6, 6, 6, 3, 1),
+    ],
+)
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 5e-4, 5e-4),
+    ],
+)
+def test_sparse_multi_token_attention_functional(B, C_in, C_out, L, K, groups, bias, dtype, atol, rtol):
+    set_seed()
+    scores = torch.randn(B, C_in, L, L, device=device, dtype=dtype)
+
+    ref_attn = TorchSparseMultiTokenAttention(
+        C_in=C_in, C_out=C_out, K=K, groups=groups, bias=bias, dtype=dtype, device=device
+    )
+
+    weight = torch.empty(C_out, C_in // groups, K, K, device=device, dtype=dtype)
+    torch.nn.init.kaiming_uniform_(weight, a=5**0.5)
+    if bias:
+        bias_tensor = torch.empty(C_out, device=device, dtype=dtype)
+        torch.nn.init.zeros_(bias_tensor)
+    else:
+        bias_tensor = None
+
+    with torch.no_grad():
+        ref_attn.weight.copy_(weight)
+        if bias:
+            ref_attn.bias.copy_(bias_tensor)
+
+    scores1 = scores.detach().clone().requires_grad_(True)
+    scores2 = scores.detach().clone().requires_grad_(True)
+    weight1 = weight.detach().clone().requires_grad_(True)
+    if bias:
+        bias1 = bias_tensor.detach().clone().requires_grad_(True)
+    else:
+        bias1 = None
+
+    out1 = liger_multi_token_attention(scores1, weight1, bias1, stride=1, padding=K // 2, groups=groups, sparse=True)
+    out2 = ref_attn(scores2)
+
+    assert_verbose_allclose(out1, out2, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(out1)
+    out1.backward(gradient=grad_output)
+    out2.backward(gradient=grad_output.clone())
+
+    assert_verbose_allclose(scores1.grad, scores2.grad, atol=atol, rtol=rtol)
+    assert_verbose_allclose(weight1.grad, ref_attn.weight.grad, atol=atol, rtol=rtol)
+    if bias:
+        assert_verbose_allclose(bias1.grad, ref_attn.bias.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_poly_norm.py b/test/transformers/test_poly_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..dd44881d8f59bbd73fc4430ac3b4b0ce6210e190
--- /dev/null
+++ b/test/transformers/test_poly_norm.py
@@ -0,0 +1,281 @@
+import os
+
+import pytest
+import torch
+import torch.nn as nn
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+
+from liger_kernel.ops import LigerPolyNormFunction
+from liger_kernel.transformers.functional import liger_poly_norm
+from liger_kernel.transformers.poly_norm import LigerPolyNorm
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+set_seed(42)
+torch.use_deterministic_algorithms(True)
+
+#  Only setting torch.use_deterministic_algorithms(True) might throw the following error:
+#  RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`,
+#  but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an
+#  environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information,
+#  go to https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+
+if device == "cuda":
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+SLEEP_SECONDS = 0.1
+
+
+class NaivePolyNorm(nn.Module):
+    """
+    Naive PyTorch implementation of PolyNorm for testing.
+
+    Reference implementation from:
+    https://github.com/BryceZhuo/PolyCom/
+
+    PolyNorm formula:
+        y = w₀·norm(x³) + w₁·norm(x²) + w₂·norm(x) + b
+        where norm(u) = u / sqrt(mean(u²) + ε)
+    """
+
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.full((3,), 1.0 / 3.0))
+        self.bias = nn.Parameter(torch.tensor(1.0))
+        self.eps = eps
+
+    def _norm(self, x):
+        """RMSNorm operation"""
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        """
+        Forward pass of PolyNorm
+
+        Args:
+            x: input tensor of shape (..., H)
+
+        Returns:
+            output tensor of same shape as input
+        """
+        # Compute powers
+        x_pow3 = x**3
+        x_pow2 = x**2
+        x_pow1 = x**1
+
+        # Normalize each power
+        norm_x3 = self._norm(x_pow3)
+        norm_x2 = self._norm(x_pow2)
+        norm_x1 = self._norm(x_pow1)
+
+        # Weighted sum with bias
+        output = self.weight[0] * norm_x3 + self.weight[1] * norm_x2 + self.weight[2] * norm_x1 + self.bias
+
+        return output
+
+
+@pytest.mark.flaky(reruns=3, reruns_delay=2)
+@pytest.mark.parametrize(
+    "bs, sl, hd",
+    [
+        (2, 128, 512),
+        (8, 64, 1024),
+        # weird shapes
+        (5, 123, 123),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        pytest.param(
+            torch.bfloat16,
+            2e-1,
+            2e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness(bs, sl, hd, dtype, atol, rtol):
+    """
+    Test LigerPolyNorm wrapper correctness against naive PyTorch implementation.
+
+    Args:
+        bs: batch size
+        sl: sequence length
+        hd: hidden dimension
+        dtype: data type (float32 or bfloat16)
+        atol: absolute tolerance
+        rtol: relative tolerance
+    """
+    _tensor = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    x1 = _tensor.clone().requires_grad_(True)
+    x2 = _tensor.clone().requires_grad_(True)
+
+    # Gradient output
+    grad_output = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    # Reference: Naive PyTorch implementation
+    naive_poly_norm = NaivePolyNorm(eps=1e-6).to(device).to(dtype)
+    ref_output = naive_poly_norm(x1)
+    ref_output.backward(grad_output, retain_graph=True)
+
+    # Liger wrapper implementation
+    liger_poly_norm = LigerPolyNorm(eps=1e-6).to(device).to(dtype)
+    # Copy weights to ensure same initialization
+    liger_poly_norm.weight.data.copy_(naive_poly_norm.weight.data)
+    liger_poly_norm.bias.data.copy_(naive_poly_norm.bias.data)
+
+    triton_output = liger_poly_norm(x2)
+    triton_output.backward(grad_output, retain_graph=True)
+
+    # Check forward pass
+    assert_verbose_allclose(ref_output, triton_output, atol=atol, rtol=rtol)
+
+    # Check weight gradient
+    assert_verbose_allclose(naive_poly_norm.weight.grad, liger_poly_norm.weight.grad, atol=atol, rtol=rtol)
+
+    # Check bias gradient
+    assert_verbose_allclose(naive_poly_norm.bias.grad, liger_poly_norm.bias.grad, atol=atol, rtol=rtol)
+
+    # Check input gradient
+    assert_verbose_allclose(x1.grad, x2.grad, atol=atol, rtol=rtol, max_print=20)
+
+
+@pytest.mark.parametrize(
+    "bs, sl, hd",
+    [
+        (2, 2, 8),
+        # weird shapes
+        (9, 7, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        pytest.param(
+            torch.bfloat16,
+            2e-1,
+            2e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness_functional(bs, sl, hd, dtype, atol, rtol):
+    """
+    Test liger_poly_norm functional API correctness.
+
+    Args:
+        bs: batch size
+        sl: sequence length
+        hd: hidden dimension
+        dtype: data type (float32 or bfloat16)
+        atol: absolute tolerance
+        rtol: relative tolerance
+    """
+    _tensor = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    x1 = _tensor.clone().requires_grad_(True)
+    x2 = _tensor.clone().requires_grad_(True)
+
+    weight = torch.tensor([0.3, 0.4, 0.3], device=device, dtype=dtype)
+    bias = torch.tensor(0.1, device=device, dtype=dtype)
+
+    weight1 = weight.clone().requires_grad_(True)
+    bias1 = bias.clone().requires_grad_(True)
+
+    weight2 = weight.clone().requires_grad_(True)
+    bias2 = bias.clone().requires_grad_(True)
+
+    # First call - functional API
+    y1 = liger_poly_norm(x1, weight1, bias1, 1e-6)
+
+    # Second call - Function.apply API (should be identical)
+    y2 = LigerPolyNormFunction.apply(x2, weight2, bias2, 1e-6)
+
+    # Check forward pass
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    grad = torch.randn_like(y2)
+    grad1 = grad.clone()
+    grad2 = grad.clone()
+
+    y1.backward(grad1)
+    y2.backward(grad2)
+
+    # Check gradients
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(weight1.grad, weight2.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(bias1.grad, bias2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "bs, sl, hd",
+    [
+        (2, 128, 512),
+        (4, 256, 1024),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.float32,
+        pytest.param(
+            torch.bfloat16,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_forward_shapes(bs, sl, hd, dtype):
+    """
+    Test that LigerPolyNorm preserves input shapes correctly.
+
+    Args:
+        bs: batch size
+        sl: sequence length
+        hd: hidden dimension
+        dtype: data type
+    """
+    x = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    poly_norm = LigerPolyNorm(eps=1e-6).to(device).to(dtype)
+    output = poly_norm(x)
+
+    assert output.shape == x.shape, f"Output shape {output.shape} != input shape {x.shape}"
+    assert output.dtype == x.dtype, f"Output dtype {output.dtype} != input dtype {x.dtype}"
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (32, 512),  # 2D
+        (8, 16, 512),  # 3D
+        (4, 8, 16, 512),  # 4D
+    ],
+)
+def test_multidimensional_input(shape):
+    """
+    Test that LigerPolyNorm handles multi-dimensional inputs correctly.
+
+    Args:
+        shape: input tensor shape
+    """
+    x = torch.randn(*shape, device=device, dtype=torch.float32, requires_grad=True)
+
+    poly_norm = LigerPolyNorm(eps=1e-6).to(device)
+    output = poly_norm(x)
+
+    assert output.shape == shape, f"Output shape {output.shape} != input shape {shape}"
+
+    # Test backward
+    grad_output = torch.randn_like(output)
+    output.backward(grad_output)
+
+    assert x.grad is not None, "Gradient should be computed for input"
+    assert x.grad.shape == shape, f"Gradient shape {x.grad.shape} != input shape {shape}"
diff --git a/test/transformers/test_qwen2vl_mrope.py b/test/transformers/test_qwen2vl_mrope.py
new file mode 100755
index 0000000000000000000000000000000000000000..4d342e419b70a3dcb172636d9a9cc3774018f7c7
--- /dev/null
+++ b/test/transformers/test_qwen2vl_mrope.py
@@ -0,0 +1,139 @@
+import pytest
+import torch
+
+from test.utils import supports_bfloat16
+
+try:
+    from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLTextConfig
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_multimodal_rotary_pos_emb
+
+    IS_QWEN_AVAILABLE = True
+except Exception:
+    IS_QWEN_AVAILABLE = False
+
+from liger_kernel.ops import LigerQwen2VLMRopeFunction
+from liger_kernel.transformers.functional import liger_qwen2vl_mrope
+from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+@pytest.mark.skipif(not IS_QWEN_AVAILABLE, reason="Qwen is not available in transformers.")
+@pytest.mark.parametrize("bsz", [1, 2])
+@pytest.mark.parametrize("seq_len", [128, 131])
+@pytest.mark.parametrize("num_q_heads, num_kv_heads", [(64, 8), (28, 4), (12, 2)])
+@pytest.mark.parametrize(
+    "head_dim, mrope_section",
+    [
+        (128, [16, 24, 24]),
+        (96, [16, 16, 16]),
+        (64, [8, 12, 12]),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            1e-1,
+            1e-5,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness(bsz, seq_len, num_q_heads, num_kv_heads, head_dim, mrope_section, dtype, atol, rtol):
+    rotary_emb = Qwen2VLRotaryEmbedding(config=Qwen2VLTextConfig(head_dim=head_dim), device=device)
+
+    _tensor_q = torch.randn((bsz, seq_len, num_q_heads, head_dim), device=device).transpose(1, 2).to(dtype)
+
+    _tensor_k = torch.randn((bsz, seq_len, num_kv_heads, head_dim), device=device).transpose(1, 2).to(dtype)
+
+    q1 = _tensor_q.clone().requires_grad_(True)
+    k1 = _tensor_k.clone().requires_grad_(True)
+
+    q2 = _tensor_q.clone().requires_grad_(True)
+    k2 = _tensor_k.clone().requires_grad_(True)
+
+    # NOTE: this position ids distribution is different from the real one, just to test op correctness
+    pos_ids = torch.arange(seq_len * 3 * bsz, device=device, dtype=torch.long).view(3, bsz, seq_len)
+    cos, sin = rotary_emb(k1, pos_ids)
+
+    # validate forward pass
+    hf_q, hf_k = apply_multimodal_rotary_pos_emb(q1, k1, cos, sin, mrope_section)
+    tt_q, tt_k = liger_multimodal_rotary_pos_emb(q2, k2, cos, sin, mrope_section)
+    torch.testing.assert_close(hf_q, tt_q, atol=atol, rtol=rtol)
+    torch.testing.assert_close(hf_k, tt_k, atol=atol, rtol=rtol)
+
+    # validate backward pass
+    dq, dk = (
+        torch.randn_like(hf_q, device=device),
+        torch.randn_like(hf_k, device=device).to(dtype),
+    )
+
+    q1_grad, k1_grad = torch.autograd.grad((hf_q, hf_k), (q1, k1), (dq, dk), allow_unused=True)
+    q2_grad, k2_grad = torch.autograd.grad((tt_q, tt_k), (q2, k2), (dq.clone(), dk.clone()), allow_unused=True)
+
+    torch.testing.assert_close(q1_grad, q2_grad, atol=atol, rtol=rtol)
+    torch.testing.assert_close(k1_grad, k2_grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.skipif(not IS_QWEN_AVAILABLE, reason="Qwen is not available in transformers.")
+@pytest.mark.parametrize(
+    "bsz, seq_len, num_q_heads, num_kv_heads, head_dim, mrope_section",
+    [
+        (1, 2, 2, 2, 8, [2, 1, 1]),
+        (1, 2, 1, 2, 8, [2, 1, 1]),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-1, 1e-5),
+    ],
+)
+def test_functional_correctness(bsz, seq_len, num_q_heads, num_kv_heads, head_dim, mrope_section, dtype, atol, rtol):
+    _q = torch.randn((bsz, num_q_heads, seq_len, head_dim), device=device, dtype=dtype)
+    _k = torch.randn((bsz, num_kv_heads, seq_len, head_dim), device=device, dtype=dtype)
+
+    q1 = _q.clone().requires_grad_(True)
+    q2 = _q.clone().requires_grad_(True)
+
+    k1 = _k.clone().requires_grad_(True)
+    k2 = _k.clone().requires_grad_(True)
+
+    rotary_emb = Qwen2VLRotaryEmbedding(config=Qwen2VLTextConfig(head_dim=head_dim), device=device)
+
+    pos_ids = torch.arange(seq_len * 3 * bsz, device=device, dtype=torch.long).view(3, bsz, seq_len)
+    cos, sin = rotary_emb(k1, pos_ids)
+
+    functional_q, functional_k = liger_qwen2vl_mrope(q1, k1, cos, sin, mrope_section)
+    class_q, class_k = LigerQwen2VLMRopeFunction.apply(q2, k2, cos, sin, mrope_section)
+
+    torch.testing.assert_close(functional_q, class_q, atol=atol, rtol=rtol)
+    torch.testing.assert_close(functional_k, class_k, atol=atol, rtol=rtol)
+
+    dq, dk = torch.randn_like(functional_q), torch.randn_like(functional_k)
+
+    dq1, dk1 = dq.clone(), dk.clone()
+    dq2, dk2 = dq.clone(), dk.clone()
+
+    q1_grad, k1_grad = torch.autograd.grad(
+        (functional_q, functional_k),
+        (q1, k1),
+        (dq1, dk1),
+        allow_unused=True,
+    )
+
+    q2_grad, k2_grad = torch.autograd.grad(
+        (class_q, class_k),
+        (q2, k2),
+        (dq2, dk2),
+        allow_unused=True,
+    )
+
+    torch.testing.assert_close(q1_grad, q2_grad, atol=atol, rtol=rtol)
+    torch.testing.assert_close(k1_grad, k2_grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_rms_norm.py b/test/transformers/test_rms_norm.py
new file mode 100755
index 0000000000000000000000000000000000000000..605f82961d36ca5506fc0ed2658c02ddd6ddcf64
--- /dev/null
+++ b/test/transformers/test_rms_norm.py
@@ -0,0 +1,311 @@
+import os
+import tempfile
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+
+from liger_kernel.ops import LigerRMSNormFunction
+from liger_kernel.transformers.functional import liger_rms_norm
+from liger_kernel.transformers.rms_norm import LigerRMSNorm
+from liger_kernel.utils import infer_comm_backend
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+set_seed(42)
+torch.use_deterministic_algorithms(True)
+
+#  Only setting torch.use_deterministic_algorithms(True) might throw the following error:
+#  RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`,
+#  but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an
+#  environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information,
+#  go to https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+
+if device == "cuda":
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+SLEEP_SECONDS = 0.1
+
+
+class BaseRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, elementwise_affine=True):
+        super().__init__()
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+        else:
+            self.register_parameter("weight", None)
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.elementwise_affine:
+            return self.weight * hidden_states.to(input_dtype)
+        else:
+            return hidden_states.to(input_dtype)
+
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L112
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, elementwise_affine=True):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+        else:
+            self.register_parameter("weight", None)
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.elementwise_affine:
+            return self.weight * hidden_states.to(input_dtype)
+        else:
+            return hidden_states.to(input_dtype)
+
+
+# https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/gemma/modeling_gemma.py#L122
+class GemmaRMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6, elementwise_affine=True):
+        super().__init__()
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+        else:
+            self.register_parameter("weight", None)
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float())
+        if self.elementwise_affine:
+            output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+
+@pytest.mark.flaky(reruns=3, reruns_delay=2)
+@pytest.mark.parametrize(
+    "bs, sl, hd",
+    [
+        (2, 128, 512),
+        # weird shapes
+        (5, 123, 123),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        pytest.param(
+            torch.bfloat16,
+            2e-1,
+            2e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "reference, offset, casting_mode",
+    [
+        (LlamaRMSNorm, 0.0, "llama"),
+        (GemmaRMSNorm, 1.0, "gemma"),
+        pytest.param(
+            BaseRMSNorm,
+            0.0,
+            "none",
+            marks=pytest.mark.skipif(device == "npu", reason="Ascend NPU does not support this test"),
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "in_place",
+    [
+        True,
+        False,
+    ],
+)
+@pytest.mark.parametrize(
+    "elementwise_affine",
+    [
+        True,
+        False,
+    ],
+)
+def test_correctness(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_mode, in_place, elementwise_affine):
+    _tensor = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    h1 = _tensor.clone().requires_grad_(True)
+    h2 = _tensor.clone().requires_grad_(True)
+
+    # do
+    do = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    # reference (llama or gemma)
+    ref_rms = reference(hidden_size=hd, elementwise_affine=elementwise_affine).to(device).to(dtype)
+    ref_o = ref_rms(h1)
+    ref_o.backward(do, retain_graph=True)
+
+    # triton
+    triton_rms = (
+        LigerRMSNorm(
+            hidden_size=hd,
+            offset=offset,
+            casting_mode=casting_mode,
+            in_place=in_place,
+            elementwise_affine=elementwise_affine,
+        )
+        .to(device)
+        .to(dtype)
+    )
+    triton_o = triton_rms(h2)
+    triton_o.backward(do, retain_graph=True)
+
+    assert_verbose_allclose(ref_o, triton_o, atol=atol, rtol=rtol)
+    if elementwise_affine:
+        assert_verbose_allclose(ref_rms.weight.grad, triton_rms.weight.grad, atol=atol, rtol=rtol)
+    print(f"{h1.grad=}")
+    print(f"{h2.grad=}")
+    assert_verbose_allclose(h1.grad, h2.grad, atol=atol, rtol=rtol, max_print=20)
+
+
+@pytest.mark.parametrize(
+    "bs, sl, hd",
+    [
+        (2, 2, 8),
+        # weird shapes
+        (9, 7, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        (torch.bfloat16, 2e-1, 2e-2),
+    ],
+)
+@pytest.mark.parametrize(
+    "reference, offset, casting_mode",
+    [
+        (LlamaRMSNorm, 0.0, "llama"),
+        (GemmaRMSNorm, 1.0, "gemma"),
+    ],
+)
+@pytest.mark.parametrize(
+    "elementwise_affine",
+    [
+        True,
+        False,
+    ],
+)
+def test_correctness_functional(bs, sl, hd, dtype, atol, rtol, reference, offset, casting_mode, elementwise_affine):
+    # h
+    _tensor = torch.randn(bs, sl, hd, device=device, dtype=dtype)
+
+    h1 = _tensor.clone().requires_grad_(True)
+    h2 = _tensor.clone().requires_grad_(True)
+
+    if elementwise_affine:
+        w = torch.randn(hd, device=device, dtype=dtype)
+    else:
+        w = None
+
+    y1 = liger_rms_norm(X=h1, W=w, eps=1e-6, offset=offset, casting_mode=casting_mode)
+    y2 = LigerRMSNormFunction.apply(h2, w, 1e-6, offset, casting_mode)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    grad = torch.randn_like(y2)
+
+    y1.backward(grad)
+    y2.backward(grad)
+
+    assert torch.allclose(h1.grad, h2.grad, atol=atol, rtol=rtol)
+
+
+def _test_dtensor_rms_norm(rank, world_size, bs, sl, hd, dtype, atol, rtol, offset, casting_mode, file_name):
+    torch.distributed.init_process_group(
+        backend=infer_comm_backend(),
+        init_method=f"file://{file_name}",
+        rank=rank,
+        world_size=world_size,
+    )
+    device = f"{infer_device()}:{rank}" if infer_device() != "cpu" else "cpu"
+    device_mesh = torch.distributed.device_mesh.init_device_mesh(
+        infer_device(), mesh_shape=(world_size,), mesh_dim_names=("tp",)
+    )
+    t = torch.randn(bs, sl, hd, device=device, dtype=dtype, requires_grad=True)
+    dt = torch.distributed.tensor.distribute_tensor(
+        t,
+        device_mesh=device_mesh,
+        placements=[torch.distributed.tensor.Shard(2)],
+    )
+    w = torch.randn(hd, device=device, dtype=dtype, requires_grad=True)
+    w1 = w.detach().clone()
+    w2 = w.detach().clone()
+
+    y1 = liger_rms_norm(X=dt, W=w1, eps=1e-6, offset=offset, casting_mode=casting_mode)
+    y2 = liger_rms_norm(X=t, W=w2, eps=1e-6, offset=offset, casting_mode=casting_mode)
+    torch.testing.assert_close(y1, y2, atol=atol, rtol=rtol)
+
+    grad = torch.randn_like(y2)
+    dgrad = torch.distributed.tensor.distribute_tensor(
+        grad,
+        device_mesh=device_mesh,
+        placements=[torch.distributed.tensor.Shard(2)],
+    )
+
+    y1.backward(dgrad)
+    y2.backward(grad)
+    torch.testing.assert_close(w1.grad, w2.grad, atol=atol, rtol=rtol)
+    torch.testing.assert_close(dt.grad, t.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.xfail(
+    torch.cuda.device_count() < 8,
+    reason="Pending multi-GPU host support. This test is expected to pass when run with multi-GPU host.",
+)
+@pytest.mark.parametrize(
+    "world_size, bs, sl, hd",
+    [
+        (4, 2, 2, 8),
+        (8, 9, 7, 64),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        (torch.bfloat16, 2e-1, 2e-2),
+    ],
+)
+@pytest.mark.parametrize(
+    "offset, casting_mode",
+    [
+        (0.0, "llama"),
+        (1.0, "gemma"),
+    ],
+)
+def test_dtensor_rms_norm(world_size, bs, sl, hd, dtype, atol, rtol, offset, casting_mode):
+    with tempfile.NamedTemporaryFile() as f:
+        mp.spawn(
+            _test_dtensor_rms_norm,
+            args=(world_size, bs, sl, hd, dtype, atol, rtol, offset, casting_mode, f.name),
+            nprocs=world_size,
+            join=True,
+        )
diff --git a/test/transformers/test_rope.py b/test/transformers/test_rope.py
new file mode 100755
index 0000000000000000000000000000000000000000..962c840a1206b525cb882d646cb22624aa01d010
--- /dev/null
+++ b/test/transformers/test_rope.py
@@ -0,0 +1,183 @@
+import pytest
+import torch
+
+from test.utils import supports_bfloat16
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+from liger_kernel.ops import LigerRopeFunction
+from liger_kernel.transformers.functional import liger_rope
+from liger_kernel.transformers.rope import liger_rotary_pos_emb
+from liger_kernel.utils import infer_device
+from liger_kernel.utils import transformers_version_dispatch
+
+device = infer_device()
+
+SLEEP_SECONDS = 0.1
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, num_q_heads, num_kv_heads, head_dim",
+    [
+        (1, 128, 32, 32, 64),
+        (2, 128, 32, 32, 64),
+        # different q/k heads
+        (1, 128, 32, 8, 64),
+        (2, 128, 32, 8, 64),
+        # weird shapes
+        # HuggingFace llama/mistral source code doesn't support odd head dimension
+        # so we don't test it here
+        (3, 423, 73, 213, 92),
+        (3, 423, 73, 155, 92),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            1e-1,
+            1e-5,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "expand_position_ids",
+    [True, False],
+)
+def test_correctness(
+    bsz,
+    seq_len,
+    num_q_heads,
+    num_kv_heads,
+    head_dim,
+    dtype,
+    expand_position_ids,
+    atol,
+    rtol,
+):
+    rotary_emb = transformers_version_dispatch(
+        "4.48.0",
+        LlamaRotaryEmbedding,
+        LlamaRotaryEmbedding,
+        before_kwargs={"dim": head_dim, "device": device},
+        after_kwargs={"config": LlamaConfig(num_kv_heads=num_kv_heads, head_dim=head_dim), "device": device},
+    )
+
+    _tensor_q = torch.randn((bsz, seq_len, num_q_heads, head_dim), device=device).transpose(1, 2).to(dtype)
+
+    _tensor_k = torch.randn((bsz, seq_len, num_kv_heads, head_dim), device=device).transpose(1, 2).to(dtype)
+
+    q1 = _tensor_q.clone().requires_grad_(True)
+    k1 = _tensor_k.clone().requires_grad_(True)
+
+    q2 = _tensor_q.clone().requires_grad_(True)
+    k2 = _tensor_k.clone().requires_grad_(True)
+
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
+    if expand_position_ids:
+        pos_ids = pos_ids.expand(bsz, -1)
+    cos, sin = rotary_emb(k1, pos_ids)
+
+    # validate forward pass
+    hf_q, hf_k = apply_rotary_pos_emb(q1, k1, cos, sin)
+    tt_q, tt_k = liger_rotary_pos_emb(q2, k2, cos, sin)
+    assert torch.allclose(hf_q, tt_q, atol=atol, rtol=rtol)
+    assert torch.allclose(hf_k, tt_k, atol=atol, rtol=rtol)
+
+    # validate backward pass
+    dq, dk = (
+        torch.randn_like(hf_q, device=device),
+        torch.randn_like(hf_k, device=device).to(dtype),
+    )
+
+    q1_grad, k1_grad = torch.autograd.grad((hf_q, hf_k), (q1, k1), (dq, dk), allow_unused=True)
+    q2_grad, k2_grad = torch.autograd.grad((tt_q, tt_k), (q2, k2), (dq.clone(), dk.clone()), allow_unused=True)
+
+    assert torch.allclose(q1_grad, q2_grad, atol=atol, rtol=rtol)
+    assert torch.allclose(k1_grad, k2_grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, num_q_heads, num_kv_heads, head_dim",
+    [
+        (1, 2, 2, 2, 8),
+        (1, 2, 1, 2, 8),
+        # weird shapes
+        (9, 7, 41, 41, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-1, 1e-5),
+    ],
+)
+@pytest.mark.parametrize(
+    "expand_position_ids",
+    [True, False],
+)
+def test_functional_correctness(
+    bsz,
+    seq_len,
+    num_q_heads,
+    num_kv_heads,
+    head_dim,
+    expand_position_ids,
+    dtype,
+    atol,
+    rtol,
+):
+    _q = torch.randn((bsz, num_q_heads, seq_len, head_dim), device=device, dtype=dtype)
+    _k = torch.randn((bsz, num_kv_heads, seq_len, head_dim), device=device, dtype=dtype)
+
+    q1 = _q.clone().requires_grad_(True)
+    q2 = _q.clone().requires_grad_(True)
+
+    k1 = _k.clone().requires_grad_(True)
+    k2 = _k.clone().requires_grad_(True)
+
+    rotary_emb = transformers_version_dispatch(
+        "4.48.0",
+        LlamaRotaryEmbedding,
+        LlamaRotaryEmbedding,
+        before_kwargs={"dim": head_dim, "device": device},
+        after_kwargs={"config": LlamaConfig(num_kv_heads=num_kv_heads, head_dim=head_dim), "device": device},
+    )
+
+    pos_ids = torch.arange(seq_len, device=device, dtype=torch.long).unsqueeze(0)
+    if expand_position_ids:
+        pos_ids = pos_ids.expand(bsz, -1)
+    cos, sin = rotary_emb(k1, pos_ids)
+
+    functional_q, functional_k = liger_rope(q=q1, k=k1, cos=cos, sin=sin)
+    class_q, class_k = LigerRopeFunction.apply(q2, k2, cos, sin)
+
+    assert torch.allclose(functional_q, class_q, atol=atol, rtol=rtol)
+    assert torch.allclose(functional_k, class_k, atol=atol, rtol=rtol)
+
+    dq, dk = torch.randn_like(functional_q), torch.randn_like(functional_k)
+
+    dq1, dk1 = dq.clone(), dk.clone()
+    dq2, dk2 = dq.clone(), dk.clone()
+
+    q1_grad, k1_grad = torch.autograd.grad(
+        (functional_q, functional_k),
+        (q1, k1),
+        (dq1, dk1),
+        allow_unused=True,
+    )
+
+    q2_grad, k2_grad = torch.autograd.grad(
+        (class_q, class_k),
+        (q2, k2),
+        (dq2, dk2),
+        allow_unused=True,
+    )
+
+    assert torch.allclose(q1_grad, q2_grad, atol=atol, rtol=rtol)
+    assert torch.allclose(k1_grad, k2_grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_softmax.py b/test/transformers/test_softmax.py
new file mode 100755
index 0000000000000000000000000000000000000000..6c0666de105a8bf88f1caa58514710fa0119566d
--- /dev/null
+++ b/test/transformers/test_softmax.py
@@ -0,0 +1,103 @@
+import pytest
+import torch
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+from test.utils import supports_bfloat16
+
+from liger_kernel.transformers.functional import liger_softmax
+from liger_kernel.transformers.softmax import LigerSoftmax
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+set_seed()
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (2, 8),
+        (4, 16),
+        (1, 1023),  # Large single row single-block dispatch
+        (3, 7, 256),  # 3D input
+        (1, 4096),  # test multi-block dispatch
+        (1, 2, 4096),  # test multi-block dispatch on 3D input
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(),
+                reason="bfloat16 not supported on this device",
+            ),
+        ),
+    ],
+)
+def test_liger_softmax(shape, dtype, atol, rtol):
+    torch.manual_seed(0)
+    x = torch.randn(*shape, dtype=dtype, device=device)
+    x1 = x.clone().requires_grad_(True)
+    x2 = x.clone().requires_grad_(True)
+
+    torch_softmax = torch.nn.Softmax(dim=-1)
+    ref_out = torch_softmax(x1)
+    liger_softmax = LigerSoftmax().to(device).to(dtype)
+    liger_out = liger_softmax(x2)
+
+    assert_verbose_allclose(ref_out, liger_out, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(ref_out)
+    ref_out.backward(grad_output, retain_graph=True)
+    liger_out.backward(grad_output, retain_graph=True)
+
+    assert_verbose_allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (2, 8),
+        (4, 16),
+        (1, 1023),
+        (3, 7, 256),
+        (1, 4096),
+        (1, 2, 4096),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            5e-2,
+            5e-2,
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(),
+                reason="bfloat16 not supported on this device",
+            ),
+        ),
+    ],
+)
+def test_liger_softmax_functional(shape, dtype, atol, rtol):
+    torch.manual_seed(0)
+    x = torch.randn(*shape, dtype=dtype, device=device)
+    x1 = x.clone().requires_grad_(True)
+    x2 = x.clone().requires_grad_(True)
+
+    ref_out = torch.nn.functional.softmax(x1, dim=-1)
+    liger_out = liger_softmax(x2)
+
+    assert_verbose_allclose(ref_out, liger_out, atol=atol, rtol=rtol)
+
+    grad_output = torch.randn_like(ref_out)
+    ref_out.backward(grad_output, retain_graph=True)
+    liger_out.backward(grad_output, retain_graph=True)
+
+    assert_verbose_allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_sparsemax.py b/test/transformers/test_sparsemax.py
new file mode 100755
index 0000000000000000000000000000000000000000..5a3f2cdd1b349cbc2be3e6ff89504897bd01c3c1
--- /dev/null
+++ b/test/transformers/test_sparsemax.py
@@ -0,0 +1,111 @@
+import pytest
+import torch
+
+from test.utils import assert_verbose_allclose
+from test.utils import set_seed
+
+from liger_kernel.transformers.functional import liger_sparsemax
+from liger_kernel.transformers.sparsemax import LigerSparsemax
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+def torch_sparsemax(input_tensor: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    input_dims = input_tensor.dim()
+    if dim < 0:
+        dim = input_dims + dim
+    input_sorted, _ = torch.sort(input_tensor, dim=dim, descending=True)
+    cumsum_input = torch.cumsum(input_sorted, dim=dim)
+    input_size = input_tensor.size(dim)
+    range_tensor = torch.arange(1, input_size + 1, device=input_tensor.device, dtype=input_tensor.dtype)
+    shape = [1] * input_dims
+    shape[dim] = input_size
+    range_tensor = range_tensor.view(shape)
+    k_bound = 1 + range_tensor * input_sorted
+    support = k_bound > cumsum_input
+    k = support.sum(dim=dim, keepdim=True).clamp(min=1)
+    support_sum = (input_sorted * support).sum(dim=dim, keepdim=True)
+    tau = (support_sum - 1) / k
+    return torch.clamp(input_tensor - tau, min=0)
+
+
+@pytest.mark.parametrize(
+    "batch_size, seq_len, features",
+    [
+        (2, 128, 512),
+        (5, 123, 123),
+    ],
+)
+@pytest.mark.parametrize("dim", [-1, 1])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [(torch.float32, 1e-5, 1e-5)],
+)
+def test_liger_sparsemax_correctness(batch_size, seq_len, features, dim, dtype, atol, rtol):
+    set_seed(0)
+    shape = (batch_size, seq_len, features)
+    if dim >= len(shape) or dim < -len(shape):
+        pytest.skip("invalid dim")
+    if shape[dim if dim >= 0 else len(shape) + dim] <= 1:
+        pytest.skip("trivial dim")
+
+    x = torch.randn(*shape, dtype=dtype, device=device)
+    lx = x.clone().requires_grad_(True)
+    tx = x.clone().requires_grad_(True)
+
+    model = LigerSparsemax(dim=dim).to(device)
+    out_l = model(lx)
+    out_t = torch_sparsemax(tx, dim=dim)
+    assert_verbose_allclose(out_l, out_t, atol=atol, rtol=rtol)
+
+    sum_l = out_l.sum(dim=dim)
+    sum_t = out_t.sum(dim=dim)
+    assert_verbose_allclose(sum_l, torch.ones_like(sum_l), atol=atol * 10, rtol=rtol * 10)
+    assert_verbose_allclose(sum_t, torch.ones_like(sum_t), atol=atol * 10, rtol=rtol * 10)
+
+    g = torch.randn_like(x)
+    out_l.backward(g)
+    out_t.backward(g)
+    assert_verbose_allclose(lx.grad, tx.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "batch_size, seq_len, features",
+    [
+        (2, 128, 512),
+        (5, 123, 123),
+    ],
+)
+@pytest.mark.parametrize("dim", [-1, 1])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+    ],
+)
+def test_liger_sparsemax_functional_correctness(batch_size, seq_len, features, dim, dtype, atol, rtol):
+    set_seed(0)
+    shape = (batch_size, seq_len, features)
+    if dim >= len(shape) or dim < -len(shape):
+        pytest.skip("invalid dim")
+    if shape[dim if dim >= 0 else len(shape) + dim] <= 1:
+        pytest.skip("trivial dim")
+
+    x = torch.randn(*shape, dtype=dtype, device=device)
+    lx = x.clone().requires_grad_(True)
+    tx = x.clone().requires_grad_(True)
+
+    out_l = liger_sparsemax(lx, dim=dim)
+    out_t = torch_sparsemax(tx, dim=dim)
+    assert_verbose_allclose(out_l, out_t, atol=atol, rtol=rtol)
+
+    sum_l = out_l.sum(dim=dim)
+    sum_t = out_t.sum(dim=dim)
+    assert_verbose_allclose(sum_l, torch.ones_like(sum_l), atol=atol * 10, rtol=rtol * 10)
+    assert_verbose_allclose(sum_t, torch.ones_like(sum_t), atol=atol * 10, rtol=rtol * 10)
+
+    g = torch.randn_like(x)
+    out_l.backward(g)
+    out_t.backward(g)
+    assert_verbose_allclose(lx.grad, tx.grad, atol=atol, rtol=rtol)
diff --git a/test/transformers/test_swiglu.py b/test/transformers/test_swiglu.py
new file mode 100755
index 0000000000000000000000000000000000000000..2d3acda1d9b2a28aa412635fb569a6a2b122e1ce
--- /dev/null
+++ b/test/transformers/test_swiglu.py
@@ -0,0 +1,492 @@
+import tempfile
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import transformers
+
+from packaging import version
+from test.utils import supports_bfloat16
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP
+from transformers.models.mixtral.configuration_mixtral import MixtralConfig
+from transformers.models.phi3.configuration_phi3 import Phi3Config
+from transformers.models.phi3.modeling_phi3 import Phi3MLP
+
+from liger_kernel.ops import LigerSiLUMulFunction
+from liger_kernel.transformers.functional import liger_swiglu
+from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP
+from liger_kernel.transformers.swiglu import LigerExperts
+from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+from liger_kernel.utils import infer_comm_backend
+from liger_kernel.utils import infer_device
+
+IS_TRANSFORMERS_V5_OR_LATER = version.parse(transformers.__version__) >= version.parse("5.0.0")
+if IS_TRANSFORMERS_V5_OR_LATER:
+    from transformers.models.mixtral.modeling_mixtral import MixtralExperts
+else:
+    from transformers.models.mixtral.modeling_mixtral import MixtralBlockSparseTop2MLP
+
+device = infer_device()
+
+LLAMA_CONFIG = LlamaConfig(
+    hidden_size=4096,
+    intermediate_size=11008,
+    hidden_act="silu",
+)
+PHI3_CONFIG = Phi3Config(
+    hidden_size=4096,
+    intermediate_size=11008,
+    hidden_act="silu",
+)
+SLEEP_SECONDS = 0.1
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, hidden_size, intermediate_size",
+    [
+        (2, 256, 256, 512),
+        # weird shapes
+        (6, 42, 123, 431),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-0, 1e-5),
+        # TODO: we should find a better way to tune this. 1e4 is too large apparently
+        pytest.param(
+            torch.bfloat16,
+            1e4,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness_llamamlp(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol):
+    _input = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    # initialize weights
+    G = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+    U = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+    D = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+
+    llama_mlp = LlamaMLP(config=LLAMA_CONFIG).to(device).to(dtype)
+    llama_mlp.gate_proj.weight.data = G.T
+    llama_mlp.up_proj.weight.data = U.T
+    llama_mlp.down_proj.weight.data = D.T
+
+    liger_mlp = LigerSwiGLUMLP(config=LLAMA_CONFIG).to(device).to(dtype)
+    liger_mlp.gate_proj.weight.data = G.T
+    liger_mlp.up_proj.weight.data = U.T
+    liger_mlp.down_proj.weight.data = D.T
+
+    y1 = llama_mlp(x1)
+    y2 = liger_mlp(x2)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    dy = torch.randn_like(y1)
+
+    y1.backward(dy.clone(), retain_graph=True)
+    y2.backward(dy.clone(), retain_graph=True)
+
+    assert torch.allclose(
+        llama_mlp.gate_proj.weight.grad,
+        liger_mlp.gate_proj.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    assert torch.allclose(
+        llama_mlp.up_proj.weight.grad,
+        liger_mlp.up_proj.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    assert torch.allclose(
+        llama_mlp.down_proj.weight.grad,
+        liger_mlp.down_proj.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.skipif(IS_TRANSFORMERS_V5_OR_LATER, reason="Skip for transformers >= v5.0.0")
+@pytest.mark.parametrize(
+    "bsz, seq_len, hidden_size, intermediate_size",
+    [
+        (2, 256, 256, 512),
+        # weird shapes
+        (6, 42, 123, 431),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-0, 1e-5),
+        # TODO: we should find a better way to tune this. 1e4 is too large apparently
+        pytest.param(
+            torch.bfloat16,
+            1e4,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness_mixtralblocksparsetop2mlp(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol):
+    MIXTRAL_CONFIG = MixtralConfig(
+        num_local_experts=8,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        hidden_act="silu",
+        num_experts_per_tok=2,
+    )
+
+    _input = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    # initialize weights
+    G = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+    U = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+    D = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+
+    mixtral_blocksparsetop2mlp = MixtralBlockSparseTop2MLP(config=MIXTRAL_CONFIG).to(device).to(dtype)
+    mixtral_blocksparsetop2mlp.w1.weight.data = G.T
+    mixtral_blocksparsetop2mlp.w2.weight.data = U.T
+    mixtral_blocksparsetop2mlp.w3.weight.data = D.T
+
+    liger_blocksparsetop2mlp = LigerBlockSparseTop2MLP(config=MIXTRAL_CONFIG).to(device).to(dtype)
+    liger_blocksparsetop2mlp.w1.weight.data = G.T
+    liger_blocksparsetop2mlp.w2.weight.data = U.T
+    liger_blocksparsetop2mlp.w3.weight.data = D.T
+
+    y1 = mixtral_blocksparsetop2mlp(x1)
+    y2 = liger_blocksparsetop2mlp(x2)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    dy = torch.randn_like(y1)
+
+    y1.backward(dy.clone(), retain_graph=True)
+    y2.backward(dy.clone(), retain_graph=True)
+
+    assert torch.allclose(
+        mixtral_blocksparsetop2mlp.w1.weight.grad,
+        liger_blocksparsetop2mlp.w1.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    assert torch.allclose(
+        mixtral_blocksparsetop2mlp.w2.weight.grad,
+        liger_blocksparsetop2mlp.w2.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    assert torch.allclose(
+        mixtral_blocksparsetop2mlp.w3.weight.grad,
+        liger_blocksparsetop2mlp.w3.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.skipif(not IS_TRANSFORMERS_V5_OR_LATER, reason="Skip for transformers < v5.0.0")
+@pytest.mark.parametrize(
+    "bsz, seq_len, hidden_size, intermediate_size",
+    [
+        (2, 256, 256, 512),
+        # weird shapes
+        (6, 42, 123, 431),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-0, 1e-5),
+        # TODO: we should find a better way to tune this. 1e4 is too large apparently
+        pytest.param(
+            torch.bfloat16,
+            1e4,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness_mixtralexperts(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol):
+    MIXTRAL_CONFIG = MixtralConfig(
+        num_local_experts=8,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        experts_implementation="eager",
+        hidden_act="silu",
+        num_experts_per_tok=2,
+    )
+
+    _input = torch.randn(bsz * seq_len, hidden_size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    # match shape: (num_experts, 2 * intermediate_dim, hidden_dim)
+    GU = torch.randn(
+        MIXTRAL_CONFIG.num_local_experts,
+        2 * intermediate_size,
+        hidden_size,
+        device=device,
+        dtype=dtype,
+        requires_grad=True,
+    )
+    # match shape: (num_experts, hidden_dim, intermediate_dim)
+    D = torch.randn(
+        MIXTRAL_CONFIG.num_local_experts, hidden_size, intermediate_size, device=device, dtype=dtype, requires_grad=True
+    )
+
+    # Generate random router logits and do topk
+    router_logits = torch.randn(bsz * seq_len, MIXTRAL_CONFIG.num_local_experts, device=device, dtype=dtype)
+    router_logits = router_logits.softmax(dim=-1)
+    top_k_weights, top_k_index = router_logits.topk(k=MIXTRAL_CONFIG.num_experts_per_tok, dim=-1)
+    top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
+
+    mixtral_experts = MixtralExperts(config=MIXTRAL_CONFIG).to(device).to(dtype)
+    mixtral_experts.gate_up_proj.data = GU.clone().detach()
+    mixtral_experts.down_proj.data = D.clone().detach()
+
+    liger_experts = LigerExperts(config=MIXTRAL_CONFIG).to(device).to(dtype)
+    liger_experts.gate_up_proj.data = GU.clone().detach()
+    liger_experts.down_proj.data = D.clone().detach()
+
+    mixtral_experts.gate_up_proj.requires_grad_()
+    mixtral_experts.down_proj.requires_grad_()
+    liger_experts.gate_up_proj.requires_grad_()
+    liger_experts.down_proj.requires_grad_()
+
+    y1 = mixtral_experts(x1, top_k_index, top_k_weights)
+    y2 = liger_experts(x2, top_k_index, top_k_weights)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    dy = torch.randn_like(y1)
+
+    y1.backward(dy.clone(), retain_graph=True)
+    y2.backward(dy.clone(), retain_graph=True)
+
+    assert torch.allclose(
+        mixtral_experts.gate_up_proj.grad,
+        liger_experts.gate_up_proj.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    assert torch.allclose(
+        mixtral_experts.down_proj.grad,
+        liger_experts.down_proj.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, hidden_size, intermediate_size",
+    [
+        (2, 256, 256, 512),
+        # weird shapes
+        (6, 42, 123, 431),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-0, 1e-5),
+        # TODO: we should find a better way to tune this. 1e4 is too large apparently
+        pytest.param(
+            torch.bfloat16,
+            1e4,
+            1e-2,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+def test_correctness_phi3mlp(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol):
+    _input = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    # initialize weights
+    GU = torch.randn(hidden_size, intermediate_size * 2, device=device, dtype=dtype)
+    D = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+
+    phi3_mlp = Phi3MLP(config=PHI3_CONFIG).to(device).to(dtype)
+    phi3_mlp.gate_up_proj.weight.data = GU.T
+    phi3_mlp.down_proj.weight.data = D.T
+
+    liger_mlp = LigerPhi3SwiGLUMLP(config=PHI3_CONFIG).to(device).to(dtype)
+    liger_mlp.gate_up_proj.weight.data = GU.T
+    liger_mlp.down_proj.weight.data = D.T
+
+    y1 = phi3_mlp(x1)
+    y2 = liger_mlp(x2)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    dy = torch.randn_like(y1)
+
+    y1.backward(dy.clone(), retain_graph=True)
+    y2.backward(dy.clone(), retain_graph=True)
+
+    assert torch.allclose(
+        phi3_mlp.gate_up_proj.weight.grad,
+        liger_mlp.gate_up_proj.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+    assert torch.allclose(
+        phi3_mlp.down_proj.weight.grad,
+        liger_mlp.down_proj.weight.grad,
+        atol=atol,
+        rtol=rtol,
+    )
+
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, size",
+    [
+        (2, 8, 8),
+        (9, 7, 41),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        # atol is for small values: they have more difference, so set atol higher
+        # rtol is for larger values: they are very close, so set rtol lower
+        (torch.float32, 1e-0, 1e-5),
+        # TODO: we should find a better way to tune this. 1e4 is too large apparently
+        (torch.bfloat16, 1e4, 1e-2),
+    ],
+)
+def test_correctness_functional(bsz, seq_len, size, dtype, atol, rtol):
+    _input = torch.randn(bsz, seq_len, size, device=device, dtype=dtype)
+    _b = torch.randn(bsz, seq_len, size, device=device, dtype=dtype)
+
+    x1 = _input.clone().requires_grad_(True)
+    x2 = _input.clone().requires_grad_(True)
+
+    b1 = _b.clone().requires_grad_(True)
+    b2 = _b.clone().requires_grad_(True)
+
+    y1 = liger_swiglu(a=x1, b=b1)
+    y2 = LigerSiLUMulFunction.apply(x2, b2)
+
+    assert torch.allclose(y1, y2, atol=atol, rtol=rtol)
+
+    # Test backward pass
+    grad_output = torch.randn_like(y1)
+
+    y1.backward(grad_output)
+    y2.backward(grad_output)
+
+    # Check if gradients are close for x
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+    assert torch.allclose(b1.grad, b2.grad, atol=atol, rtol=rtol)
+
+
+def _test_dtensor_liger_silumul(rank, world_size, bsz, seq_len, hidden_size, dtype, atol, rtol, file_name):
+    torch.distributed.init_process_group(
+        backend=infer_comm_backend(),
+        init_method=f"file://{file_name}",
+        rank=rank,
+        world_size=world_size,
+    )
+    device = f"{infer_device()}:{rank}" if infer_device() != "cpu" else "cpu"
+    device_mesh = torch.distributed.device_mesh.init_device_mesh(
+        infer_device(), mesh_shape=(world_size,), mesh_dim_names=("tp",)
+    )
+
+    _a = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+    _b = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype)
+
+    # Broadcast from rank 0 so all ranks operate on identical tensors
+    torch.distributed.broadcast(_a, src=0)
+    torch.distributed.broadcast(_b, src=0)
+
+    assert hidden_size % world_size == 0, f"hidden_size ({hidden_size}) must be divisible by world_size ({world_size})"
+
+    # DTensor path: shard inputs along the hidden dim
+    a1 = _a.clone().detach().requires_grad_(True)
+    b1 = _b.clone().detach().requires_grad_(True)
+    da = torch.distributed.tensor.distribute_tensor(
+        a1, device_mesh=device_mesh, placements=[torch.distributed.tensor.Shard(2)]
+    )
+    db = torch.distributed.tensor.distribute_tensor(
+        b1, device_mesh=device_mesh, placements=[torch.distributed.tensor.Shard(2)]
+    )
+
+    # Regular tensor path
+    a2 = _a.clone().detach().requires_grad_(True)
+    b2 = _b.clone().detach().requires_grad_(True)
+
+    c1 = LigerSiLUMulFunction.apply(da, db)
+    c2 = LigerSiLUMulFunction.apply(a2, b2)
+
+    torch.testing.assert_close(c1.full_tensor(), c2, atol=atol, rtol=rtol)
+
+    grad = torch.randn_like(c2)
+    torch.distributed.broadcast(grad, src=0)
+    dgrad = torch.distributed.tensor.distribute_tensor(
+        grad, device_mesh=device_mesh, placements=[torch.distributed.tensor.Shard(2)]
+    )
+
+    c1.backward(dgrad)
+    c2.backward(grad)
+
+    torch.testing.assert_close(da.grad.full_tensor(), a2.grad, atol=atol, rtol=rtol)
+    torch.testing.assert_close(db.grad.full_tensor(), b2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.xfail(
+    torch.cuda.device_count() < 8,
+    reason="Pending multi-GPU host support. This test is expected to pass when run with multi-GPU host.",
+)
+@pytest.mark.parametrize(
+    "world_size, bsz, seq_len, hidden_size",
+    [
+        (4, 2, 2, 8),
+        (8, 9, 7, 64),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-4, 1e-6),
+        (torch.bfloat16, 2e-1, 2e-2),
+    ],
+)
+def test_dtensor_liger_silumul(world_size, bsz, seq_len, hidden_size, dtype, atol, rtol):
+    with tempfile.NamedTemporaryFile() as f:
+        mp.spawn(
+            _test_dtensor_liger_silumul,
+            args=(world_size, bsz, seq_len, hidden_size, dtype, atol, rtol, f.name),
+            nprocs=world_size,
+            join=True,
+        )
diff --git a/test/transformers/test_tiled_mlp.py b/test/transformers/test_tiled_mlp.py
new file mode 100755
index 0000000000000000000000000000000000000000..bb9ecda09c15c1d7519dc2466d41789bf2feff11
--- /dev/null
+++ b/test/transformers/test_tiled_mlp.py
@@ -0,0 +1,197 @@
+import pytest
+import torch
+
+from test.utils import supports_bfloat16
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+from liger_kernel.transformers.geglu import LigerGEGLUMLP
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+from liger_kernel.transformers.tiled_mlp import LigerTiledGEGLUMLP
+from liger_kernel.transformers.tiled_mlp import LigerTiledSwiGLUMLP
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, hidden_size, intermediate_size",
+    [
+        (1, 1024, 128, 256),  # num_shards=8 if auto
+        (2, 1024, 64, 256),  # num_shards=16 if auto
+        # weird shapes
+        (4, 127, 128, 256),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-0, 2e-6),
+        pytest.param(
+            torch.bfloat16,
+            1e-0,
+            1e-0,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("num_shards", [None, 2, 4])
+@pytest.mark.parametrize("check_2d", [True, False])
+def test_tiled_geglu_correctness(bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol, num_shards, check_2d):
+    """Test that TiledGEGLUMLP produces similar results as regular GEGLUMLP."""
+
+    # BF16 accumulation is sensitive to the number of reduction steps. Narrow hidden layers
+    # (hidden_size < 128) combined with sharding result in high-density summation boundaries
+    # where rounding errors exceed standard tolerances. We skip these edge cases to maintain
+    # strict parity checks for production-scale shapes.
+    if dtype == torch.bfloat16 and hidden_size < 128:
+        pytest.skip(f"Skipping unstable BF16 configuration: hidden_size={hidden_size}")
+
+    config = LlamaConfig(
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        hidden_act="gelu_pytorch_tanh",
+    )
+
+    # scale input so that the numerical errors are accumulated less
+    _input = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype) * 0.1
+    x1 = _input.detach().clone().requires_grad_(True)
+    x2 = _input.detach().clone().requires_grad_(True)
+
+    # Convert to 2D input for MoE experts testing
+    if check_2d:
+        x1 = x1.view(-1, hidden_size)
+        x2 = x2.view(-1, hidden_size)
+
+    # Initialize weights
+    G = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+    U = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+    D = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+
+    # Regular GEGLU MLP
+    regular_mlp = LigerGEGLUMLP(config=config).to(device).to(dtype)
+    regular_mlp.gate_proj.weight.data = G
+    regular_mlp.up_proj.weight.data = U
+    regular_mlp.down_proj.weight.data = D
+
+    # Tiled GEGLU MLP
+    tiled_mlp = LigerTiledGEGLUMLP(config=config, num_shards=num_shards).to(device).to(dtype)
+    tiled_mlp.gate_proj.weight.data = G
+    tiled_mlp.up_proj.weight.data = U
+    tiled_mlp.down_proj.weight.data = D
+
+    # Forward pass
+    y1 = regular_mlp(x1)
+    y2 = tiled_mlp(x2)
+    torch.testing.assert_close(y1, y2, atol=atol, rtol=rtol, msg="Forward outputs don't match")
+
+    # Backward pass
+    dy = torch.randn_like(y1)
+    y1.backward(dy.clone(), retain_graph=True)
+    y2.backward(dy.clone(), retain_graph=True)
+
+    # Dynamic parameter discovery ensures PEFT/LoRA adapters are also validated
+    regular_params = [p for p in regular_mlp.parameters() if p.requires_grad]
+    tiled_params = [p for p in tiled_mlp.parameters() if p.requires_grad]
+    assert len(regular_params) == len(tiled_params), "Number of trainable parameters mismatch"
+
+    for p1, p2 in zip(regular_params, tiled_params):
+        torch.testing.assert_close(
+            p1.grad,
+            p2.grad,
+            atol=atol,
+            rtol=rtol,
+            msg="Gradients for trainable parameters do not match",
+        )
+
+    torch.testing.assert_close(x1.grad, x2.grad, atol=atol, rtol=rtol, msg="Input gradients don't match")
+
+
+@pytest.mark.parametrize(
+    "bsz, seq_len, hidden_size, intermediate_size",
+    [
+        (2, 512, 512, 1024),
+        (1, 1024, 256, 512),
+        # weird shapes
+        (4, 127, 128, 256),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-0, 2e-6),
+        pytest.param(
+            torch.bfloat16,
+            1e-0,
+            1e-0,
+            marks=pytest.mark.skip(reason="bfloat16 tests disabled due to numerical instability"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("num_shards", [None, 2, 4])
+@pytest.mark.parametrize("check_2d", [True, False])
+def test_tiled_swiglu_correctness(
+    bsz, seq_len, hidden_size, intermediate_size, dtype, atol, rtol, num_shards, check_2d
+):
+    """Test that TiledSwiGLUMLP produces similar results as regular SwiGLUMLP."""
+
+    # See rationale in test_tiled_geglu_correctness
+    if dtype == torch.bfloat16 and hidden_size < 128:
+        pytest.skip(f"Skipping unstable BF16 configuration: hidden_size={hidden_size}")
+
+    config = LlamaConfig(
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        hidden_act="silu",
+    )
+
+    # scale input so that the numerical errors are accumulated less
+    _input = torch.randn(bsz, seq_len, hidden_size, device=device, dtype=dtype) * 0.1
+    x1 = _input.detach().clone().requires_grad_(True)
+    x2 = _input.detach().clone().requires_grad_(True)
+
+    if check_2d:
+        x1 = x1.view(-1, hidden_size)
+        x2 = x2.view(-1, hidden_size)
+
+    # Initialize weights
+    G = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+    U = torch.randn(intermediate_size, hidden_size, device=device, dtype=dtype)
+    D = torch.randn(hidden_size, intermediate_size, device=device, dtype=dtype)
+
+    # Regular SwiGLU MLP
+    regular_mlp = LigerSwiGLUMLP(config=config).to(device).to(dtype)
+    regular_mlp.gate_proj.weight.data = G
+    regular_mlp.up_proj.weight.data = U
+    regular_mlp.down_proj.weight.data = D
+
+    # Tiled SwiGLU MLP
+    tiled_mlp = LigerTiledSwiGLUMLP(config=config, num_shards=num_shards).to(device).to(dtype)
+    tiled_mlp.gate_proj.weight.data = G
+    tiled_mlp.up_proj.weight.data = U
+    tiled_mlp.down_proj.weight.data = D
+
+    # Forward pass
+    y1 = regular_mlp(x1)
+    y2 = tiled_mlp(x2)
+    torch.testing.assert_close(y1, y2, atol=atol, rtol=rtol, msg="Forward outputs don't match")
+
+    # Backward pass
+    dy = torch.randn_like(y1)
+    y1.backward(dy.clone(), retain_graph=True)
+    y2.backward(dy.clone(), retain_graph=True)
+
+    # Check gradients
+    regular_params = [p for p in regular_mlp.parameters() if p.requires_grad]
+    tiled_params = [p for p in tiled_mlp.parameters() if p.requires_grad]
+    assert len(regular_params) == len(tiled_params), "Number of trainable parameters mismatch"
+
+    for p1, p2 in zip(regular_params, tiled_params):
+        torch.testing.assert_close(
+            p1.grad,
+            p2.grad,
+            atol=atol,
+            rtol=rtol,
+            msg="Gradients for trainable parameters do not match",
+        )
+
+    torch.testing.assert_close(x1.grad, x2.grad, atol=atol, rtol=rtol, msg="Input gradients don't match")
diff --git a/test/transformers/test_trainer_integration.py b/test/transformers/test_trainer_integration.py
new file mode 100755
index 0000000000000000000000000000000000000000..b7c940d1f893dce49b934f1fe190c98ad27946e1
--- /dev/null
+++ b/test/transformers/test_trainer_integration.py
@@ -0,0 +1,8 @@
+import pytest
+
+
+def test_import():
+    try:
+        from liger_kernel.transformers.trainer_integration import _apply_liger_kernel  # noqa: F401
+    except Exception:
+        pytest.fail("Import _apply_liger_kernel fails")
diff --git a/test/transformers/test_transformers.py b/test/transformers/test_transformers.py
new file mode 100755
index 0000000000000000000000000000000000000000..fbc54b39d38a1551c8ae3218507650f66092f78c
--- /dev/null
+++ b/test/transformers/test_transformers.py
@@ -0,0 +1,17 @@
+import pytest
+
+
+def test_import_from_root():
+    try:
+        from liger_kernel.transformers import LigerBlockSparseTop2MLP  # noqa: F401
+        from liger_kernel.transformers import LigerCrossEntropyLoss  # noqa: F401
+        from liger_kernel.transformers import LigerExperts  # noqa: F401
+        from liger_kernel.transformers import LigerFusedLinearCrossEntropyLoss  # noqa: F401
+        from liger_kernel.transformers import LigerGEGLUMLP  # noqa: F401
+        from liger_kernel.transformers import LigerLayerNorm  # noqa: F401
+        from liger_kernel.transformers import LigerPhi3SwiGLUMLP  # noqa: F401
+        from liger_kernel.transformers import LigerRMSNorm  # noqa: F401
+        from liger_kernel.transformers import LigerSwiGLUMLP  # noqa: F401
+        from liger_kernel.transformers import liger_rotary_pos_emb  # noqa: F401
+    except Exception:
+        pytest.fail("Import kernels from root fails")
diff --git a/test/transformers/test_tvd.py b/test/transformers/test_tvd.py
new file mode 100755
index 0000000000000000000000000000000000000000..3ae5d9d36f2e50967022e69aabcfff54b1a7f8f8
--- /dev/null
+++ b/test/transformers/test_tvd.py
@@ -0,0 +1,188 @@
+import pytest
+import torch
+
+from test.utils import supports_bfloat16
+
+from liger_kernel.transformers.tvd import LigerTVDLoss
+from liger_kernel.utils import infer_device
+
+
+class TorchTVDLoss(torch.nn.Module):
+    def __init__(self, reduction="batchmean", ignore_index: int = -100):
+        super(TorchTVDLoss, self).__init__()
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+
+    def forward(self, p, q, label=None):
+        tvd = torch.abs(p - q) / 2.0
+        n_non_ignore = p.size(0)
+        if label is not None:
+            tvd = torch.where(label.unsqueeze(1) != self.ignore_index, tvd, torch.zeros_like(tvd))
+            n_non_ignore = (label != self.ignore_index).sum().item()
+            if n_non_ignore == 0:
+                return torch.tensor(0.0).to(tvd.device)
+
+        if self.reduction == "mean":
+            return torch.sum(tvd) / (n_non_ignore * p.size(1))
+        elif self.reduction == "sum":
+            return torch.sum(tvd)
+        elif self.reduction == "none":
+            return tvd
+        elif self.reduction == "batchmean":
+            return torch.sum(tvd) / n_non_ignore
+        else:
+            raise ValueError("Invalid reduction type.")
+
+
+_SHAPE_PARAMS = (
+    "B, T, V",
+    [
+        (1, 4096, 32000),
+        (32, 4096, 1024),
+        (41, 401, 1271),
+        pytest.param(
+            1,
+            4096,
+            128256,
+            marks=pytest.mark.skipif(
+                hasattr(torch, infer_device())
+                and getattr(torch, infer_device()).is_available()
+                and getattr(torch, infer_device()).get_device_properties(0).total_memory < 36e9,
+                reason="This test requires a GPU with at least 36GB of memory",
+            ),
+        ),
+        (3, 423, 32000),
+    ],
+)
+
+_DTYPE_PARAMS = (
+    "dtype, atol, rtol",
+    [
+        pytest.param(
+            torch.bfloat16,
+            1e-8,
+            1e-6,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+        (torch.float32, 1e-8, 1e-6),
+        # (torch.float16, 1e-1, 1e-2), # turn off because of numerical instability of torch.float16
+    ],
+)
+
+
+def _test_correctness_once(
+    target_tvd,
+    torch_tvd,
+    B,
+    T,
+    V,
+    dtype,
+    atol,
+    rtol,
+    reduction,
+    is_last_layer=True,
+    device=infer_device(),
+):
+    torch.manual_seed(0)
+    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True)
+
+    x1 = input.detach().clone().requires_grad_(True)
+    x2 = input.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, device=device).softmax(dim=-1)
+
+    output = target_tvd(x1, target)
+    output2 = torch_tvd(x2, target)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    if not is_last_layer:
+        output = output * 2.0
+        output2 = output2 * 2.0
+
+    if reduction == "none":
+        return
+
+    output.backward()
+    output2.backward()
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+def _test_correctness_with_ignore_index_once(
+    target_tvd,
+    torch_tvd,
+    ignore_index,
+    B,
+    T,
+    V,
+    dtype,
+    atol,
+    rtol,
+    reduction,
+    device=infer_device(),
+):
+    input = torch.randn(B * T, V, device=device, dtype=dtype, requires_grad=True)
+
+    x1 = input.detach().clone().requires_grad_(True)
+    x2 = input.detach().clone().requires_grad_(True)
+
+    with torch.no_grad():
+        target = torch.randn(B * T, V, device=device).softmax(dim=-1)
+
+    label = torch.randint(0, V, (B * T,), device=device, dtype=torch.long)
+
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    label[indices_to_assign] = ignore_index
+
+    output = torch_tvd(x1, target, label)
+    output2 = target_tvd(x2, target, label)
+
+    assert torch.allclose(output, output2, atol=atol, rtol=rtol)
+
+    if reduction == "none":
+        return
+
+    output.backward()
+    output2.backward()
+    assert torch.allclose(x1.grad, x2.grad, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+def test_correctness(B, T, V, reduction, dtype, atol, rtol):
+    liger_tvd = LigerTVDLoss(reduction=reduction)
+    torch_tvd = TorchTVDLoss(reduction=reduction)
+    _test_correctness_once(liger_tvd, torch_tvd, B, T, V, dtype, atol, rtol, reduction)
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+def test_correctness_not_last(B, T, V, reduction, dtype, atol, rtol):
+    liger_tvd = LigerTVDLoss(reduction=reduction)
+    torch_tvd = TorchTVDLoss(reduction=reduction)
+    _test_correctness_once(
+        liger_tvd,
+        torch_tvd,
+        B,
+        T,
+        V,
+        dtype,
+        atol,
+        rtol,
+        reduction,
+        is_last_layer=False,
+    )
+
+
+@pytest.mark.parametrize(*_SHAPE_PARAMS)
+@pytest.mark.parametrize("reduction", ["batchmean", "sum", "mean", "none"])
+@pytest.mark.parametrize(*_DTYPE_PARAMS)
+@pytest.mark.parametrize("ignore_index", [-100, 0, 1])
+def test_correctness_with_ignore_index(B, T, V, reduction, dtype, atol, rtol, ignore_index):
+    liger_tvd = LigerTVDLoss(reduction=reduction, ignore_index=ignore_index)
+    torch_tvd = TorchTVDLoss(reduction=reduction, ignore_index=ignore_index)
+    _test_correctness_with_ignore_index_once(liger_tvd, torch_tvd, ignore_index, B, T, V, dtype, atol, rtol, reduction)
diff --git a/test/triton/test_triton_monkey_patch.py b/test/triton/test_triton_monkey_patch.py
new file mode 100755
index 0000000000000000000000000000000000000000..eeb8c173dc2a78ad3c1331e69538636c1bfa805d
--- /dev/null
+++ b/test/triton/test_triton_monkey_patch.py
@@ -0,0 +1,29 @@
+import secrets
+
+import pytest
+
+
+def generate_random_hex(length=16):
+    return secrets.token_hex(length // 2)
+
+
+def test_import_from_root():
+    try:
+        from liger_kernel.triton import apply_liger_triton_cache_manager  # noqa: F401
+    except Exception:
+        pytest.fail("Import kernel patch from root fails")
+
+
+def test_import_custom_cache_manager():
+    from triton.runtime.cache import get_cache_manager
+
+    from liger_kernel.triton import apply_liger_triton_cache_manager
+
+    apply_liger_triton_cache_manager()
+    random_hex_key = generate_random_hex(16)
+    cache_manager = get_cache_manager(key=random_hex_key)
+    from liger_kernel.triton.monkey_patch import LigerTritonFileCacheManager
+
+    assert isinstance(cache_manager, LigerTritonFileCacheManager), (
+        "Cache manager should have been LigerTritonFileCacheManager"
+    )
diff --git a/test/utils.py b/test/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..1e494e722da3f0736656591a6f7bcd7541c4f8c5
--- /dev/null
+++ b/test/utils.py
@@ -0,0 +1,1158 @@
+import importlib
+import json
+import os
+import random
+
+from abc import abstractmethod
+from dataclasses import dataclass
+from functools import wraps
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import transformers
+
+from packaging import version
+from tokenizers import AddedToken
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import Whitespace
+from tokenizers.trainers import BpeTrainer
+from transformers import PretrainedConfig
+from transformers import PreTrainedModel
+from transformers.tokenization_utils_base import BatchEncoding
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+# =============================================================================
+# Transformers Version Compatibility Utilities
+# =============================================================================
+# These utilities help maintain backward compatibility across different
+# versions of the transformers library (v4.52.0, v4.57.6, v5.0.0+).
+
+TRANSFORMERS_VERSION = version.parse(transformers.__version__)
+TRANSFORMERS_V5 = version.parse("5.0.0")
+
+
+def is_transformers_v5_or_later() -> bool:
+    """Check if the installed transformers version is 5.0.0 or later."""
+    return TRANSFORMERS_VERSION >= TRANSFORMERS_V5
+
+
+def get_mllama_rope_config() -> dict:
+    """
+    Get the correct rope configuration for MLlama models.
+
+    In transformers v4.x: requires explicit rope_scaling with llama3 rope_type
+    In transformers v5.0+: uses defaults, no explicit config needed
+
+    Returns:
+        dict: Configuration dictionary with rope_scaling for v4.x, empty for v5.0+
+    """
+    if is_transformers_v5_or_later():
+        return {}
+    return {
+        "rope_scaling": {
+            "factor": 8.0,
+            "high_freq_factor": 4.0,
+            "low_freq_factor": 1.0,
+            "original_max_position_embeddings": 8192,
+            "rope_type": "llama3",
+        },
+    }
+
+
+def get_qwen3_vl_rope_config() -> dict:
+    """
+    Get the correct rope configuration for Qwen3-VL models.
+
+    In transformers v4.x: requires rope_scaling with type="mrope"
+    In transformers v5.0+: uses defaults, no explicit config needed
+
+    Returns:
+        dict: Configuration dictionary with rope_scaling for v4.x, empty for v5.0+
+    """
+    if is_transformers_v5_or_later():
+        return {}
+    return {
+        "rope_theta": 1000000.0,
+        "rope_scaling": {
+            "type": "mrope",
+            "mrope_section": [16, 24, 24],
+        },
+    }
+
+
+def set_seed(seed=42):
+    """
+    Fix all random seeds we use for reproducibility.
+    """
+    # Python random seed
+    random.seed(seed)
+    # Numpy random seed
+    np.random.seed(0)
+    # PyTorch random seed
+    torch.manual_seed(seed)
+
+    if device == "cuda":
+        # If you are using CUDA
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
+
+        # PyTorch backend settings
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    elif device == "xpu":
+        # If you are using XPU
+        torch.xpu.manual_seed(seed)
+        torch.xpu.manual_seed_all(seed)
+    elif device == "npu":
+        torch.npu.manual_seed(seed)
+        torch.npu.manual_seed_all(seed)
+
+    # Python hash seed
+    os.environ["PYTHONHASHSEED"] = str(seed)
+
+
+def require_deterministic(test_case):
+    @wraps(test_case)
+    def wrapper(*args, **kwargs):
+        original_state = torch.are_deterministic_algorithms_enabled()
+        try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+            return test_case(*args, **kwargs)
+        finally:
+            torch.use_deterministic_algorithms(original_state)
+
+    return wrapper
+
+
+@torch.no_grad
+def get_logprobs(tensor):
+    return torch.nn.functional.log_softmax(tensor, dim=-1, dtype=torch.float32)
+
+
+@torch.no_grad
+def get_topk(tensor, k=20):
+    topk = torch.topk(tensor, k, dim=-1)
+    return topk
+
+
+def assert_verbose_allclose(tensor1, tensor2, rtol=1e-05, atol=1e-08, max_print=5, extra_info=""):
+    """
+    Assert that two tensors are element-wise equal within a tolerance, providing detailed information about mismatches.
+
+    Parameters:
+    tensor1 (torch.Tensor): First tensor to compare.
+    tensor2 (torch.Tensor): Second tensor to compare.
+    rtol (float): Relative tolerance.
+    atol (float): Absolute tolerance.
+    max_print (int): Maximum number of mismatched elements to print.
+    extra_info (str): Extra information to show at the start of the error message.
+
+    Raises:
+    AssertionError: If the tensors are not all close within the given tolerance.
+    """
+    # Check if the shapes of the tensors match
+    if tensor1.shape != tensor2.shape:
+        raise AssertionError("Input tensors must have the same shape.")
+
+    # Calculate the difference between the tensors
+    diff = torch.abs(tensor1 - tensor2)
+
+    # Determine the tolerance
+    tolerance = atol + rtol * torch.abs(tensor2)
+
+    # Find tolerance mismatched elements
+    tol_mismatched = diff > tolerance
+
+    # Find nan mismatched elements
+    nan_mismatched = torch.logical_xor(torch.isnan(tensor1), torch.isnan(tensor2))
+
+    # Find +inf mismatched elements
+    posinf_mismatched = torch.logical_xor(torch.isposinf(tensor1), torch.isposinf(tensor2))
+    # Find -inf mismatched elements
+    neginf_mismatched = torch.logical_xor(torch.isneginf(tensor1), torch.isneginf(tensor2))
+
+    # Find all mismatched elements
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+
+    # Count the number of mismatched elements
+    num_mismatched = mismatched.sum().item()
+
+    # Check if all elements are close
+    all_close = num_mismatched == 0
+
+    # Raise AssertionError with detailed information if there are mismatches
+    if not all_close and num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+        print_count = min(max_print, num_mismatched)
+        for index in mismatched_indices[:print_count]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"Mismatch at index {i}: tensor1[{i}] = {tensor1[i]}, tensor2[{i}] = {tensor2[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+
+        raise AssertionError(extra_info + "\n".join(mismatch_details))
+
+
+# Pre-tokenized dataset using Mistral-7B tokenizer used for convergence tests
+DEFAULT_DATASET_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/tiny_shakespeare_tokenized")
+
+UNTOKENIZED_DATASET_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/tiny_shakespeare.txt")
+
+FAKE_CONFIGS_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resources/fake_configs")
+
+
+@dataclass
+class MiniModelConfig:
+    liger_kernel_patch_func: callable
+    liger_kernel_patch_revert_func: callable
+    model_class: PreTrainedModel
+    mini_model_config: PretrainedConfig
+
+
+def simple_collate_fn(data: List[Dict[str, Any]]):
+    """A basic collate function to use for DataLoader"""
+    batch = {}
+
+    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in data])
+    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in data])
+    labels = input_ids.clone()
+    batch["input_ids"] = input_ids
+    batch["attention_mask"] = attention_mask
+    batch["labels"] = labels
+    if version.parse("4.54.1") <= version.parse(transformers.__version__):
+        shift_labels = nn.functional.pad(labels, (0, 1), value=-100)
+        shift_labels = shift_labels[..., 1:].contiguous()
+        batch["shift_labels"] = shift_labels
+
+    return BatchEncoding(batch)
+
+
+def multimodal_collate_fn(data: List[Dict[str, Any]]):
+    """A collate function to use for DataLoader for multimodal models"""
+    batch = {}
+    keys = set(data[0].keys())
+
+    input_ids = torch.cat([torch.tensor(item["input_ids"]) for item in data])
+    keys.remove("input_ids")
+    batch["input_ids"] = input_ids
+
+    labels = input_ids.clone()
+    batch["labels"] = labels
+    if version.parse("4.54.1") <= version.parse(transformers.__version__):
+        shift_labels = nn.functional.pad(labels, (0, 1), value=-100)
+        shift_labels = shift_labels[..., 1:].contiguous()
+        batch["shift_labels"] = shift_labels
+
+    # Collate all other keys, e.g. pixel_values, attention_mask, image_grid_thw, etc
+    for key in keys:
+        batch[key] = torch.cat([item[key] for item in data])
+
+    return BatchEncoding(batch)
+
+
+def load_tokenizer_config(config_path: str) -> dict:
+    """Load and process tokenizer configuration from a JSON file."""
+    with open(config_path) as reader:
+        tokenizer_config = json.load(reader)
+    tokenizer_config["added_tokens_decoder"] = {
+        k: AddedToken(**v) for k, v in tokenizer_config["added_tokens_decoder"].items()
+    }
+    return tokenizer_config
+
+
+def load_image_processing_config(config_path: str) -> dict:
+    """Load and process image processing configuration from a JSON file."""
+    with open(config_path) as reader:
+        image_processing_config = json.load(reader)
+    return image_processing_config
+
+
+def load_processor_config(config_path: str) -> dict:
+    """Load and process processor configuration from a JSON file."""
+    with open(config_path) as reader:
+        processor_config = json.load(reader)
+    return processor_config
+
+
+def train_bpe_tokenizer(special_tokens: List[str], unk_token: str = "<|unk|>"):
+    """
+    Train a tokenizer using the BPE algorithm.
+
+    Parameters:
+    unk_token (str): The token to use for unknown tokens.
+    special_tokens (List[str]): A list of special tokens to use.
+
+    Returns:
+    Tokenizer: The trained tokenizer.
+    """
+    # Add unk_token to special_tokens if not already present
+    if unk_token not in special_tokens:
+        special_tokens.append(unk_token)
+
+    tokenizer = Tokenizer(BPE(unk_token=unk_token))
+    trainer = BpeTrainer(special_tokens=special_tokens)
+
+    tokenizer.pre_tokenizer = Whitespace()
+    file = [UNTOKENIZED_DATASET_PATH]
+    tokenizer.train(file, trainer)
+
+    return tokenizer
+
+
+def supports_bfloat16():
+    if device == "cuda":
+        return torch.cuda.get_device_capability() >= (8, 0)  # Ampere and newer
+    elif device == "xpu":
+        return True
+    elif device == "npu":
+        return True
+    else:
+        return False
+
+
+def is_torchvision_available():
+    if importlib.util.find_spec("torchvision") is not None:
+        return True
+    else:
+        return False
+
+
+def revert_liger_kernel_to_granite(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Granite.
+    """
+
+    from transformers.models.granite import modeling_granite
+
+    importlib.reload(modeling_granite)
+    model_config.model_class = modeling_granite.GraniteForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_llama(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Llama.
+    """
+
+    from transformers.models.llama import modeling_llama
+
+    importlib.reload(modeling_llama)
+    model_config.model_class = modeling_llama.LlamaForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_smollm3(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to SmolLM3.
+    """
+
+    from transformers.models.smollm3 import modeling_smollm3
+
+    importlib.reload(modeling_smollm3)
+    model_config.model_class = modeling_smollm3.SmolLM3ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_mllama(model_config: MiniModelConfig, model_type: str = "causal_lm"):
+    """
+    Revert all Liger kernel patches applied to MLlama.
+    """
+
+    assert model_type in [
+        "causal_lm",
+        "conditional_generation",
+    ], f'model_type must be "causal_lm" or "conditional_generation", Got: {model_type}'
+    import torch.nn as nn
+
+    from transformers.models.mllama import modeling_mllama
+
+    importlib.reload(nn)
+    importlib.reload(modeling_mllama)
+    if model_type == "causal_lm":
+        model_config.model_class = modeling_mllama.MllamaForCausalLM
+    else:
+        model_config.model_class = modeling_mllama.MllamaForConditionalGeneration
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_llama4(model_config: MiniModelConfig, model_type: str = "causal_lm"):
+    """
+    Revert all Liger kernel patches applied to Llama4.
+    """
+
+    assert model_type in [
+        "causal_lm",
+        "conditional_generation",
+    ], f'model_type must be "causal_lm" or "conditional_generation", Got: {model_type}'
+    import torch.nn as nn
+
+    from transformers.models.llama4 import modeling_llama4
+
+    importlib.reload(nn)
+    importlib.reload(modeling_llama4)
+    if model_type == "causal_lm":
+        model_config.model_class = modeling_llama4.Llama4ForCausalLM
+    else:
+        model_config.model_class = modeling_llama4.Llama4ForConditionalGeneration
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_mistral(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Mistral.
+    """
+
+    from transformers.models.mistral import modeling_mistral
+
+    importlib.reload(modeling_mistral)
+    model_config.model_class = modeling_mistral.MistralForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_mixtral(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Mixtral.
+    """
+
+    from transformers.models.mixtral import modeling_mixtral
+
+    importlib.reload(modeling_mixtral)
+    model_config.model_class = modeling_mixtral.MixtralForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_gemma(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Gemma.
+    """
+
+    from transformers.models.gemma import modeling_gemma
+
+    importlib.reload(modeling_gemma)
+    model_config.model_class = modeling_gemma.GemmaForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_gemma2(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Gemma2.
+    """
+
+    from transformers.models.gemma2 import modeling_gemma2
+
+    importlib.reload(modeling_gemma2)
+    model_config.model_class = modeling_gemma2.Gemma2ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_gemma3_text(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Gemma3.
+    """
+
+    from transformers.models.gemma3 import modeling_gemma3
+
+    importlib.reload(modeling_gemma3)
+
+    model_config.model_class = modeling_gemma3.Gemma3ForCausalLM
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_gemma3(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Gemma3.
+    """
+
+    from transformers.models.gemma3 import modeling_gemma3
+    from transformers.models.siglip import modeling_siglip
+
+    importlib.reload(modeling_gemma3)
+    importlib.reload(modeling_siglip)
+    model_config.model_class = modeling_gemma3.Gemma3ForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_Paligemma(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Paligemma.
+    """
+
+    from transformers.models.gemma import modeling_gemma
+    from transformers.models.gemma2 import modeling_gemma2
+    from transformers.models.paligemma import modeling_paligemma
+    from transformers.models.siglip import modeling_siglip
+
+    importlib.reload(modeling_gemma)
+    importlib.reload(modeling_gemma2)
+    importlib.reload(modeling_paligemma)
+    importlib.reload(modeling_siglip)
+    model_config.model_class = modeling_paligemma.PaliGemmaForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen2(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen2.
+    """
+
+    from transformers.models.qwen2 import modeling_qwen2
+
+    importlib.reload(modeling_qwen2)
+    model_config.model_class = modeling_qwen2.Qwen2ForCausalLM
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen3(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen3.
+    """
+    from transformers.models.qwen3 import modeling_qwen3
+
+    importlib.reload(modeling_qwen3)
+    model_config.model_class = modeling_qwen3.Qwen3ForCausalLM
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen3_moe(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen3 MoE.
+    """
+    from transformers.models.qwen3_moe import modeling_qwen3_moe
+
+    importlib.reload(modeling_qwen3_moe)
+    model_config.model_class = modeling_qwen3_moe.Qwen3MoeForCausalLM
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_gpt_oss(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to GPT-OSS.
+    """
+    from transformers.models.gpt_oss import modeling_gpt_oss
+
+    importlib.reload(modeling_gpt_oss)
+    model_config.model_class = modeling_gpt_oss.GptOssForCausalLM
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen2_vl(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen2-VL.
+    """
+    from transformers.models.qwen2_vl import modeling_qwen2_vl
+
+    importlib.reload(modeling_qwen2_vl)
+    model_config.model_class = modeling_qwen2_vl.Qwen2VLForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen2_5_vl(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen2.5-VL.
+    """
+    from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl
+
+    importlib.reload(modeling_qwen2_5_vl)
+    model_config.model_class = modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen3_vl(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen3-VL.
+    """
+    from transformers.models.qwen3_vl import modeling_qwen3_vl
+
+    importlib.reload(modeling_qwen3_vl)
+    model_config.model_class = modeling_qwen3_vl.Qwen3VLForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen3_vl_moe(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen3-VL-MoE.
+    """
+    from transformers.models.qwen3_vl_moe import modeling_qwen3_vl_moe
+
+    importlib.reload(modeling_qwen3_vl_moe)
+    model_config.model_class = modeling_qwen3_vl_moe.Qwen3VLMoeForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_phi3(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Phi3.
+    """
+
+    from transformers.models.phi3 import modeling_phi3
+
+    importlib.reload(modeling_phi3)
+    model_config.model_class = modeling_phi3.Phi3ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_pixtral(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Pixtral.
+    """
+    from transformers.models.pixtral import modeling_pixtral
+
+    importlib.reload(modeling_pixtral)
+    model_config.model_class = modeling_pixtral.PixtralVisionModel
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_olmo2(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Olmo2.
+    """
+
+    from transformers.models.olmo2 import modeling_olmo2
+
+    importlib.reload(modeling_olmo2)
+    model_config.model_class = modeling_olmo2.Olmo2ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_olmo3(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Olmo3.
+    """
+
+    from transformers.models.olmo3 import modeling_olmo3
+
+    importlib.reload(modeling_olmo3)
+    model_config.model_class = modeling_olmo3.Olmo3ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_glm4(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Glm4.
+    """
+
+    from transformers.models.glm4 import modeling_glm4
+
+    importlib.reload(modeling_glm4)
+    model_config.model_class = modeling_glm4.Glm4ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_glm4v(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Glm4v.
+    """
+
+    from transformers.models.glm4v import modeling_glm4v
+
+    importlib.reload(modeling_glm4v)
+    model_config.model_class = modeling_glm4v.Glm4vForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_glm4v_moe(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Glm4v_MoE.
+    """
+
+    from transformers.models.glm4v_moe import modeling_glm4v_moe
+
+    importlib.reload(modeling_glm4v_moe)
+    model_config.model_class = modeling_glm4v_moe.Glm4vMoeForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_llava(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to llava.
+    """
+
+    from transformers.models.llama import modeling_llama
+    from transformers.models.llava import modeling_llava
+
+    # Note: Do NOT reload modeling_clip as it breaks CLIPVisionModel's
+    # output_hidden_states functionality in transformers v5.
+    # Liger kernel does not patch modeling_clip when model=None.
+    importlib.reload(modeling_llava)
+    importlib.reload(modeling_llama)
+
+    model_config.model_class = modeling_llava.LlavaForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_internvl(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to InternVL.
+    """
+    import torch.nn as nn
+
+    from transformers.models.internvl import modeling_internvl
+    from transformers.models.qwen2 import modeling_qwen2
+
+    importlib.reload(nn)
+    importlib.reload(modeling_internvl)
+    importlib.reload(modeling_qwen2)
+
+    model_config.model_class = modeling_internvl.InternVLForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_smolvlm2(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to SmolVLM2.
+    """
+    import torch.nn as nn
+
+    from transformers.models.llama import modeling_llama
+    from transformers.models.smolvlm import modeling_smolvlm
+
+    importlib.reload(nn)
+    importlib.reload(modeling_smolvlm)
+    importlib.reload(modeling_llama)
+
+    model_config.model_class = modeling_smolvlm.SmolVLMForConditionalGeneration
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_falcon_h1(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to FalconH1.
+    """
+
+    from transformers.models.falcon_h1 import modeling_falcon_h1
+
+    importlib.reload(modeling_falcon_h1)
+    model_config.model_class = modeling_falcon_h1.FalconH1ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen3_next(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen3Next.
+    """
+
+    from transformers.models.qwen3_next import modeling_qwen3_next
+
+    importlib.reload(modeling_qwen3_next)
+    model_config.model_class = modeling_qwen3_next.Qwen3NextForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen3_5(model_config: MiniModelConfig, model_type: str = "causal_lm"):
+    """
+    Revert all Liger kernel patches applied to Qwen3.5 dense.
+    """
+
+    assert model_type in [
+        "causal_lm",
+        "conditional_generation",
+    ], f'model_type must be "causal_lm" or "conditional_generation", Got: {model_type}'
+
+    import torch.nn as nn
+
+    from transformers.models.qwen3_5 import modeling_qwen3_5
+
+    importlib.reload(nn)
+    importlib.reload(modeling_qwen3_5)
+    if model_type == "causal_lm":
+        model_config.model_class = modeling_qwen3_5.Qwen3_5ForCausalLM
+    else:
+        model_config.model_class = modeling_qwen3_5.Qwen3_5ForConditionalGeneration
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_qwen3_5_moe(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Qwen3.5 MoE.
+    """
+
+    from transformers.models.qwen3_5_moe import modeling_qwen3_5_moe
+
+    importlib.reload(modeling_qwen3_5_moe)
+    model_config.model_class = modeling_qwen3_5_moe.Qwen3_5MoeForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_hunyuan_v1(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Hunyuanv1.
+    """
+    from transformers.models.hunyuan_v1_dense import modeling_hunyuan_v1_dense
+
+    importlib.reload(modeling_hunyuan_v1_dense)
+    model_config.model_class = modeling_hunyuan_v1_dense.HunYuanDenseV1ForCausalLM
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_hunyuan_v1_moe(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to Hunyuanv1 MoE.
+    """
+    from transformers.models.hunyuan_v1_moe import modeling_hunyuan_v1_moe
+
+    importlib.reload(modeling_hunyuan_v1_moe)
+    model_config.model_class = modeling_hunyuan_v1_moe.HunYuanMoEV1ForCausalLM
+
+    print("Liger kernel patches have been reverted.")
+
+
+def revert_liger_kernel_to_exaone4(model_config: MiniModelConfig):
+    """
+    Revert all Liger kernel patches applied to EXAONE4.
+    """
+    from transformers.models.exaone4 import modeling_exaone4
+
+    importlib.reload(modeling_exaone4)
+    model_config.model_class = modeling_exaone4.Exaone4ForCausalLM
+    print("Liger kernel patches have been reverted.")
+
+
+class HFAlignmentLoss:
+    def __init__(
+        self,
+        alpha: float = 1.0,
+        beta: float = 0.1,
+        ignore_index: int = -100,
+        use_ref_model: bool = False,
+        unpaired: bool = False,
+        compute_nll_loss: bool = True,
+        **kwargs,
+    ):
+        self.alpha = alpha
+        self.beta = beta
+        self.ignore_index = ignore_index
+        self.use_ref_model = use_ref_model
+        self.unpaired = unpaired
+        self.compute_nll_loss = compute_nll_loss
+
+    @abstractmethod
+    def alignment_loss(self):
+        pass
+
+    def get_batch_logps(
+        self,
+        logits: torch.FloatTensor,
+        labels: torch.LongTensor,
+        average_log_prob: bool = False,
+    ) -> torch.FloatTensor:
+        """Compute the log probabilities of the given labels under the given logits.
+
+        Args:
+            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
+            labels: Labels for which to compute the log probabilities. Label tokens with a value of ignore_index are ignored. Shape: (batch_size, sequence_length)
+            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
+            is_encoder_decoder: Whether the model is an encoder-decoder model.
+        Returns:
+            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
+        """
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+
+        loss_mask = labels != self.ignore_index
+
+        # dummy token; we'll ignore the losses on these tokens later
+        labels = torch.where(labels == self.ignore_index, 0, labels)
+
+        per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
+
+        if average_log_prob:
+            return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            return (per_token_logps * loss_mask).sum(-1)
+
+    def get_ref_logps(
+        self,
+        ref_input: torch.FloatTensor,
+        ref_weight: torch.FloatTensor,
+        target: torch.LongTensor,
+        ref_bias: torch.FloatTensor,
+        average_log_prob: bool = True,
+        preference_labels: torch.Tensor = None,
+    ):
+        """Compute the log probabilities of the given labels under the given reference model."""
+
+        with torch.no_grad():
+            ref_logits = ref_input @ ref_weight.t()
+            if ref_bias is not None:
+                ref_logits = ref_logits + ref_bias
+            ref_all_logps = self.get_batch_logps(ref_logits, target, average_log_prob=average_log_prob)
+
+            if self.unpaired and preference_labels is not None:
+                # Split based on preference labels
+                return (
+                    ref_all_logps[preference_labels],
+                    ref_all_logps[~preference_labels],
+                )
+            else:
+                # Original paired behavior - split in half
+                return (
+                    ref_all_logps[: ref_input.shape[0] // 2],
+                    ref_all_logps[ref_input.shape[0] // 2 :],
+                )
+
+    def concatenated_forward(
+        self,
+        _input: torch.FloatTensor,
+        weight: torch.FloatTensor,
+        target: torch.LongTensor,
+        bias: Optional[torch.FloatTensor] = None,
+        average_log_prob: bool = True,
+        preference_labels: torch.Tensor = None,
+        nll_target: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+        """Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        len_chosen = _input.shape[0] // 2
+
+        outputs = _input @ weight.t()
+        if bias is not None:
+            outputs = outputs + bias
+        all_logits = outputs.float()
+
+        def cross_entropy_loss(logits, labels):
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss(ignore_index=self.ignore_index)
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        labels = nll_target if nll_target is not None else target
+        chosen_nll_loss = torch.tensor(0.0, device=all_logits.device)
+        if self.compute_nll_loss:
+            chosen_nll_loss = cross_entropy_loss(all_logits[:len_chosen], labels[:len_chosen])
+
+        all_logps = self.get_batch_logps(
+            all_logits,
+            target,
+            average_log_prob=average_log_prob,
+        )
+
+        if self.unpaired and preference_labels is not None:
+            # Split based on labels tensor
+            chosen_logps = all_logps[preference_labels]
+            rejected_logps = all_logps[~preference_labels]
+            chosen_logits = all_logits[preference_labels]
+            rejected_logits = all_logits[~preference_labels]
+        else:
+            # Original paired behavior - split in half
+            len_chosen = _input.shape[0] // 2
+            chosen_logps = all_logps[:len_chosen]
+            rejected_logps = all_logps[len_chosen:]
+            chosen_logits = all_logits[:len_chosen]
+            rejected_logits = all_logits[len_chosen:]
+
+        return (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        )
+
+    def get_batch_loss_metrics(
+        self,
+        weight: torch.FloatTensor,
+        _input: torch.FloatTensor,
+        target: torch.LongTensor,
+        bias: torch.FloatTensor = None,
+        ref_input: torch.FloatTensor = None,
+        ref_weight: torch.FloatTensor = None,
+        ref_bias: torch.FloatTensor = None,
+        average_log_prob: bool = True,
+        preference_labels: torch.Tensor = None,
+        nll_target: torch.LongTensor = None,
+        **loss_kwargs,
+    ):
+        """Compute the loss metrics for the given batch of inputs for train or test."""
+        forward_output = self.concatenated_forward(
+            _input, weight, target, bias, average_log_prob, preference_labels, nll_target
+        )
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = forward_output[:5]
+
+        if self.use_ref_model:
+            ref_chosen_logps, ref_rejected_logps = self.get_ref_logps(
+                ref_input,
+                ref_weight,
+                target,
+                ref_bias,
+                average_log_prob,
+                preference_labels,
+            )
+            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
+            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
+        alignment_loss_outputs = self.alignment_loss(policy_chosen_logps, policy_rejected_logps, **loss_kwargs)
+        if isinstance(alignment_loss_outputs, tuple):
+            losses, *aggregated_aux_outputs = alignment_loss_outputs
+        else:
+            losses, aggregated_aux_outputs = alignment_loss_outputs, []
+
+        loss = policy_nll_loss * self.alpha + losses.mean()
+
+        if not self.unpaired:
+            return_vars = (
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_chosen_logits.detach().mean(),
+                policy_rejected_logits.detach().mean(),
+                policy_nll_loss,
+            )
+            return loss, (*return_vars, *aggregated_aux_outputs)
+        else:
+            return_vars = (
+                policy_chosen_logps.detach().sum(),
+                policy_rejected_logps.detach().sum(),
+                policy_chosen_logits.detach().sum(),
+                policy_rejected_logits.detach().sum(),
+            )
+            return loss, (*return_vars, *aggregated_aux_outputs)
+
+
+class HFDistillationLoss:
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1,
+    ):
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+
+    @abstractmethod
+    def distillation_loss(self, student_logits, teacher_logits, **loss_kwargs):
+        """Abstract method for computing distillation loss."""
+        pass
+
+    def concatenated_forward(
+        self,
+        student_input: torch.FloatTensor,
+        student_weight: torch.FloatTensor,
+        teacher_input: torch.FloatTensor,
+        teacher_weight: torch.FloatTensor,
+        target: torch.LongTensor,
+        student_bias: torch.FloatTensor = None,
+        teacher_bias: torch.FloatTensor = None,
+    ) -> Tuple[
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+        torch.FloatTensor,
+    ]:
+        """Compute forward pass for both student and teacher models."""
+
+        student_batch_seq_len_size, student_hidden_size = student_input.shape
+        student_input_reshaped = student_input.view(-1, student_hidden_size)
+        teacher_batch_seq_len_size, teacher_hidden_size = teacher_input.shape
+        teacher_input_reshaped = teacher_input.view(-1, teacher_hidden_size)
+
+        student_outputs = student_input_reshaped @ student_weight.t()
+        if student_bias is not None:
+            student_outputs = student_outputs + student_bias
+
+        with torch.no_grad():
+            teacher_outputs = teacher_input_reshaped @ teacher_weight.t()
+            if teacher_bias is not None:
+                teacher_outputs = teacher_outputs + teacher_bias
+
+        student_logits = student_outputs.view(student_batch_seq_len_size, -1).float()
+        teacher_logits = teacher_outputs.view(teacher_batch_seq_len_size, -1).float()
+
+        if torch.all(target == self.ignore_index):
+            return torch.tensor(0.0)
+
+        def cross_entropy_loss(logits, labels):
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss(ignore_index=self.ignore_index)
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits, labels)
+            return loss
+
+        labels = target
+        ce_loss = cross_entropy_loss(
+            student_logits.view(-1, student_logits.shape[-1]),
+            labels.view(-1),
+        )
+
+        return (
+            student_logits,
+            teacher_logits,
+            ce_loss,
+        )
+
+    def get_batch_loss_metrics(
+        self,
+        student_input: torch.FloatTensor,
+        student_weight: torch.FloatTensor,
+        teacher_input: torch.FloatTensor,
+        teacher_weight: torch.FloatTensor,
+        target: torch.LongTensor,
+        student_bias: torch.FloatTensor = None,
+        teacher_bias: torch.FloatTensor = None,
+        **loss_kwargs,
+    ):
+        """Compute the distillation loss metrics for the given batch."""
+        forward_output = self.concatenated_forward(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            target,
+            student_bias,
+            teacher_bias,
+        )
+        (
+            student_logits,
+            teacher_logits,
+            hard_loss,
+        ) = forward_output
+
+        student_logits /= self.temperature
+        teacher_logits /= self.temperature
+
+        soft_loss = self.distillation_loss(
+            student_logits, teacher_logits, target=target, ignore_index=self.ignore_index, **loss_kwargs
+        )
+        # full loss
+        loss = self.weight_hard_loss * hard_loss + self.weight_soft_loss * soft_loss
+        return loss