[JAX] Fix the wrong shape of bias when fusing GEMMs. (#152)

* Allow update_collections and update_fp8_metas to return both Dict and FrozenDict. Signed-off-by: Ming Huang <mingh@nvidia.com> * Fix the wrong shape issue of bias when fused QKV or KV. Signed-off-by: Ming Huang <mingh@nvidia.com> * Reuse tuplized features for bias creating. Signed-off-by: Ming Huang <mingh@nvidia.com> * Replace get_args to be more readable. Signed-off-by: Ming Huang <mingh@nvidia.com> --------- Signed-off-by: Ming Huang <mingh@nvidia.com>

[JAX] Fix the wrong shape of bias when fusing GEMMs. (#152)
* Allow update_collections and update_fp8_metas to return both Dict and FrozenDict. Signed-off-by: Ming Huang <mingh@nvidia.com> * Fix the wrong shape issue of bias when fused QKV or KV. Signed-off-by: Ming Huang <mingh@nvidia.com> * Reuse tuplized features for bias creating. Signed-off-by: Ming Huang <mingh@nvidia.com> * Replace get_args to be more readable. Signed-off-by: Ming Huang <mingh@nvidia.com> --------- Signed-off-by: Ming Huang <mingh@nvidia.com>
4a1efe89 · Ming-Xu Huang · GitHub · a41bf711 · 4a1efe89 · 4a1efe89
Unverified Commit 4a1efe89 authored Apr 21, 2023 by Ming-Xu Huang Committed by GitHub Apr 20, 2023
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 13 deletions

transformer_engine/jax/fp8.py transformer_engine/jax/fp8.py +17 -9

transformer_engine/jax/module.py transformer_engine/jax/module.py +8 -4

No files found.
--- a/transformer_engine/jax/fp8.py
+++ b/transformer_engine/jax/fp8.py
@@ -199,24 +199,32 @@ class FP8Helper:
        """
        Update the collections
        """
-        if not isinstance(original, FrozenDict):
+        assert isinstance(original, (dict, FrozenDict))
-            original = FrozenDict(original)
+        assert isinstance(new, (dict, FrozenDict))
+        frozen_original = FrozenDict(original) if not isinstance(original, FrozenDict) else original
        for key in new:
-            if key in original:
+            if key in frozen_original:
-                original, _ = original.pop(key)
+                frozen_original, _ = frozen_original.pop(key)
-        return FrozenDict({**new, **original})
+        new_coll = FrozenDict({**new, **frozen_original})
+        if not isinstance(original, FrozenDict):
+            new_coll = new_coll.unfreeze()
+        return new_coll
    @staticmethod
    def update_fp8_metas(state: Collection) -> Collection:
        """
        Update the FP8 metas
        """
+        assert isinstance(state, (dict, FrozenDict))
        if FP8Helper.FP8_COLLECTION_NAME in state:
-            if not isinstance(state, FrozenDict):
+            frozen_state = FrozenDict(state) if not isinstance(state, FrozenDict) else state
-                state = FrozenDict(state)
+            others, fp8_metas = frozen_state.pop(FP8Helper.FP8_COLLECTION_NAME)
-            others, fp8_metas = state.pop(FP8Helper.FP8_COLLECTION_NAME)
            fp8_metas = FP8Helper._update_fp8_metas_impl(fp8_metas)
-            return FrozenDict({**others, FP8Helper.FP8_COLLECTION_NAME: fp8_metas})
+            new_state = FrozenDict({**others, FP8Helper.FP8_COLLECTION_NAME: fp8_metas})
+            if not isinstance(state, FrozenDict):
+                new_state = new_state.unfreeze()
+            return new_state
        return state
    @staticmethod

--- a/transformer_engine/jax/module.py
+++ b/transformer_engine/jax/module.py
@@ -425,7 +425,8 @@ class DenseGeneral(TransformerEngineBase):
        if self.use_bias:
            bias = nn_partitioning.param_with_axes('bias',
-                                                   self.bias_init, (self.features,),
+                                                   self.bias_init,
+                                                   features,
                                                   self.dtype,
                                                   axes=self.bias_axes)
        else:
@@ -446,7 +447,8 @@ class DenseGeneral(TransformerEngineBase):
            y = lax.dot_general(inputs, kernel, ((axis, contract_ind), ((), ())))
        if bias is not None:
-            y += jnp.reshape(bias, (1,) * (y.ndim - 1) + (-1,))
+            bais_shape = (1,) * (y.ndim - bias.ndim) + bias.shape
+            y += jnp.reshape(bias, bais_shape)
        return y
@@ -651,12 +653,14 @@ class LayerNormDenseGeneral(TransformerEngineBase):
        bias = None
        if self.use_bias:
            bias = nn_partitioning.param_with_axes('bias',
-                                                   self.bias_init, (self.features,),
+                                                   self.bias_init,
+                                                   features,
                                                   self.dtype,
                                                   axes=self.bias_axes)
        if bias is not None:
-            z += jnp.reshape(bias, (1,) * (z.ndim - 1) + (-1,))
+            bais_shape = (1,) * (z.ndim - bias.ndim) + bias.shape
+            z += jnp.reshape(bias, bais_shape)
        if self.depth_scaling is not None:
            z = z / self.depth_scaling