[JAX] Adapt latest JAX/PAX image (#744)

* value_and_grad requires same shape for input and gradients Signed-off-by: Reese Wang <rewang@nvidia.com> * Use high precision layernorm Signed-off-by: Reese Wang <rewang@nvidia.com> * Remove local_device_ids as it caused unexpected behaviors Signed-off-by: Reese Wang <rewang@nvidia.com> * Revert "Remove local_device_ids as it caused unexpected behaviors" This reverts commit c54349b2ce1e96ae696cf0d74f5210e55002cf72. Signed-off-by: Reese Wang <rewang@nvidia.com> --------- Signed-off-by: Reese Wang <rewang@nvidia.com>

[JAX] Adapt latest JAX/PAX image (#744)
* value_and_grad requires same shape for input and gradients Signed-off-by: Reese Wang <rewang@nvidia.com> * Use high precision layernorm Signed-off-by: Reese Wang <rewang@nvidia.com> * Remove local_device_ids as it caused unexpected behaviors Signed-off-by: Reese Wang <rewang@nvidia.com> * Revert "Remove local_device_ids as it caused unexpected behaviors" This reverts commit c54349b2ce1e96ae696cf0d74f5210e55002cf72. Signed-off-by: Reese Wang <rewang@nvidia.com> --------- Signed-off-by: Reese Wang <rewang@nvidia.com>
bfe21c3d · Reese Wang · GitHub · d541d208 · bfe21c3d · bfe21c3d
Unverified Commit bfe21c3d authored Apr 07, 2024 by Reese Wang Committed by GitHub Apr 06, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 6 deletions

tests/jax/test_custom_call_compute.py tests/jax/test_custom_call_compute.py +3 -2

tests/jax/utils.py tests/jax/utils.py +3 -4

No files found.
--- a/tests/jax/test_custom_call_compute.py
+++ b/tests/jax/test_custom_call_compute.py
@@ -485,7 +485,8 @@ class TestGeLuFP8(TestGeLu):
        primitive.defvjp(primitive_fwd, primitive_bwd)
        func = value_and_grad(lambda x, y, z, w: jnp.mean(primitive(x, y, z, w)), (0, 1, 2, 3))

-        return func(inputs, no_use, no_use, no_use)
+        return func(inputs, jnp.transpose(inputs, (2, 0, 1)),
+                    jnp.zeros(inputs.shape[-1], dtype=inputs.dtype), no_use)

    @pytest.mark.skipif(not is_fp8_supported, reason=reason)
    @pytest.mark.parametrize('shape', [(32, 2, 64), (64, 2, 256)])
@@ -582,7 +583,7 @@ class TestGatedGeLuFP8(TestGatedGeLu):
        primitive.defvjp(primitive_fwd, primitive_bwd)
        func = value_and_grad(lambda x, y, z: jnp.mean(primitive(x, y, z)), (0, 1, 2))

-        return func(inputs, no_use, no_use)
+        return func(inputs, jnp.transpose(inputs, (1, 2, 0)), no_use)

    @pytest.mark.skipif(not is_fp8_supported, reason=reason)
    @pytest.mark.parametrize('shape', [(32, 2, 64), (64, 2, 256)])

--- a/tests/jax/utils.py
+++ b/tests/jax/utils.py
@@ -731,19 +731,18 @@ class LayerNorm(nn.Module):
                                                   axes=('embed',))
            bias = jnp.asarray(bias, self.dtype)

-            y = jnp.asarray(y, self.dtype)
            if not self.zero_centered_gamma:
                z = y * scale + bias
            else:
-                z = y * (scale + 1) + bias
+                z = y * (scale + 1.) + bias
        else:
            assert self.layernorm_type == 'rmsnorm'
            assert not self.zero_centered_gamma
            mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
-            y = jnp.asarray(x * lax.rsqrt(mean2 + self.epsilon), self.dtype)
+            y = x * lax.rsqrt(mean2 + self.epsilon)
            z = y * scale

-        return z
+        return jnp.asarray(z, self.dtype)


 class RelativePositionBiases(nn.Module):