Link attention docs to the main docs and fix errors reported by Sphinx (#1062)

* Link attention docs to the main docs and fix errors reported by Sphinx Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Lower the version of nbsphinx Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * More fixes Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Change the URL of example_attention.py to GitHub Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * More fixes in the attention tutorial Signed-off-by: Przemek Tredak <ptredak@nvidia.com> --------- Signed-off-by: Przemek Tredak <ptredak@nvidia.com>

Link attention docs to the main docs and fix errors reported by Sphinx (#1062)
* Link attention docs to the main docs and fix errors reported by Sphinx Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Lower the version of nbsphinx Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * More fixes Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Change the URL of example_attention.py to GitHub Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * More fixes in the attention tutorial Signed-off-by: Przemek Tredak <ptredak@nvidia.com> --------- Signed-off-by: Przemek Tredak <ptredak@nvidia.com>
098e3006 · Przemyslaw Tredak · GitHub · 9c127ef5 · 098e3006 · 098e3006
Unverified Commit 098e3006 authored Aug 01, 2024 by Przemyslaw Tredak Committed by GitHub Aug 01, 2024
10 changed files
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -17,8 +17,8 @@ jobs:
        uses: actions/checkout@v3
      - name: 'Install dependencies'
        run: |
-          pip install sphinx==7.1.2 sphinx_rtd_theme==2.0.0 nbsphinx==0.9.4 IPython ipython_genutils==0.2.0 ipywidgets==8.1.3 astroid==3.2.2
+          pip install sphinx==5.1.1 sphinx_rtd_theme==1.0.0 nbsphinx==0.8.10 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==2.15.7
-          pip install breathe==4.35.0 sphinx-autoapi==3.1.1
+          pip install breathe==4.34.0 sphinx-autoapi==2.0.1
          sudo apt-get install -y pandoc graphviz doxygen
          export GIT_SHA=$(git show-ref --hash HEAD)
      - name: 'Build docs'

--- a/docs/_templates/layout.html
+++ b/docs/_templates/layout.html
@@ -70,7 +70,7 @@
    color: #8c0;
  }
-  html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt {
+  html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt {
    background: rgba(118, 185, 0, 0.1);
    color: rgba(59,93,0,1);
    border-top: solid 3px rgba(59,93,0,1);

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -109,6 +109,8 @@ napoleon_custom_sections = [
    ("Parallelism parameters", "params_style"),
    ("Optimization parameters", "params_style"),
    ("Values", "params_style"),
+    ("Graphing parameters", "params_style"),
+    ("FP8-related parameters", "params_style"),
 ]
 breathe_projects = {"TransformerEngine": os.path.abspath("doxygen/xml/")}

--- a/docs/examples/attention/attention.ipynb
+++ b/docs/examples/attention/attention.ipynb
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -51,3 +51,4 @@ Transformer Engine documentation
   :caption: Advanced
   api/c/index
+   examples/attention/attention.ipynb
--- a/transformer_engine/jax/flax/module.py
+++ b/transformer_engine/jax/flax/module.py
@@ -366,8 +366,8 @@ class TransformerEngineBase(nn.Module):  # pylint: disable=too-few-public-method
 class DenseGeneral(TransformerEngineBase):
-    """
+    r"""
-    Applies a linear transformation to the incoming data :math:`y = xA^T + b`
+    Applies a linear transformation to the incoming data :math:`y = xA^T + b`.
    Parameters
    ----------

--- a/transformer_engine/jax/flax/transformer.py
+++ b/transformer_engine/jax/flax/transformer.py
@@ -1531,9 +1531,10 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        Indicate the min and max time-scales of rotary position embedding,
        only used when :attr:`enable_rotary_pos_emb=True`
    rotary_pos_emb_group_method: str, default = 'consecutive'
-        Indicate the method to coupled the coordinates. It should be one of
+        Indicate the method to couple the coordinates. It should be one of
-        ['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`
+        ['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`,
-        , d is the hidden dimension. 'consecutive' pairs index :math:`i` with :math:`i + 1`.
+        where :math:`d` is the hidden dimension. 'consecutive' pairs index :math:`i` with
+        :math:`i + 1`.
    low_rank_adaptation_scope: str, default = 'none'
        Indicate the scope to apply low rank adaptation. It should be one of
        ['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj',
@@ -1543,7 +1544,7 @@ class TransformerLayer(nn.Module):  # pylint: disable=too-few-public-methods
        :attr:`enable_low_rank_adaptation=True`
    low_rank_adaptation_alpha: float, default = None
        The alpha for computing the scaling factor of LoRA output.
-        :math:`\frac{alpha}{rank} * lora_output`. None means no scaling.
+        :math:`\frac{alpha}{rank} * lora\_output`. None means no scaling.
    enable_sequence_parallel: bool, default = False
        Whether to enable sequence parallelism to operations except dot.

--- a/transformer_engine/jax/fp8.py
+++ b/transformer_engine/jax/fp8.py
@@ -328,8 +328,8 @@ def fp8_autocast(
                    pjit(transformer.init, ...)(...)
    .. note::
-        We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len`
+        We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len`,
-        , and :attr:`amax_compute_algo`(with value 'max' and 'most_recent') in
+        and :attr:`amax_compute_algo` (with value 'max' and 'most_recent') in
        recipe.DelayedScaling currently. Other parameters in recipe.DelayedScaling
        will trigger an assertion.

--- a/transformer_engine/paddle/layer/transformer.py
+++ b/transformer_engine/paddle/layer/transformer.py
@@ -9,9 +9,11 @@ import warnings
 import paddle
 from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd
-from transformer_engine.paddle.layer import LayerNormMLP, LayerNorm, MultiHeadAttention
+from .layernorm_mlp import LayerNormMLP
-from transformer_engine.paddle.constants import AttnMaskTypes, LayerTypes, dist_group_type
+from .layernorm import LayerNorm
-from transformer_engine.paddle.distributed import get_tp_group_and_world_size, track_rng_state
+from .attention import MultiHeadAttention
+from ..constants import AttnMaskTypes, LayerTypes, dist_group_type
+from ..distributed import get_tp_group_and_world_size, track_rng_state
 class TransformerLayer(paddle.nn.Layer):

--- a/transformer_engine/pytorch/ops/sequential.py
+++ b/transformer_engine/pytorch/ops/sequential.py
@@ -10,7 +10,7 @@ from typing import Optional
 import torch
-from transformer_engine.pytorch.ops import FusibleOperation
+from transformer_engine.pytorch.ops.op import FusibleOperation
 from transformer_engine.pytorch.ops.fuser import OperationFuser