Unverified Commit 098e3006 authored by Przemyslaw Tredak's avatar Przemyslaw Tredak Committed by GitHub
Browse files

Link attention docs to the main docs and fix errors reported by Sphinx (#1062)



* Link attention docs to the main docs and fix errors reported by Sphinx
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* Lower the version of nbsphinx
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* More fixes
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* Change the URL of example_attention.py to GitHub
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* More fixes in the attention tutorial
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

---------
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>
parent 9c127ef5
...@@ -17,8 +17,8 @@ jobs: ...@@ -17,8 +17,8 @@ jobs:
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: 'Install dependencies' - name: 'Install dependencies'
run: | run: |
pip install sphinx==7.1.2 sphinx_rtd_theme==2.0.0 nbsphinx==0.9.4 IPython ipython_genutils==0.2.0 ipywidgets==8.1.3 astroid==3.2.2 pip install sphinx==5.1.1 sphinx_rtd_theme==1.0.0 nbsphinx==0.8.10 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==2.15.7
pip install breathe==4.35.0 sphinx-autoapi==3.1.1 pip install breathe==4.34.0 sphinx-autoapi==2.0.1
sudo apt-get install -y pandoc graphviz doxygen sudo apt-get install -y pandoc graphviz doxygen
export GIT_SHA=$(git show-ref --hash HEAD) export GIT_SHA=$(git show-ref --hash HEAD)
- name: 'Build docs' - name: 'Build docs'
......
...@@ -70,7 +70,7 @@ ...@@ -70,7 +70,7 @@
color: #8c0; color: #8c0;
} }
html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt { html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt {
background: rgba(118, 185, 0, 0.1); background: rgba(118, 185, 0, 0.1);
color: rgba(59,93,0,1); color: rgba(59,93,0,1);
border-top: solid 3px rgba(59,93,0,1); border-top: solid 3px rgba(59,93,0,1);
......
...@@ -109,6 +109,8 @@ napoleon_custom_sections = [ ...@@ -109,6 +109,8 @@ napoleon_custom_sections = [
("Parallelism parameters", "params_style"), ("Parallelism parameters", "params_style"),
("Optimization parameters", "params_style"), ("Optimization parameters", "params_style"),
("Values", "params_style"), ("Values", "params_style"),
("Graphing parameters", "params_style"),
("FP8-related parameters", "params_style"),
] ]
breathe_projects = {"TransformerEngine": os.path.abspath("doxygen/xml/")} breathe_projects = {"TransformerEngine": os.path.abspath("doxygen/xml/")}
......
This diff is collapsed.
...@@ -51,3 +51,4 @@ Transformer Engine documentation ...@@ -51,3 +51,4 @@ Transformer Engine documentation
:caption: Advanced :caption: Advanced
api/c/index api/c/index
examples/attention/attention.ipynb
...@@ -366,8 +366,8 @@ class TransformerEngineBase(nn.Module): # pylint: disable=too-few-public-method ...@@ -366,8 +366,8 @@ class TransformerEngineBase(nn.Module): # pylint: disable=too-few-public-method
class DenseGeneral(TransformerEngineBase): class DenseGeneral(TransformerEngineBase):
""" r"""
Applies a linear transformation to the incoming data :math:`y = xA^T + b` Applies a linear transformation to the incoming data :math:`y = xA^T + b`.
Parameters Parameters
---------- ----------
......
...@@ -1531,9 +1531,10 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods ...@@ -1531,9 +1531,10 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods
Indicate the min and max time-scales of rotary position embedding, Indicate the min and max time-scales of rotary position embedding,
only used when :attr:`enable_rotary_pos_emb=True` only used when :attr:`enable_rotary_pos_emb=True`
rotary_pos_emb_group_method: str, default = 'consecutive' rotary_pos_emb_group_method: str, default = 'consecutive'
Indicate the method to coupled the coordinates. It should be one of Indicate the method to couple the coordinates. It should be one of
['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2` ['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`,
, d is the hidden dimension. 'consecutive' pairs index :math:`i` with :math:`i + 1`. where :math:`d` is the hidden dimension. 'consecutive' pairs index :math:`i` with
:math:`i + 1`.
low_rank_adaptation_scope: str, default = 'none' low_rank_adaptation_scope: str, default = 'none'
Indicate the scope to apply low rank adaptation. It should be one of Indicate the scope to apply low rank adaptation. It should be one of
['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj', ['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj',
...@@ -1543,7 +1544,7 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods ...@@ -1543,7 +1544,7 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods
:attr:`enable_low_rank_adaptation=True` :attr:`enable_low_rank_adaptation=True`
low_rank_adaptation_alpha: float, default = None low_rank_adaptation_alpha: float, default = None
The alpha for computing the scaling factor of LoRA output. The alpha for computing the scaling factor of LoRA output.
:math:`\frac{alpha}{rank} * lora_output`. None means no scaling. :math:`\frac{alpha}{rank} * lora\_output`. None means no scaling.
enable_sequence_parallel: bool, default = False enable_sequence_parallel: bool, default = False
Whether to enable sequence parallelism to operations except dot. Whether to enable sequence parallelism to operations except dot.
......
...@@ -328,8 +328,8 @@ def fp8_autocast( ...@@ -328,8 +328,8 @@ def fp8_autocast(
pjit(transformer.init, ...)(...) pjit(transformer.init, ...)(...)
.. note:: .. note::
We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len` We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len`,
, and :attr:`amax_compute_algo`(with value 'max' and 'most_recent') in and :attr:`amax_compute_algo` (with value 'max' and 'most_recent') in
recipe.DelayedScaling currently. Other parameters in recipe.DelayedScaling recipe.DelayedScaling currently. Other parameters in recipe.DelayedScaling
will trigger an assertion. will trigger an assertion.
......
...@@ -9,9 +9,11 @@ import warnings ...@@ -9,9 +9,11 @@ import warnings
import paddle import paddle
from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd
from transformer_engine.paddle.layer import LayerNormMLP, LayerNorm, MultiHeadAttention from .layernorm_mlp import LayerNormMLP
from transformer_engine.paddle.constants import AttnMaskTypes, LayerTypes, dist_group_type from .layernorm import LayerNorm
from transformer_engine.paddle.distributed import get_tp_group_and_world_size, track_rng_state from .attention import MultiHeadAttention
from ..constants import AttnMaskTypes, LayerTypes, dist_group_type
from ..distributed import get_tp_group_and_world_size, track_rng_state
class TransformerLayer(paddle.nn.Layer): class TransformerLayer(paddle.nn.Layer):
......
...@@ -10,7 +10,7 @@ from typing import Optional ...@@ -10,7 +10,7 @@ from typing import Optional
import torch import torch
from transformer_engine.pytorch.ops import FusibleOperation from transformer_engine.pytorch.ops.op import FusibleOperation
from transformer_engine.pytorch.ops.fuser import OperationFuser from transformer_engine.pytorch.ops.fuser import OperationFuser
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment