"vscode:/vscode.git/clone" did not exist on "05ff90b692a6cdac4d8c06e7a4a4606d1b8fe1d6"
Unverified Commit 098e3006 authored by Przemyslaw Tredak's avatar Przemyslaw Tredak Committed by GitHub
Browse files

Link attention docs to the main docs and fix errors reported by Sphinx (#1062)



* Link attention docs to the main docs and fix errors reported by Sphinx
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* Lower the version of nbsphinx
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* More fixes
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* Change the URL of example_attention.py to GitHub
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* More fixes in the attention tutorial
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

---------
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>
parent 9c127ef5
...@@ -17,8 +17,8 @@ jobs: ...@@ -17,8 +17,8 @@ jobs:
uses: actions/checkout@v3 uses: actions/checkout@v3
- name: 'Install dependencies' - name: 'Install dependencies'
run: | run: |
pip install sphinx==7.1.2 sphinx_rtd_theme==2.0.0 nbsphinx==0.9.4 IPython ipython_genutils==0.2.0 ipywidgets==8.1.3 astroid==3.2.2 pip install sphinx==5.1.1 sphinx_rtd_theme==1.0.0 nbsphinx==0.8.10 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==2.15.7
pip install breathe==4.35.0 sphinx-autoapi==3.1.1 pip install breathe==4.34.0 sphinx-autoapi==2.0.1
sudo apt-get install -y pandoc graphviz doxygen sudo apt-get install -y pandoc graphviz doxygen
export GIT_SHA=$(git show-ref --hash HEAD) export GIT_SHA=$(git show-ref --hash HEAD)
- name: 'Build docs' - name: 'Build docs'
......
...@@ -70,7 +70,7 @@ ...@@ -70,7 +70,7 @@
color: #8c0; color: #8c0;
} }
html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt { html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt {
background: rgba(118, 185, 0, 0.1); background: rgba(118, 185, 0, 0.1);
color: rgba(59,93,0,1); color: rgba(59,93,0,1);
border-top: solid 3px rgba(59,93,0,1); border-top: solid 3px rgba(59,93,0,1);
......
...@@ -109,6 +109,8 @@ napoleon_custom_sections = [ ...@@ -109,6 +109,8 @@ napoleon_custom_sections = [
("Parallelism parameters", "params_style"), ("Parallelism parameters", "params_style"),
("Optimization parameters", "params_style"), ("Optimization parameters", "params_style"),
("Values", "params_style"), ("Values", "params_style"),
("Graphing parameters", "params_style"),
("FP8-related parameters", "params_style"),
] ]
breathe_projects = {"TransformerEngine": os.path.abspath("doxygen/xml/")} breathe_projects = {"TransformerEngine": os.path.abspath("doxygen/xml/")}
......
This diff is collapsed.
...@@ -51,3 +51,4 @@ Transformer Engine documentation ...@@ -51,3 +51,4 @@ Transformer Engine documentation
:caption: Advanced :caption: Advanced
api/c/index api/c/index
examples/attention/attention.ipynb
...@@ -366,8 +366,8 @@ class TransformerEngineBase(nn.Module): # pylint: disable=too-few-public-method ...@@ -366,8 +366,8 @@ class TransformerEngineBase(nn.Module): # pylint: disable=too-few-public-method
class DenseGeneral(TransformerEngineBase): class DenseGeneral(TransformerEngineBase):
""" r"""
Applies a linear transformation to the incoming data :math:`y = xA^T + b` Applies a linear transformation to the incoming data :math:`y = xA^T + b`.
Parameters Parameters
---------- ----------
......
...@@ -1531,19 +1531,20 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods ...@@ -1531,19 +1531,20 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods
Indicate the min and max time-scales of rotary position embedding, Indicate the min and max time-scales of rotary position embedding,
only used when :attr:`enable_rotary_pos_emb=True` only used when :attr:`enable_rotary_pos_emb=True`
rotary_pos_emb_group_method: str, default = 'consecutive' rotary_pos_emb_group_method: str, default = 'consecutive'
Indicate the method to coupled the coordinates. It should be one of Indicate the method to couple the coordinates. It should be one of
['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2` ['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`,
, d is the hidden dimension. 'consecutive' pairs index :math:`i` with :math:`i + 1`. where :math:`d` is the hidden dimension. 'consecutive' pairs index :math:`i` with
:math:`i + 1`.
low_rank_adaptation_scope: str, default = 'none' low_rank_adaptation_scope: str, default = 'none'
Indicate the scope to apply low rank adaptation. It should be one of Indicate the scope to apply low rank adaptation. It should be one of
['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj', ['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj',
'exclude_output_proj', 'exclude_mlp'] 'exclude_output_proj', 'exclude_mlp']
low_rank_adaptation_dim: int, default = 32 low_rank_adaptation_dim: int, default = 32
The dimension for low rank adaptation, only used when The dimension for low rank adaptation, only used when
:attr:`enable_low_rank_adaptation=True` :attr:`enable_low_rank_adaptation=True`
low_rank_adaptation_alpha: float, default = None low_rank_adaptation_alpha: float, default = None
The alpha for computing the scaling factor of LoRA output. The alpha for computing the scaling factor of LoRA output.
:math:`\frac{alpha}{rank} * lora_output`. None means no scaling. :math:`\frac{alpha}{rank} * lora\_output`. None means no scaling.
enable_sequence_parallel: bool, default = False enable_sequence_parallel: bool, default = False
Whether to enable sequence parallelism to operations except dot. Whether to enable sequence parallelism to operations except dot.
......
...@@ -328,8 +328,8 @@ def fp8_autocast( ...@@ -328,8 +328,8 @@ def fp8_autocast(
pjit(transformer.init, ...)(...) pjit(transformer.init, ...)(...)
.. note:: .. note::
We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len` We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len`,
, and :attr:`amax_compute_algo`(with value 'max' and 'most_recent') in and :attr:`amax_compute_algo` (with value 'max' and 'most_recent') in
recipe.DelayedScaling currently. Other parameters in recipe.DelayedScaling recipe.DelayedScaling currently. Other parameters in recipe.DelayedScaling
will trigger an assertion. will trigger an assertion.
......
...@@ -9,9 +9,11 @@ import warnings ...@@ -9,9 +9,11 @@ import warnings
import paddle import paddle
from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd
from transformer_engine.paddle.layer import LayerNormMLP, LayerNorm, MultiHeadAttention from .layernorm_mlp import LayerNormMLP
from transformer_engine.paddle.constants import AttnMaskTypes, LayerTypes, dist_group_type from .layernorm import LayerNorm
from transformer_engine.paddle.distributed import get_tp_group_and_world_size, track_rng_state from .attention import MultiHeadAttention
from ..constants import AttnMaskTypes, LayerTypes, dist_group_type
from ..distributed import get_tp_group_and_world_size, track_rng_state
class TransformerLayer(paddle.nn.Layer): class TransformerLayer(paddle.nn.Layer):
......
...@@ -10,7 +10,7 @@ from typing import Optional ...@@ -10,7 +10,7 @@ from typing import Optional
import torch import torch
from transformer_engine.pytorch.ops import FusibleOperation from transformer_engine.pytorch.ops.op import FusibleOperation
from transformer_engine.pytorch.ops.fuser import OperationFuser from transformer_engine.pytorch.ops.fuser import OperationFuser
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment