Unverified Commit 098e3006 authored by Przemyslaw Tredak's avatar Przemyslaw Tredak Committed by GitHub
Browse files

Link attention docs to the main docs and fix errors reported by Sphinx (#1062)



* Link attention docs to the main docs and fix errors reported by Sphinx
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* Lower the version of nbsphinx
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* More fixes
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* Change the URL of example_attention.py to GitHub
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

* More fixes in the attention tutorial
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>

---------
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>
parent 9c127ef5
......@@ -17,8 +17,8 @@ jobs:
uses: actions/checkout@v3
- name: 'Install dependencies'
run: |
pip install sphinx==7.1.2 sphinx_rtd_theme==2.0.0 nbsphinx==0.9.4 IPython ipython_genutils==0.2.0 ipywidgets==8.1.3 astroid==3.2.2
pip install breathe==4.35.0 sphinx-autoapi==3.1.1
pip install sphinx==5.1.1 sphinx_rtd_theme==1.0.0 nbsphinx==0.8.10 IPython ipython_genutils==0.2.0 ipywidgets==8.0.2 astroid==2.15.7
pip install breathe==4.34.0 sphinx-autoapi==2.0.1
sudo apt-get install -y pandoc graphviz doxygen
export GIT_SHA=$(git show-ref --hash HEAD)
- name: 'Build docs'
......
......@@ -70,7 +70,7 @@
color: #8c0;
}
html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt {
html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt {
background: rgba(118, 185, 0, 0.1);
color: rgba(59,93,0,1);
border-top: solid 3px rgba(59,93,0,1);
......
......@@ -109,6 +109,8 @@ napoleon_custom_sections = [
("Parallelism parameters", "params_style"),
("Optimization parameters", "params_style"),
("Values", "params_style"),
("Graphing parameters", "params_style"),
("FP8-related parameters", "params_style"),
]
breathe_projects = {"TransformerEngine": os.path.abspath("doxygen/xml/")}
......
This diff is collapsed.
......@@ -51,3 +51,4 @@ Transformer Engine documentation
:caption: Advanced
api/c/index
examples/attention/attention.ipynb
......@@ -366,8 +366,8 @@ class TransformerEngineBase(nn.Module): # pylint: disable=too-few-public-method
class DenseGeneral(TransformerEngineBase):
"""
Applies a linear transformation to the incoming data :math:`y = xA^T + b`
r"""
Applies a linear transformation to the incoming data :math:`y = xA^T + b`.
Parameters
----------
......
......@@ -1531,9 +1531,10 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods
Indicate the min and max time-scales of rotary position embedding,
only used when :attr:`enable_rotary_pos_emb=True`
rotary_pos_emb_group_method: str, default = 'consecutive'
Indicate the method to coupled the coordinates. It should be one of
['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`
, d is the hidden dimension. 'consecutive' pairs index :math:`i` with :math:`i + 1`.
Indicate the method to couple the coordinates. It should be one of
['consecutive', 'alternate']. 'alternate' is to pair index :math:`i` with :math:`i + d/2`,
where :math:`d` is the hidden dimension. 'consecutive' pairs index :math:`i` with
:math:`i + 1`.
low_rank_adaptation_scope: str, default = 'none'
Indicate the scope to apply low rank adaptation. It should be one of
['none', 'all', 'qkv_proj', 'output_proj', 'mlp', 'exclude_qkv_proj',
......@@ -1543,7 +1544,7 @@ class TransformerLayer(nn.Module): # pylint: disable=too-few-public-methods
:attr:`enable_low_rank_adaptation=True`
low_rank_adaptation_alpha: float, default = None
The alpha for computing the scaling factor of LoRA output.
:math:`\frac{alpha}{rank} * lora_output`. None means no scaling.
:math:`\frac{alpha}{rank} * lora\_output`. None means no scaling.
enable_sequence_parallel: bool, default = False
Whether to enable sequence parallelism to operations except dot.
......
......@@ -328,8 +328,8 @@ def fp8_autocast(
pjit(transformer.init, ...)(...)
.. note::
We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len`
, and :attr:`amax_compute_algo`(with value 'max' and 'most_recent') in
We only support :attr:`margin`, :attr:`fp8_format`, :attr:`amax_history_len`,
and :attr:`amax_compute_algo` (with value 'max' and 'most_recent') in
recipe.DelayedScaling currently. Other parameters in recipe.DelayedScaling
will trigger an assertion.
......
......@@ -9,9 +9,11 @@ import warnings
import paddle
from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd
from transformer_engine.paddle.layer import LayerNormMLP, LayerNorm, MultiHeadAttention
from transformer_engine.paddle.constants import AttnMaskTypes, LayerTypes, dist_group_type
from transformer_engine.paddle.distributed import get_tp_group_and_world_size, track_rng_state
from .layernorm_mlp import LayerNormMLP
from .layernorm import LayerNorm
from .attention import MultiHeadAttention
from ..constants import AttnMaskTypes, LayerTypes, dist_group_type
from ..distributed import get_tp_group_and_world_size, track_rng_state
class TransformerLayer(paddle.nn.Layer):
......
......@@ -10,7 +10,7 @@ from typing import Optional
import torch
from transformer_engine.pytorch.ops import FusibleOperation
from transformer_engine.pytorch.ops.op import FusibleOperation
from transformer_engine.pytorch.ops.fuser import OperationFuser
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment