Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
a68e5f87
Commit
a68e5f87
authored
Feb 24, 2026
by
wenjh
Browse files
Enable fp8 on nmz
Signed-off-by:
wenjh
<
wenjh@sugon.com
>
parent
99a1c744
Pipeline
#3434
failed with stages
in 0 seconds
Changes
7
Pipelines
1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
223 additions
and
851 deletions
+223
-851
qa/L0_pytorch_unittest/test.sh
qa/L0_pytorch_unittest/test.sh
+5
-3
tests/pytorch/distributed/test_numerics.py
tests/pytorch/distributed/test_numerics.py
+16
-1
tests/pytorch/test_float8_blockwise_gemm_exact.py
tests/pytorch/test_float8_blockwise_gemm_exact.py
+3
-3
tests/pytorch/test_float8_blockwise_scaling_exact.py
tests/pytorch/test_float8_blockwise_scaling_exact.py
+116
-1
tests/pytorch/test_int8_channelwise_gemm_exact.py
tests/pytorch/test_int8_channelwise_gemm_exact.py
+0
-796
transformer_engine/pytorch/quantization.py
transformer_engine/pytorch/quantization.py
+20
-22
transformer_engine/pytorch/utils.py
transformer_engine/pytorch/utils.py
+63
-25
No files found.
qa/L0_pytorch_unittest/test.sh
View file @
a68e5f87
...
...
@@ -36,10 +36,12 @@ python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_nvfp4.xml $TE_PA
python3
-m
pytest
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8tensor.xml
$TE_PATH
/tests/pytorch/test_float8tensor.py
||
test_fail
"test_float8tensor.py"
python3
-m
pytest
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8blockwisetensor.xml
$TE_PATH
/tests/pytorch/test_float8blockwisetensor.py
||
test_fail
"test_float8blockwisetensor.py"
python3
-m
pytest
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_scaling_exact.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_scaling_exact.py
||
test_fail
"test_float8_blockwise_scaling_exact.py"
NVTE_INT8_SIM_FP8
=
1 python3
-m
pytest
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_gemm_exact.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_gemm_exact.py
||
test_fail
"test_float8_blockwise_gemm_exact.py"
NVTE_INT8_SIM_FP8
=
1 python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_gemm_exact_int8.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_gemm_exact.py
||
test_fail
"test_float8_blockwise_gemm_exact.py_int8"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_gemm_exact.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_gemm_exact.py
||
test_fail
"test_float8_blockwise_gemm_exact.py"
# channelwise int8 test
NVTE_INT8_SIM_FP8
=
1 python3
-m
pytest
-v
-s
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_current_scaling_exact.xml
$TE_PATH
/tests/pytorch/test_float8_current_scaling_exact.py
NVTE_INT8_SIM_FP8
=
1
NVTE_INT8_SIM_FP8_TENSORWISE
=
1 python3
-m
pytest
-v
-s
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_current_scaling_exact.xml
$TE_PATH
/tests/pytorch/test_float8_current_scaling_exact.py
python3
-m
pytest
-v
-s
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_current_scaling_exact.xml
$TE_PATH
/tests/pytorch/test_float8_current_scaling_exact.py
||
test_fail
"test_float8_current_scaling_exact.py"
NVTE_INT8_SIM_FP8
=
1 python3
-m
pytest
-v
-s
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_current_scaling_exact_int8.xml
$TE_PATH
/tests/pytorch/test_float8_current_scaling_exact.py
||
test_fail
"test_float8_current_scaling_exact.py_int8"
NVTE_INT8_SIM_FP8
=
1
NVTE_INT8_SIM_FP8_TENSORWISE
=
1 python3
-m
pytest
-v
-s
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_current_scaling_exact_int8_tensorwise.xml
$TE_PATH
/tests/pytorch/test_float8_current_scaling_exact.py
||
test_fail
"test_float8_current_scaling_exact.py_int8_tensorwise"
python3
-m
pytest
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_gqa.xml
$TE_PATH
/tests/pytorch/test_gqa.py
||
test_fail
"test_gqa.py"
python3
-m
pytest
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_fused_optimizer.xml
$TE_PATH
/tests/pytorch/test_fused_optimizer.py
||
test_fail
"test_fused_optimizer.py"
python3
-m
pytest
--tb
=
auto
--junitxml
=
$XML_LOG_DIR
/pytest_test_multi_tensor.xml
$TE_PATH
/tests/pytorch/test_multi_tensor.py
||
test_fail
"test_multi_tensor.py"
...
...
tests/pytorch/distributed/test_numerics.py
View file @
a68e5f87
...
...
@@ -51,11 +51,26 @@ def _run_test(quantization):
all_boolean
=
[
True
,
False
]
@
pytest
.
mark
.
parametrize
(
"quantization"
,
[
None
,
"fp8"
,
"mxfp8"
,
"fp8_cs"
,
"fp8_block_scaling"
,
"nvfp4"
]
)
def
test_distributed
(
quantization
):
if
quantization
==
"fp8"
and
not
fp8_available
:
pytest
.
skip
(
reason_for_no_fp8
)
if
quantization
==
"fp8_cs"
and
not
fp8_available
:
pytest
.
skip
(
reason_for_no_fp8
)
if
quantization
==
"mxfp8"
and
not
mxfp8_available
:
pytest
.
skip
(
reason_for_no_mxfp8
)
if
quantization
==
"fp8_block_scaling"
and
not
fp8_block_scaling_available
:
pytest
.
skip
(
reason_for_no_fp8_block_scaling
)
if
quantization
==
"nvfp4"
and
not
nvfp4_available
:
pytest
.
skip
(
reason_for_no_nvfp4
)
_run_test
(
quantization
)
@
pytest
.
mark
.
parametrize
(
"quantization"
,
[
None
,
"fp8"
,
"mxfp8"
,
"fp8_cs"
,
"fp8_block_scaling"
,
"nvfp4"
]
)
def
test_int8_distributed
(
quantization
):
if
quantization
==
"fp8"
and
not
fp8_available
:
pytest
.
skip
(
reason_for_no_fp8
)
if
quantization
==
"fp8_cs"
and
not
fp8_available
:
...
...
tests/pytorch/test_float8_blockwise_gemm_exact.py
View file @
a68e5f87
...
...
@@ -47,7 +47,7 @@ def cublas_gemm_fp8_blockwise_case(
atol
:
float
=
0.0
,
rtol
:
float
=
0.0
):
if
IS_HIP_EXTENSION
and
int8_simulation_fp8
:
if
IS_HIP_EXTENSION
:
if
use_bias
or
use_gelu
:
pytest
.
skip
(
"Bias and GELU not supported in int8 simulation mode on ROCm."
)
if
not
((
not
x_columnwise
and
not
w_columnwise
and
is_x_1d_scaled
and
not
is_w_1d_scaled
)
or
(
not
x_columnwise
and
w_columnwise
and
is_x_1d_scaled
and
not
is_w_1d_scaled
)
or
(
x_columnwise
and
w_columnwise
and
is_x_1d_scaled
and
is_w_1d_scaled
)):
...
...
@@ -249,7 +249,7 @@ def cublas_gemm_test_constraint_enforced(
expected_err_cls
=
RuntimeError
):
if
IS_HIP_EXTENSION
:
pytest
.
skip
(
"ROCm does not support cuBLAS
GEMM
. No need to test constraint enforcement."
)
pytest
.
skip
(
"ROCm does not support cuBLAS
blockwise FP8 gemm
. No need to test constraint enforcement."
)
if
not
fp8_blockwise_gemm_supported
():
pytest
.
skip
(
"CUDA version does not support blockwise FP8 gemm."
)
# Setup device and random seed
...
...
tests/pytorch/test_float8_blockwise_scaling_exact.py
View file @
a68e5f87
...
...
@@ -9,7 +9,7 @@ import pathlib
import
pytest
import
torch
import
transformer_engine.pytorch
as
te
from
transformer_engine.pytorch.fp8
import
blockwise_fp8_block_len
from
transformer_engine.pytorch.fp8
import
(
FP8GlobalStateManager
,
blockwise_fp8_block_len
)
from
transformer_engine.common.recipe
import
Float8BlockScaling
from
transformer_engine.pytorch.constants
import
TE_DType
from
transformer_engine.pytorch
import
(
...
...
@@ -507,6 +507,9 @@ def test_quantization_block_tiling_extrema_versus_reference(
rtol
=
0.0
,
)
def
fp8_blockwise_scaling_supported
()
->
bool
:
supported
,
_
=
FP8GlobalStateManager
.
is_fp8_block_scaling_available
()
return
supported
# FP8 per tesnor current scaling
@
pytest
.
mark
.
skipif
(
not
recipe_available
,
reason
=
reason_for_no_recipe
)
...
...
@@ -541,12 +544,65 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
out_size
,
dtype
,
use_bias
=
True
,
):
if
not
fp8_blockwise_scaling_supported
():
pytest
.
skip
(
"CUDA version does not support blockwise FP8."
)
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
tensor_map
=
self
.
_check_golden_tensor_dumps
(
TENSOR_DUMP_DIR
,
recipe2
,
(
batch_size
,
hidden_size
,
out_size
),
dtype
,
use_bias
)
if
tensor_map
is
not
None
:
fp8_zero_tolerance_tensor_dumps_recipe2
=
tensor_map
self
.
compare_recipe
(
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
use_bias
,
seed
=
torch
.
initial_seed
(),
dtype
=
dtype
,
y_error
=
0.5
,
dgrad_error
=
1
,
wgrad_error
=
1
,
bgrad_error
=
0.5
,
recipe1_golden_tensors
=
None
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
)
@
pytest
.
mark
.
parametrize
(
"batch_size, hidden_size, out_size"
,
[
(
16
,
256
,
128
),
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
],
ids
=
[
"bf16"
])
@
pytest
.
mark
.
parametrize
(
"recipe1, recipe2"
,
[
(
GetRecipes
.
none
,
GetRecipes
.
fp8_blockwise
),
],
)
def
test_int8_current_scaling_with_linear_module
(
self
,
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
dtype
,
use_bias
=
True
,
):
if
IS_HIP_EXTENSION
:
import
importlib
ori_int8_sim_fp8
=
os
.
environ
.
get
(
"NVTE_INT8_SIM_FP8"
,
None
)
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
"1"
importlib
.
reload
(
te
.
pytorch
.
fp8
)
if
not
fp8_blockwise_scaling_supported
():
pytest
.
skip
(
"CUDA version does not support blockwise FP8."
)
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
...
...
@@ -612,12 +668,71 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
out_size
,
dtype
,
use_bias
=
True
,
):
if
not
fp8_blockwise_scaling_supported
():
pytest
.
skip
(
"CUDA version does not support blockwise FP8."
)
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
tensor_map
=
self
.
_check_golden_tensor_dumps
(
TENSOR_DUMP_DIR
,
recipe2
,
(
batch_size
,
hidden_size
,
out_size
),
dtype
,
use_bias
,
"LayerNorm"
,
)
if
tensor_map
is
not
None
:
fp8_zero_tolerance_tensor_dumps_recipe2
=
tensor_map
self
.
compare_recipe
(
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
use_bias
,
seed
=
torch
.
initial_seed
(),
dtype
=
dtype
,
y_error
=
0.5
,
ln_out_error
=
0.5
,
dgrad_error
=
1.6
,
wgrad_error
=
1
,
bgrad_error
=
0.5
,
recipe1_golden_tensors
=
None
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
)
@
pytest
.
mark
.
parametrize
(
"batch_size, hidden_size, out_size"
,
[
(
16
,
256
,
128
),
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
],
ids
=
[
"bf16"
])
@
pytest
.
mark
.
parametrize
(
"recipe1, recipe2"
,
[
(
GetRecipes
.
none
,
GetRecipes
.
fp8_blockwise
),
],
)
def
test_int8_current_scaling_with_layernorm_linear_module
(
self
,
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
dtype
,
use_bias
=
True
,
):
if
IS_HIP_EXTENSION
:
import
importlib
ori_int8_sim_fp8
=
os
.
environ
.
get
(
"NVTE_INT8_SIM_FP8"
,
None
)
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
"1"
importlib
.
reload
(
te
.
pytorch
.
fp8
)
if
not
fp8_blockwise_scaling_supported
():
pytest
.
skip
(
"CUDA version does not support blockwise FP8."
)
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
...
...
tests/pytorch/test_int8_channelwise_gemm_exact.py
deleted
100644 → 0
View file @
99a1c744
This diff is collapsed.
Click to expand it.
transformer_engine/pytorch/quantization.py
View file @
a68e5f87
...
...
@@ -28,7 +28,7 @@ from transformer_engine.common.recipe import (
)
from
.constants
import
dist_group_type
from
.utils
import
get_device_compute_capability
from
.utils
import
(
get_device_compute_capability
,
is_gfx928
,
is_gfx936
,
is_gfx938
)
from
.jit
import
jit_fuser
from
torch.utils.cpp_extension
import
IS_HIP_EXTENSION
int8_simulation_fp8
=
bool
(
int
(
os
.
getenv
(
"NVTE_INT8_SIM_FP8"
,
"0"
)))
...
...
@@ -45,18 +45,14 @@ __all__ = [
"get_default_recipe"
,
]
if
IS_HIP_EXTENSION
:
from
transformer_engine.pytorch.utils
import
is_K100_AI
,
is_BW
@
functools
.
lru_cache
(
maxsize
=
None
)
def
check_fp8_support
()
->
Tuple
[
bool
,
str
]:
"""Return if fp8 support is available"""
if
IS_HIP_EXTENSION
:
if
(
is_K100_AI
()
or
is_BW
())
and
int8_simulation_fp8
:
return
True
,
"DCU turn on fp8 simulation with int8"
else
:
return
False
,
"DCU not support fp8 for now"
else
:
if
is_gfx938
():
return
True
,
""
if
(
is_gfx928
()
or
is_gfx936
())
and
int8_simulation_fp8
and
int8_simulation_fp8_tensorwise
:
return
True
,
""
if
get_device_compute_capability
()
>=
(
9
,
0
):
# hopper and above
return
True
,
""
if
get_device_compute_capability
()
<
(
8
,
9
):
# pre-ada
...
...
@@ -71,6 +67,8 @@ def check_fp8_support() -> Tuple[bool, str]:
@
functools
.
lru_cache
(
maxsize
=
None
)
def
check_mxfp8_support
()
->
Tuple
[
bool
,
str
]:
"""Return if fp8 support is available"""
if
IS_HIP_EXTENSION
:
return
False
,
"DCU not support mxfp8 for now"
if
get_device_compute_capability
()
>=
(
12
,
0
):
return
False
,
"MXFP8 (for all gemm layouts) is not supported on 12.0+ architectures yet."
if
get_device_compute_capability
()
>=
(
10
,
0
):
# blackwell and above
...
...
@@ -83,7 +81,6 @@ def check_nvfp4_support() -> Tuple[bool, str]:
"""Return if nvfp4 support is available"""
if
IS_HIP_EXTENSION
:
return
False
,
"NVFP4 is not supported on rocm platform."
else
:
if
get_device_compute_capability
()
>=
(
10
,
0
):
# blackwell and above
return
True
,
""
return
False
,
"Device compute capability 10.0 or higher required for NVFP4 execution."
...
...
@@ -93,9 +90,10 @@ def check_nvfp4_support() -> Tuple[bool, str]:
def
check_fp8_block_scaling_support
()
->
Tuple
[
bool
,
str
]:
"""Return if fp8 block scaling support is available"""
if
IS_HIP_EXTENSION
:
if
is_K100_AI
()
or
is_BW
()
and
int8_simulation_fp8
:
if
is_gfx938
():
return
True
,
""
if
(
is_gfx928
()
or
is_gfx936
())
and
int8_simulation_fp8
:
return
True
,
""
else
:
return
False
,
"DCU not support block_scaling fp8 for now"
if
get_device_compute_capability
()
>=
(
9
,
0
)
and
float
(
torch
.
version
.
cuda
)
>=
12.9
:
return
True
,
""
...
...
transformer_engine/pytorch/utils.py
View file @
a68e5f87
...
...
@@ -10,10 +10,9 @@ import os
from
typing
import
Any
,
Callable
,
List
,
Optional
,
Sequence
,
Tuple
,
Union
import
numpy
as
np
import
torch
from
torch.utils.cpp_extension
import
IS_HIP_EXTENSION
from
.
import
torch_version
from
.quantized_tensor
import
Quantizer
from
torch.utils.cpp_extension
import
IS_HIP_EXTENSION
__all__
=
[
"get_device_compute_capability"
,
"get_cudnn_version"
,
"is_bf16_available"
]
...
...
@@ -445,20 +444,64 @@ def assert_dim_for_fp8_exec(*tensors: List[torch.Tensor]) -> None:
)
if
IS_HIP_EXTENSION
:
def
is_mi200
():
"""check whether this machine is mi200/210/250"""
@
functools
.
lru_cache
(
maxsize
=
None
)
def
_get_gcn_arch_impl
(
device
:
torch
.
device
)
->
int
:
props
=
torch
.
cuda
.
get_device_properties
(
device
)
import
re
return
(
re
.
search
(
'AMD Instinct MI2.0'
,
torch
.
cuda
.
get_device_name
(
torch
.
cuda
.
current_device
()))
is
not
None
)
if
re
.
search
(
'gfx906'
,
props
.
gcnArchName
)
is
not
None
:
return
906
if
re
.
search
(
'gfx926'
,
props
.
gcnArchName
)
is
not
None
:
return
926
if
re
.
search
(
'gfx928'
,
props
.
gcnArchName
)
is
not
None
:
return
928
if
re
.
search
(
'gfx936'
,
props
.
gcnArchName
)
is
not
None
:
return
936
if
re
.
search
(
'gfx938'
,
props
.
gcnArchName
)
is
not
None
:
return
938
raise
RuntimeError
(
f
"Unsupported GCN Arch
{
props
.
gcnArchName
}
"
)
def
_get_gcn_arch
()
->
int
:
return
_get_gcn_arch_impl
(
torch
.
cuda
.
current_device
())
def
is_gfx906
()
->
bool
:
"""check whether this machine is gfx906"""
return
_get_gcn_arch
()
==
906
def
is_gfx926
()
->
bool
:
"""check whether this machine is gfx926"""
return
_get_gcn_arch
()
==
926
def
is_gfx928
()
->
bool
:
"""check whether this machine is gfx928"""
return
_get_gcn_arch
()
==
928
def
is_gfx936
()
->
bool
:
"""check whether this machine is gfx928"""
return
_get_gcn_arch
()
==
936
def
is_gfx938
()
->
bool
:
"""check whether this machine is gfx928"""
return
_get_gcn_arch
()
==
938
else
:
def
is_gfx906
()
->
bool
:
"""gfx906 is only available on ROCm"""
return
False
def
is_K100_AI
():
"""check whether this machine is K100_AI"""
import
re
return
(
re
.
search
(
'K100_AI'
,
torch
.
cuda
.
get_device_name
(
torch
.
cuda
.
current_device
()))
is
not
None
)
def
is_gfx926
()
->
bool
:
"""gfx926 is only available on ROCm"""
return
False
def
is_BW
():
"""check whether this machine is BW"""
import
re
return
(
re
.
search
(
'BW'
,
torch
.
cuda
.
get_device_name
(
torch
.
cuda
.
current_device
()))
is
not
None
)
def
is_gfx928
()
->
bool
:
"""gfx928 is only available on ROCm"""
return
False
def
is_gfx936
()
->
bool
:
"""gfx936 is only available on ROCm"""
return
False
def
is_gfx938
()
->
bool
:
"""gfx938 is only available on ROCm"""
return
False
def
assert_dim_for_all_gather
(
tensor
:
torch
.
Tensor
,
with_all_gather
:
bool
,
quantizer
:
Quantizer
...
...
@@ -475,12 +518,8 @@ def is_bf16_compatible() -> bool:
check on device compute capability to enforce sm_80 or higher.
"""
if
IS_HIP_EXTENSION
:
# only MI200 and MI300 machines support bf16
if
get_device_compute_capability
()
>=
(
9
,
4
)
or
is_mi200
()
or
is_K100_AI
()
or
is_BW
():
return
True
else
:
return
False
else
:
# only these arch support bf16
return
is_gfx928
()
or
is_gfx936
()
or
is_gfx938
()
return
torch
.
cuda
.
get_device_capability
()[
0
]
>=
8
...
...
@@ -515,7 +554,6 @@ def is_non_tn_fp8_gemm_supported(is_blockwise: Optional[bool] = False) -> bool:
if
IS_HIP_EXTENSION
:
if
is_blockwise
:
return
False
else
:
return
True
device_capability
=
torch
.
cuda
.
get_device_capability
()
return
(
10
,
0
)
<=
device_capability
<
(
12
,
0
)
or
device_capability
>=
(
13
,
0
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment