Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
7248272c
Unverified
Commit
7248272c
authored
Jun 29, 2025
by
Baizhou Zhang
Committed by
GitHub
Jun 29, 2025
Browse files
Add dsv3 router gemm kernel (#7627)
parent
22352d47
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
398 additions
and
0 deletions
+398
-0
sgl-kernel/CMakeLists.txt
sgl-kernel/CMakeLists.txt
+1
-0
sgl-kernel/benchmark/bench_dsv3_router_gemm.py
sgl-kernel/benchmark/bench_dsv3_router_gemm.py
+56
-0
sgl-kernel/csrc/common_extension.cc
sgl-kernel/csrc/common_extension.cc
+3
-0
sgl-kernel/csrc/gemm/dsv3_router_gemm.cu
sgl-kernel/csrc/gemm/dsv3_router_gemm.cu
+286
-0
sgl-kernel/include/sgl_kernel_ops.h
sgl-kernel/include/sgl_kernel_ops.h
+1
-0
sgl-kernel/python/sgl_kernel/__init__.py
sgl-kernel/python/sgl_kernel/__init__.py
+1
-0
sgl-kernel/python/sgl_kernel/gemm.py
sgl-kernel/python/sgl_kernel/gemm.py
+18
-0
sgl-kernel/tests/test_dsv3_router_gemm.py
sgl-kernel/tests/test_dsv3_router_gemm.py
+32
-0
No files found.
sgl-kernel/CMakeLists.txt
View file @
7248272c
...
...
@@ -222,6 +222,7 @@ set(SOURCES
"csrc/gemm/awq_kernel.cu"
"csrc/gemm/bmm_fp8.cu"
"csrc/gemm/dsv3_fused_a_gemm.cu"
"csrc/gemm/dsv3_router_gemm.cu"
"csrc/gemm/fp8_blockwise_gemm_kernel.cu"
"csrc/gemm/fp8_gemm_kernel.cu"
"csrc/gemm/int8_gemm_kernel.cu"
...
...
sgl-kernel/benchmark/bench_dsv3_router_gemm.py
0 → 100644
View file @
7248272c
import
argparse
import
torch
import
torch.nn.functional
as
F
import
triton
import
triton.testing
from
sgl_kernel
import
dsv3_router_gemm
@
triton
.
testing
.
perf_report
(
triton
.
testing
.
Benchmark
(
x_names
=
[
"num_tokens"
],
x_vals
=
[
i
+
1
for
i
in
range
(
16
)],
x_log
=
False
,
line_arg
=
"impl"
,
line_vals
=
[
"torch"
,
"sgl-kernel"
],
line_names
=
[
"torch"
,
"dsv3_router_gemm"
],
styles
=
[(
"blue"
,
"-"
),
(
"orange"
,
"-"
)],
ylabel
=
"TFLOPs"
,
plot_name
=
"input-bf16-output-fp32 dsv3 router gemm throughput"
,
args
=
{},
)
)
def
benchmark
(
num_tokens
,
impl
):
# M: num_tokens, K: hidden_dim, N: num_experts
M
,
K
,
N
=
num_tokens
,
7168
,
256
mat_a
=
torch
.
randn
((
M
,
K
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
contiguous
()
mat_b
=
torch
.
randn
((
N
,
K
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
contiguous
()
quantiles
=
[
0.5
,
0.2
,
0.8
]
if
impl
==
"torch"
:
def
runner
():
F
.
linear
(
mat_a
,
mat_b
).
to
(
torch
.
float32
)
elif
impl
==
"sgl-kernel"
:
def
runner
():
dsv3_router_gemm
(
mat_a
,
mat_b
)
ms
,
min_ms
,
max_ms
=
triton
.
testing
.
do_bench
(
runner
,
quantiles
=
quantiles
)
def
tflops
(
t_ms
):
flops
=
2
*
M
*
K
*
N
return
flops
/
(
t_ms
*
1e-3
)
/
1e12
return
tflops
(
ms
),
tflops
(
max_ms
),
tflops
(
min_ms
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
args
=
parser
.
parse_args
()
benchmark
.
run
(
print_data
=
True
,
show_plots
=
True
,
save_path
=
"bench_dsv3_router_gemm"
)
sgl-kernel/csrc/common_extension.cc
View file @
7248272c
...
...
@@ -158,6 +158,9 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
" Tensor expert_offsets, Tensor sf_offsets) -> ()"
);
m
.
impl
(
"cutlass_fp4_group_mm"
,
torch
::
kCUDA
,
&
cutlass_fp4_group_mm
);
m
.
def
(
"dsv3_router_gemm(Tensor! output, Tensor mat_a, Tensor mat_b) -> ()"
);
m
.
impl
(
"dsv3_router_gemm"
,
torch
::
kCUDA
,
&
dsv3_router_gemm
);
/*
* From csrc/moe
*/
...
...
sgl-kernel/csrc/gemm/dsv3_router_gemm.cu
0 → 100644
View file @
7248272c
/*
* Adapted from
* https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu
* https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp
*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include "cuda_bf16.h"
#include "cuda_runtime.h"
#include "utils.h"
// Custom FMA implementation using PTX assembly instructions
__device__
__forceinline__
void
fma
(
float2
&
d
,
float2
const
&
a
,
float2
const
&
b
,
float2
const
&
c
)
{
asm
volatile
(
"fma.rn.f32x2 %0, %1, %2, %3;
\n
"
:
"=l"
(
reinterpret_cast
<
uint64_t
&>
(
d
))
:
"l"
(
reinterpret_cast
<
uint64_t
const
&>
(
a
)),
"l"
(
reinterpret_cast
<
uint64_t
const
&>
(
b
)),
"l"
(
reinterpret_cast
<
uint64_t
const
&>
(
c
)));
}
// Convert 8 bfloat16 values from a uint4 to float array - optimized conversion
template
<
int
VPT
>
__device__
__forceinline__
void
bf16_uint4_to_float8
(
uint4
const
&
vec
,
float
*
dst
)
{
__nv_bfloat16
*
bf16_ptr
=
reinterpret_cast
<
__nv_bfloat16
*>
(
const_cast
<
uint4
*>
(
&
vec
));
#pragma unroll
for
(
int
i
=
0
;
i
<
VPT
;
i
++
)
{
dst
[
i
]
=
__bfloat162float
(
bf16_ptr
[
i
]);
}
}
template
<
typename
T
,
int
kBlockSize
,
int
VPT
,
int
kNumTokens
,
int
kNumExperts
,
int
kHiddenDim
>
__global__
__launch_bounds__
(
128
,
1
)
void
router_gemm_kernel
(
float
*
out
,
T
const
*
mat_a
,
T
const
*
mat_b
)
{
// Each block handles one expert column
int
const
n_idx
=
blockIdx
.
x
;
int
const
tid
=
threadIdx
.
x
;
constexpr
int
kWarpSize
=
32
;
constexpr
int
kNumWarps
=
kBlockSize
/
kWarpSize
;
// Constants for this kernel
constexpr
int
k_elems_per_k_iteration
=
VPT
*
kBlockSize
;
constexpr
int
k_iterations
=
kHiddenDim
/
k_elems_per_k_iteration
;
// Total K iterations
// Initialize accumulators for all M rows
float
acc
[
kNumTokens
]
=
{};
// Shared memory for warp-level reduction
__shared__
float
sm_reduction
[
kNumTokens
][
kNumWarps
];
// kNumWarps
// B matrix is in column-major order, so we can directly load a column for the n_idx expert
T
const
*
b_col
=
mat_b
+
n_idx
*
kHiddenDim
;
// Pre-compute k_base values for each iteration to help compiler optimize
// int k_bases[k_iterations];
int
k_bases
[
k_iterations
];
#pragma unroll
for
(
int
ki
=
0
;
ki
<
k_iterations
;
ki
++
)
{
k_bases
[
ki
]
=
ki
*
k_elems_per_k_iteration
+
tid
*
VPT
;
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm
volatile
(
"griddepcontrol.wait;"
);
#endif
// Process the GEMM in chunks
for
(
int
ki
=
0
;
ki
<
k_iterations
;
ki
++
)
{
int
const
k_base
=
k_bases
[
ki
];
// Load B matrix values using vector load (8 bf16 values)
uint4
b_vec
=
*
reinterpret_cast
<
uint4
const
*>
(
b_col
+
k_base
);
// Convert B values to float
float
b_float
[
VPT
];
bf16_uint4_to_float8
<
VPT
>
(
b_vec
,
b_float
);
// Process each token
#pragma unroll
for
(
int
m_idx
=
0
;
m_idx
<
kNumTokens
;
m_idx
++
)
{
// Load both rows of A matrix using vector loads
uint4
a_vec
=
*
reinterpret_cast
<
uint4
const
*>
(
mat_a
+
(
m_idx
*
kHiddenDim
)
+
k_base
);
// Convert A values to float
float
a_float
[
VPT
];
bf16_uint4_to_float8
<
VPT
>
(
a_vec
,
a_float
);
// Process elements in this chunk
#pragma unroll
for
(
int
k
=
0
;
k
<
VPT
;
k
++
)
{
float
a
=
a_float
[
k
];
float
b
=
b_float
[
k
];
acc
[
m_idx
]
+=
a
*
b
;
}
}
}
// Perform warp-level reduction
int
const
warpSize
=
32
;
int
const
warpId
=
tid
/
warpSize
;
int
const
laneId
=
tid
%
warpSize
;
// Register for warp-level reduction results
float
warp_result
[
kNumTokens
];
#pragma unroll
for
(
int
m_idx
=
0
;
m_idx
<
kNumTokens
;
m_idx
++
)
{
warp_result
[
m_idx
]
=
acc
[
m_idx
];
}
// Perform warp-level reduction using optimized butterfly pattern
#pragma unroll
for
(
int
m
=
0
;
m
<
kNumTokens
;
m
++
)
{
float
sum
=
warp_result
[
m
];
// Butterfly reduction pattern
sum
+=
__shfl_xor_sync
(
0xffffffff
,
sum
,
16
);
sum
+=
__shfl_xor_sync
(
0xffffffff
,
sum
,
8
);
sum
+=
__shfl_xor_sync
(
0xffffffff
,
sum
,
4
);
sum
+=
__shfl_xor_sync
(
0xffffffff
,
sum
,
2
);
sum
+=
__shfl_xor_sync
(
0xffffffff
,
sum
,
1
);
// Only the first thread in each warp stores to shared memory
if
(
laneId
==
0
)
{
sm_reduction
[
m
][
warpId
]
=
sum
;
}
}
__syncthreads
();
// Final reduction across warps (only first thread)
if
(
tid
==
0
)
{
#pragma unroll
for
(
int
m
=
0
;
m
<
kNumTokens
;
m
++
)
{
float
final_sum
=
0.0
f
;
// Sum across the kNumWarps
#pragma unroll
for
(
int
w
=
0
;
w
<
kNumWarps
;
w
++
)
{
final_sum
+=
sm_reduction
[
m
][
w
];
}
// Write final result
out
[
m
*
kNumExperts
+
n_idx
]
=
final_sum
;
}
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm
volatile
(
"griddepcontrol.launch_dependents;"
);
#endif
}
template
<
typename
T
,
int
kNumTokens
,
int
kNumExperts
,
int
kHiddenDim
>
void
invokeRouterGemm
(
float
*
output
,
T
const
*
mat_a
,
T
const
*
mat_b
,
cudaStream_t
stream
)
{
constexpr
int
VPT
=
16
/
sizeof
(
T
);
constexpr
int
kBlockSize
=
128
;
cudaLaunchConfig_t
config
;
config
.
gridDim
=
kNumExperts
;
config
.
blockDim
=
kBlockSize
;
config
.
dynamicSmemBytes
=
0
;
config
.
stream
=
stream
;
cudaLaunchAttribute
attrs
[
1
];
attrs
[
0
].
id
=
cudaLaunchAttributeProgrammaticStreamSerialization
;
attrs
[
0
].
val
.
programmaticStreamSerializationAllowed
=
getEnvEnablePDL
();
config
.
numAttrs
=
1
;
config
.
attrs
=
attrs
;
cudaLaunchKernelEx
(
&
config
,
router_gemm_kernel
<
T
,
kBlockSize
,
VPT
,
kNumTokens
,
kNumExperts
,
kHiddenDim
>
,
output
,
mat_a
,
mat_b
);
}
template
void
invokeRouterGemm
<
__nv_bfloat16
,
1
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
2
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
3
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
4
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
5
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
6
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
7
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
8
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
9
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
10
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
11
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
12
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
13
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
14
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
15
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
void
invokeRouterGemm
<
__nv_bfloat16
,
16
,
256
,
7168
>(
float
*
,
__nv_bfloat16
const
*
,
__nv_bfloat16
const
*
,
cudaStream_t
);
template
<
int
kBegin
,
int
kEnd
,
int
kNumExperts
,
int
kHiddenDim
>
struct
LoopUnroller
{
static
void
unroll
(
int
num_tokens
,
float
*
output
,
__nv_bfloat16
const
*
input
,
__nv_bfloat16
const
*
weights
,
cudaStream_t
stream
)
{
if
(
num_tokens
==
kBegin
)
{
invokeRouterGemm
<
__nv_bfloat16
,
kBegin
,
kNumExperts
,
kHiddenDim
>
(
output
,
input
,
weights
,
stream
);
}
else
{
LoopUnroller
<
kBegin
+
1
,
kEnd
,
kNumExperts
,
kHiddenDim
>::
unroll
(
num_tokens
,
output
,
input
,
weights
,
stream
);
}
}
};
template
<
int
kEnd
,
int
kNumExperts
,
int
kHiddenDim
>
struct
LoopUnroller
<
kEnd
,
kEnd
,
kNumExperts
,
kHiddenDim
>
{
static
void
unroll
(
int
num_tokens
,
float
*
output
,
__nv_bfloat16
const
*
input
,
__nv_bfloat16
const
*
weights
,
cudaStream_t
stream
)
{
if
(
num_tokens
==
kEnd
)
{
invokeRouterGemm
<
__nv_bfloat16
,
kEnd
,
kNumExperts
,
kHiddenDim
>
(
output
,
input
,
weights
,
stream
);
}
else
{
throw
std
::
invalid_argument
(
"Invalid num_tokens, only supports 1 to 16"
);
}
}
};
void
dsv3_router_gemm
(
torch
::
Tensor
&
output
,
// [num_tokens, num_experts]
const
torch
::
Tensor
&
mat_a
,
// [num_tokens, hidden_dim]
const
torch
::
Tensor
&
mat_b
// [num_experts, hidden_dim]
)
{
TORCH_CHECK
(
output
.
dim
()
==
2
&&
mat_a
.
dim
()
==
2
&&
mat_b
.
dim
()
==
2
);
const
int
num_tokens
=
mat_a
.
size
(
0
);
constexpr
int
num_experts
=
256
;
constexpr
int
hidden_dim
=
7168
;
TORCH_CHECK
(
mat_a
.
size
(
1
)
==
mat_b
.
size
(
1
),
"mat_a and mat_b must have the same hidden_dim"
);
TORCH_CHECK
(
mat_a
.
size
(
1
)
==
hidden_dim
,
"currently hidden_dim only supports 7168"
);
TORCH_CHECK
(
mat_b
.
size
(
0
)
==
num_experts
,
"currently num_experts only supports 256"
);
TORCH_CHECK
(
num_tokens
>=
1
&&
num_tokens
<=
16
,
"currently num_tokens must be less than or equal to 16 for router_gemm"
);
TORCH_CHECK
(
mat_a
.
dtype
()
==
torch
::
kBFloat16
,
"mat_a must be bf16"
);
TORCH_CHECK
(
mat_b
.
dtype
()
==
torch
::
kBFloat16
,
"mat_b must be bf16"
);
TORCH_CHECK
(
output
.
dtype
()
==
torch
::
kFloat32
,
"output must be float32"
);
auto
const
sm
=
getSMVersion
();
TORCH_CHECK
(
sm
>=
90
,
"required CUDA ARCH >= SM_90"
);
const
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
LoopUnroller
<
1
,
16
,
num_experts
,
hidden_dim
>::
unroll
(
num_tokens
,
reinterpret_cast
<
float
*>
(
output
.
mutable_data_ptr
()),
reinterpret_cast
<
__nv_bfloat16
const
*>
(
mat_a
.
data_ptr
()),
reinterpret_cast
<
__nv_bfloat16
const
*>
(
mat_b
.
data_ptr
()),
stream
);
}
sgl-kernel/include/sgl_kernel_ops.h
View file @
7248272c
...
...
@@ -200,6 +200,7 @@ void bmm_fp8(
at
::
Tensor
workspace_buffer
,
int64_t
cublas_handle
,
int64_t
cuda_stream
);
void
dsv3_router_gemm
(
torch
::
Tensor
&
output
,
const
torch
::
Tensor
&
mat_a
,
const
torch
::
Tensor
&
mat_b
);
void
dsv3_fused_a_gemm
(
torch
::
Tensor
&
output
,
torch
::
Tensor
const
&
mat_a
,
torch
::
Tensor
const
&
mat_b
);
...
...
sgl-kernel/python/sgl_kernel/__init__.py
View file @
7248272c
...
...
@@ -34,6 +34,7 @@ from sgl_kernel.gemm import (
bmm_fp8
,
cutlass_scaled_fp4_mm
,
dsv3_fused_a_gemm
,
dsv3_router_gemm
,
fp8_blockwise_scaled_mm
,
fp8_scaled_mm
,
int8_scaled_mm
,
...
...
sgl-kernel/python/sgl_kernel/gemm.py
View file @
7248272c
...
...
@@ -259,6 +259,24 @@ def qserve_w4a8_per_group_gemm(
return
out_feats
def
dsv3_router_gemm
(
hidden_states
:
torch
.
Tensor
,
router_weights
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
output
=
torch
.
empty
(
hidden_states
.
shape
[
0
],
router_weights
.
shape
[
0
],
device
=
hidden_states
.
device
,
dtype
=
torch
.
float32
,
)
torch
.
ops
.
sgl_kernel
.
dsv3_router_gemm
(
output
,
hidden_states
,
router_weights
,
)
return
output
def
shuffle_rows
(
input_tensor
,
dst2src_map
,
output_tensor_shape
):
output_tensor
=
torch
.
empty
(
output_tensor_shape
,
...
...
sgl-kernel/tests/test_dsv3_router_gemm.py
0 → 100644
View file @
7248272c
import
pytest
import
torch
import
torch.nn.functional
as
F
from
sgl_kernel
import
dsv3_router_gemm
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
i
+
1
for
i
in
range
(
16
)])
def
test_dsv3_router_gemm
(
num_tokens
):
num_experts
=
256
hidden_dim
=
7168
mat_a
=
torch
.
randn
(
(
num_tokens
,
hidden_dim
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
contiguous
()
mat_b
=
torch
.
randn
(
(
num_experts
,
hidden_dim
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
contiguous
()
output
=
torch
.
empty
(
(
num_tokens
,
num_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
contiguous
()
ref
=
F
.
linear
(
mat_a
,
mat_b
).
to
(
torch
.
float32
)
output
=
dsv3_router_gemm
(
mat_a
,
mat_b
)
assert
torch
.
allclose
(
output
,
ref
,
rtol
=
1e-2
,
atol
=
1e-3
),
"Router GEMM output mismatch with torch.nn.functional.linear reference"
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment