Commit 917d7a2b authored by Chao Liu's avatar Chao Liu
Browse files

use vectorized read and write for threadwise generic tensor copy

parent 33c38e2a
...@@ -15,7 +15,7 @@ template <index_t BlockSize, ...@@ -15,7 +15,7 @@ template <index_t BlockSize,
class SrcAccessOrder, class SrcAccessOrder,
class DstAccessOrder, class DstAccessOrder,
index_t SrcDataPerRead, index_t SrcDataPerRead,
index_t DstDataPerRead> index_t DstDataPerWrite>
struct BlockwiseGenericTensorSliceCopy_v1 struct BlockwiseGenericTensorSliceCopy_v1
{ {
static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
...@@ -217,14 +217,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -217,14 +217,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex( const index_t clipboard_offset = thread_tensor_desc.GetOffsetFromMultiIndex(
clipboard_data_multi_id_begin); // cannot not constexpr, why? clipboard_data_multi_id_begin); // cannot not constexpr, why?
threadwise_generic_tensor_slice_copy(SrcDesc{}, threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
p_src + src_offset + mThreadSrcOffset, p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
thread_tensor_desc, thread_tensor_desc,
p_clipboard + clipboard_offset, p_clipboard + clipboard_offset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
SrcAccessOrder{}); SrcAccessOrder{},
Number<SrcDataPerRead>{});
}); });
} }
...@@ -255,14 +256,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -255,14 +256,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(
dst_data_multi_id_begin); // cannot not constexpr, why? dst_data_multi_id_begin); // cannot not constexpr, why?
threadwise_generic_tensor_slice_copy(thread_tensor_desc, threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
p_clipboard + clipboard_offset, p_clipboard + clipboard_offset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
DstDesc{}, DstDesc{},
p_dst + dst_offset + mThreadDstOffset, p_dst + dst_offset + mThreadDstOffset,
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths, thread_sub_tensor_lengths,
DstAccessOrder{}); DstAccessOrder{},
Number<DstDataPerWrite>{});
}); });
} }
......
...@@ -105,4 +105,13 @@ __host__ __device__ constexpr T min(T x, Ts... xs) ...@@ -105,4 +105,13 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
return x < y ? x : y; return x < y ? x : y;
} }
// this is wrong
// TODO: implement correct least common multiple, instead of calling max()
template <class T, class... Ts>
__host__ __device__ constexpr T least_common_multiple(T x, Ts... xs)
{
return max(x, xs...);
}
} // namespace mod_conv } // namespace mod_conv
...@@ -3,8 +3,7 @@ ...@@ -3,8 +3,7 @@
// this is ugly, only for 4d // this is ugly, only for 4d
template <class InDesc, class WeiDesc> template <class InDesc, class WeiDesc>
constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, WeiDesc)
WeiDesc)
{ {
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
...@@ -34,8 +33,10 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc, ...@@ -34,8 +33,10 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(InDesc,
} }
template <class InDesc, class WeiDesc, class LowerPads, class UpperPads> template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor( constexpr auto get_convolution_with_padding_output_default_4d_tensor_descriptor(InDesc,
InDesc, WeiDesc, LowerPads, UpperPads) WeiDesc,
LowerPads,
UpperPads)
{ {
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
...@@ -113,5 +114,6 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc, ...@@ -113,5 +114,6 @@ constexpr std::size_t calculate_convolution_memory_size(Float, InDesc, WeiDesc,
constexpr index_t Y = wei_desc.GetLength(I2); constexpr index_t Y = wei_desc.GetLength(I2);
constexpr index_t X = wei_desc.GetLength(I3); constexpr index_t X = wei_desc.GetLength(I3);
return sizeof(Float) * (InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace()); return sizeof(Float) *
(InDesc::GetElementSpace() + WeiDesc::GetElementSpace() + OutDesc::GetElementSpace());
} }
...@@ -474,7 +474,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -474,7 +474,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
map_out_global2thread, map_out_global2thread,
Number<OutThreadCopyDataPerWrite_W>{}); Number<OutThreadCopyDataPerWrite_W>{});
#else #else
threadwise_generic_tensor_slice_copy( threadwise_generic_tensor_slice_copy_v1(
out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread), out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread),
p_out_thread, p_out_thread,
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>(),
......
...@@ -423,7 +423,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw ...@@ -423,7 +423,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
map_out_global2thread, map_out_global2thread,
Number<OutThreadCopyDataPerWrite_W>{}); Number<OutThreadCopyDataPerWrite_W>{});
#else #else
threadwise_generic_tensor_slice_copy( threadwise_generic_tensor_slice_copy_v1(
out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread), out_10d_thread_desc.ReorderGivenNew2Old(map_out_global2thread),
p_out_thread, p_out_thread,
make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>(),
......
...@@ -405,7 +405,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw ...@@ -405,7 +405,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread, p_out_thread,
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
......
...@@ -358,7 +358,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw ...@@ -358,7 +358,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread, p_out_thread,
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "ConstantMatrixDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_generic_tensor_slice_op.hip.hpp" #include "blockwise_generic_tensor_slice_op.hip.hpp"
#include "blockwise_gemm.hip.hpp" #include "blockwise_gemm.hip.hpp"
#include "threadwise_tensor_slice_op.hip.hpp" #include "threadwise_generic_tensor_slice_op.hip.hpp"
// define B = merge(N, Ho, Wo) // define B = merge(N, Ho, Wo)
template <index_t GridSize, template <index_t GridSize,
...@@ -170,7 +170,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -170,7 +170,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// operator for blockwise copy of weight into LDS // operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor // slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in // this copy operator already have blockwise offset built-in
#if 0 #if 1
auto blockwise_wei_copy = auto blockwise_wei_copy =
BlockwiseGenericTensorSliceCopy_v1<BlockSize, BlockwiseGenericTensorSliceCopy_v1<BlockSize,
Float, Float,
...@@ -188,7 +188,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -188,7 +188,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
#else #else
constexpr auto map_k_e_2_e_k = Sequence<1, 0>{}; constexpr auto map_k_e_2_e_k = Sequence<1, 0>{};
const auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3< auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3<
BlockSize, BlockSize,
Float, Float,
decltype(wei_e_k_global_desc.ReorderGivenNew2Old(map_k_e_2_e_k)), decltype(wei_e_k_global_desc.ReorderGivenNew2Old(map_k_e_2_e_k)),
...@@ -324,11 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -324,11 +324,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#if 0
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#endif
__syncthreads(); __syncthreads();
...@@ -355,11 +351,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -355,11 +351,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
// even iteration // even iteration
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True); blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#if 0
blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
#else
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0); p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
#endif
__syncthreads(); __syncthreads();
...@@ -434,14 +426,16 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw ...@@ -434,14 +426,16 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread, p_out_thread,
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
p_out_thread_on_global, p_out_thread_on_global,
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0},
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(),
arithmetic_sequence_gen<0, 8, 1>::SeqType{}); arithmetic_sequence_gen<0, 8, 1>::SeqType{},
Number<1>{});
} }
} }
}; };
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "ConstantMatrixDescriptor.hip.hpp" #include "ConstantMatrixDescriptor.hip.hpp"
#include "blockwise_generic_tensor_slice_op.hip.hpp" #include "blockwise_generic_tensor_slice_op.hip.hpp"
#include "blockwise_gemm.hip.hpp" #include "blockwise_gemm.hip.hpp"
#include "threadwise_tensor_slice_op.hip.hpp" #include "threadwise_generic_tensor_slice_op.hip.hpp"
// define B = merge(N, Ho, Wo) // define B = merge(N, Ho, Wo)
template <index_t GridSize, template <index_t GridSize,
...@@ -327,7 +327,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw ...@@ -327,7 +327,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0); k_thread_data_on_global, 0, b_thread_data_on_global, 0);
threadwise_generic_tensor_slice_copy(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc, threadwise_generic_tensor_slice_copy_v1(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc,
p_out_thread, p_out_thread,
{0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0},
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc, out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc,
......
#pragma once
#include "ConstantTensorDescriptor.hip.hpp"
#include "ConstantMergedTensorDescriptor.hip.hpp"
template <class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
class DimAccessOrder,
index_t DataPerAccess>
__device__ void threadwise_generic_tensor_slice_copy_v1(
SrcDesc,
const Float* __restrict__ p_src,
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
DstDesc,
Float* __restrict__ p_dst,
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
SliceLengths,
DimAccessOrder,
Number<DataPerAccess>)
{
constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
#if 0
// doesn't compile, because merged-tensor reordering is not implemented
// TODO: implement tensor desc ops for merged-tensor
constexpr auto src_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
constexpr auto dst_strides_in_access_order =
SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
// check src/dst stride on the lowest access dimension
static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
(DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
"wrong! src/dst stride on the lowest access dimension needs to be 1 for "
"vectorized read/write");
#endif
constexpr auto slice_lengths_in_access_order =
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
// check slice length on the lowest access dimension
static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
"wrong! slice length on the lowest access dimension should be evenly divided by "
"DataPerAccess");
constexpr index_t num_access_on_lowest_access_dimension =
slice_lengths_in_access_order.Back() / DataPerAccess;
constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
#if 1
ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
auto data_multi_id_in_access_order = access_multi_id;
data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess;
const auto data_multi_id =
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
});
#else
static_ford<decltype(access_lengths)>{}([&](auto access_multi_id_) {
const auto access_multi_id = sequence2array(access_multi_id_);
auto data_multi_id_in_access_order = access_multi_id;
data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess;
const auto data_multi_id =
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
});
#endif
}
...@@ -192,58 +192,3 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, ...@@ -192,58 +192,3 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
}); });
}); });
} }
template <class Float, class SrcDesc, class DstDesc, class SliceLengths, class DimAccessOrder>
__device__ void threadwise_generic_tensor_slice_copy(
SrcDesc,
const Float* __restrict__ p_src,
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
DstDesc,
Float* __restrict__ p_dst,
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
SliceLengths,
DimAccessOrder)
{
constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
constexpr auto slice_lengths_in_access_order =
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
#if 1
ford<decltype(slice_lengths_in_access_order)>{}([&](auto data_multi_id_in_access_order) {
const auto data_multi_id =
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
p_dst[dst_index] = p_src[src_index];
});
#else
static_ford<decltype(slice_lengths_in_access_order)>{}(
[&](auto data_multi_id_in_access_order_) {
constexpr auto data_multi_id_in_access_order =
sequence2array(decltype(data_multi_id_in_access_order_){});
const auto data_multi_id =
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
p_dst[dst_index] = p_src[src_index];
});
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment