Commit c353b35b authored by zhanghj2's avatar zhanghj2
Browse files

恢复支持旧接口

parent c566af36
......@@ -5,7 +5,7 @@
#include "dense_decode.h"
#include "dense_decode_qkvfp8.h"
#include "dense_decode_kvfp8.h"
#include "../extension/flash_api.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.doc() = "FlashMLA";
m.def("sparse_decode_fwd", &sparse_attn_decode_interface);
......@@ -13,4 +13,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("dense_decode_fwd_qkvfp8", &dense_attn_decode_qkvfp8_interface);
m.def("dense_decode_fwd_kvfp8", &dense_attn_decode_kvfp8_interface);
m.def("sparse_prefill_fwd", &sparse_attn_prefill_interface);
m.def("get_mla_decoding_metadata_dense_fp8", &get_mla_decoding_metadata_dense_fp8);
m.def("fwd_kvcache_quantization_mla", &mha_fwd_kvcache_quantization_mla);
m.def("fwd_kvcache_quantization_q_nope_pe_mla", &mha_fwd_kvcache_quantization_q_nope_pe_mla);
m.def("fwd_kvcache_mla_nope_pe", &mha_fwd_kvcache_mla_nope_pe);
m.def("fwd_kvcache_mla_fp8", &mha_fwd_kvcache_mla_fp8);
m.def("fwd_kvcache_mla_fp8_with_cat", &mha_fwd_kvcache_mla_fp8_with_cat);
}
This diff is collapsed.
#include "flash_fwd_mla_kernel.h"
template void run_mha_fwd_splitkv_mla<cutlass::bfloat16_t, 576>(Flash_fwd_mla_params &params, const std::string& kv_cache_dtype, cudaStream_t stream, bool is_q_nope_pe = false);
#include "flash_fwd_mla_kernel.h"
template void run_mha_fwd_splitkv_mla<cutlass::half_t, 576>(Flash_fwd_mla_params &params, const std::string& kv_cache_dtype, cudaStream_t stream, bool is_q_nope_pe = false);
#include "flash_fwd_mla_kernel_fp8.h"
template void run_mha_fwd_splitkv_mla_fp8<cutlass::float_e4m3_t,cutlass::bfloat16_t,576>(Flash_fwd_mla_params &params, cudaStream_t stream, bool is_with_cat);
#include "flash_fwd_mla_kernel_fp8.h"
template void run_mha_fwd_splitkv_mla_fp8<cutlass::bfloat16_t,cutlass::bfloat16_t,576>(Flash_fwd_mla_params &params, cudaStream_t stream, bool is_with_cat);
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
#include "flash_fwd_mla_kernel.h"
static constexpr int MaxBatchSize = 4096;
__global__ void __launch_bounds__(64, 1)
get_mla_metadata_kernel(const Mla_metadata_params params) {
int *seqlens_k_ptr = params.seqlens_k_ptr;
int *tile_scheduler_metadata_ptr = params.tile_scheduler_metadata_ptr;
int *num_splits_ptr = params.num_splits_ptr;
int batch_size = params.batch_size;
int block_size_n = params.block_size_n;
int fixed_overhead_num_blocks = params.fixed_overhead_num_blocks;
int num_sm_parts = params.num_sm_parts;
__shared__ int num_blocks_shared[MaxBatchSize];
__shared__ int num_splits_shared[MaxBatchSize];
int total_num_blocks = 0;
for (int i = threadIdx.x; i < batch_size; i += 64) {
int num_blocks = cutlass::ceil_div(seqlens_k_ptr[i], block_size_n);
total_num_blocks += num_blocks + fixed_overhead_num_blocks;
num_blocks_shared[i] = num_blocks;
}
for (int offset = 32; offset >= 1; offset /= 2) {
// total_num_blocks += __shfl_xor(uint32_t(-1), total_num_blocks, offset);
total_num_blocks += __shfl_xor(total_num_blocks, offset, 64);
}
__syncthreads();
if (threadIdx.x == 0) {
int payload = cutlass::ceil_div(total_num_blocks, num_sm_parts) + fixed_overhead_num_blocks;
int now_idx = 0, now_block = 0, now_n_split_idx = 0, cum_num_splits = 0;
num_splits_shared[0] = 0;
for (int i = 0; i < num_sm_parts; ++i) {
int tile_scheduler_metadata0[4], tile_scheduler_metadata1;
tile_scheduler_metadata0[0] = now_idx;
tile_scheduler_metadata0[1] = now_block * block_size_n;
tile_scheduler_metadata1 = now_n_split_idx;
int remain_payload = payload;
while (now_idx < batch_size) {
int num_blocks = num_blocks_shared[now_idx];
int now_remain_blocks = num_blocks - now_block;
if (remain_payload >= now_remain_blocks + fixed_overhead_num_blocks) {
cum_num_splits += now_n_split_idx + 1;
num_splits_shared[now_idx + 1] = cum_num_splits;
remain_payload -= now_remain_blocks + fixed_overhead_num_blocks;
++now_idx;
now_block = 0;
now_n_split_idx = 0;
} else {
if (remain_payload - fixed_overhead_num_blocks > 0) {
now_block += remain_payload - fixed_overhead_num_blocks;
++now_n_split_idx;
remain_payload = 0;
}
break;
}
}
tile_scheduler_metadata0[2] = now_block > 0 ? now_idx : now_idx - 1;
tile_scheduler_metadata0[3] = now_block > 0 ? now_block * block_size_n : seqlens_k_ptr[now_idx - 1];
*reinterpret_cast<int4 *>(tile_scheduler_metadata_ptr + i * TileSchedulerMetaDataSize) = *reinterpret_cast<int4 *>(tile_scheduler_metadata0);
tile_scheduler_metadata_ptr[i * TileSchedulerMetaDataSize + 4] = tile_scheduler_metadata1;
}
//FLASH_DEVICE_ASSERT(now_idx == batch_size && now_block == 0 && now_n_split_idx == 0);
}
__syncthreads();
for (int i = threadIdx.x; i <= batch_size; i += 64) {
num_splits_ptr[i] = num_splits_shared[i];
}
}
void get_mla_metadata_func(Mla_metadata_params &params, cudaStream_t stream) {
FLASH_ASSERT(params.batch_size < MaxBatchSize);
get_mla_metadata_kernel<<<1, 64, 0, stream>>>(params);
CHECK_CUDA_KERNEL_LAUNCH();
}
__global__ void __launch_bounds__(64, 1)
get_mla_decoding_metadata_kernel(const GetDecodingMetadataParams params) {
int *seqlens_k_ptr = params.seqlens_k_ptr;
int *tile_scheduler_metadata_ptr = params.tile_scheduler_metadata_ptr;
int *num_splits_ptr = params.num_splits_ptr;
int batch_size = params.batch_size;
int block_size_n = params.block_size_n;
int fixed_overhead_num_blocks = params.fixed_overhead_num_blocks;
int num_sm_parts = params.num_sm_parts;
extern __shared__ int shared_mem[];
int* num_blocks_shared = shared_mem; // [batch_size]
int* num_splits_shared = shared_mem + batch_size; // [batch_size+1]
int* seqlens_k_shared = shared_mem + batch_size*2+1; // [batch_size]
int* first_block_idx_shared = shared_mem + batch_size*3+1; // [batch_size]
int* last_block_idx_shared = shared_mem + batch_size*4+1; // [batch_size]
int total_num_blocks = 0;
for (int i = threadIdx.x; i < batch_size; i += 64) {
int cur_s_k = params.topk == -1 ? __ldg(seqlens_k_ptr + i) : params.topk;
seqlens_k_shared[i] = cur_s_k;
int first_token_idx = 0;
int last_token_idx = max(cur_s_k-1, 0);
int cur_first_block_idx = first_token_idx / block_size_n;
int cur_last_block_idx = last_token_idx / block_size_n;
// NOTE Should attend to tokens [first_token_idx, last_token_idx], i.e. blocks [cur_first_block_idx, cur_last_block_idx]
// NOTE Before clamping, first_token_idx <= last_token_idx always holds, so after clamping, first_token_idx <= last_token_idx still holds.
// NOTE if seqlens_k is 0, then first_token_idx == last_token_idx == cur_first_block_idx == cur_last_block_idx == 0. So the sequence will have 1 block. We will correct this later in this kernel.
int num_blocks = cur_last_block_idx - cur_first_block_idx + 1;
total_num_blocks += num_blocks + fixed_overhead_num_blocks;
num_blocks_shared[i] = num_blocks;
first_block_idx_shared[i] = cur_first_block_idx;
last_block_idx_shared[i] = cur_last_block_idx;
}
for (int offset = 32; offset >= 1; offset /= 2) {
// total_num_blocks += __shfl_xor_sync(uint32_t(-1), total_num_blocks, offset);
total_num_blocks += __shfl_xor(total_num_blocks, offset, 64);
}
__syncthreads();
if (threadIdx.x == 0) {
int payload = cutlass::ceil_div(total_num_blocks, num_sm_parts) + fixed_overhead_num_blocks;
int now_idx = 0, now_block = 0, now_n_split_idx = 0, cum_num_splits = 0;
num_splits_shared[0] = 0;
for (int i = 0; i < num_sm_parts; ++i) {
int tile_scheduler_metadata0[4], tile_scheduler_metadata1;
tile_scheduler_metadata0[0] = now_idx;
tile_scheduler_metadata0[1] = now_block + first_block_idx_shared[now_idx];
tile_scheduler_metadata1 = now_n_split_idx;
int remain_payload = payload;
while (now_idx < batch_size) {
int num_blocks = num_blocks_shared[now_idx];
int now_remain_blocks = num_blocks - now_block;
if (remain_payload >= now_remain_blocks + fixed_overhead_num_blocks) {
cum_num_splits += now_n_split_idx + 1;
num_splits_shared[now_idx + 1] = cum_num_splits;
remain_payload -= now_remain_blocks + fixed_overhead_num_blocks;
++now_idx;
now_block = 0;
now_n_split_idx = 0;
} else {
if (remain_payload - fixed_overhead_num_blocks > 0) {
now_block += remain_payload - fixed_overhead_num_blocks;
++now_n_split_idx;
remain_payload = 0;
}
break;
}
}
tile_scheduler_metadata0[2] = now_block > 0 ? now_idx : now_idx - 1;
tile_scheduler_metadata0[3] = now_block > 0 ? now_block + first_block_idx_shared[now_idx] : (seqlens_k_shared[now_idx-1] == 0 ? 0 : last_block_idx_shared[now_idx-1] + 1);
*reinterpret_cast<int4 *>(tile_scheduler_metadata_ptr + i * TileSchedulerMetaDataSize) = *reinterpret_cast<int4 *>(tile_scheduler_metadata0);
tile_scheduler_metadata_ptr[i * TileSchedulerMetaDataSize + 4] = tile_scheduler_metadata1;
}
// FLASH_DEVICE_ASSERT(now_idx == batch_size && now_block == 0 && now_n_split_idx == 0);
}
__syncthreads();
for (int i = threadIdx.x; i <= batch_size; i += 64) {
num_splits_ptr[i] = num_splits_shared[i];
}
}
void run_get_mla_metadata_kernel(GetDecodingMetadataParams &params, cudaStream_t stream) {
int smem_size = sizeof(int) * (params.batch_size*5+1);
// CHECK_CUDA(cudaFuncSetAttribute(get_mla_metadata_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
get_mla_decoding_metadata_kernel<<<1, 64, smem_size, stream>>>(params);
CHECK_CUDA_KERNEL_LAUNCH();
}
\ No newline at end of file
#pragma once
////////////////////////////////////////////////////////////////////////////////////////////////////
struct Flash_fwd_mla_params {
using index_t = int64_t;
int b, seqlen_q, d, d_v;
int h, h_h_k_ratio, ngroups;
bool is_causal;
float scale_softmax, scale_softmax_log2;
int *__restrict__ cu_seqlens_k;
void *__restrict__ q_ptr;
void *__restrict__ q_nope_ptr;
void *__restrict__ q_pe_ptr;
void *__restrict__ k_ptr;
void *__restrict__ v_ptr;
void *__restrict__ o_ptr;
void *__restrict__ softmax_lse_ptr;
index_t q_batch_stride;
index_t q_nope_batch_stride;
index_t q_pe_batch_stride;
index_t k_batch_stride;
index_t v_batch_stride;
index_t o_batch_stride;
index_t q_row_stride;
index_t q_nope_row_stride;
index_t q_pe_row_stride;
index_t k_row_stride;
index_t v_row_stride;
index_t o_row_stride;
index_t q_head_stride;
index_t q_nope_head_stride;
index_t q_pe_head_stride;
index_t k_head_stride;
index_t v_head_stride;
index_t o_head_stride;
int *__restrict__ block_table;
index_t block_table_batch_stride;
int page_block_size;
int *__restrict__ tile_scheduler_metadata_ptr;
int num_sm_parts;
int *__restrict__ num_splits_ptr;
void *__restrict__ softmax_lseaccum_ptr;
void *__restrict__ oaccum_ptr;
void *__restrict__ k_scale_ptr;
float * __restrict__ descale_q_ptr ;
float * __restrict__ descale_k_ptr ;
};
struct SparsePrefillParams {
int s_q, s_kv, h_q, h_kv, d_qk, d_v, topk;
float sm_scale, sm_scale_div_log2;
// Input tensors
void* __restrict__ q; // [s_q, h_q, d_qk]
void* __restrict__ kv; // [s_kv, h_kv, d_qk]
void* __restrict__ indices; // [s_q, h_kv, topk]
int stride_q_s_q;
int stride_q_h_q;
int stride_kv_s_kv;
int stride_kv_h_kv;
int stride_indices_s_q;
int stride_indices_h_kv;
// Output tensors
void* __restrict__ out; // [s_q, h_q, d_v]
void* __restrict__ max_logits; // [s_q, h_q]
void* __restrict__ lse; // [s_q, h_q]
// cudaStream_t stream;
};
static constexpr int TileSchedulerMetaDataSize = 8;
// [begin_idx, begin_seqlen, end_idx, end_seqlen, begin_n_split_idx, _, _, _]
////////////////////////////////////////////////////////////////////////////////////////////////////
template<typename T, int Headdim>
void run_mha_fwd_splitkv_mla(Flash_fwd_mla_params &params, const std::string& kv_cache_dtype, hipStream_t stream, bool is_q_nope_pe = false);
template<typename T, typename To, int Headdim>
void run_mha_fwd_splitkv_mla_fp8(Flash_fwd_mla_params &params,hipStream_t stream, bool is_with_cat);
template<typename T, int Headdim>
void run_mha_fwd_sparse_prefill(const SparsePrefillParams &params, hipStream_t stream);
struct Mla_metadata_params {
int *__restrict__ seqlens_k_ptr;
int *__restrict__ tile_scheduler_metadata_ptr;
int *__restrict__ num_splits_ptr;
int batch_size;
int block_size_n;
int fixed_overhead_num_blocks;
int num_sm_parts;
};
void get_mla_metadata_func(Mla_metadata_params &params, cudaStream_t stream);
enum class Fp8KVCacheDataType {
kAuto = 0,
kFp8E4M3 = 1,
kFp8E5M2 = 2,
kInt8 = 3,
};
static inline bool get_env_(const char *env_var) {
if (char *value = std::getenv(env_var)) {
if (strcmp(value, "0") == 0) {
return false;
}
return true;
}
return false;
}
struct GetDecodingMetadataParams {
int *__restrict__ seqlens_k_ptr;
int *__restrict__ tile_scheduler_metadata_ptr;
int *__restrict__ num_splits_ptr;
int batch_size;
int block_size_n;
int fixed_overhead_num_blocks;
int num_sm_parts;
int topk;
};
void run_get_mla_metadata_kernel(GetDecodingMetadataParams &params, cudaStream_t stream);
struct DecodingParams {
using index_t = int64_t;
int b; // batch size
int s_q;
int q_seq_per_hk; // The number of q(s) per KV head, = h_q / h_k * s_q
int d, d_v; // K/V dimension
int h_q, h_k; // The number of Q/K heads
int num_blocks; // Number of blocks in total
int q_head_per_hk; // The number of q_head(s) per KV head, = h_q / h_k
bool is_causal;
float scale_softmax, scale_softmax_log2;
int topk;
void *__restrict__ q_ptr;
void *__restrict__ k_ptr;
void *__restrict__ o_ptr;
void *__restrict__ softmax_lse_ptr;
int *__restrict__ indices_ptr;
index_t q_batch_stride;
index_t k_batch_stride;
index_t o_batch_stride;
index_t q_row_stride;
index_t k_row_stride;
index_t o_row_stride;
index_t q_head_stride;
index_t k_head_stride;
index_t o_head_stride;
index_t indices_batch_stride;
index_t indices_row_stride;
int *__restrict__ block_table;
index_t block_table_batch_stride;
int page_block_size;
int *__restrict__ seqlens_k_ptr;
int *__restrict__ tile_scheduler_metadata_ptr;
int num_sm_parts;
int *__restrict__ num_splits_ptr;
int total_num_splits;
void *__restrict__ softmax_lseaccum_ptr;
void *__restrict__ oaccum_ptr;
};
template<typename T, int Headdim>
void run_flash_splitkv_sparse_mla_kernel(const DecodingParams &params, cudaStream_t stream);
\ No newline at end of file
This diff is collapsed.
#pragma once
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
exit(1); \
} \
} while(0)
#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError())
#define FLASH_ASSERT(cond) \
do { \
if (not (cond)) { \
fprintf(stderr, "Assertion failed (%s:%d): %s\n", __FILE__, __LINE__, #cond); \
exit(1); \
} \
} while(0)
#define FLASH_DEVICE_ASSERT(cond) \
do { \
if (not (cond)) { \
printf("Assertion failed (%s:%d): %s\n", __FILE__, __LINE__, #cond); \
asm("S_ENDPGM;"); \
} \
} while(0)
#define BOOL_SWITCH(COND, CONST_NAME, ...) \
[&] { \
if (COND) { \
constexpr static bool CONST_NAME = true; \
return __VA_ARGS__(); \
} else { \
constexpr static bool CONST_NAME = false; \
return __VA_ARGS__(); \
} \
}()
#define MLA_NUM_SPLITS_SWITCH(NUM_SPLITS, NAME, ...) \
[&] { \
if (NUM_SPLITS <= 32) { \
constexpr static int NAME = 32; \
return __VA_ARGS__(); \
} else if (NUM_SPLITS <= 64) { \
constexpr static int NAME = 64; \
return __VA_ARGS__(); \
} else if (NUM_SPLITS <= 72) { \
constexpr static int NAME = 72; \
return __VA_ARGS__(); \
} else if (NUM_SPLITS <= 96) { \
constexpr static int NAME = 96; \
return __VA_ARGS__(); \
} else if (NUM_SPLITS <= 128) { \
constexpr static int NAME = 128; \
return __VA_ARGS__(); \
} else if (NUM_SPLITS <= 144) { \
constexpr static int NAME = 144; \
return __VA_ARGS__(); \
} else if (NUM_SPLITS <= 160) { \
constexpr static int NAME = 160; \
return __VA_ARGS__(); \
} else { \
FLASH_ASSERT(false); \
} \
}()
This diff is collapsed.
......@@ -17,6 +17,7 @@ compute_attn_1rowblock_splitkv_mla_kvfp8(const DenseAttnDecodeParams_fp8 &params
const int n_split_idx, const int seqlen_k,
const int n_block_min, const int n_block_max, const bool NoSplit)
{
#if 0
constexpr static bool Is_causal = T::Is_causal;
constexpr int kBlockM = T::kBlockM;
constexpr int kBlockN = T::kBlockN;
......@@ -384,9 +385,7 @@ compute_attn_1rowblock_splitkv_mla_kvfp8(const DenseAttnDecodeParams_fp8 &params
}
}
}
#endif
}
template<typename T>
......
......@@ -184,9 +184,9 @@ inline constexpr bool is_decode_v = std::bool_constant<FWD_MODE == SparseAttnFwd
template<SparseAttnFwdMode FWD_MODE>
using SparseFwdArgT = std::conditional_t<is_decode_v<FWD_MODE>, SparseAttnDecodeParams, SparseAttnFwdParams>;
enum class Fp8KVCacheDataType {
kAuto = 0,
kFp8E4M3 = 1,
kFp8E5M2 = 2,
kInt8 = 3,
};
// enum class Fp8KVCacheDataType {
// kAuto = 0,
// kFp8E4M3 = 1,
// kFp8E5M2 = 2,
// kInt8 = 3,
// };
......@@ -1081,7 +1081,7 @@ void wait_vmcnt() {
"s_barrier; \n\t"
:: "n"(N));
}
#if 0
template<typename Element, bool is_scale_equal_one, Fp8KVCacheDataType KV_DTYPE, typename Tensor0, typename Tensor1, typename Tensor2, typename Tensor3, typename Tensor4,
typename TiledMma, typename TiledCopy, typename ThrCopy>
__forceinline__ __device__ void gemm_rs_fp8(Tensor0 &acc, Tensor1 &tCrA, Tensor2 &tCrB_int8, Tensor3 &tCrB, Tensor4 const& tCsB,
......@@ -1302,7 +1302,7 @@ __forceinline__ __device__ void gemm_k_rs_fp8(Tensor0 &acc, Tensor1 &tCrA, Tens
}
cute::gemm(tiled_mma, tCrA(_, _, k_idx), tCrB(_, _, k_idx), acc);
}
#endif
template <
bool Is_even_MN=true,
bool Is_even_K=true,
......@@ -1367,7 +1367,7 @@ buffer_load_copy_fp8(
}
}
#if 0
template<typename Element, bool is_scale_equal_one, Fp8KVCacheDataType KV_DTYPE, typename Tensor0, typename Tensor1, typename Tensor3, typename Tensor4,
typename TiledMma, typename TiledCopy, typename ThrCopy>
__forceinline__ __device__ void gemm1_rs_fp8(Tensor0 &acc, Tensor1 &tCrA, Tensor3 &tCrB, Tensor4 const& tCsB,
......@@ -1500,8 +1500,7 @@ __forceinline__ __device__ void gemm1_rs_fp8(Tensor0 &acc, Tensor1 &tCrA, Tensor
}
}
#endif
}
\ No newline at end of file
......@@ -4,14 +4,27 @@ from flash_mla.flash_mla_interface import (
get_mla_metadata,
flash_mla_with_kvcache,
flash_mla_sparse_fwd,
flash_mla_with_kvcache_qkvfp8,
flash_mla_with_kvcache_kvfp8
get_mla_decoding_metadata_dense_fp8,
flash_mla_with_kvcache_quantization,
flash_mla_with_kvcache_q_nope_pe,
flash_mla_with_kvcache_quantization_q_nope_pe,
flash_mla_with_kvcache_fp8,
flash_mla_with_kvcache_fp8_with_cat
)
__all__ = [
"get_mla_metadata",
"flash_mla_with_kvcache",
"flash_mla_sparse_fwd",
"flash_mla_with_kvcache_qkvfp8",
"flash_mla_with_kvcache_kvfp8"
"get_mla_decoding_metadata_dense_fp8",
"flash_mla_with_kvcache_quantization",
"flash_mla_with_kvcache_q_nope_pe",
"flash_mla_with_kvcache_quantization_q_nope_pe",
"flash_mla_with_kvcache_fp8",
"flash_mla_with_kvcache_fp8_with_cat"
]
import os
FLASH_MLA_ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
# print(FLUX_ROOT_DIR)
os.environ["FLASH_MLA_ROOT_DIR"] = FLASH_MLA_ROOT_DIR + "/asm/"
\ No newline at end of file
clang -x assembler -target amdgcn-amd-amdhsa -mcode-object-version=4 -mcpu=gfx938:sramecc+ -c -o flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.o flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.s
clang -target amdgcn-amd-amdhsa flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.o -o flash_fwd_mla_fp8_gfx938-hip-amdgcn-amd-amdhsa-gfx938.co
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment