Unverified Commit ae0ccb40 authored by Or Sharir's avatar Or Sharir Committed by GitHub
Browse files

Add missing kernel for CodeLlama-34B on A/H100 (no tensor parallelism) when...

Add missing kernel for CodeLlama-34B on A/H100 (no tensor parallelism) when using Multi-LoRA. (#3350)
parent 739c350c
...@@ -43,6 +43,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, ...@@ -43,6 +43,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 14336) \
f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 16384) \
f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 20480) \
f(in_T, out_T, W_T, narrow, 22016) \
f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 24576) \
f(in_T, out_T, W_T, narrow, 28672) \ f(in_T, out_T, W_T, narrow, 28672) \
f(in_T, out_T, W_T, narrow, 32000) \ f(in_T, out_T, W_T, narrow, 32000) \
......
...@@ -45,7 +45,7 @@ def _lora_ref_impl( ...@@ -45,7 +45,7 @@ def _lora_ref_impl(
H1 = H2 = [ H1 = H2 = [
128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120, 128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120,
5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, 5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336,
24576, 32000, 32256, 32512, 32768, 33024 22016, 24576, 32000, 32256, 32512, 32768, 33024
] ]
SEED = [0xabcdabcd987] SEED = [0xabcdabcd987]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment