Unverified Commit 989ae253 authored by Jee Li's avatar Jee Li Committed by GitHub
Browse files

[Kernel] Add punica dimension for Baichuan-13B (#4053)

parent 0a430b4a
...@@ -47,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, ...@@ -47,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13696) \
f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 13824) \
f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 14336) \
f(in_T, out_T, W_T, narrow, 15360) \
f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 16384) \
f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 20480) \
f(in_T, out_T, W_T, narrow, 22016) \ f(in_T, out_T, W_T, narrow, 22016) \
......
...@@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files): ...@@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files):
@pytest.mark.skip("Requires multiple GPUs") @pytest.mark.skip("Requires multiple GPUs")
def test_llama_tensor_parallel_equality(baichuan_lora_files): def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
# Cannot use as it will initialize torch.cuda too early... # Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 4: # if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}") # pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
......
...@@ -72,6 +72,7 @@ H1 = H2 = [ ...@@ -72,6 +72,7 @@ H1 = H2 = [
11008, 11008,
13824, 13824,
14336, 14336,
15360,
22016, 22016,
24576, 24576,
27392, 27392,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment