Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
18c42e67
Commit
18c42e67
authored
Jul 27, 2024
by
chenxl
Browse files
Initial commit
parents
Changes
247
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1037 additions
and
0 deletions
+1037
-0
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
+163
-0
ktransformers/ktransformers_ext/cpu_backend/backend.cpp
ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+101
-0
ktransformers/ktransformers_ext/cpu_backend/backend.h
ktransformers/ktransformers_ext/cpu_backend/backend.h
+51
-0
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+58
-0
ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
+57
-0
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+40
-0
ktransformers/ktransformers_ext/cuda/binding.cpp
ktransformers/ktransformers_ext/cuda/binding.cpp
+32
-0
ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
+25
-0
ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
...sformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
+40
-0
ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+164
-0
ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+19
-0
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
...formers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+0
-0
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
...ormers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
+80
-0
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
...ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
+80
-0
ktransformers/ktransformers_ext/cuda/gptq_marlin/ops.h
ktransformers/ktransformers_ext/cuda/gptq_marlin/ops.h
+25
-0
ktransformers/ktransformers_ext/cuda/setup.py
ktransformers/ktransformers_ext/cuda/setup.py
+18
-0
ktransformers/ktransformers_ext/examples/test_linear.py
ktransformers/ktransformers_ext/examples/test_linear.py
+84
-0
ktransformers/ktransformers_ext/examples/test_mlp.py
ktransformers/ktransformers_ext/examples/test_mlp.py
+0
-0
ktransformers/ktransformers_ext/examples/test_moe.py
ktransformers/ktransformers_ext/examples/test_moe.py
+0
-0
ktransformers/ktransformers_ext/ext_bindings.cpp
ktransformers/ktransformers_ext/ext_bindings.cpp
+0
-0
No files found.
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
0 → 100644
View file @
18c42e67
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-07-25 10:32:57
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
import
time
import
torch
import
torch.nn.quantized
as
nnq
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
bench_moe
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
expert_num
=
10
hidden_size
=
5120
intermediate_size
=
1536
n_routed_experts
=
6
layer_num
=
10
warm_up_iter
=
1000
test_iter
=
10000
if
quant_mode
==
"fp32"
:
proj_type
=
torch
.
float32
bytes_per_elem
=
4.000000
elif
quant_mode
==
"fp16"
:
proj_type
=
torch
.
float16
bytes_per_elem
=
2.000000
elif
quant_mode
==
"bf16"
:
proj_type
=
torch
.
bfloat16
bytes_per_elem
=
2.000000
elif
quant_mode
==
"qint8"
:
proj_type
=
torch
.
qint8
bytes_per_elem
=
1.000000
else
:
assert
(
False
)
gate_projs
=
[]
up_projs
=
[]
down_projs
=
[]
for
_
in
range
(
layer_num
):
gate_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
if
quant_mode
==
"qint8"
:
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
quantized_gate_proj
=
[]
quantized_up_proj
=
[]
quantized_down_proj
=
[]
for
i
in
range
(
expert_num
):
gate_proj_q
=
torch
.
quantize_per_tensor
(
gate_proj
[
i
],
scale
,
zero_point
,
torch
.
qint8
)
quantized_gate
=
nnq
.
Linear
(
hidden_size
,
intermediate_size
)
quantized_gate
.
set_weight_bias
(
gate_proj_q
,
None
)
quantized_gate_proj
.
append
(
quantized_gate
)
up_proj_q
=
torch
.
quantize_per_tensor
(
up_proj
[
i
],
scale
,
zero_point
,
torch
.
qint8
)
quantized_up
=
nnq
.
Linear
(
hidden_size
,
intermediate_size
)
quantized_up
.
set_weight_bias
(
up_proj_q
,
None
)
quantized_up_proj
.
append
(
quantized_up
)
down_proj_q
=
torch
.
quantize_per_tensor
(
down_proj
[
i
],
scale
,
zero_point
,
torch
.
qint8
)
quantized_down
=
nnq
.
Linear
(
intermediate_size
,
hidden_size
)
quantized_down
.
set_weight_bias
(
down_proj_q
,
None
)
quantized_down_proj
.
append
(
quantized_down
)
gate_projs
.
append
(
quantized_gate_proj
)
up_projs
.
append
(
quantized_up_proj
)
down_projs
.
append
(
quantized_down_proj
)
else
:
gate_projs
.
append
(
gate_proj
.
to
(
proj_type
))
up_projs
.
append
(
up_proj
.
to
(
proj_type
))
down_projs
.
append
(
down_proj
.
to
(
proj_type
))
# warm up
for
i
in
range
(
warm_up_iter
):
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
n_routed_experts
,),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
n_routed_experts
,),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
quantized_gate
=
gate_proj
[
expert_id
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_proj
[
expert_id
]
up_buf
=
quantized_up
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_proj
[
expert_id
]
expert_output
=
quantized_down
(
intermediate_q
)
expert_output
=
expert_output
.
dequantize
()
t_output
+=
weights
[
i
]
*
expert_output
else
:
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
proj_type
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
[
expert_id
].
t
())
t_output
+=
weights
[
i
]
*
expert_output
# test
total_time
=
0
for
i
in
range
(
test_iter
):
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
n_routed_experts
,),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
n_routed_experts
,),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
start
=
time
.
perf_counter
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
quantized_gate
=
gate_proj
[
expert_id
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_proj
[
expert_id
]
up_buf
=
quantized_up
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_proj
[
expert_id
]
expert_output
=
quantized_down
(
intermediate_q
)
expert_output
=
expert_output
.
dequantize
()
t_output
+=
weights
[
i
]
*
expert_output
else
:
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
proj_type
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
[
expert_id
].
t
())
t_output
+=
weights
[
i
]
*
expert_output
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time(us) per iteration: '
,
total_time
/
test_iter
*
1000000
)
print
(
'Bandwidth: '
,
hidden_size
*
intermediate_size
*
3
*
n_routed_experts
*
bytes_per_elem
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
''
)
bench_moe
(
"fp32"
)
bench_moe
(
"fp16"
)
bench_moe
(
"bf16"
)
bench_moe
(
"qint8"
)
ktransformers/ktransformers_ext/cpu_backend/backend.cpp
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:34
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "backend.h"
Backend
::
Backend
(
int
thread_num
)
{
thread_num_
=
thread_num
;
thread_state_
.
resize
(
thread_num
);
for
(
int
i
=
0
;
i
<
thread_num
;
i
++
)
{
thread_state_
[
i
].
curr
=
std
::
make_unique
<
std
::
atomic
<
int
>>
();
thread_state_
[
i
].
status
=
std
::
make_unique
<
std
::
atomic
<
ThreadStatus
>>
(
ThreadStatus
::
WAITING
);
}
workers_
.
resize
(
thread_num
);
for
(
int
i
=
1
;
i
<
thread_num
;
i
++
)
{
workers_
[
i
]
=
std
::
thread
(
&
Backend
::
worker_thread
,
this
,
i
);
}
}
Backend
::~
Backend
()
{
for
(
int
i
=
0
;
i
<
thread_num_
;
i
++
)
{
thread_state_
[
i
].
status
->
store
(
ThreadStatus
::
EXIT
,
std
::
memory_order_release
);
}
for
(
int
i
=
1
;
i
<
thread_num_
;
i
++
)
{
if
(
workers_
[
i
].
joinable
())
{
workers_
[
i
].
join
();
}
}
}
int
Backend
::
get_thread_num
()
{
return
thread_num_
;
}
void
Backend
::
do_work_stealing_job
(
int
task_num
,
std
::
function
<
void
(
int
)
>
func
)
{
func_
=
func
;
int
base
=
task_num
/
thread_num_
;
int
remain
=
task_num
%
thread_num_
;
thread_state_
[
0
].
end
=
base
+
(
0
<
remain
);
for
(
int
i
=
1
;
i
<
thread_num_
;
i
++
)
{
thread_state_
[
i
].
curr
->
store
(
thread_state_
[
i
-
1
].
end
,
std
::
memory_order_relaxed
);
thread_state_
[
i
].
end
=
thread_state_
[
i
-
1
].
end
+
base
+
(
i
<
remain
);
thread_state_
[
i
].
status
->
store
(
ThreadStatus
::
WORKING
,
std
::
memory_order_release
);
}
thread_state_
[
0
].
curr
->
store
(
0
,
std
::
memory_order_relaxed
);
thread_state_
[
0
].
status
->
store
(
ThreadStatus
::
WORKING
,
std
::
memory_order_release
);
process_tasks
(
0
);
for
(
int
i
=
1
;
i
<
thread_num_
;
i
++
)
{
while
(
thread_state_
[
i
].
status
->
load
(
std
::
memory_order_acquire
)
==
ThreadStatus
::
WORKING
)
{
}
}
}
void
Backend
::
process_tasks
(
int
thread_id
)
{
while
(
true
)
{
int
task_id
=
thread_state_
[
thread_id
].
curr
->
fetch_add
(
1
,
std
::
memory_order_acq_rel
);
if
(
task_id
>=
thread_state_
[
thread_id
].
end
)
{
break
;
}
func_
(
task_id
);
}
for
(
int
t_offset
=
1
;
t_offset
<
thread_num_
;
t_offset
++
)
{
int
t_i
=
(
thread_id
+
t_offset
)
%
thread_num_
;
if
(
thread_state_
[
t_i
].
status
->
load
(
std
::
memory_order_acquire
)
!=
ThreadStatus
::
WORKING
)
{
continue
;
}
while
(
true
)
{
int
task_id
=
thread_state_
[
t_i
].
curr
->
fetch_add
(
1
,
std
::
memory_order_acq_rel
);
if
(
task_id
>=
thread_state_
[
t_i
].
end
)
{
break
;
}
func_
(
task_id
);
}
}
thread_state_
[
thread_id
].
status
->
store
(
ThreadStatus
::
WAITING
,
std
::
memory_order_release
);
}
void
Backend
::
worker_thread
(
int
thread_id
)
{
auto
start
=
std
::
chrono
::
steady_clock
::
now
();
while
(
true
)
{
ThreadStatus
status
=
thread_state_
[
thread_id
].
status
->
load
(
std
::
memory_order_acquire
);
if
(
status
==
ThreadStatus
::
WORKING
)
{
process_tasks
(
thread_id
);
start
=
std
::
chrono
::
steady_clock
::
now
();
}
else
if
(
status
==
ThreadStatus
::
WAITING
)
{
auto
now
=
std
::
chrono
::
steady_clock
::
now
();
auto
duration
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
milliseconds
>
(
now
-
start
).
count
();
if
(
duration
>
50
)
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1
));
}
}
else
if
(
status
==
ThreadStatus
::
EXIT
)
{
return
;
}
}
}
\ No newline at end of file
ktransformers/ktransformers_ext/cpu_backend/backend.h
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:38
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_BACKEND_H
#define CPUINFER_BACKEND_H
#include <atomic>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <mutex>
#include <thread>
#include <vector>
enum
ThreadStatus
{
WORKING
,
WAITING
,
EXIT
,
};
struct
ThreadState
{
std
::
unique_ptr
<
std
::
atomic
<
ThreadStatus
>>
status
;
std
::
unique_ptr
<
std
::
atomic
<
int
>>
curr
;
int
end
;
};
class
Backend
{
public:
Backend
(
int
);
~
Backend
();
int
get_thread_num
();
void
do_work_stealing_job
(
int
,
std
::
function
<
void
(
int
)
>
);
private:
int
thread_num_
;
std
::
vector
<
ThreadState
>
thread_state_
;
// [thread_num]
std
::
function
<
void
(
int
)
>
func_
;
std
::
vector
<
std
::
thread
>
workers_
;
void
process_tasks
(
int
);
void
worker_thread
(
int
);
};
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:42
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_CPUINFER_H
#define CPUINFER_CPUINFER_H
#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
#include "backend.h"
#include "task_queue.h"
#include "llama.cpp/ggml-impl.h"
class
CPUInfer
{
public:
CPUInfer
(
int
thread_num
)
{
backend_
=
new
Backend
(
thread_num
-
1
);
task_queue_
=
new
TaskQueue
();
for
(
int
i
=
0
;
i
<
(
1
<<
16
);
++
i
)
{
ggml_table_f32_f16
[
i
]
=
GGML_COMPUTE_FP16_TO_FP32
(
i
);
}
}
~
CPUInfer
()
{
delete
backend_
;
delete
task_queue_
;
}
template
<
typename
Func
,
typename
Obj
,
typename
...
Args
>
void
submit
(
Func
f
,
Obj
*
obj
,
Args
...
args
)
{
task_queue_
->
enqueue
([
=
]()
{
std
::
invoke
(
f
,
*
obj
,
args
...,
backend_
);
});
}
void
sync
()
{
task_queue_
->
sync
();
}
public:
Backend
*
backend_
;
TaskQueue
*
task_queue_
;
};
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-17 12:25:51
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:44
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "task_queue.h"
TaskQueue
::
TaskQueue
()
{
worker
=
std
::
thread
(
&
TaskQueue
::
processTasks
,
this
);
sync_flag
.
store
(
true
,
std
::
memory_order_seq_cst
);
exit_flag
.
store
(
false
,
std
::
memory_order_seq_cst
);
}
TaskQueue
::~
TaskQueue
()
{
exit_flag
.
store
(
true
,
std
::
memory_order_seq_cst
);
if
(
worker
.
joinable
())
{
worker
.
join
();
}
}
void
TaskQueue
::
enqueue
(
std
::
function
<
void
()
>
task
)
{
mutex
.
lock
();
tasks
.
push
(
task
);
sync_flag
.
store
(
false
,
std
::
memory_order_seq_cst
);
mutex
.
unlock
();
}
void
TaskQueue
::
sync
()
{
while
(
!
sync_flag
.
load
(
std
::
memory_order_seq_cst
))
;
}
void
TaskQueue
::
processTasks
()
{
while
(
true
)
{
mutex
.
lock
();
if
(
tasks
.
empty
())
{
if
(
exit_flag
.
load
(
std
::
memory_order_seq_cst
))
{
return
;
}
mutex
.
unlock
();
continue
;
}
std
::
function
<
void
()
>
task
=
tasks
.
front
();
mutex
.
unlock
();
task
();
mutex
.
lock
();
tasks
.
pop
();
if
(
tasks
.
empty
())
{
sync_flag
.
store
(
true
,
std
::
memory_order_seq_cst
);
}
mutex
.
unlock
();
}
}
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:47
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_TASKQUEUE_H
#define CPUINFER_TASKQUEUE_H
#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
class
TaskQueue
{
public:
TaskQueue
();
~
TaskQueue
();
void
enqueue
(
std
::
function
<
void
()
>
);
void
sync
();
private:
void
processTasks
();
std
::
queue
<
std
::
function
<
void
()
>>
tasks
;
std
::
thread
worker
;
std
::
mutex
mutex
;
std
::
atomic
<
bool
>
sync_flag
;
std
::
atomic
<
bool
>
exit_flag
;
};
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/cuda/binding.cpp
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : Azure-Tang
* @Date : 2024-07-25 13:38:30
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 08:36:03
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "custom_gguf/ops.h"
#include "gptq_marlin/ops.h"
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
// namespace py = pybind11;
PYBIND11_MODULE
(
KTransformersOps
,
m
)
{
m
.
def
(
"dequantize_q8_0"
,
&
dequantize_q8_0
,
"Function to dequantize q8_0 data."
,
py
::
arg
(
"data"
),
py
::
arg
(
"blk_size"
),
py
::
arg
(
"device"
));
m
.
def
(
"dequantize_q6_k"
,
&
dequantize_q6_k
,
"Function to dequantize q6_k data."
,
py
::
arg
(
"data"
),
py
::
arg
(
"blk_size"
),
py
::
arg
(
"device"
));
m
.
def
(
"dequantize_q4_k"
,
&
dequantize_q4_k
,
"Function to dequantize q4_k data."
,
py
::
arg
(
"data"
),
py
::
arg
(
"blk_size"
),
py
::
arg
(
"device"
));
m
.
def
(
"gptq_marlin_gemm"
,
&
gptq_marlin_gemm
,
"Function to perform GEMM using Marlin quantization."
,
py
::
arg
(
"a"
),
py
::
arg
(
"b_q_weight"
),
py
::
arg
(
"b_scales"
),
py
::
arg
(
"g_idx"
),
py
::
arg
(
"perm"
),
py
::
arg
(
"workspace"
),
py
::
arg
(
"num_bits"
),
py
::
arg
(
"size_m"
),
py
::
arg
(
"size_n"
),
py
::
arg
(
"size_k"
),
py
::
arg
(
"is_k_full"
));
}
ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
0 → 100644
View file @
18c42e67
#include "ops.h"
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
// namespace py = pybind11;
int
test
(){
return
5
;
}
torch
::
Tensor
dequantize_q6_k
(
torch
::
Tensor
data
,
int
blk_size
,
torch
::
Device
device
);
PYBIND11_MODULE
(
cudaops
,
m
)
{
m
.
def
(
"dequantize_q8_0"
,
&
dequantize_q8_0
,
"Function to dequantize q8_0 data."
,
py
::
arg
(
"data"
),
py
::
arg
(
"blk_size"
),
py
::
arg
(
"device"
));
m
.
def
(
"dequantize_q6_k"
,
&
dequantize_q6_k
,
"Function to dequantize q6_k data."
,
py
::
arg
(
"data"
),
py
::
arg
(
"blk_size"
),
py
::
arg
(
"device"
));
m
.
def
(
"dequantize_q4_k"
,
&
dequantize_q4_k
,
"Function to dequantize q4_k data."
,
py
::
arg
(
"data"
),
py
::
arg
(
"blk_size"
),
py
::
arg
(
"device"
));
m
.
def
(
"test"
,
&
test
,
"Function to test."
);
}
ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
0 → 100644
View file @
18c42e67
#include <cuda_fp16.h>
__device__
float
ggml_compute_fp16_to_fp32
(
uint16_t
h
)
{
return
__uint2float_rd
(
h
);
}
static
inline
float
ggml_compute_fp16_to_fp32
(
uint16_t
h
)
{
uint16_t
tmp
;
memcpy
(
&
tmp
,
&
h
,
sizeof
(
ggml_fp16_t
));
return
(
float
)
tmp
;
}
// define the global table for fp16 to fp32 conversion
__device__
float
ggml_table_f32_f16
[
1
<<
16
];
// CUDA Kernel to init the table
__global__
void
init_fp16_to_fp32_table
()
{
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
auto
blk_id
=
idx
;
blk_id
<
(
1
<<
16
);
blk_id
+=
blockDim
.
x
*
gridDim
.
x
){
ggml_table_f32_f16
[
blk_id
]
=
GGML_COMPUTE_FP16_TO_FP32
(
blk_id
);
}
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
extern
__device__
float
ggml_table_f32_f16
[
1
<<
16
];
// Declare as __device__ if used within device code
// This version of the function is designed to be called from within a CUDA kernel
#if !defined(GGML_FP16_TO_FP32)
__device__
float
ggml_lookup_fp16_to_fp32
(
uint16_t
f
)
{
return
ggml_table_f32_f16
[
f
];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
0 → 100644
View file @
18c42e67
/*
* @Description :
* @Author : Azure-Tang, Boxin Zhang
* @Date : 2024-07-25 13:38:30
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 11:58:50
* Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
*/
#include <cuda_runtime.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
#include <cstdint>
__global__
void
dequantize_q8_0_kernel
(
float
*
output
,
const
float
*
scales
,
const
int8_t
*
qs
,
int
num_blocks
,
int
blk_size
)
{
int
global_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
auto
block_id
=
global_idx
;
block_id
<
num_blocks
;
block_id
+=
blockDim
.
x
*
gridDim
.
x
){
for
(
int
i
=
0
;
i
<
blk_size
;
i
++
){
float
scale
=
scales
[
block_id
];
output
[
block_id
*
blk_size
+
i
]
=
scale
*
qs
[
block_id
*
blk_size
+
i
];
}
}
}
// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
__device__
void
get_scale_min_k4
(
int
j
,
const
uint8_t
*
q
,
uint8_t
*
__restrict__
d
,
uint8_t
*
__restrict__
m
)
{
if
(
j
<
4
)
{
*
d
=
q
[
j
]
&
63
;
*
m
=
q
[
j
+
4
]
&
63
;
}
else
{
*
d
=
(
q
[
j
+
4
]
&
0xF
)
|
((
q
[
j
-
4
]
>>
6
)
<<
4
);
*
m
=
(
q
[
j
+
4
]
>>
4
)
|
((
q
[
j
-
0
]
>>
6
)
<<
4
);
}
}
__global__
void
dequantize_q4_k_kernel
(
int8_t
*
data
,
float
*
output
,
int
blk_size
,
int
num_blocks
)
{
int
global_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
auto
block_id
=
global_idx
;
block_id
<
num_blocks
;
block_id
+=
blockDim
.
x
*
gridDim
.
x
){
float
*
__restrict__
output_blk
=
(
float
*
)(
output
+
block_id
*
256
);
// const uint8_t * q = data[i].qs;
const
uint8_t
*
q
=
(
uint8_t
*
)(
data
+
block_id
*
144
+
16
);
const
float
d
=
__half2float
(
*
(
reinterpret_cast
<
half
*>
(
data
+
block_id
*
144
+
0
)));
const
float
min
=
__half2float
(
*
(
reinterpret_cast
<
half
*>
(
data
+
block_id
*
144
+
2
)));
int
is
=
0
;
uint8_t
sc
,
m
;
for
(
int
j
=
0
;
j
<
blk_size
;
j
+=
64
)
{
uint8_t
*
scales
=
(
uint8_t
*
)(
data
+
block_id
*
144
+
4
);
get_scale_min_k4
(
is
+
0
,
scales
,
&
sc
,
&
m
);
const
float
d1
=
d
*
sc
;
const
float
m1
=
min
*
m
;
get_scale_min_k4
(
is
+
1
,
scales
,
&
sc
,
&
m
);
const
float
d2
=
d
*
sc
;
const
float
m2
=
min
*
m
;
for
(
int
l
=
0
;
l
<
32
;
++
l
)
*
output_blk
++
=
d1
*
(
q
[
l
]
&
0xF
)
-
m1
;
for
(
int
l
=
0
;
l
<
32
;
++
l
)
*
output_blk
++
=
d2
*
(
q
[
l
]
>>
4
)
-
m2
;
q
+=
32
;
is
+=
2
;
}
}
}
__global__
void
dequantize_q6_k_kernel
(
int8_t
*
data
,
float
*
output
,
int
blk_size
,
int
num_blocks
)
{
int
global_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
auto
block_id
=
global_idx
;
block_id
<
num_blocks
;
block_id
+=
blockDim
.
x
*
gridDim
.
x
){
float
*
__restrict__
output_blk
=
(
float
*
)(
output
+
block_id
*
256
);
const
float
d
=
__half2float
(
*
(
reinterpret_cast
<
half
*>
(
data
+
block_id
*
blk_size
+
208
)));
const
uint8_t
*
__restrict__
ql
=
(
uint8_t
*
)(
data
+
block_id
*
blk_size
);
const
uint8_t
*
__restrict__
qh
=
(
uint8_t
*
)(
data
+
block_id
*
blk_size
+
128
);
const
int8_t
*
__restrict__
sc
=
(
int8_t
*
)(
data
+
block_id
*
blk_size
+
192
);
//if (blk_size == 256){
for
(
int
n
=
0
;
n
<
blk_size
;
n
+=
128
)
{
for
(
int
l
=
0
;
l
<
32
;
++
l
)
{
int
is
=
l
/
16
;
const
int8_t
q1
=
(
int8_t
)((
ql
[
l
+
0
]
&
0xF
)
|
(((
qh
[
l
]
>>
0
)
&
3
)
<<
4
))
-
32
;
const
int8_t
q2
=
(
int8_t
)((
ql
[
l
+
32
]
&
0xF
)
|
(((
qh
[
l
]
>>
2
)
&
3
)
<<
4
))
-
32
;
const
int8_t
q3
=
(
int8_t
)((
ql
[
l
+
0
]
>>
4
)
|
(((
qh
[
l
]
>>
4
)
&
3
)
<<
4
))
-
32
;
const
int8_t
q4
=
(
int8_t
)((
ql
[
l
+
32
]
>>
4
)
|
(((
qh
[
l
]
>>
6
)
&
3
)
<<
4
))
-
32
;
output_blk
[
l
+
0
]
=
d
*
sc
[
is
+
0
]
*
q1
;
output_blk
[
l
+
32
]
=
d
*
sc
[
is
+
2
]
*
q2
;
output_blk
[
l
+
64
]
=
d
*
sc
[
is
+
4
]
*
q3
;
output_blk
[
l
+
96
]
=
d
*
sc
[
is
+
6
]
*
q4
;
}
output_blk
+=
128
;
ql
+=
64
;
qh
+=
32
;
sc
+=
8
;
}
}
}
torch
::
Tensor
dequantize_q8_0
(
torch
::
Tensor
data
,
int
blk_size
,
torch
::
Device
device
)
{
int
num_blocks
=
data
.
numel
()
/
blk_size
;
// create gpu
auto
options_scales
=
torch
::
TensorOptions
().
dtype
(
torch
::
kFloat32
).
device
(
device
).
memory_format
(
torch
::
MemoryFormat
::
Contiguous
);
auto
options_qs
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt8
).
device
(
device
).
memory_format
(
torch
::
MemoryFormat
::
Contiguous
);
auto
scales_gpu
=
torch
::
empty
({{
num_blocks
,
1
}},
options_scales
);
auto
qs_gpu
=
torch
::
empty
({
num_blocks
,
32
},
options_qs
);
// read on cpu
options_scales
=
torch
::
TensorOptions
().
dtype
(
torch
::
kFloat16
).
device
(
torch
::
kCPU
);
options_qs
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt8
).
device
(
torch
::
kCPU
);
// // reinterpret
auto
scales
=
torch
::
from_blob
(
data
.
data_ptr
(),
{
num_blocks
,
1
+
16
},
options_scales
).
slice
(
1
,
0
,
1
);
auto
qs
=
torch
::
from_blob
(
data
.
data_ptr
(),
{
num_blocks
,
2
+
32
},
options_qs
).
slice
(
1
,
2
);
auto
scales_f32
=
scales
.
to
(
torch
::
kFloat32
);
scales_gpu
.
copy_
(
scales_f32
,
false
);
qs_gpu
.
copy_
(
qs
,
false
);
// Create output tensor
auto
output
=
torch
::
zeros_like
(
qs
,
torch
::
dtype
(
torch
::
kFloat32
).
device
(
device
));
// Launch kernel
dequantize_q8_0_kernel
<<<
512
,
256
>>>
(
output
.
data_ptr
<
float
>
(),
scales_gpu
.
data_ptr
<
float
>
(),
qs_gpu
.
data_ptr
<
int8_t
>
(),
num_blocks
,
32
);
cudaDeviceSynchronize
();
return
output
;
}
torch
::
Tensor
dequantize_q6_k
(
torch
::
Tensor
data
,
int
blk_size
,
torch
::
Device
device
)
{
// data.numel%blk_size should be 0, else raise err
int
num_blocks
=
data
.
numel
()
/
blk_size
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt8
).
device
(
device
).
memory_format
(
torch
::
MemoryFormat
::
Contiguous
);
auto
data_gpu
=
torch
::
empty
({
data
.
numel
()},
options
);
data_gpu
.
copy_
(
data
,
false
);
// Create output tensor
auto
output
=
torch
::
zeros
({
num_blocks
,
256
},
torch
::
dtype
(
torch
::
kFloat32
).
device
(
device
));
// Launch kernel
dequantize_q6_k_kernel
<<<
512
,
256
>>>
(
data_gpu
.
data_ptr
<
int8_t
>
(),
output
.
data_ptr
<
float
>
(),
blk_size
,
num_blocks
);
// dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
cudaDeviceSynchronize
();
return
output
;
}
torch
::
Tensor
dequantize_q4_k
(
torch
::
Tensor
data
,
int
blk_size
,
torch
::
Device
device
)
{
// data.numel%blk_size should be 0, else raise err
int
num_blocks
=
data
.
numel
()
/
blk_size
;
auto
options
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt8
).
device
(
device
).
memory_format
(
torch
::
MemoryFormat
::
Contiguous
);
auto
data_gpu
=
torch
::
empty
({
data
.
numel
()},
options
);
data_gpu
.
copy_
(
data
,
false
);
// Create output tensor
auto
output
=
torch
::
zeros
({
num_blocks
,
256
},
torch
::
dtype
(
torch
::
kFloat32
).
device
(
device
));
// Launch kernel
dequantize_q4_k_kernel
<<<
512
,
256
>>>
(
data_gpu
.
data_ptr
<
int8_t
>
(),
output
.
data_ptr
<
float
>
(),
256
,
num_blocks
);
cudaDeviceSynchronize
();
return
output
;
}
ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : Azure-Tang
* @Date : 2024-07-22 09:27:55
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 08:38:20
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
torch
::
Tensor
dequantize_q8_0
(
torch
::
Tensor
data
,
int
blk_size
,
torch
::
Device
device
);
torch
::
Tensor
dequantize_q6_k
(
torch
::
Tensor
data
,
int
blk_size
,
torch
::
Device
device
);
torch
::
Tensor
dequantize_q4_k
(
torch
::
Tensor
data
,
int
blk_size
,
torch
::
Device
device
);
\ No newline at end of file
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
0 → 100644
View file @
18c42e67
This diff is collapsed.
Click to expand it.
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
namespace
gptq_marlin
{
// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
static
constexpr
int
default_threads
=
256
;
static
constexpr
int
pipe_stages
=
4
;
// 4 pipeline stages fit into shared memory
static
constexpr
int
min_thread_n
=
64
;
static
constexpr
int
min_thread_k
=
64
;
static
constexpr
int
tile_size
=
16
;
static
constexpr
int
max_par
=
16
;
template
<
typename
T
,
int
n
>
struct
Vec
{
T
elems
[
n
];
__device__
T
&
operator
[](
int
i
)
{
return
elems
[
i
];
}
};
using
I4
=
Vec
<
int
,
4
>
;
constexpr
int
div_ceil
(
int
a
,
int
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// No support for async
#else
__device__
inline
void
cp_async4_pred
(
void
*
smem_ptr
,
const
void
*
glob_ptr
,
bool
pred
=
true
)
{
const
int
BYTES
=
16
;
uint32_t
smem
=
static_cast
<
uint32_t
>
(
__cvta_generic_to_shared
(
smem_ptr
));
asm
volatile
(
"{
\n
"
" .reg .pred p;
\n
"
" setp.ne.b32 p, %0, 0;
\n
"
" @p cp.async.cg.shared.global [%1], [%2], %3;
\n
"
"}
\n
"
::
"r"
((
int
)
pred
),
"r"
(
smem
),
"l"
(
glob_ptr
),
"n"
(
BYTES
));
}
__device__
inline
void
cp_async4
(
void
*
smem_ptr
,
const
void
*
glob_ptr
)
{
const
int
BYTES
=
16
;
uint32_t
smem
=
static_cast
<
uint32_t
>
(
__cvta_generic_to_shared
(
smem_ptr
));
asm
volatile
(
"{
\n
"
" cp.async.cg.shared.global [%0], [%1], %2;
\n
"
"}
\n
"
::
"r"
(
smem
),
"l"
(
glob_ptr
),
"n"
(
BYTES
));
}
__device__
inline
void
cp_async_fence
()
{
asm
volatile
(
"cp.async.commit_group;
\n
"
::
);
}
template
<
int
n
>
__device__
inline
void
cp_async_wait
()
{
asm
volatile
(
"cp.async.wait_group %0;
\n
"
::
"n"
(
n
));
}
#endif
}
// namespace gptq_marlin
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#ifndef _data_types_cuh
#define _data_types_cuh
#include "gptq_marlin.cuh"
#include <cuda_fp16.h>
#include <cuda_bf16.h>
namespace
gptq_marlin
{
template
<
typename
scalar_t
>
class
ScalarType
{};
template
<
>
class
ScalarType
<
half
>
{
public:
using
scalar_t
=
half
;
using
scalar_t2
=
half2
;
// Matrix fragments for tensor core instructions; their precise layout is
// documented here:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
using
FragA
=
Vec
<
half2
,
4
>
;
using
FragB
=
Vec
<
half2
,
2
>
;
using
FragC
=
Vec
<
float
,
4
>
;
using
FragS
=
Vec
<
half2
,
1
>
;
static
__device__
float
inline
num2float
(
const
half
x
)
{
return
__half2float
(
x
);
}
static
__device__
half2
inline
num2num2
(
const
half
x
)
{
return
__half2half2
(
x
);
}
static
__device__
half2
inline
nums2num2
(
const
half
x1
,
const
half
x2
)
{
return
__halves2half2
(
x1
,
x2
);
}
static
__host__
__device__
half
inline
float2num
(
const
float
x
)
{
return
__float2half
(
x
);
}
};
template
<
>
class
ScalarType
<
nv_bfloat16
>
{
public:
using
scalar_t
=
nv_bfloat16
;
using
scalar_t2
=
nv_bfloat162
;
using
FragA
=
Vec
<
nv_bfloat162
,
4
>
;
using
FragB
=
Vec
<
nv_bfloat162
,
2
>
;
using
FragC
=
Vec
<
float
,
4
>
;
using
FragS
=
Vec
<
nv_bfloat162
,
1
>
;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static
__device__
float
inline
num2float
(
const
nv_bfloat16
x
)
{
return
__bfloat162float
(
x
);
}
static
__device__
nv_bfloat162
inline
num2num2
(
const
nv_bfloat16
x
)
{
return
__bfloat162bfloat162
(
x
);
}
static
__device__
nv_bfloat162
inline
nums2num2
(
const
nv_bfloat16
x1
,
const
nv_bfloat16
x2
)
{
return
__halves2bfloat162
(
x1
,
x2
);
}
static
__host__
__device__
nv_bfloat16
inline
float2num
(
const
float
x
)
{
return
__float2bfloat16
(
x
);
}
#endif
};
}
// namespace gptq_marlin
#endif
ktransformers/ktransformers_ext/cuda/gptq_marlin/ops.h
0 → 100644
View file @
18c42e67
/**
* @Description :
* @Author : Azure
* @Date : 2024-07-22 09:27:55
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 08:35:00
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
torch
::
Tensor
gptq_marlin_gemm
(
torch
::
Tensor
&
a
,
torch
::
Tensor
&
b_q_weight
,
torch
::
Tensor
&
b_scales
,
torch
::
Tensor
&
g_idx
,
torch
::
Tensor
&
perm
,
torch
::
Tensor
&
workspace
,
int64_t
num_bits
,
int64_t
size_m
,
int64_t
size_n
,
int64_t
size_k
,
bool
is_k_full
);
// torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
// int64_t size_k, int64_t size_n,
// int64_t num_bits);
\ No newline at end of file
ktransformers/ktransformers_ext/cuda/setup.py
0 → 100644
View file @
18c42e67
from
setuptools
import
setup
,
Extension
from
torch.utils
import
cpp_extension
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
# setup marlin gemm
setup
(
name
=
'KTransformersOps'
,
ext_modules
=
[
CUDAExtension
(
'KTransformersOps'
,
[
'custom_gguf/dequant.cu'
,
'binding.cpp'
,
'gptq_marlin/gptq_marlin.cu'
,
# 'gptq_marlin_repack.cu',
])
],
cmdclass
=
{
'build_ext'
:
BuildExtension
})
ktransformers/ktransformers_ext/examples/test_linear.py
0 → 100644
View file @
18c42e67
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-07-25 10:34:00
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
import
time
sys
.
path
.
append
(
os
.
path
.
dirname
(
__file__
)
+
'/../build'
)
import
cpuinfer_ext
import
torch
with
torch
.
inference_mode
(
mode
=
True
):
input_size
=
16384
output_size
=
5120
stride
=
32
proj_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
warm_up_iter
=
1000
test_iter
=
10000
linears
=
[]
projs
=
[]
for
_
in
range
(
layer_num
):
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
linear
=
cpuinfer_ext
.
linear
.
Linear
(
config
)
projs
.
append
(
proj
)
linears
.
append
(
linear
)
# validation
for
i
in
range
(
validation_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# print('cpuinfer output', output)
proj
=
projs
[
i
%
layer_num
]
t_output
=
torch
.
mm
(
input
,
proj
.
t
())
# print('torch output', t_output)
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
print
(
'diff = '
,
diff
)
assert
(
diff
<
0.001
)
# warm up
for
i
in
range
(
warm_up_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# test
total_time
=
0
for
i
in
range
(
test_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
start
=
time
.
perf_counter
()
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Time: '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time per iteration: '
,
total_time
/
test_iter
)
print
(
'Bandwidth: '
,
input_size
*
output_size
*
2
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
"All tasks completed."
)
\ No newline at end of file
ktransformers/ktransformers_ext/examples/test_mlp.py
0 → 100644
View file @
18c42e67
This diff is collapsed.
Click to expand it.
ktransformers/ktransformers_ext/examples/test_moe.py
0 → 100644
View file @
18c42e67
This diff is collapsed.
Click to expand it.
ktransformers/ktransformers_ext/ext_bindings.cpp
0 → 100644
View file @
18c42e67
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
7
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment