Unverified Commit ec53d63f authored by tpoisonooo's avatar tpoisonooo Committed by GitHub
Browse files

fix(kv_qparams.py): zp use min (#59)

* fix(kv_qparams.py): zp use min

* revert(qparams.py): revert format

* fix(kv_qparams.py): update formula
parent 61e8d2c6
......@@ -23,7 +23,7 @@
![](../../resources/batch_memory.png)
因为每个并发需要 1030MB 显存为 2048 token 保存 kv_cache,而服务端需要考量高并发场景的成本,所以量化 kv_cache 比直接量化 weight更合适。
因为每个并发需要 1030MB 显存为 2048 token 保存 kv_cache,而服务端需要考量高并发场景的成本,所以量化 kv_cache 比直接量化 weight 更合适。
需要注意的是,`kCacheKVInt8``WeightInt4` 两种方案可以同时运行,我们后续将提供相关实现。
......
......@@ -167,6 +167,8 @@ def main(model: str,
tp = i % num_tp
save_path = out_dir / f'layers.{layer}.past_kv_scale.{tp}.weight'
if symmetry:
# quant: q = f / scale
# dequant: f = q * scale
k_scale = max(k_obs.buffer) / (2**(bits - 1) - 1)
v_scale = max(v_obs.buffer) / (2**(bits - 1) - 1)
......@@ -175,6 +177,8 @@ def main(model: str,
print(f'Layer {layer} TP {tp} KV scales done.')
else:
# quant: q = (f - zp) / scale
# dequant: f = q * scale + zp
k_min = min([min_k for min_k, _ in k_obs.buffer])
k_max = max([max_k for _, max_k in k_obs.buffer])
......@@ -184,10 +188,7 @@ def main(model: str,
k_scale = (k_max - k_min) / (2**bits - 1)
v_scale = (v_max - v_min) / (2**bits - 1)
k_zero = (-k_min / k_scale).round()
v_zero = (-v_min / v_scale).round()
kv_qparams = np.array([k_scale, k_zero, v_scale, v_zero],
kv_qparams = np.array([k_scale, k_min, v_scale, v_min],
dtype=np.float32)
kv_qparams.tofile(save_path)
print(f'Layer {i} KV scales&zeros done.')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment