Unverified Commit 823ab796 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update `pre-commit` hooks (#12475)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 6116ca8c
......@@ -3,18 +3,18 @@ default_stages:
- manual # Run in CI
repos:
- repo: https://github.com/google/yapf
rev: v0.32.0
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.5
rev: v0.9.3
hooks:
- id: ruff
args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
rev: v2.4.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
......@@ -23,7 +23,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.5
rev: v19.1.7
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
......@@ -35,7 +35,7 @@ repos:
- id: pymarkdown
files: docs/.*
- repo: https://github.com/rhysd/actionlint
rev: v1.7.6
rev: v1.7.7
hooks:
- id: actionlint
- repo: local
......
......@@ -926,8 +926,8 @@ def main(args: argparse.Namespace):
)
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
......
......@@ -38,9 +38,13 @@ struct Signal {
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
};
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
struct __align__(16) RankData {
const void* __restrict__ ptrs[8];
};
struct __align__(16) RankSignals { Signal* signals[8]; };
struct __align__(16) RankSignals {
Signal* signals[8];
};
// like std::array, but aligned
template <typename T, int sz>
......
......@@ -138,8 +138,8 @@ __device__ inline FragB dequant<vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
......@@ -182,8 +182,8 @@ __device__ inline FragB dequant<vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
const int SUB = 0x64006400;
const int MUL = 0x2c002c00;
......
......@@ -173,8 +173,8 @@ dequant<half, vllm::kU4B8.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
......@@ -197,9 +197,9 @@ dequant<nv_bfloat16, vllm::kU4B8.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80;
......@@ -221,8 +221,8 @@ dequant<half, vllm::kU4.id()>(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
const int SUB = 0x64006400;
const int MUL = 0x2c002c00;
......@@ -244,9 +244,9 @@ dequant<nv_bfloat16, vllm::kU4.id()>(int q) {
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
q >>= 4;
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
typename ScalarType<nv_bfloat16>::FragB frag_b;
static constexpr uint32_t MUL = 0x3F803F80;
......
......@@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
......
......@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
static constexpr uint32_t HI = 0x00f000f0;
static constexpr uint32_t EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
static constexpr uint32_t SUB = 0x64086408;
......
......@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
const int HI = 0x00f000f0;
const int EX = 0x64006400;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const int SUB = 0x64086408;
......
......@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
// max_num_partitions, head_size]
const int* __restrict__ context_lens, // [num_seqs]
const int max_num_partitions){UNREACHABLE_CODE}
const int max_num_partitions) {
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
......
......@@ -417,7 +417,7 @@ def get_rocm_version():
if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
ctypes.byref(patch)) == 0):
return "%d.%d.%d" % (major.value, minor.value, patch.value)
return f"{major.value}.{minor.value}.{patch.value}"
return None
except Exception:
return None
......
......@@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A,
A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
]
B_tiles = [[
B[j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
B[
j * block_n:min((j + 1) * block_n, N),
i * block_k:min((i + 1) * block_k, K),
] for i in range(k_tiles)
] for j in range(n_tiles)]
C_tiles = [
C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
......@@ -157,9 +159,9 @@ def setup_cuda():
torch.set_default_device("cuda")
@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
SEEDS))
@pytest.mark.parametrize(
"num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS))
@torch.inference_mode()
def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
torch.manual_seed(seed)
......@@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
assert torch.allclose(scale, ref_scale)
@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
SEEDS))
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
torch.manual_seed(seed)
......@@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert rel_diff < 0.001
@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
BLOCK_SIZE, DTYPES, SEEDS))
@pytest.mark.parametrize(
"M,N,K,E,topk,block_size,dtype,seed",
itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES,
SEEDS))
@torch.inference_mode()
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
torch.manual_seed(seed)
......
......@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0
print("My rank: %d, device: %s" % (my_rank, device))
print(f"My rank: {my_rank}, device: {device}")
# insert
tokens = torch.tensor([1, 2, 3]).to(device)
......@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0
print("My rank: %d, Test run passed!" % (my_rank))
print(f"My rank: {my_rank}, Test run passed!")
def stress_test(my_rank, buf, device):
......@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
assert torch.allclose(k, k_)
assert torch.allclose(v, v_)
assert torch.allclose(h, h_)
print('Rank %d done' % my_rank)
print(f"Rank {my_rank} done")
torch.distributed.barrier()
if my_rank == 0:
......@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else:
torch.distributed.send(torch.tensor([n]), 0)
print("My rank: %d, Passed stress test!" % (my_rank))
print(f"My rank: {my_rank}, Passed stress test!")
if __name__ == "__main__":
......@@ -122,7 +122,7 @@ if __name__ == "__main__":
rank=my_rank,
)
print("initialized! My rank is %d" % my_rank)
print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig(
kv_connector='PyNcclConnector',
......
......@@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
@pytest.mark.xfail(current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm"
)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM(
MODEL_PATH,
......
......@@ -521,12 +521,13 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - image embeddings
# - video
# - custom inputs
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=False,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=False,
))
def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
......@@ -543,12 +544,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=False,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=False,
))
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
......@@ -565,12 +567,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=False,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=False,
))
def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
......@@ -586,12 +589,13 @@ def test_image_embedding_models(model_type: str,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=False,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=False,
))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
video_assets: _VideoAssets):
......@@ -605,12 +609,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=False,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=False,
))
def test_custom_inputs_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
......@@ -627,12 +632,13 @@ def test_custom_inputs_models(
#### Tests filtering for things running each test as a new process
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=True,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
......@@ -650,12 +656,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=True,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
......@@ -673,12 +680,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=True,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs,
......@@ -695,12 +703,13 @@ def test_image_embedding_models_heavy(model_type: str,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=True,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
fork_new_process_for_each_test=True,
))
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
......@@ -715,12 +724,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
)
@pytest.mark.parametrize("model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=True,
))
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
fork_new_process_for_each_test=True,
))
@fork_new_process_for_each_test
def test_custom_inputs_models_heavy(
model_type: str,
......
......@@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [(tokens, text,
[{k: asdict(v)
for k, v in token_logprobs.items()}
for token_logprobs in (logprobs or [])])
json_data = [(tokens, text, [{
k: asdict(v)
for k, v in token_logprobs.items()
} for token_logprobs in (logprobs or [])])
for tokens, text, logprobs in outputs]
with open(filename, "w") as f:
......@@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f:
json_data = json.load(f)
return [(tokens, text,
[{int(k): Logprob(**v)
for k, v in token_logprobs.items()}
for token_logprobs in logprobs])
for tokens, text, logprobs in json_data]
return [(tokens, text, [{
int(k): Logprob(**v)
for k, v in token_logprobs.items()
} for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
@large_gpu_test(min_gb=80)
......
......@@ -314,9 +314,9 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
@pytest.mark.skipif(not sparse_cutlass_supported(),
reason="2of4 Sparse is not yet supported on this GPU type."
)
@pytest.mark.skipif(
not sparse_cutlass_supported(),
reason="2of4 Sparse is not yet supported on this GPU type.")
@pytest.mark.parametrize(
"args_2of4",
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
......
......@@ -23,16 +23,17 @@ def mock_causal_accepted_tensor(
"""
batch_size = last_accepted_indices.shape[0]
accepted = (torch.arange(k).expand(batch_size, k) <=
last_accepted_indices.unsqueeze(-1).broadcast_to(
accepted = (torch.arange(k).expand(batch_size, k)
<= last_accepted_indices.unsqueeze(-1).broadcast_to(
batch_size, k))
# Sprinkle accepted values after the contiguous initial accepted values.
# This replicates the behavior of rejection sampling, which may "accept"
# a token that cannot be accepted because of causality.
sprinkle_candidates = (
torch.arange(k).expand(batch_size, k) >
last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1)
sprinkle_candidates = (torch.arange(k).expand(
batch_size,
k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) +
1)
sprinkle = torch.rand(batch_size, k) > 0.5
accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates]
return accepted
......@@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution(
distance_wrt_reference)
expected_improvement_multiplier = 20
assert (relative_change_in_distance_wrt_target >
relative_change_in_distance_wrt_reference *
assert (relative_change_in_distance_wrt_target
> relative_change_in_distance_wrt_reference *
expected_improvement_multiplier)
......
......@@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types):
print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
'parallelism)'.format(length, total_cpu_time,
total_cpu_time * 1.0 / length))
print(' %d build steps completed, average of %1.2f/s' %
(len(entries), len(entries) / (length)))
print(' {} build steps completed, average of {:1.2f}/s'.format(
len(entries),
len(entries) / (length)))
def main():
......
......@@ -820,8 +820,8 @@ def scaled_int8_quant(
if scale is not None:
# static-per-tensor quantization.
assert symmetric == (
azp is
None), "azp must only be provided for asymmetric quantization."
azp
is None), "azp must only be provided for asymmetric quantization."
torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
return output, scale, azp
......
......@@ -219,8 +219,8 @@ if triton.__version__ >= "2.1.0":
float("-inf"))
if SLIDING_WINDOW > 0:
qk = tl.where(
offs_m[:, None] -
(start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000)
offs_m[:, None] - (start_n + offs_n[None, :])
< SLIDING_WINDOW, qk, -10000)
# -- compute m_ij, p, l_ij
m_ij = tl.max(qk, 1)
......@@ -324,10 +324,10 @@ if triton.__version__ >= "2.1.0":
(cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
cur_head * stride_qh + offs_d[None, :] * stride_qd)
q = tl.load(
Q + off_q,
mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
other=0.0)
q = tl.load(Q + off_q,
mask=offs_m[:, None]
< cur_batch_seq_len - cur_batch_ctx_len,
other=0.0)
# # initialize pointer to m and l
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
......@@ -402,8 +402,8 @@ if triton.__version__ >= "2.1.0":
# -- compute qk ----
k = tl.load(k_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_kbs,
mask=(start_n + offs_n[None, :]) <
cur_batch_seq_len - cur_batch_ctx_len,
mask=(start_n + offs_n[None, :])
< cur_batch_seq_len - cur_batch_ctx_len,
other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
......@@ -430,8 +430,8 @@ if triton.__version__ >= "2.1.0":
# update acc
v = tl.load(v_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_vbs,
mask=(start_n + offs_n[:, None]) <
cur_batch_seq_len - cur_batch_ctx_len,
mask=(start_n + offs_n[:, None])
< cur_batch_seq_len - cur_batch_ctx_len,
other=0.0)
p = p.to(v.dtype)
......@@ -639,8 +639,8 @@ if triton.__version__ >= "2.1.0":
k = tl.load(k_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_kbs,
mask=dim_mask[:, None] &
((start_n + offs_n[None, :]) <
cur_batch_seq_len - cur_batch_ctx_len),
((start_n + offs_n[None, :])
< cur_batch_seq_len - cur_batch_ctx_len),
other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
......@@ -677,8 +677,8 @@ if triton.__version__ >= "2.1.0":
v = tl.load(v_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_vbs,
mask=dim_mask[None, :] &
((start_n + offs_n[:, None]) <
cur_batch_seq_len - cur_batch_ctx_len),
((start_n + offs_n[:, None])
< cur_batch_seq_len - cur_batch_ctx_len),
other=0.0)
p = p.to(v.dtype)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment