Unverified Commit 1eb2b9c1 authored by Peter Pan's avatar Peter Pan Committed by GitHub
Browse files

[CI] update typos config for CI pre-commit and fix some spells (#20919)


Signed-off-by: default avatarPeter Pan <Peter.Pan@daocloud.io>
parent 6ebf3137
...@@ -21,7 +21,7 @@ repos: ...@@ -21,7 +21,7 @@ repos:
- id: ruff-format - id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.* files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/crate-ci/typos - repo: https://github.com/crate-ci/typos
rev: v1.32.0 rev: v1.34.0
hooks: hooks:
- id: typos - id: typos
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
......
...@@ -58,7 +58,7 @@ namespace { ...@@ -58,7 +58,7 @@ namespace {
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_LAST_DIM_CONTIGUOUS(x) \ #define CHECK_LAST_DIM_CONTIGUOUS(x) \
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention") TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
#define CHECK_INPUT(x) \ #define CHECK_INPUT(x) \
CHECK_CPU(x); \ CHECK_CPU(x); \
......
...@@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl( ...@@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
int64_t topk, int64_t topk,
int64_t num_tokens_post_pad); int64_t num_tokens_post_pad);
// shared expert implememntation for int8 w8a8 // shared expert implementation for int8 w8a8
template <typename scalar_t> template <typename scalar_t>
void shared_expert_int8_kernel_impl( void shared_expert_int8_kernel_impl(
scalar_t* __restrict__ output, scalar_t* __restrict__ output,
......
...@@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> { ...@@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
__m512 vd0; __m512 vd0;
__m512 vd1[COLS]; __m512 vd1[COLS];
// oops! 4x4 spills but luckly we use 4x2 // oops! 4x4 spills but luckily we use 4x2
__m512 vbias[COLS]; __m512 vbias[COLS];
// [NOTE]: s8s8 igemm compensation in avx512-vnni // [NOTE]: s8s8 igemm compensation in avx512-vnni
......
...@@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto ...@@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
#define CVT_FP16_TO_FP32(a) \ #define CVT_FP16_TO_FP32(a) \
_mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
// this doesn't hanel NaN. // this doesn't handle NaN.
inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) { inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
const __m512i x = _mm512_cvtepu8_epi16(fp8_vec); const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
......
...@@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly ...@@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
ARG PIP_KEYRING_PROVIDER=disabled ARG PIP_KEYRING_PROVIDER=disabled
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER} ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
# Flag enables build-in KV-connector dependency libs into docker images # Flag enables built-in KV-connector dependency libs into docker images
ARG INSTALL_KV_CONNECTORS=false ARG INSTALL_KV_CONNECTORS=false
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
......
...@@ -106,7 +106,7 @@ to enable simultaneous generation and embedding using the same engine instance i ...@@ -106,7 +106,7 @@ to enable simultaneous generation and embedding using the same engine instance i
Models using selective state-space mechanisms instead of standard transformer attention are partially supported. Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet suported. Please note that these models currently require (e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
enforcing eager mode and disabling prefix caching in V1. enforcing eager mode and disabling prefix caching in V1.
Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
......
...@@ -174,3 +174,186 @@ respect-ignore-files = true ...@@ -174,3 +174,186 @@ respect-ignore-files = true
[tool.ty.environment] [tool.ty.environment]
python = "./.venv" python = "./.venv"
[tool.typos.files]
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
"vllm/third_party/*"]
ignore-hidden = true
ignore-files = true
ignore-dot = true
ignore-vcs = true
ignore-global = true
ignore-parent = true
[tool.typos.default]
binary = false
check-filename = false
check-file = true
unicode = true
ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
".*[Tt]h[rR].*"]
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.default.extend-identifiers]
bbc5b7ede = "bbc5b7ede"
womens_doubles = "womens_doubles"
v_2nd = "v_2nd"
# splitted_input = "splitted_input"
NOOPs = "NOOPs"
typ = "typ"
nin_shortcut = "nin_shortcut"
UperNetDecoder = "UperNetDecoder"
subtile = "subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput = "SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
depthwise_seperable_CNN = "depthwise_seperable_CNN"
[tool.typos.default.extend-words]
iy = "iy"
tendencias = "tendencias"
# intel cpu features
tme = "tme"
dout = "dout"
Pn = "Pn"
arange = "arange"
[tool.typos.type.py]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.py.extend-identifiers]
arange = "arange"
NDArray = "NDArray"
EOFError = "EOFError"
fo = "fo"
ba = "ba"
[tool.typos.type.py.extend-words]
[tool.typos.type.cpp]
extend-glob = ["*.cu"]
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cpp.extend-identifiers]
countr_one = "countr_one"
k_ot = "k_ot"
ot = "ot"
[tool.typos.type.cpp.extend-words]
[tool.typos.type.rust]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.rust.extend-identifiers]
flate2 = "flate2"
[tool.typos.type.rust.extend-words]
ser = "ser"
[tool.typos.type.lock]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.lock.extend-identifiers]
[tool.typos.type.lock.extend-words]
[tool.typos.type.jl]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.jl.extend-identifiers]
[tool.typos.type.jl.extend-words]
modul = "modul"
egals = "egals"
usig = "usig"
egal = "egal"
[tool.typos.type.go]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.go.extend-identifiers]
flate = "flate"
[tool.typos.type.go.extend-words]
[tool.typos.type.css]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.css.extend-identifiers]
nd = "nd"
[tool.typos.type.css.extend-words]
[tool.typos.type.man]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.man.extend-identifiers]
Nd = "Nd"
[tool.typos.type.man.extend-words]
[tool.typos.type.cert]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cert.extend-identifiers]
[tool.typos.type.cert.extend-words]
[tool.typos.type.sh]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.sh.extend-identifiers]
ot = "ot"
[tool.typos.type.sh.extend-words]
[tool.typos.type.vimscript]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.vimscript.extend-identifiers]
windo = "windo"
[tool.typos.type.vimscript.extend-words]
...@@ -416,7 +416,7 @@ class RankTensors: ...@@ -416,7 +416,7 @@ class RankTensors:
# We dequant and use that as hidden_states so the tests are stable. # We dequant and use that as hidden_states so the tests are stable.
# quantizing and dequantizing yield slightly different results # quantizing and dequantizing yield slightly different results
# depending on the hardware. Here we, quantize and dequantize # depending on the hardware. Here we, quantize and dequantize
# first - so further quantize and dequantize will yeild the same # first - so further quantize and dequantize will yield the same
# values. # values.
if config.is_per_tensor_act_quant: if config.is_per_tensor_act_quant:
a_q, a_scales = ops.scaled_fp8_quant( a_q, a_scales = ops.scaled_fp8_quant(
......
...@@ -95,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size): ...@@ -95,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1) topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
# triton referrence # triton reference
out_triton = fused_experts( out_triton = fused_experts(
hidden_states=tokens_bf16, hidden_states=tokens_bf16,
w1=w1, w1=w1,
......
...@@ -43,7 +43,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): ...@@ -43,7 +43,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
text_config = hf_config.get_text_config() text_config = hf_config.get_text_config()
# Ensure at least 2 expert per group # Ensure at least 2 expert per group
# Since `grouped_topk` assums top-2 # Since `grouped_topk` assumes top-2
n_group = getattr(text_config, 'n_group', None) n_group = getattr(text_config, 'n_group', None)
num_experts = n_group * 2 if n_group is not None else 2 num_experts = n_group * 2 if n_group is not None else 2
......
...@@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b" ...@@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
# Number of data parallel ranks for external LB testing # Number of data parallel ranks for external LB testing
DP_SIZE = int(os.getenv("DP_SIZE", "2")) DP_SIZE = int(os.getenv("DP_SIZE", "2"))
# Default tensor parallell size to use # Default tensor parallel size to use
TP_SIZE = int(os.getenv("TP_SIZE", "1")) TP_SIZE = int(os.getenv("TP_SIZE", "1"))
......
[files]
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
"vllm/third_party/*"]
ignore-hidden = true
ignore-files = true
ignore-dot = true
ignore-vcs = true
ignore-global = true
ignore-parent = true
[default]
binary = false
check-filename = false
check-file = true
unicode = true
ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
".*ot.*", ".*[Tt]h[rR].*"]
extend-ignore-words-re = []
extend-ignore-re = []
[default.extend-identifiers]
bbc5b7ede = "bbc5b7ede"
womens_doubles = "womens_doubles"
v_2nd = "v_2nd"
splitted_input = "splitted_input"
NOOPs = "NOOPs"
typ = "typ"
nin_shortcut = "nin_shortcut"
UperNetDecoder = "UperNetDecoder"
subtile = "subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput = "SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
depthwise_seperable_CNN = "depthwise_seperable_CNN"
[default.extend-words]
iy = "iy"
tendencias = "tendencias"
# intel cpu features
tme = "tme"
dout = "dout"
Pn = "Pn"
arange = "arange"
[type.py]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.py.extend-identifiers]
arange = "arange"
NDArray = "NDArray"
EOFError = "EOFError"
[type.py.extend-words]
[type.cpp]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.cpp.extend-identifiers]
countr_one = "countr_one"
[type.cpp.extend-words]
[type.rust]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.rust.extend-identifiers]
flate2 = "flate2"
[type.rust.extend-words]
ser = "ser"
[type.lock]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.lock.extend-identifiers]
[type.lock.extend-words]
[type.jl]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.jl.extend-identifiers]
[type.jl.extend-words]
modul = "modul"
egals = "egals"
usig = "usig"
egal = "egal"
[type.go]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.go.extend-identifiers]
flate = "flate"
[type.go.extend-words]
[type.css]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.css.extend-identifiers]
nd = "nd"
[type.css.extend-words]
[type.man]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.man.extend-identifiers]
Nd = "Nd"
[type.man.extend-words]
[type.cert]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.cert.extend-identifiers]
[type.cert.extend-words]
[type.sh]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.sh.extend-identifiers]
stap = "stap"
ot = "ot"
[type.sh.extend-words]
[type.vimscript]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[type.vimscript.extend-identifiers]
windo = "windo"
[type.vimscript.extend-words]
...@@ -961,7 +961,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl): ...@@ -961,7 +961,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
"... H (two D) -> ... (H two) D", "... H (two D) -> ... (H two) D",
two=2) two=2)
else: # re-use the kv cache, full attention else: # reuse the kv cache, full attention
q = q.view(-1, self.num_heads, self.head_size) q = q.view(-1, self.num_heads, self.head_size)
q1, q2 = self.split_heads(q) q1, q2 = self.split_heads(q)
# kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501 # kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501
......
...@@ -372,7 +372,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -372,7 +372,7 @@ class OpenAIServingResponses(OpenAIServing):
}) })
# Append the new input. # Append the new input.
# Reponses API supports simple text inputs without chat format. # Responses API supports simple text inputs without chat format.
if isinstance(request.input, str): if isinstance(request.input, str):
messages.append({"role": "user", "content": request.input}) messages.append({"role": "user", "content": request.input})
else: else:
......
...@@ -1172,7 +1172,7 @@ def fused_experts( ...@@ -1172,7 +1172,7 @@ def fused_experts(
allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor: allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
# For now, disable DeepGemm for small N (<= 512) until better # For now, disable DeepGemm for small N (<= 512) until better
# permute/unpermute ops are available. # permute/unpermute ops are available.
# However, on B200, we use DeepGemm for all cases becuase they only support # However, on B200, we use DeepGemm for all cases because they only support
# E8M0 scale, which means we requantize the weight and input to the specific # E8M0 scale, which means we requantize the weight and input to the specific
# scale. Fallen back to cutlass or triton for some cases would cause # scale. Fallen back to cutlass or triton for some cases would cause
# accuracy issue. # accuracy issue.
......
...@@ -193,7 +193,7 @@ class SambaYAttention(nn.Module): ...@@ -193,7 +193,7 @@ class SambaYAttention(nn.Module):
], ],
dim=-1) dim=-1)
attn_output = self.attn(q, k, v) attn_output = self.attn(q, k, v)
else: # re-use the kv cache, full attention else: # reuse the kv cache, full attention
q = self.Wqkv(hidden_states) q = self.Wqkv(hidden_states)
attn_output = self.attn(q, None, None) attn_output = self.attn(q, None, None)
attn_output = attn_output.view(-1, self.num_heads * self.head_dim) attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
......
...@@ -394,7 +394,7 @@ def use_cudnn_prefill() -> bool: ...@@ -394,7 +394,7 @@ def use_cudnn_prefill() -> bool:
# Currently 394MB, this can be tuned based on GEMM sizes used. # Currently 394MB, this can be tuned based on GEMM sizes used.
# Choosen to be the same as sglang: # Chosen to be the same as sglang:
# https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37 # https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024 FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
......
...@@ -969,7 +969,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): ...@@ -969,7 +969,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
else: else:
mm_embeds = [] mm_embeds = []
xm.mark_step() xm.mark_step()
# Prepare inputs, the requests might be splitted into multiple # Prepare inputs, the requests might be split into multiple
# executions, combine the result of each execution. # executions, combine the result of each execution.
start_index = 0 start_index = 0
combined_selected_tokens: list[torch.Tensor] = [] combined_selected_tokens: list[torch.Tensor] = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment