"vscode:/vscode.git/clone" did not exist on "3cd36660f72f75b888c82a8feac93ea9f17c8e1e"
Unverified Commit 6a895197 authored by Jiayi Yan's avatar Jiayi Yan Committed by GitHub
Browse files

[Bugfix][CI] fix typos (#34934)


Signed-off-by: default avatar1195343015 <1195343015@qq.com>
Signed-off-by: default avatarJiayi Yan <66017932+1195343015@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8c760b6a
...@@ -72,7 +72,7 @@ obj_json="objects.json" ...@@ -72,7 +72,7 @@ obj_json="objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR" mkdir -p "$INDICES_OUTPUT_DIR"
# call script to generate indicies for all existing wheels # call script to generate indices for all existing wheels
# this indices have relative paths that could work as long as it is next to the wheel directory in s3 # this indices have relative paths that could work as long as it is next to the wheel directory in s3
# i.e., the wheels are always in s3://vllm-wheels/<commit>/ # i.e., the wheels are always in s3://vllm-wheels/<commit>/
# and indices can be placed in /<commit>/, or /nightly/, or /<version>/ # and indices can be placed in /<commit>/, or /nightly/, or /<version>/
......
...@@ -467,7 +467,7 @@ steps: ...@@ -467,7 +467,7 @@ steps:
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
# TODO: Add the "V1 Test attetion (MI300)" test group # TODO: Add the "V1 Test attention (MI300)" test group
- label: V1 Test attention (H100) # 10min - label: V1 Test attention (H100) # 10min
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
...@@ -2174,7 +2174,7 @@ steps: ...@@ -2174,7 +2174,7 @@ steps:
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
# TODO: Add the "V1 Test attetion (MI300)" test group # TODO: Add the "V1 Test attention (MI300)" test group
- label: V1 Test attention (H100) # 10min - label: V1 Test attention (H100) # 10min
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
......
...@@ -13,7 +13,7 @@ repos: ...@@ -13,7 +13,7 @@ repos:
args: [--output-format, github, --fix] args: [--output-format, github, --fix]
- id: ruff-format - id: ruff-format
- repo: https://github.com/crate-ci/typos - repo: https://github.com/crate-ci/typos
rev: v1.38.1 rev: v1.43.5
hooks: hooks:
- id: typos - id: typos
args: [--force-exclude] args: [--force-exclude]
......
...@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]: ...@@ -30,7 +30,7 @@ def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
max_kv_len = max(r.kv_len for r in requests) if requests else 0 max_kv_len = max(r.kv_len for r in requests) if requests else 0
return (batch_size, max_q_len, max_kv_len) return (batch_size, max_q_len, max_kv_len)
except Exception: except Exception:
# Fallback for unparseable specs # Fallback for unparsable specs
return (0, 0, 0) return (0, 0, 0)
......
...@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int): ...@@ -202,7 +202,7 @@ def test_correctness(T: int, N: int):
# reference output # reference output
ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE) ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE)
# test ouptut # test output
out_q, out_s = output_from_impl( out_q, out_s = output_from_impl(
ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR
) )
......
...@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> { ...@@ -420,7 +420,7 @@ class AttentionImpl<ISA::AMX, scalar_t, head_dim> {
const int64_t block_size, const int64_t block_size_stride) { const int64_t block_size, const int64_t block_size_stride) {
// For AMX 2D tiles, size of each line is 64 bytes // For AMX 2D tiles, size of each line is 64 bytes
constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES; constexpr int64_t amx_tile_row_size = AMX_TILE_ROW_BYTES;
// For AMX B martix, N always is 16 // For AMX B matrix, N always is 16
constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4; constexpr int64_t amx_b_tile_n_size = AMX_TILE_ROW_BYTES / 4;
constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t); constexpr int64_t amx_b_tile_k_size = amx_tile_row_size / sizeof(scalar_t);
// For now suppose block_size is divisible by amx_tile_column_num // For now suppose block_size is divisible by amx_tile_column_num
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include <torch/library.h> #include <torch/library.h>
// Note: overwrite the external defination for sharing same name between // Note: overwrite the external definition for sharing same name between
// libraries use different ISAs. // libraries use different ISAs.
#define TORCH_EXTENSION_NAME _C #define TORCH_EXTENSION_NAME _C
......
...@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel( ...@@ -35,11 +35,11 @@ __global__ void batched_moe_align_block_size_kernel(
int32_t const block_ids_size = sorted_ids_size / block_size; int32_t const block_ids_size = sorted_ids_size / block_size;
int32_t const SENTINEL = int32_t const SENTINEL =
num_batches * max_tokens_per_batch; // To denote invalid entries. num_batches * max_tokens_per_batch; // To denote invalid entries.
// Intialize sorted_ids // Initialize sorted_ids
for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) { for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
sorted_ids[i] = SENTINEL; sorted_ids[i] = SENTINEL;
} }
// Intialize expert_ids with -1 // Initialize expert_ids with -1
for (size_t i = threadIdx.x; i < block_ids_size; i += stride) { for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
block_ids[i] = -1; block_ids[i] = -1;
} }
......
...@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( ...@@ -542,7 +542,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
if (!lane_id) { if (!lane_id) {
// Store scales. // Store scales.
if constexpr (std::is_same<scale_t, uint8_t>::value) { if constexpr (std::is_same<scale_t, uint8_t>::value) {
// Packed UE8MO format. Remove Mantissa. // Packed UE8M0 format. Remove Mantissa.
*y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7; *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
bool const jump_pack = (current_group_id + 1) % 4 == 0; bool const jump_pack = (current_group_id + 1) % 4 == 0;
......
...@@ -1476,7 +1476,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) ...@@ -1476,7 +1476,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
#endif #endif
// B[] staging is cooperative across GrpsShrB, so sync here before reading // B[] staging is cooperative across GrpsShrB, so sync here before reading
// back. This wait is currently inserted by compiler, but not gauranteed. // back. This wait is currently inserted by compiler, but not guaranteed.
asm volatile("s_waitcnt 0"); asm volatile("s_waitcnt 0");
__syncthreads(); __syncthreads();
......
...@@ -98,7 +98,7 @@ The goal of this structure is to uniquely identify a (padded) batch with minimal ...@@ -98,7 +98,7 @@ The goal of this structure is to uniquely identify a (padded) batch with minimal
### `CudagraphDispatcher` ### `CudagraphDispatcher`
The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWarpper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher. The [CudagraphDispatcher][vllm.v1.cudagraph_dispatcher.CudagraphDispatcher] takes responsibility for maintaining two sets of valid dispatching keys, one set for `FULL` runtime mode and one set for `PIECEWISE` runtime mode, and dispatches the correct runtime mode and the dispatching keys before executing the model's forwards. It will take in the initial key (a rough batch_descriptor for the padded input) and return the selected runtime mode and the final batch_descriptor, then tell the CUDAGraphWrapper instances that decision through forward contexts. Notice that `CudagraphDispatcher` is the only source of truth for available CUDA Graph keys and `CUDAGraphWrapper` instances can blindly trust the forward context on what CUDA Graphs to dispatch to. This lets us simplify the wrapper code and centralize the logic in the dispatcher.
The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config. The dispatching keys are initialized through the dispatcher's `initialize_cudagraph_keys` method, which is called by the gpu_model_runner after all possible attention backends are initialized. This is where we can get much fancier in the future and “prepare” all kinds of CUDA Graphs combinations. For now, we just append available keys based on the valid combos of `decode_mode`/`mixed_mode` of `cudagraph_mode` and `cudagraph_capture_sizes` in the compilation config.
......
...@@ -47,7 +47,7 @@ The TopK Weight Application and Reduction components happen right after the Unpe ...@@ -47,7 +47,7 @@ The TopK Weight Application and Reduction components happen right after the Unpe
Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py). Please find the implementations of TopKWeightAndReduce [here](../../vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py).
`FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method. `FusedMoEPrepareAndFinalizeModular::finalize()` method accepts a `TopKWeightAndReduce` argument that is invoked inside the method.
The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPerpareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens. The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEExpertsModular` and `FusedMoEPrepareAndFinalize` implementations to determine where the TopK Weight Application and Reduction happens.
* `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself. * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceNoOp` if the `FusedMoEExpertsModular` implementation does the weight application and reduction itself.
* `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction. * `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` method returns `TopKWeightAndReduceContiguous` / `TopKWeightAndReduceNaiveBatched` / `TopKWeightAndReduceDelegate` if the `FusedMoEExpertsModular` implementation needs the `FusedMoEPrepareAndFinalizeModular::finalize()` to do the weight application and reduction.
......
...@@ -352,7 +352,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests, ...@@ -352,7 +352,7 @@ The `BatchUpdate` abstraction models the persistent batch as a list of requests,
(s, d, UNIDIRECTIONAL or SWAP) (s, d, UNIDIRECTIONAL or SWAP)
``` ```
* If the Move specifies `UNIDRECTIONAL`: * If the Move specifies `UNIDIRECTIONAL`:
* The request at index `s` is moved to index `d`; index `s` becomes an empty slot * The request at index `s` is moved to index `d`; index `s` becomes an empty slot
......
...@@ -141,7 +141,7 @@ Every plugin has three parts: ...@@ -141,7 +141,7 @@ Every plugin has three parts:
- triton ops - triton ops
Custom way doesn't work for triton ops now. Custom way doesn't work for triton ops now.
7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc. 7. (optional) Implement other pluggable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
## Compatibility Guarantee ## Compatibility Guarantee
......
...@@ -641,7 +641,7 @@ Then you obtain the sparse embeddings like this: ...@@ -641,7 +641,7 @@ Then you obtain the sparse embeddings like this:
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "BAAI/bge-m3", "model": "BAAI/bge-m3",
"task": "token_classify", "task": "token_classify",
"input": ["What is BGE M3?", "Defination of BM25"] "input": ["What is BGE M3?", "Definition of BM25"]
}' }'
``` ```
...@@ -657,7 +657,7 @@ You can obtain the colbert embeddings like this: ...@@ -657,7 +657,7 @@ You can obtain the colbert embeddings like this:
curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
"model": "BAAI/bge-m3", "model": "BAAI/bge-m3",
"task": "token_embed", "task": "token_embed",
"input": ["What is BGE M3?", "Defination of BM25"] "input": ["What is BGE M3?", "Definition of BM25"]
}' }'
``` ```
......
...@@ -349,7 +349,7 @@ ...@@ -349,7 +349,7 @@
"defaults": { "defaults": {
"color": { "mode": "thresholds" }, "color": { "mode": "thresholds" },
"mappings": [ "mappings": [
{ "options": { "Calcultion": { "index": 0, "text": "Last (not null)" } }, "type": "value" } { "options": { "Calculation": { "index": 0, "text": "Last (not null)" } }, "type": "value" }
], ],
"thresholds": { "thresholds": {
"mode": "absolute", "mode": "absolute",
......
...@@ -124,193 +124,54 @@ python = "./.venv" ...@@ -124,193 +124,54 @@ python = "./.venv"
[tool.typos.files] [tool.typos.files]
# these files may be written in non english words # these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizers_/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*", "benchmarks/sonnet.txt", "tests/lora/data/*", "examples/pooling/token_embed/*", "build/*",
"vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/test_transcription_validation.py",
"docs/governance/process.md"] "docs/governance/process.md", "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*"]
ignore-hidden = true ignore-hidden = false
ignore-files = true
ignore-dot = true
ignore-vcs = true
ignore-global = true
ignore-parent = true
[tool.typos.default] [tool.typos.default]
binary = false extend-ignore-identifiers-re = [".*[Uu][Ee][0-9][Mm][0-9].*"]
check-filename = false
check-file = true
unicode = true
ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
".*[Tt]h[rR].*"]
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.default.extend-identifiers] [tool.typos.default.extend-identifiers]
bbc5b7ede = "bbc5b7ede" bbc5b7ede = "bbc5b7ede"
womens_doubles = "womens_doubles"
v_2nd = "v_2nd"
# splitted_input = "splitted_input"
NOOPs = "NOOPs" NOOPs = "NOOPs"
typ = "typ"
nin_shortcut = "nin_shortcut" nin_shortcut = "nin_shortcut"
UperNetDecoder = "UperNetDecoder"
subtile = "subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin" cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput = "SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel = "depthwise_seperable_out_channel" depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d" pard_token = "pard_token"
depthwise_seperable_CNN = "depthwise_seperable_CNN" ptd_token_id = "ptd_token_id"
ser_de = "ser_de"
shared_memory_per_block_optin = "shared_memory_per_block_optin"
FoPE = "FoPE"
k_ot = "k_ot"
view_seperator = "view_seperator"
inverse_std_variences = "inverse_std_variences"
[tool.typos.default.extend-words] [tool.typos.default.extend-words]
iy = "iy" iy = "iy"
tendencias = "tendencias"
indx = "indx" indx = "indx"
# intel cpu features # intel cpu features
tme = "tme" tme = "tme"
dout = "dout" dout = "dout"
Pn = "Pn" Pn = "Pn"
arange = "arange" arange = "arange"
thw = "thw"
subtile = "subtile"
HSA = "HSA"
setp = "setp"
CPY = "CPY"
thr = "thr"
Thr = "Thr"
PARD = "PARD" PARD = "PARD"
pard = "pard" pard = "pard"
AKS = "AKS" AKS = "AKS"
[tool.typos.type.py]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.py.extend-identifiers]
arange = "arange"
NDArray = "NDArray"
EOFError = "EOFError"
fo = "fo"
ba = "ba" ba = "ba"
fo = "fo"
[tool.typos.type.py.extend-words]
ba = "ba"
nd = "nd"
[tool.typos.type.cpp]
extend-glob = ["*.cu"]
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cpp.extend-identifiers]
countr_one = "countr_one"
k_ot = "k_ot"
ot = "ot"
[tool.typos.type.cpp.extend-words]
[tool.typos.type.rust]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.rust.extend-identifiers]
flate2 = "flate2"
[tool.typos.type.rust.extend-words]
ser = "ser"
[tool.typos.type.lock]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.lock.extend-identifiers]
[tool.typos.type.lock.extend-words]
[tool.typos.type.jl]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.jl.extend-identifiers]
[tool.typos.type.jl.extend-words]
modul = "modul"
egals = "egals"
usig = "usig"
egal = "egal"
[tool.typos.type.go]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.go.extend-identifiers]
flate = "flate"
[tool.typos.type.go.extend-words]
[tool.typos.type.css]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.css.extend-identifiers]
nd = "nd" nd = "nd"
eles = "eles"
[tool.typos.type.css.extend-words] datas = "datas"
[tool.typos.type.man]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.man.extend-identifiers]
Nd = "Nd"
[tool.typos.type.man.extend-words]
[tool.typos.type.cert]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cert.extend-identifiers]
[tool.typos.type.cert.extend-words]
[tool.typos.type.sh]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.sh.extend-identifiers]
ot = "ot"
[tool.typos.type.sh.extend-words]
[tool.typos.type.vimscript]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.vimscript.extend-identifiers]
windo = "windo"
[tool.typos.type.vimscript.extend-words]
[tool.uv] [tool.uv]
no-build-isolation-package = ["torch"] no-build-isolation-package = ["torch"]
...@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch ...@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
expected_num_backend_compilations = 4 expected_num_backend_compilations = 4
# A has support_torch_compile but enable_if fn returns False # A has support_torch_compile but enable_if fn returns False
# enalbe_if will be True for B, so we expect mod1 and mod2 # enable_if will be True for B, so we expect mod1 and mod2
# to be compiled # to be compiled
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=2, num_graphs_seen=2,
......
...@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch): ...@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
f"Expected {expected1}, got {result1}" f"Expected {expected1}, got {result1}"
) )
# Second call should triger another compilation # Second call should trigger another compilation
x2 = torch.tensor([1, 2, 3]) x2 = torch.tensor([1, 2, 3])
result2 = wrapper(x2) result2 = wrapper(x2)
expected2 = torch.tensor([100, 200, 300]) expected2 = torch.tensor([100, 200, 300])
......
...@@ -444,7 +444,7 @@ def ref_multi_query_kv_attention( ...@@ -444,7 +444,7 @@ def ref_multi_query_kv_attention(
@pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention]) @pytest.mark.parametrize("attention_cls", [Attention, MMEncoderAttention])
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None: def test_num_heads_not_divisible_by_num_kv_heads(attention_cls: type) -> None:
head_size = 64 head_size = 64
scale = float(1.0 / (head_size**0.5)) scale = float(1.0 / (head_size**0.5))
num_heads = 16 num_heads = 16
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment