reorder_threshold.yaml 2.57 KB
Newer Older
1
2
3
4
5
6
7
8
# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
# Question: At what query length does prefill pipeline become faster than decode pipeline?
# Methodology: For each query length, compare decode vs prefill performance to find crossover point
# Applies to: FlashAttn MLA, FlashMLA

description: "Decode vs Prefill pipeline crossover analysis"

# Test FlashAttn MLA
9
backend: FLASH_ATTN_MLA
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

# Mode: decode_vs_prefill comparison (special sweep mode)
# For each batch spec, we'll test both decode and prefill pipelines
mode: "decode_vs_prefill"

# Query lengths to test (from old benchmark_mla_threshold.py methodology)
# Each query length will be tested with BOTH decode and prefill pipelines:
#   - decode: threshold >= query_length (forces decode pipeline)
#   - prefill: threshold < query_length (forces prefill pipeline)
#
# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
# This tests different query lengths with fixed sequence length context
#
# Using batch_spec_ranges for automatic generation:
batch_spec_ranges:
  - template: "q{q_len}s1k"
    q_len:
      start: 1
      stop: 16
      step: 1
      end_inclusive: false
  - template: "q{q_len}s1k"
    q_len:
      start: 16
      stop: 64
      step: 2
      end_inclusive: false
  - template: "q{q_len}s1k"
    q_len:
      start: 64
      stop: 1024
      step: 4
      end_inclusive: true

# Batch sizes to test (from old script)
batch_sizes:
  - 1
  - 2
  - 4
  - 8
  - 16
  - 32
  - 64
  - 128
  - 256

# Model configuration (DeepSeek V2/V3 defaults)
model:
  num_layers: 10
  head_dim: 576
  num_q_heads: 128
  num_kv_heads: 1
  block_size: 128

# Benchmark settings
65
66
67
68
device: "cuda:0"
repeats: 15          # More repeats for spec decode variance
warmup_iters: 5
profile_memory: false
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

# Output
output:
  csv: "reorder_threshold_results.csv"
  json: "reorder_threshold_results.json"

# Expected outcome (reproduces old benchmark_mla_threshold.py study):
# - For each batch size, find the crossover point where prefill becomes faster than decode
# - Show decode vs prefill performance across all query lengths
# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
# - Understand how crossover point varies with batch size
# - Provide data-driven guidance for default threshold value
#
# Methodology (from old script):
# - Each query length tested with BOTH pipelines:
#     * decode: threshold >= query_length (forces decode pipeline)
#     * prefill: threshold < query_length (forces prefill pipeline)
# - Compare which is faster to find crossover point
#