Unverified Commit 0376e72f authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

test: add unit test for sla planner's interpolator (#2505)

parent 62978595
......@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import scipy
......@@ -30,7 +31,7 @@ class PrefillInterpolator:
with np.load(prefill_npz_fn) as raw_data:
self.prefill_isl = raw_data["prefill_isl"]
self.prefill_ttft = raw_data["prefill_ttft"]
self.prefill_ttft = raw_data["prefill_ttft"] / 1000 # convert ms to s
self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
self.min_isl = min(self.prefill_isl)
......@@ -143,7 +144,9 @@ class DecodeInterpolator:
ix, iy = self.compute_idx(concurrency, context_length)
return self.thpt_interpolator[iy, ix]
def find_best_throughput_per_gpu(self, itl: float, context_length: float) -> float:
def find_best_throughput_per_gpu(
self, itl: float, context_length: float
) -> tuple[float, float, float]:
# find the max kv_load that has itl <= target itl
# here we cannot use binary search as interpolated itl might not be monotonic
iy = int(
......@@ -157,5 +160,71 @@ class DecodeInterpolator:
for ix in range(self.resolution - 1, -1, -1):
if self.itl_interpolator[iy, ix] <= itl:
return self.thpt_interpolator[iy, ix]
return self.thpt_interpolator[iy, 0]
return (
self.thpt_interpolator[iy, ix],
self.itl_interpolator[iy, ix],
self.xi[ix],
)
return self.thpt_interpolator[iy, 0], self.itl_interpolator[iy, 0], self.xi[0]
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--profile_results_dir", type=str, required=True)
parser.add_argument("--isl", type=int, default=3000)
parser.add_argument("--osl", type=int, default=150)
parser.add_argument("--ttft", type=float, default=0.1, help="in s")
parser.add_argument("--itl", type=float, default=0.01, help="in s")
args = parser.parse_args()
print(f"ISL={args.isl}, OSL={args.osl}")
print(f"TTFT={args.ttft}s, ITL={args.itl}s")
print(f"Using profile results from {args.profile_results_dir}")
print("")
# first interpolate prefill
print("Interpolating prefill performance ...")
prefill_interpolator = PrefillInterpolator(args.profile_results_dir)
est_ttft = prefill_interpolator.interpolate_ttft(args.isl)
est_thpt_per_gpu = prefill_interpolator.interpolate_thpt_per_gpu(args.isl)
if est_ttft <= args.ttft:
print(
f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA."
)
else:
print(
f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA."
)
print(
f"\tEstimated throughput: {est_thpt_per_gpu:.2f} tokens/s/gpu. Request rate at {est_thpt_per_gpu / args.isl:.2f} requests/s will saturate one GPU."
)
print("")
# then interpolate decode
decode_interpolator = DecodeInterpolator(args.profile_results_dir)
print("Interpolating decode performance ...")
context_length = args.isl + args.osl // 2
print(f"\tAverage context length: isl + osl/2 = {context_length}.")
(
est_thpt_per_gpu,
est_itl,
est_kv_usage,
) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length)
if est_itl <= args.itl:
print(
f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage."
)
print(
f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU."
)
else:
print(
f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA."
)
......@@ -278,11 +278,13 @@ class Planner:
else:
corrected_itl = self.args.itl / self.d_correction_factor
# 2. reversely find out what is best throughput/gpu that can achieve corrected_itl under the predicted context length
pred_decode_thpt_per_gpu = (
self.decode_interpolator.find_best_throughput_per_gpu(
(
pred_decode_thpt_per_gpu,
_,
_,
) = self.decode_interpolator.find_best_throughput_per_gpu(
itl=corrected_itl, context_length=next_isl + next_osl / 2
)
)
# 3. compute number of decode replicas needed
next_num_d = math.ceil(
next_num_req
......
......@@ -22,3 +22,42 @@ Use the pre-configured test deployment with sample profiling data, we provide th
### Option B: Use Your Own Profiling Results
1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions.
## Interpolator Testing
SLA planner uses two interpolators to estimate the performance of prefill and decode. You can test the interpolators with the following command:
```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir <path_to_profile_results> \
--isl <ISL> \
--osl <OSL> \
--ttft <TTFT(s)> \
--itl <ITL(s)>
```
The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine.
For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200,
```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
--isl 3000 \
--osl 150 \
--ttft 0.1 \
--itl 0.01
> ISL=3000, OSL=150
> TTFT=0.1s, ITL=0.01s
> Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/
>
> Interpolating prefill performance ...
> Estimated TTFT=0.027s <= target TTFT=0.100s. Requests can queue 0.073s maximally while meeting TTFT SLA.
> Estimated throughput: 110893.48 tokens/s/gpu. Request rate at 36.96 requests/s will saturate one GPU.
>
> Interpolating decode performance ...
> Average context length: isl + osl/2 = 3075.
> Estimated ITL=0.0098s <= target ITL=0.0100s at 33.33% active kv usage.
> Estimated throughput: 10226.60 token/s/gpu. Request rate at 68.18 requests/s will saturate one GPU.
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment