Unverified Commit 0376e72f authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

test: add unit test for sla planner's interpolator (#2505)

parent 62978595
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import scipy import scipy
...@@ -30,7 +31,7 @@ class PrefillInterpolator: ...@@ -30,7 +31,7 @@ class PrefillInterpolator:
with np.load(prefill_npz_fn) as raw_data: with np.load(prefill_npz_fn) as raw_data:
self.prefill_isl = raw_data["prefill_isl"] self.prefill_isl = raw_data["prefill_isl"]
self.prefill_ttft = raw_data["prefill_ttft"] self.prefill_ttft = raw_data["prefill_ttft"] / 1000 # convert ms to s
self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"] self.prefill_thpt_per_gpu = raw_data["prefill_thpt_per_gpu"]
self.min_isl = min(self.prefill_isl) self.min_isl = min(self.prefill_isl)
...@@ -143,7 +144,9 @@ class DecodeInterpolator: ...@@ -143,7 +144,9 @@ class DecodeInterpolator:
ix, iy = self.compute_idx(concurrency, context_length) ix, iy = self.compute_idx(concurrency, context_length)
return self.thpt_interpolator[iy, ix] return self.thpt_interpolator[iy, ix]
def find_best_throughput_per_gpu(self, itl: float, context_length: float) -> float: def find_best_throughput_per_gpu(
self, itl: float, context_length: float
) -> tuple[float, float, float]:
# find the max kv_load that has itl <= target itl # find the max kv_load that has itl <= target itl
# here we cannot use binary search as interpolated itl might not be monotonic # here we cannot use binary search as interpolated itl might not be monotonic
iy = int( iy = int(
...@@ -157,5 +160,71 @@ class DecodeInterpolator: ...@@ -157,5 +160,71 @@ class DecodeInterpolator:
for ix in range(self.resolution - 1, -1, -1): for ix in range(self.resolution - 1, -1, -1):
if self.itl_interpolator[iy, ix] <= itl: if self.itl_interpolator[iy, ix] <= itl:
return self.thpt_interpolator[iy, ix] return (
return self.thpt_interpolator[iy, 0] self.thpt_interpolator[iy, ix],
self.itl_interpolator[iy, ix],
self.xi[ix],
)
return self.thpt_interpolator[iy, 0], self.itl_interpolator[iy, 0], self.xi[0]
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--profile_results_dir", type=str, required=True)
parser.add_argument("--isl", type=int, default=3000)
parser.add_argument("--osl", type=int, default=150)
parser.add_argument("--ttft", type=float, default=0.1, help="in s")
parser.add_argument("--itl", type=float, default=0.01, help="in s")
args = parser.parse_args()
print(f"ISL={args.isl}, OSL={args.osl}")
print(f"TTFT={args.ttft}s, ITL={args.itl}s")
print(f"Using profile results from {args.profile_results_dir}")
print("")
# first interpolate prefill
print("Interpolating prefill performance ...")
prefill_interpolator = PrefillInterpolator(args.profile_results_dir)
est_ttft = prefill_interpolator.interpolate_ttft(args.isl)
est_thpt_per_gpu = prefill_interpolator.interpolate_thpt_per_gpu(args.isl)
if est_ttft <= args.ttft:
print(
f"\tEstimated TTFT={est_ttft:.3f}s <= target TTFT={args.ttft:.3f}s. Requests can queue {args.ttft - est_ttft:.3f}s maximally while meeting TTFT SLA."
)
else:
print(
f"\tEstimated TTFT={est_ttft:.3f}s > target TTFT={args.ttft:.3f}s. Cannot meet TTFT SLA."
)
print(
f"\tEstimated throughput: {est_thpt_per_gpu:.2f} tokens/s/gpu. Request rate at {est_thpt_per_gpu / args.isl:.2f} requests/s will saturate one GPU."
)
print("")
# then interpolate decode
decode_interpolator = DecodeInterpolator(args.profile_results_dir)
print("Interpolating decode performance ...")
context_length = args.isl + args.osl // 2
print(f"\tAverage context length: isl + osl/2 = {context_length}.")
(
est_thpt_per_gpu,
est_itl,
est_kv_usage,
) = decode_interpolator.find_best_throughput_per_gpu(args.itl, context_length)
if est_itl <= args.itl:
print(
f"\tEstimated ITL={est_itl:.4f}s <= target ITL={args.itl:.4f}s at {est_kv_usage*100:.2f}% active kv usage."
)
print(
f"\tEstimated throughput: {est_thpt_per_gpu:.2f} token/s/gpu. Request rate at {est_thpt_per_gpu / args.osl:.2f} requests/s will saturate one GPU."
)
else:
print(
f"\tEstimated ITL={est_itl:.4f}s > target ITL={args.itl:.4f}s. Cannot meet ITL SLA."
)
...@@ -278,11 +278,13 @@ class Planner: ...@@ -278,11 +278,13 @@ class Planner:
else: else:
corrected_itl = self.args.itl / self.d_correction_factor corrected_itl = self.args.itl / self.d_correction_factor
# 2. reversely find out what is best throughput/gpu that can achieve corrected_itl under the predicted context length # 2. reversely find out what is best throughput/gpu that can achieve corrected_itl under the predicted context length
pred_decode_thpt_per_gpu = ( (
self.decode_interpolator.find_best_throughput_per_gpu( pred_decode_thpt_per_gpu,
_,
_,
) = self.decode_interpolator.find_best_throughput_per_gpu(
itl=corrected_itl, context_length=next_isl + next_osl / 2 itl=corrected_itl, context_length=next_isl + next_osl / 2
) )
)
# 3. compute number of decode replicas needed # 3. compute number of decode replicas needed
next_num_d = math.ceil( next_num_d = math.ceil(
next_num_req next_num_req
......
...@@ -22,3 +22,42 @@ Use the pre-configured test deployment with sample profiling data, we provide th ...@@ -22,3 +22,42 @@ Use the pre-configured test deployment with sample profiling data, we provide th
### Option B: Use Your Own Profiling Results ### Option B: Use Your Own Profiling Results
1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions. 1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions.
## Interpolator Testing
SLA planner uses two interpolators to estimate the performance of prefill and decode. You can test the interpolators with the following command:
```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir <path_to_profile_results> \
--isl <ISL> \
--osl <OSL> \
--ttft <TTFT(s)> \
--itl <ITL(s)>
```
The script will perform the interpolation based on ISL, OSL, and TTFT and ITL SLAs and advise the load that can saturate the engine.
For example, to test the interpolator for `nvidia/Llama-3.1-8B-Instruct-FP8` on H200,
```bash
python components/planner/src/dynamo/planner/utils/perf_interpolation.py \
--profile_results_dir tests/planner/profiling_results/H200_TP1P_TP1D/ \
--isl 3000 \
--osl 150 \
--ttft 0.1 \
--itl 0.01
> ISL=3000, OSL=150
> TTFT=0.1s, ITL=0.01s
> Using profile results from tests/planner/profiling_results/H200_TP1P_TP1D/
>
> Interpolating prefill performance ...
> Estimated TTFT=0.027s <= target TTFT=0.100s. Requests can queue 0.073s maximally while meeting TTFT SLA.
> Estimated throughput: 110893.48 tokens/s/gpu. Request rate at 36.96 requests/s will saturate one GPU.
>
> Interpolating decode performance ...
> Average context length: isl + osl/2 = 3075.
> Estimated ITL=0.0098s <= target ITL=0.0100s at 33.33% active kv usage.
> Estimated throughput: 10226.60 token/s/gpu. Request rate at 68.18 requests/s will saturate one GPU.
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment