Unverified Commit fba8eccd authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Log if cuda graph is used & extend cuda graph capture to cuda-graph-max-bs (#6201)


Co-authored-by: default avatarSangBin Cho <rkooo567@gmail.com>
parent 7d3a3d45
...@@ -97,7 +97,9 @@ class TestEAGLEEngine(CustomTestCase): ...@@ -97,7 +97,9 @@ class TestEAGLEEngine(CustomTestCase):
print(f"{engine.get_server_info()=}") print(f"{engine.get_server_info()=}")
avg_spec_accept_length = engine.get_server_info()["avg_spec_accept_length"] avg_spec_accept_length = engine.get_server_info()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 1.9) self.assertGreater(avg_spec_accept_length, 1.9)
...@@ -296,7 +298,9 @@ class TestEAGLEServer(CustomTestCase): ...@@ -296,7 +298,9 @@ class TestEAGLEServer(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.20) self.assertGreater(metrics["accuracy"], 0.20)
server_info = requests.get(self.base_url + "/get_server_info").json() server_info = requests.get(self.base_url + "/get_server_info").json()
avg_spec_accept_length = server_info["avg_spec_accept_length"] avg_spec_accept_length = server_info["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
speculative_eagle_topk = server_info["speculative_eagle_topk"] speculative_eagle_topk = server_info["speculative_eagle_topk"]
......
...@@ -111,7 +111,9 @@ class BaseFlashAttentionTest(CustomTestCase): ...@@ -111,7 +111,9 @@ class BaseFlashAttentionTest(CustomTestCase):
if self.speculative_decode: if self.speculative_decode:
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold) self.assertGreater(avg_spec_accept_length, self.spec_decode_threshold)
......
...@@ -118,7 +118,9 @@ class TestDeepseekV3MTP(CustomTestCase): ...@@ -118,7 +118,9 @@ class TestDeepseekV3MTP(CustomTestCase):
print(f"{metrics=}") print(f"{metrics=}")
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
if is_in_ci(): if is_in_ci():
......
...@@ -100,7 +100,9 @@ class TestDeepseekV3MTP(CustomTestCase): ...@@ -100,7 +100,9 @@ class TestDeepseekV3MTP(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.60) self.assertGreater(metrics["accuracy"], 0.60)
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 2.5) self.assertGreater(avg_spec_accept_length, 2.5)
...@@ -159,7 +161,9 @@ class TestDeepseekV3MTPWithDraft(CustomTestCase): ...@@ -159,7 +161,9 @@ class TestDeepseekV3MTPWithDraft(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.60) self.assertGreater(metrics["accuracy"], 0.60)
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 2.5) self.assertGreater(avg_spec_accept_length, 2.5)
......
...@@ -158,7 +158,9 @@ class TestFlashinferMLAMTP(CustomTestCase): ...@@ -158,7 +158,9 @@ class TestFlashinferMLAMTP(CustomTestCase):
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
print(f"{server_info=}") print(f"{server_info=}")
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 2.5) self.assertGreater(avg_spec_accept_length, 2.5)
......
...@@ -105,7 +105,9 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase): ...@@ -105,7 +105,9 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.60) self.assertGreater(metrics["accuracy"], 0.60)
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 2.5) self.assertGreater(avg_spec_accept_length, 2.5)
...@@ -199,7 +201,9 @@ class TestDeepseekV3MTPBlockInt8(CustomTestCase): ...@@ -199,7 +201,9 @@ class TestDeepseekV3MTPBlockInt8(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.60) self.assertGreater(metrics["accuracy"], 0.60)
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["avg_spec_accept_length"] avg_spec_accept_length = server_info.json()["internal_states"][0][
"avg_spec_accept_length"
]
print(f"{avg_spec_accept_length=}") print(f"{avg_spec_accept_length=}")
self.assertGreater(avg_spec_accept_length, 2.5) self.assertGreater(avg_spec_accept_length, 2.5)
......
...@@ -492,9 +492,6 @@ class TestSRTEndpoint(CustomTestCase): ...@@ -492,9 +492,6 @@ class TestSRTEndpoint(CustomTestCase):
max_total_num_tokens = response_json["max_total_num_tokens"] max_total_num_tokens = response_json["max_total_num_tokens"]
self.assertIsInstance(max_total_num_tokens, int) self.assertIsInstance(max_total_num_tokens, int)
attention_backend = response_json["attention_backend"]
self.assertIsInstance(attention_backend, str)
version = response_json["version"] version = response_json["version"]
self.assertIsInstance(version, str) self.assertIsInstance(version, str)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment