ci: Revert openai_server related tests in AMD suites (#7449)

fa42e419 · Chang Su · GitHub · e5afb88b · fa42e419 · fa42e419
Unverified Commit fa42e419 authored Jun 23, 2025 by Chang Su Committed by GitHub Jun 23, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 47 additions and 60 deletions

test/srt/openai_server/features/test_cache_report.py test/srt/openai_server/features/test_cache_report.py +43 -42

test/srt/run_suite.py test/srt/run_suite.py +4 -18

No files found.
--- a/test/srt/openai_server/features/test_cache_report.py
+++ b/test/srt/openai_server/features/test_cache_report.py
@@ -163,48 +163,49 @@ class TestCacheReport(CustomTestCase):
            >= usage_2.prompt_tokens - self.min_cached
        )

-    def test_cache_report_openai_async(self):
-        print("=" * 100)
-
-        async def run_test():
-            task0 = asyncio.create_task(
-                self.cache_report_openai_async(
-                    "first request, to start the inference and let the next two request be started in the same batch"
-                )
-            )
-            await asyncio.sleep(0.05)  # to force the first request to be started first
-            task1 = asyncio.create_task(
-                self.cache_report_openai_async(
-                    "> can the same batch parallel request use the cache?"
-                )
-            )
-            task2 = asyncio.create_task(
-                self.cache_report_openai_async(
-                    "> can the same batch parallel request use the cache?"
-                )
-            )
-            result0, result1, result2 = await asyncio.gather(task0, task1, task2)
-
-            cached_tokens0, prompt_tokens0 = result0
-            cached_tokens1, prompt_tokens1 = result1
-            cached_tokens2, prompt_tokens2 = result2
-
-            print(
-                f"Async request 0 - Cached tokens: {cached_tokens0}, Prompt tokens: {prompt_tokens0}"
-            )
-            print(
-                f"Async request 1 - Cached tokens: {cached_tokens1}, Prompt tokens: {prompt_tokens1}"
-            )
-            print(
-                f"Async request 2 - Cached tokens: {cached_tokens2}, Prompt tokens: {prompt_tokens2}"
-            )
-
-            # Assert that no requests used the cache (becausefirst is alone, and the next two are in the same batch)
-            # If a new optimisation limiting starting request with same prefix at the same time was added
-            # to maximise the cache hit, this would not be true
-            assert cached_tokens1 == cached_tokens2 == cached_tokens0
-
-        asyncio.run(run_test())
+    # TODO: flaky test
+    # def test_cache_report_openai_async(self):
+    #     print("=" * 100)
+
+    #     async def run_test():
+    #         task0 = asyncio.create_task(
+    #             self.cache_report_openai_async(
+    #                 "first request, to start the inference and let the next two request be started in the same batch"
+    #             )
+    #         )
+    #         await asyncio.sleep(1)  # to force the first request to be started first
+    #         task1 = asyncio.create_task(
+    #             self.cache_report_openai_async(
+    #                 "> can the same batch parallel request use the cache?"
+    #             )
+    #         )
+    #         task2 = asyncio.create_task(
+    #             self.cache_report_openai_async(
+    #                 "> can the same batch parallel request use the cache?"
+    #             )
+    #         )
+    #         result0, result1, result2 = await asyncio.gather(task0, task1, task2)
+
+    #         cached_tokens0, prompt_tokens0 = result0
+    #         cached_tokens1, prompt_tokens1 = result1
+    #         cached_tokens2, prompt_tokens2 = result2
+
+    #         print(
+    #             f"Async request 0 - Cached tokens: {cached_tokens0}, Prompt tokens: {prompt_tokens0}"
+    #         )
+    #         print(
+    #             f"Async request 1 - Cached tokens: {cached_tokens1}, Prompt tokens: {prompt_tokens1}"
+    #         )
+    #         print(
+    #             f"Async request 2 - Cached tokens: {cached_tokens2}, Prompt tokens: {prompt_tokens2}"
+    #         )
+
+    #         # Assert that no requests used the cache (because first is alone, and the next two are in the same batch)
+    #         # If a new optimisation limiting starting request with same prefix at the same time was added
+    #         # to maximise the cache hit, this would not be true
+    #         assert cached_tokens1 == cached_tokens2 == cached_tokens0
+
+    #     asyncio.run(run_test())


 if __name__ == "__main__":

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -113,11 +113,10 @@ suites = {
        TestFile("models/test_qwen_models.py", 82),
        TestFile("models/test_reward_models.py", 132),
        TestFile("openai_server/basic/test_openai_embedding.py", 141),
-        TestFile("openai_server/basic/test_openai_server.py", 149),
-        TestFile("openai_server/basic/test_protocol.py", 10),
-        TestFile("openai_server/basic/test_serving_chat.py", 10),
-        TestFile("openai_server/basic/test_serving_completions.py", 10),
-        TestFile("openai_server/basic/test_serving_embedding.py", 10),
+        TestFile("openai_server/features/test_enable_thinking.py", 70),
+        TestFile("openai_server/features/test_reasoning_content.py", 89),
+        TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
+        TestFile("openai_server/validation/test_request_length_validation.py", 31),
        TestFile("test_abort.py", 51),
        TestFile("test_block_int8.py", 22),
        TestFile("test_create_kvindices.py", 2),
@@ -125,19 +124,6 @@ suites = {
        TestFile("test_eval_fp8_accuracy.py", 303),
        TestFile("test_function_call_parser.py", 10),
        TestFile("test_input_embeddings.py", 38),
-        TestFile("openai_server/features/test_cache_report.py", 100),
-        TestFile("openai_server/features/test_enable_thinking.py", 70),
-        TestFile("openai_server/features/test_json_constrained.py", 98),
-        TestFile("openai_server/features/test_json_mode.py", 90),
-        TestFile("openai_server/features/test_openai_server_ebnf.py", 95),
-        TestFile("openai_server/features/test_openai_server_hidden_states.py", 240),
-        TestFile("openai_server/features/test_reasoning_content.py", 89),
-        TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
-        TestFile("openai_server/function_call/test_tool_choice.py", 226),
-        TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
-        TestFile("openai_server/validation/test_matched_stop.py", 60),
-        TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
-        TestFile("openai_server/validation/test_request_length_validation.py", 31),
        TestFile("test_metrics.py", 32),
        TestFile("test_no_chunked_prefill.py", 108),
        TestFile("test_no_overlap_scheduler.py", 234),