Unverified Commit 8690c40b authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Improve stack trace of retry errors (#4845)

parent b1cfb4e9
...@@ -35,6 +35,7 @@ import sys ...@@ -35,6 +35,7 @@ import sys
import tempfile import tempfile
import threading import threading
import time import time
import traceback
import warnings import warnings
from contextlib import contextmanager from contextlib import contextmanager
from functools import lru_cache from functools import lru_cache
...@@ -1766,3 +1767,32 @@ def parse_connector_type(url: str) -> str: ...@@ -1766,3 +1767,32 @@ def parse_connector_type(url: str) -> str:
return "" return ""
return m.group(1) return m.group(1)
def retry(
fn,
max_retry: int,
initial_delay: float = 2.0,
max_delay: float = 60.0,
should_retry: Callable[[Any], bool] = lambda e: True,
):
for try_index in itertools.count():
try:
return fn()
except Exception as e:
if try_index >= max_retry:
raise Exception(f"retry() exceed maximum number of retries.")
if not should_retry(e):
raise Exception(f"retry() observe errors that should not be retried.")
delay = min(initial_delay * (2**try_index), max_delay) * (
0.75 + 0.25 * random.random()
)
logger.warning(
f"retry() failed once ({try_index}th try, maximum {max_retry} retries). Will delay {delay:.2f}s and retry. Error: {e}"
)
traceback.print_exc()
time.sleep(delay)
...@@ -25,7 +25,7 @@ from sglang.bench_serving import run_benchmark ...@@ -25,7 +25,7 @@ from sglang.bench_serving import run_benchmark
from sglang.global_config import global_config from sglang.global_config import global_config
from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.openai import OpenAI
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.srt.utils import get_bool_env_var, kill_process_tree from sglang.srt.utils import get_bool_env_var, kill_process_tree, retry
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
...@@ -1010,26 +1010,10 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple): ...@@ -1010,26 +1010,10 @@ def run_logprob_check(self: unittest.TestCase, arg: Tuple):
class CustomTestCase(unittest.TestCase): class CustomTestCase(unittest.TestCase):
def _callTestMethod(self, method): def _callTestMethod(self, method):
_retry_execution( max_retry = int(
lambda: super(CustomTestCase, self)._callTestMethod(method), os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0")
max_retry=_get_max_retry(),
) )
retry(
lambda: super(CustomTestCase, self)._callTestMethod(method),
def _get_max_retry(): max_retry=max_retry,
return int(os.environ.get("SGLANG_TEST_MAX_RETRY", "2" if is_in_ci() else "0"))
def _retry_execution(fn, max_retry: int):
if max_retry == 0:
fn()
return
try:
fn()
except Exception as e:
print(
f"retry_execution failed once and will retry. This may be an error or a flaky test. Error: {e}"
) )
traceback.print_exc()
_retry_execution(fn, max_retry=max_retry - 1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment