Unverified Commit 8bd26dd4 authored by Mick's avatar Mick Committed by GitHub
Browse files

ci: fix night-ci with push retry mechanism (#11765)

parent ab07cd3e
...@@ -7,6 +7,8 @@ import base64 ...@@ -7,6 +7,8 @@ import base64
import json import json
import os import os
import sys import sys
import time
from urllib.error import HTTPError
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
...@@ -28,14 +30,17 @@ def make_github_request(url, token, method="GET", data=None): ...@@ -28,14 +30,17 @@ def make_github_request(url, token, method="GET", data=None):
try: try:
with urlopen(req) as response: with urlopen(req) as response:
return response.read().decode("utf-8") return response.read().decode("utf-8")
except Exception as e: except HTTPError as e:
print(f"GitHub API request failed: {e}") print(f"GitHub API request failed: {e}")
if hasattr(e, "read"):
try: try:
error_body = e.read().decode("utf-8") error_body = e.read().decode("utf-8")
print(f"Error response body: {error_body}") print(f"Error response body: {error_body}")
except: e.error_body = error_body # Attach for later inspection
pass except Exception:
e.error_body = ""
raise
except Exception as e:
print(f"GitHub API request failed with a non-HTTP error: {e}")
raise raise
...@@ -196,6 +201,10 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False): ...@@ -196,6 +201,10 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
) )
sys.exit(1) sys.exit(1)
max_retries = 5
retry_delay = 5 # seconds
for attempt in range(max_retries):
try: try:
# Get current branch head # Get current branch head
branch_sha = get_branch_sha(repo_owner, repo_name, branch, token) branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
...@@ -214,7 +223,12 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False): ...@@ -214,7 +223,12 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
# Create commit # Create commit
commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)" commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
commit_sha = create_commit( commit_sha = create_commit(
repo_owner, repo_name, new_tree_sha, branch_sha, commit_message, token repo_owner,
repo_name,
new_tree_sha,
branch_sha,
commit_message,
token,
) )
print(f"Created commit: {commit_sha}") print(f"Created commit: {commit_sha}")
...@@ -223,8 +237,22 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False): ...@@ -223,8 +237,22 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
print("Updated branch reference") print("Updated branch reference")
print("Successfully published all traces in a single commit") print("Successfully published all traces in a single commit")
return
except Exception as e: except Exception as e:
is_ff_error = False
if (
hasattr(e, "error_body")
and "Update is not a fast forward" in e.error_body
):
is_ff_error = True
if is_ff_error and attempt < max_retries - 1:
print(
f"Attempt {attempt + 1} failed: not a fast-forward update. Retrying in {retry_delay} seconds..."
)
time.sleep(retry_delay)
else:
print(f"Failed to publish traces: {e}") print(f"Failed to publish traces: {e}")
raise raise
......
...@@ -37,7 +37,7 @@ MODEL_THRESHOLDS = { ...@@ -37,7 +37,7 @@ MODEL_THRESHOLDS = {
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings( ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"] "Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1), ): ModelEvalMetrics(0.29, 37.0),
ModelLaunchSettings( ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503" "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7), ): ModelEvalMetrics(0.310, 16.7),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment