Unverified Commit 8bd26dd4 authored by Mick's avatar Mick Committed by GitHub
Browse files

ci: fix night-ci with push retry mechanism (#11765)

parent ab07cd3e
...@@ -7,6 +7,8 @@ import base64 ...@@ -7,6 +7,8 @@ import base64
import json import json
import os import os
import sys import sys
import time
from urllib.error import HTTPError
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
...@@ -28,14 +30,17 @@ def make_github_request(url, token, method="GET", data=None): ...@@ -28,14 +30,17 @@ def make_github_request(url, token, method="GET", data=None):
try: try:
with urlopen(req) as response: with urlopen(req) as response:
return response.read().decode("utf-8") return response.read().decode("utf-8")
except Exception as e: except HTTPError as e:
print(f"GitHub API request failed: {e}") print(f"GitHub API request failed: {e}")
if hasattr(e, "read"): try:
try: error_body = e.read().decode("utf-8")
error_body = e.read().decode("utf-8") print(f"Error response body: {error_body}")
print(f"Error response body: {error_body}") e.error_body = error_body # Attach for later inspection
except: except Exception:
pass e.error_body = ""
raise
except Exception as e:
print(f"GitHub API request failed with a non-HTTP error: {e}")
raise raise
...@@ -196,37 +201,60 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False): ...@@ -196,37 +201,60 @@ def publish_traces(traces_dir, run_id, run_number, is_vlm=False):
) )
sys.exit(1) sys.exit(1)
try: max_retries = 5
# Get current branch head retry_delay = 5 # seconds
branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
print(f"Current branch head: {branch_sha}") for attempt in range(max_retries):
try:
# Get current tree # Get current branch head
tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token) branch_sha = get_branch_sha(repo_owner, repo_name, branch, token)
print(f"Current tree SHA: {tree_sha}") print(f"Current branch head: {branch_sha}")
# Create new tree with all files # Get current tree
new_tree_sha = create_tree( tree_sha = get_tree_sha(repo_owner, repo_name, branch_sha, token)
repo_owner, repo_name, tree_sha, files_to_upload, token print(f"Current tree SHA: {tree_sha}")
)
print(f"Created new tree: {new_tree_sha}") # Create new tree with all files
new_tree_sha = create_tree(
# Create commit repo_owner, repo_name, tree_sha, files_to_upload, token
commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)" )
commit_sha = create_commit( print(f"Created new tree: {new_tree_sha}")
repo_owner, repo_name, new_tree_sha, branch_sha, commit_message, token
) # Create commit
print(f"Created commit: {commit_sha}") commit_message = f"Nightly traces for run {run_id} at {run_number} ({len(files_to_upload)} files)"
commit_sha = create_commit(
# Update branch reference repo_owner,
update_branch_ref(repo_owner, repo_name, branch, commit_sha, token) repo_name,
print("Updated branch reference") new_tree_sha,
branch_sha,
print("Successfully published all traces in a single commit") commit_message,
token,
except Exception as e: )
print(f"Failed to publish traces: {e}") print(f"Created commit: {commit_sha}")
raise
# Update branch reference
update_branch_ref(repo_owner, repo_name, branch, commit_sha, token)
print("Updated branch reference")
print("Successfully published all traces in a single commit")
return
except Exception as e:
is_ff_error = False
if (
hasattr(e, "error_body")
and "Update is not a fast forward" in e.error_body
):
is_ff_error = True
if is_ff_error and attempt < max_retries - 1:
print(
f"Attempt {attempt + 1} failed: not a fast-forward update. Retrying in {retry_delay} seconds..."
)
time.sleep(retry_delay)
else:
print(f"Failed to publish traces: {e}")
raise
def main(): def main():
......
...@@ -37,7 +37,7 @@ MODEL_THRESHOLDS = { ...@@ -37,7 +37,7 @@ MODEL_THRESHOLDS = {
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings( ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"] "Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1), ): ModelEvalMetrics(0.29, 37.0),
ModelLaunchSettings( ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503" "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7), ): ModelEvalMetrics(0.310, 16.7),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment