Unverified Commit 6806c4e6 authored by Xiaoyu Zhang's avatar Xiaoyu Zhang Committed by GitHub
Browse files

[CI monitor] Improve CI analyzer: fix job failure tracking and add CUDA-focused filtering (#11505)

parent 0c0779d6
This diff is collapsed.
...@@ -67,15 +67,14 @@ class SGLangCIAnalyzer: ...@@ -67,15 +67,14 @@ class SGLangCIAnalyzer:
return all_runs[:limit] return all_runs[:limit]
def analyze_ci_failures(self, runs: List[Dict]) -> Dict: def analyze_ci_failures(self, runs: List[Dict]) -> Dict:
"""Analyze CI failure patterns""" """Analyze CI failure patterns (CUDA jobs only)"""
print("Analyzing CI failure data...") print("Analyzing CI failure data (CUDA only)...")
# SGLang specific job categories # SGLang specific job categories (CUDA only)
job_categories = { job_categories = {
"sgl-kernel": [ "build": [
"build-test",
"sgl-kernel-build-wheels", "sgl-kernel-build-wheels",
"sgl-kernel-unit-test",
"sgl-kernel-mla-test",
], ],
"unit-test": [ "unit-test": [
"unit-test-frontend", "unit-test-frontend",
...@@ -87,11 +86,35 @@ class SGLangCIAnalyzer: ...@@ -87,11 +86,35 @@ class SGLangCIAnalyzer:
"performance": [ "performance": [
"performance-test-1-gpu-part-1", "performance-test-1-gpu-part-1",
"performance-test-1-gpu-part-2", "performance-test-1-gpu-part-2",
"performance-test-1-gpu-part-3",
"performance-test-2-gpu", "performance-test-2-gpu",
], ],
"accuracy": ["accuracy-test-1-gpu", "accuracy-test-2-gpu"], "accuracy": [
"deepep": ["unit-test-deepep-4-gpu", "unit-test-deepep-8-gpu"], "accuracy-test-1-gpu",
"b200": ["unit-test-backend-4-gpu-b200"], "accuracy-test-2-gpu",
],
"mla-test": [
"sgl-kernel-mla-test",
],
"deepep": [
"unit-test-deepep-4-gpu",
"unit-test-deepep-8-gpu",
],
"per-commit": [
"per-commit-8-gpu-h20",
],
"nightly": [
"nightly-test-perf-text-models",
"nightly-test-eval-text-models",
],
"integration": [
"run-all-notebooks",
"vllm-dependency-test",
"test-disaggregation",
],
"b200": [
"unit-test-backend-4-gpu-b200",
],
} }
stats = { stats = {
...@@ -141,13 +164,26 @@ class SGLangCIAnalyzer: ...@@ -141,13 +164,26 @@ class SGLangCIAnalyzer:
job_name = job.get("name", "Unknown") job_name = job.get("name", "Unknown")
job_conclusion = job.get("conclusion", "unknown") job_conclusion = job.get("conclusion", "unknown")
# Filter out non-specific CI jobs # Filter out non-specific CI jobs and non-CUDA jobs
if job_name not in [ # Skip meta jobs and AMD/NPU related jobs
"check-changes", if (
"pr-test-finish", job_name
"pr-test-h20-finish", not in [
"lint", "check-changes",
]: "pr-test-finish",
"pr-test-h20-finish",
"pr-test-amd-finish",
"pr-test-b200-finish",
"lint",
"Set up job",
]
and "-amd" not in job_name.lower()
and "mi300" not in job_name.lower()
and "mi325" not in job_name.lower()
and "gfx" not in job_name.lower()
and "-npu" not in job_name.lower()
and "ascend" not in job_name.lower()
):
# Record successful jobs (update last success) # Record successful jobs (update last success)
if job_conclusion == "success": if job_conclusion == "success":
stats["job_last_success"][job_name] = { stats["job_last_success"][job_name] = {
...@@ -158,7 +194,7 @@ class SGLangCIAnalyzer: ...@@ -158,7 +194,7 @@ class SGLangCIAnalyzer:
} }
# Record failed jobs # Record failed jobs
elif job_conclusion == "failure" and run_status == "failure": elif job_conclusion == "failure":
stats["job_failures"][job_name] += 1 stats["job_failures"][job_name] += 1
# Store failure link (keep only last 3 for each job) # Store failure link (keep only last 3 for each job)
...@@ -216,7 +252,7 @@ class SGLangCIAnalyzer: ...@@ -216,7 +252,7 @@ class SGLangCIAnalyzer:
return pr_info return pr_info
def _analyze_failure_pattern(self, job: Dict, stats: Dict): def _analyze_failure_pattern(self, job: Dict, stats: Dict):
"""Analyze failure patterns""" """Analyze failure patterns (CUDA jobs only)"""
job_name = job.get("name", "") job_name = job.get("name", "")
steps = job.get("steps", []) steps = job.get("steps", [])
...@@ -224,19 +260,33 @@ class SGLangCIAnalyzer: ...@@ -224,19 +260,33 @@ class SGLangCIAnalyzer:
if step.get("conclusion") == "failure": if step.get("conclusion") == "failure":
step_name = step.get("name", "") step_name = step.get("name", "")
# SGLang specific failure pattern recognition # SGLang specific failure pattern recognition (CUDA only)
if "timeout" in step_name.lower(): if "timeout" in step_name.lower():
stats["failure_patterns"]["Timeout"] += 1 stats["failure_patterns"]["Timeout"] += 1
elif "test" in step_name.lower() and "unit" in job_name.lower(): elif "build" in step_name.lower() or "build" in job_name.lower():
stats["failure_patterns"]["Build Failure"] += 1
elif "install" in step_name.lower() or "dependency" in job_name.lower():
stats["failure_patterns"]["Dependency Installation Failure"] += 1
elif "unit" in job_name.lower() or "unit-test" in job_name.lower():
stats["failure_patterns"]["Unit Test Failure"] += 1 stats["failure_patterns"]["Unit Test Failure"] += 1
elif "performance" in job_name.lower(): elif "performance" in job_name.lower() or "perf" in job_name.lower():
stats["failure_patterns"]["Performance Test Failure"] += 1 stats["failure_patterns"]["Performance Test Failure"] += 1
elif "accuracy" in job_name.lower(): elif "accuracy" in job_name.lower():
stats["failure_patterns"]["Accuracy Test Failure"] += 1 stats["failure_patterns"]["Accuracy Test Failure"] += 1
elif "build" in step_name.lower(): elif "mla" in job_name.lower():
stats["failure_patterns"]["Build Failure"] += 1 stats["failure_patterns"]["MLA Test Failure"] += 1
elif "install" in step_name.lower(): elif "deepep" in job_name.lower():
stats["failure_patterns"]["Dependency Installation Failure"] += 1 stats["failure_patterns"]["DeepEP Test Failure"] += 1
elif "nightly" in job_name.lower():
stats["failure_patterns"]["Nightly Test Failure"] += 1
elif "notebook" in job_name.lower():
stats["failure_patterns"]["Notebook Test Failure"] += 1
elif "disaggregation" in job_name.lower():
stats["failure_patterns"]["Disaggregation Test Failure"] += 1
elif "h20" in job_name.lower() or "h200" in job_name.lower():
stats["failure_patterns"]["H20/H200 GPU Failure"] += 1
elif "b200" in job_name.lower():
stats["failure_patterns"]["B200 GPU Failure"] += 1
elif "gpu" in job_name.lower(): elif "gpu" in job_name.lower():
stats["failure_patterns"]["GPU Related Failure"] += 1 stats["failure_patterns"]["GPU Related Failure"] += 1
else: else:
...@@ -245,7 +295,7 @@ class SGLangCIAnalyzer: ...@@ -245,7 +295,7 @@ class SGLangCIAnalyzer:
def generate_report(self, stats: Dict): def generate_report(self, stats: Dict):
"""Generate CI analysis report""" """Generate CI analysis report"""
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("SGLang CI Analysis Report") print("SGLang CI Analysis Report (CUDA Only)")
print("=" * 60) print("=" * 60)
# Overall statistics # Overall statistics
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment