Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6806c4e6
Unverified
Commit
6806c4e6
authored
Oct 13, 2025
by
Xiaoyu Zhang
Committed by
GitHub
Oct 13, 2025
Browse files
[CI monitor] Improve CI analyzer: fix job failure tracking and add CUDA-focused filtering (#11505)
parent
0c0779d6
Changes
2
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
76 additions
and
554 deletions
+76
-554
scripts/ci_monitor/README.md
scripts/ci_monitor/README.md
+0
-528
scripts/ci_monitor/ci_analyzer.py
scripts/ci_monitor/ci_analyzer.py
+76
-26
No files found.
scripts/ci_monitor/README.md
View file @
6806c4e6
This diff is collapsed.
Click to expand it.
scripts/ci_monitor/ci_analyzer.py
View file @
6806c4e6
...
@@ -67,15 +67,14 @@ class SGLangCIAnalyzer:
...
@@ -67,15 +67,14 @@ class SGLangCIAnalyzer:
return
all_runs
[:
limit
]
return
all_runs
[:
limit
]
def
analyze_ci_failures
(
self
,
runs
:
List
[
Dict
])
->
Dict
:
def
analyze_ci_failures
(
self
,
runs
:
List
[
Dict
])
->
Dict
:
"""Analyze CI failure patterns"""
"""Analyze CI failure patterns
(CUDA jobs only)
"""
print
(
"Analyzing CI failure data..."
)
print
(
"Analyzing CI failure data
(CUDA only)
..."
)
# SGLang specific job categories
# SGLang specific job categories
(CUDA only)
job_categories
=
{
job_categories
=
{
"sgl-kernel"
:
[
"build"
:
[
"build-test"
,
"sgl-kernel-build-wheels"
,
"sgl-kernel-build-wheels"
,
"sgl-kernel-unit-test"
,
"sgl-kernel-mla-test"
,
],
],
"unit-test"
:
[
"unit-test"
:
[
"unit-test-frontend"
,
"unit-test-frontend"
,
...
@@ -87,11 +86,35 @@ class SGLangCIAnalyzer:
...
@@ -87,11 +86,35 @@ class SGLangCIAnalyzer:
"performance"
:
[
"performance"
:
[
"performance-test-1-gpu-part-1"
,
"performance-test-1-gpu-part-1"
,
"performance-test-1-gpu-part-2"
,
"performance-test-1-gpu-part-2"
,
"performance-test-1-gpu-part-3"
,
"performance-test-2-gpu"
,
"performance-test-2-gpu"
,
],
],
"accuracy"
:
[
"accuracy-test-1-gpu"
,
"accuracy-test-2-gpu"
],
"accuracy"
:
[
"deepep"
:
[
"unit-test-deepep-4-gpu"
,
"unit-test-deepep-8-gpu"
],
"accuracy-test-1-gpu"
,
"b200"
:
[
"unit-test-backend-4-gpu-b200"
],
"accuracy-test-2-gpu"
,
],
"mla-test"
:
[
"sgl-kernel-mla-test"
,
],
"deepep"
:
[
"unit-test-deepep-4-gpu"
,
"unit-test-deepep-8-gpu"
,
],
"per-commit"
:
[
"per-commit-8-gpu-h20"
,
],
"nightly"
:
[
"nightly-test-perf-text-models"
,
"nightly-test-eval-text-models"
,
],
"integration"
:
[
"run-all-notebooks"
,
"vllm-dependency-test"
,
"test-disaggregation"
,
],
"b200"
:
[
"unit-test-backend-4-gpu-b200"
,
],
}
}
stats
=
{
stats
=
{
...
@@ -141,13 +164,26 @@ class SGLangCIAnalyzer:
...
@@ -141,13 +164,26 @@ class SGLangCIAnalyzer:
job_name
=
job
.
get
(
"name"
,
"Unknown"
)
job_name
=
job
.
get
(
"name"
,
"Unknown"
)
job_conclusion
=
job
.
get
(
"conclusion"
,
"unknown"
)
job_conclusion
=
job
.
get
(
"conclusion"
,
"unknown"
)
# Filter out non-specific CI jobs
# Filter out non-specific CI jobs and non-CUDA jobs
if
job_name
not
in
[
# Skip meta jobs and AMD/NPU related jobs
if
(
job_name
not
in
[
"check-changes"
,
"check-changes"
,
"pr-test-finish"
,
"pr-test-finish"
,
"pr-test-h20-finish"
,
"pr-test-h20-finish"
,
"pr-test-amd-finish"
,
"pr-test-b200-finish"
,
"lint"
,
"lint"
,
]:
"Set up job"
,
]
and
"-amd"
not
in
job_name
.
lower
()
and
"mi300"
not
in
job_name
.
lower
()
and
"mi325"
not
in
job_name
.
lower
()
and
"gfx"
not
in
job_name
.
lower
()
and
"-npu"
not
in
job_name
.
lower
()
and
"ascend"
not
in
job_name
.
lower
()
):
# Record successful jobs (update last success)
# Record successful jobs (update last success)
if
job_conclusion
==
"success"
:
if
job_conclusion
==
"success"
:
stats
[
"job_last_success"
][
job_name
]
=
{
stats
[
"job_last_success"
][
job_name
]
=
{
...
@@ -158,7 +194,7 @@ class SGLangCIAnalyzer:
...
@@ -158,7 +194,7 @@ class SGLangCIAnalyzer:
}
}
# Record failed jobs
# Record failed jobs
elif
job_conclusion
==
"failure"
and
run_status
==
"failure"
:
elif
job_conclusion
==
"failure"
:
stats
[
"job_failures"
][
job_name
]
+=
1
stats
[
"job_failures"
][
job_name
]
+=
1
# Store failure link (keep only last 3 for each job)
# Store failure link (keep only last 3 for each job)
...
@@ -216,7 +252,7 @@ class SGLangCIAnalyzer:
...
@@ -216,7 +252,7 @@ class SGLangCIAnalyzer:
return
pr_info
return
pr_info
def
_analyze_failure_pattern
(
self
,
job
:
Dict
,
stats
:
Dict
):
def
_analyze_failure_pattern
(
self
,
job
:
Dict
,
stats
:
Dict
):
"""Analyze failure patterns"""
"""Analyze failure patterns
(CUDA jobs only)
"""
job_name
=
job
.
get
(
"name"
,
""
)
job_name
=
job
.
get
(
"name"
,
""
)
steps
=
job
.
get
(
"steps"
,
[])
steps
=
job
.
get
(
"steps"
,
[])
...
@@ -224,19 +260,33 @@ class SGLangCIAnalyzer:
...
@@ -224,19 +260,33 @@ class SGLangCIAnalyzer:
if
step
.
get
(
"conclusion"
)
==
"failure"
:
if
step
.
get
(
"conclusion"
)
==
"failure"
:
step_name
=
step
.
get
(
"name"
,
""
)
step_name
=
step
.
get
(
"name"
,
""
)
# SGLang specific failure pattern recognition
# SGLang specific failure pattern recognition
(CUDA only)
if
"timeout"
in
step_name
.
lower
():
if
"timeout"
in
step_name
.
lower
():
stats
[
"failure_patterns"
][
"Timeout"
]
+=
1
stats
[
"failure_patterns"
][
"Timeout"
]
+=
1
elif
"test"
in
step_name
.
lower
()
and
"unit"
in
job_name
.
lower
():
elif
"build"
in
step_name
.
lower
()
or
"build"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Build Failure"
]
+=
1
elif
"install"
in
step_name
.
lower
()
or
"dependency"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Dependency Installation Failure"
]
+=
1
elif
"unit"
in
job_name
.
lower
()
or
"unit-test"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Unit Test Failure"
]
+=
1
stats
[
"failure_patterns"
][
"Unit Test Failure"
]
+=
1
elif
"performance"
in
job_name
.
lower
():
elif
"performance"
in
job_name
.
lower
()
or
"perf"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Performance Test Failure"
]
+=
1
stats
[
"failure_patterns"
][
"Performance Test Failure"
]
+=
1
elif
"accuracy"
in
job_name
.
lower
():
elif
"accuracy"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Accuracy Test Failure"
]
+=
1
stats
[
"failure_patterns"
][
"Accuracy Test Failure"
]
+=
1
elif
"build"
in
step_name
.
lower
():
elif
"mla"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Build Failure"
]
+=
1
stats
[
"failure_patterns"
][
"MLA Test Failure"
]
+=
1
elif
"install"
in
step_name
.
lower
():
elif
"deepep"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Dependency Installation Failure"
]
+=
1
stats
[
"failure_patterns"
][
"DeepEP Test Failure"
]
+=
1
elif
"nightly"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Nightly Test Failure"
]
+=
1
elif
"notebook"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Notebook Test Failure"
]
+=
1
elif
"disaggregation"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"Disaggregation Test Failure"
]
+=
1
elif
"h20"
in
job_name
.
lower
()
or
"h200"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"H20/H200 GPU Failure"
]
+=
1
elif
"b200"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"B200 GPU Failure"
]
+=
1
elif
"gpu"
in
job_name
.
lower
():
elif
"gpu"
in
job_name
.
lower
():
stats
[
"failure_patterns"
][
"GPU Related Failure"
]
+=
1
stats
[
"failure_patterns"
][
"GPU Related Failure"
]
+=
1
else
:
else
:
...
@@ -245,7 +295,7 @@ class SGLangCIAnalyzer:
...
@@ -245,7 +295,7 @@ class SGLangCIAnalyzer:
def
generate_report
(
self
,
stats
:
Dict
):
def
generate_report
(
self
,
stats
:
Dict
):
"""Generate CI analysis report"""
"""Generate CI analysis report"""
print
(
"
\n
"
+
"="
*
60
)
print
(
"
\n
"
+
"="
*
60
)
print
(
"SGLang CI Analysis Report"
)
print
(
"SGLang CI Analysis Report
(CUDA Only)
"
)
print
(
"="
*
60
)
print
(
"="
*
60
)
# Overall statistics
# Overall statistics
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment