get_ci_error_statistics.py 11 KB
Newer Older
1
2
3
4
5
import argparse
import json
import math
import os
import time
6
import traceback
7
8
9
10
11
12
import zipfile
from collections import Counter

import requests


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def get_jobs(workflow_run_id, token=None):
    """Extract jobs in a GitHub Actions workflow run"""

    headers = None
    if token is not None:
        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}

    url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100"
    result = requests.get(url, headers=headers).json()
    jobs = []

    try:
        jobs.extend(result["jobs"])
        pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)

        for i in range(pages_to_iterate_over):
            result = requests.get(url + f"&page={i + 2}", headers=headers).json()
            jobs.extend(result["jobs"])

        return jobs
    except Exception:
        print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")

    return []


39
def get_job_links(workflow_run_id, token=None):
40
41
    """Extract job names and their job links in a GitHub Actions workflow run"""

42
43
44
45
    headers = None
    if token is not None:
        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}

46
    url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100"
47
    result = requests.get(url, headers=headers).json()
48
49
50
51
52
53
54
    job_links = {}

    try:
        job_links.update({job["name"]: job["html_url"] for job in result["jobs"]})
        pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)

        for i in range(pages_to_iterate_over):
55
            result = requests.get(url + f"&page={i + 2}", headers=headers).json()
56
57
58
            job_links.update({job["name"]: job["html_url"] for job in result["jobs"]})

        return job_links
59
60
    except Exception:
        print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")
61
62
63
64

    return {}


65
def get_artifacts_links(worflow_run_id, token=None):
66
67
    """Get all artifact links from a workflow run"""

68
69
70
71
    headers = None
    if token is not None:
        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}

72
    url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{worflow_run_id}/artifacts?per_page=100"
73
    result = requests.get(url, headers=headers).json()
74
75
76
77
78
79
80
    artifacts = {}

    try:
        artifacts.update({artifact["name"]: artifact["archive_download_url"] for artifact in result["artifacts"]})
        pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100)

        for i in range(pages_to_iterate_over):
81
            result = requests.get(url + f"&page={i + 2}", headers=headers).json()
82
83
84
            artifacts.update({artifact["name"]: artifact["archive_download_url"] for artifact in result["artifacts"]})

        return artifacts
85
86
    except Exception:
        print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}")
87
88
89
90
91
92
93

    return {}


def download_artifact(artifact_name, artifact_url, output_dir, token):
    """Download a GitHub Action artifact from a URL.

94
    The URL is of the form `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`,
95
96
97
    but it can't be used to download directly. We need to get a redirect URL first.
    See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
    """
98
99
100
101
102
103
104
105
106
107
    headers = None
    if token is not None:
        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}

    result = requests.get(artifact_url, headers=headers, allow_redirects=False)
    download_url = result.headers["Location"]
    response = requests.get(download_url, allow_redirects=True)
    file_path = os.path.join(output_dir, f"{artifact_name}.zip")
    with open(file_path, "wb") as fp:
        fp.write(response.content)
108
109


110
def get_errors_from_single_artifact(artifact_zip_path, job_links=None):
111
112
113
    """Extract errors from a downloaded artifact (in .zip format)"""
    errors = []
    failed_tests = []
114
    job_name = None
115
116
117
118
119

    with zipfile.ZipFile(artifact_zip_path) as z:
        for filename in z.namelist():
            if not os.path.isdir(filename):
                # read the file
120
                if filename in ["failures_line.txt", "summary_short.txt", "job_name.txt"]:
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
                    with z.open(filename) as f:
                        for line in f:
                            line = line.decode("UTF-8").strip()
                            if filename == "failures_line.txt":
                                try:
                                    # `error_line` is the place where `error` occurs
                                    error_line = line[: line.index(": ")]
                                    error = line[line.index(": ") + len(": ") :]
                                    errors.append([error_line, error])
                                except Exception:
                                    # skip un-related lines
                                    pass
                            elif filename == "summary_short.txt" and line.startswith("FAILED "):
                                # `test` is the test method that failed
                                test = line[len("FAILED ") :]
                                failed_tests.append(test)
137
138
                            elif filename == "job_name.txt":
                                job_name = line
139
140
141
142
143
144
145
146

    if len(errors) != len(failed_tests):
        raise ValueError(
            f"`errors` and `failed_tests` should have the same number of elements. Got {len(errors)} for `errors` "
            f"and {len(failed_tests)} for `failed_tests` instead. The test reports in {artifact_zip_path} have some"
            " problem."
        )

147
148
149
150
151
152
153
154
    job_link = None
    if job_name and job_links:
        job_link = job_links.get(job_name, None)

    # A list with elements of the form (line of error, error, failed test)
    result = [x + [y] + [job_link] for x, y in zip(errors, failed_tests)]

    return result
155
156


157
def get_all_errors(artifact_dir, job_links=None):
158
159
160
161
162
163
    """Extract errors from all artifact files"""

    errors = []

    paths = [os.path.join(artifact_dir, p) for p in os.listdir(artifact_dir) if p.endswith(".zip")]
    for p in paths:
164
        errors.extend(get_errors_from_single_artifact(p, job_links=job_links))
165

166
    return errors
167
168


169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def reduce_by_error(logs, error_filter=None):
    """count each error"""

    counter = Counter()
    counter.update([x[1] for x in logs])
    counts = counter.most_common()
    r = {}
    for error, count in counts:
        if error_filter is None or error not in error_filter:
            r[error] = {"count": count, "failed_tests": [(x[2], x[0]) for x in logs if x[1] == error]}

    r = dict(sorted(r.items(), key=lambda item: item[1]["count"], reverse=True))
    return r


def get_model(test):
    """Get the model name from a test method"""
    test = test.split("::")[0]
    if test.startswith("tests/models/"):
        test = test.split("/")[2]
    else:
        test = None

    return test


def reduce_by_model(logs, error_filter=None):
    """count each error per model"""

    logs = [(x[0], x[1], get_model(x[2])) for x in logs]
    logs = [x for x in logs if x[2] is not None]
200
    tests = {x[2] for x in logs}
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

    r = {}
    for test in tests:
        counter = Counter()
        # count by errors in `test`
        counter.update([x[1] for x in logs if x[2] == test])
        counts = counter.most_common()
        error_counts = {error: count for error, count in counts if (error_filter is None or error not in error_filter)}
        n_errors = sum(error_counts.values())
        if n_errors > 0:
            r[test] = {"count": n_errors, "errors": error_counts}

    r = dict(sorted(r.items(), key=lambda item: item[1]["count"], reverse=True))
    return r


def make_github_table(reduced_by_error):
218
219
    header = "| no. | error | status |"
    sep = "|-:|:-|:-|"
220
221
222
    lines = [header, sep]
    for error in reduced_by_error:
        count = reduced_by_error[error]["count"]
223
        line = f"| {count} | {error[:100]} |  |"
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
        lines.append(line)

    return "\n".join(lines)


def make_github_table_per_model(reduced_by_model):
    header = "| model | no. of errors | major error | count |"
    sep = "|-:|-:|-:|-:|"
    lines = [header, sep]
    for model in reduced_by_model:
        count = reduced_by_model[model]["count"]
        error, _count = list(reduced_by_model[model]["errors"].items())[0]
        line = f"| {model} | {count} | {error[:60]} | {_count} |"
        lines.append(line)

    return "\n".join(lines)


242
243
244
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # Required parameters
245
    parser.add_argument("--workflow_run_id", type=str, required=True, help="A GitHub Actions workflow run id.")
246
247
248
249
250
251
    parser.add_argument(
        "--output_dir",
        type=str,
        required=True,
        help="Where to store the downloaded artifacts and other result files.",
    )
252
    parser.add_argument("--token", default=None, type=str, help="A token that has actions:read permission.")
253
254
255
256
    args = parser.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)

257
    _job_links = get_job_links(args.workflow_run_id, token=args.token)
258
259
260
261
262
263
264
265
266
267
268
269
270
    job_links = {}
    # To deal with `workflow_call` event, where a job name is the combination of the job names in the caller and callee.
    # For example, `PyTorch 1.11 / Model tests (models/albert, single-gpu)`.
    if _job_links:
        for k, v in _job_links.items():
            # This is how GitHub actions combine job names.
            if " / " in k:
                index = k.find(" / ")
                k = k[index + len(" / ") :]
            job_links[k] = v
    with open(os.path.join(args.output_dir, "job_links.json"), "w", encoding="UTF-8") as fp:
        json.dump(job_links, fp, ensure_ascii=False, indent=4)

271
    artifacts = get_artifacts_links(args.workflow_run_id, token=args.token)
272
273
274
275
276
277
278
279
    with open(os.path.join(args.output_dir, "artifacts.json"), "w", encoding="UTF-8") as fp:
        json.dump(artifacts, fp, ensure_ascii=False, indent=4)

    for idx, (name, url) in enumerate(artifacts.items()):
        download_artifact(name, url, args.output_dir, args.token)
        # Be gentle to GitHub
        time.sleep(1)

280
    errors = get_all_errors(args.output_dir, job_links=job_links)
281

282
    # `e[1]` is the error
283
284
285
286
287
288
289
290
291
292
293
    counter = Counter()
    counter.update([e[1] for e in errors])

    # print the top 30 most common test errors
    most_common = counter.most_common(30)
    for item in most_common:
        print(item)

    with open(os.path.join(args.output_dir, "errors.json"), "w", encoding="UTF-8") as fp:
        json.dump(errors, fp, ensure_ascii=False, indent=4)

294
295
    reduced_by_error = reduce_by_error(errors)
    reduced_by_model = reduce_by_model(errors)
296
297
298
299
300
301
302
303

    s1 = make_github_table(reduced_by_error)
    s2 = make_github_table_per_model(reduced_by_model)

    with open(os.path.join(args.output_dir, "reduced_by_error.txt"), "w", encoding="UTF-8") as fp:
        fp.write(s1)
    with open(os.path.join(args.output_dir, "reduced_by_model.txt"), "w", encoding="UTF-8") as fp:
        fp.write(s2)