gen_popular_issues.py

# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from datetime import datetime
from typing import List, Union

import pandas as pd
import requests
import mkdocs_gen_files


REPOSITORY = "argilla-io/distilabel"
DATA_PATH = "sections/community/popular_issues.md"

# public_repo and read:org scopes are required
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")


def fetch_data_from_github(repository, auth_token):
    if auth_token is None:
        return pd.DataFrame(
            {
                "Issue": [],
                "State": [],
                "Created at": [],
                "Milestone": [],
                "Reactions": [],
                "Comments": [],
                "URL": [],
                "Author": [],
                "Author association": [],
            }
        )
    headers = {"Authorization": f"token {auth_token}", "Accept": "application/vnd.github.v3+json"}
    issues_data = []

    print(f"Fetching issues from {repository}...")
    with requests.Session() as session:
        session.headers.update(headers)

        owner, repo_name = repository.split("/")
        issues_url = f"https://api.github.com/repos/{owner}/{repo_name}/issues?state=all"

        while issues_url:
            response = session.get(issues_url)
            issues = response.json()

            for issue in issues:
                if "pull_request" in issue:
                    continue
                issues_data.append(
                    {
                        "Issue": f"{issue['number']} - {issue['title']}",
                        "State": issue["state"],
                        "Created at": issue["created_at"],
                        "Milestone": (issue.get("milestone") or {}).get("title"),
                        "Reactions": issue["reactions"]["total_count"],
                        "Comments": issue["comments"],
                        "URL": issue["html_url"],
                        "Author": issue["user"]["login"],
                        "Author association": issue["author_association"],
                    }
                )

            issues_url = response.links.get("next", {}).get("url", None)

    return pd.DataFrame(issues_data)


with mkdocs_gen_files.open(DATA_PATH, "w") as f:
    df = fetch_data_from_github(REPOSITORY, GITHUB_TOKEN)

    open_issues = df.loc[df["State"] == "open"]
    engagement_df = (
        open_issues[["URL", "Issue", "Reactions", "Comments"]]
        .sort_values(by=["Reactions", "Comments"], ascending=False)
        .head(10)
        .reset_index()
    )

    community_issues = df[df["Author association"] != "MEMBER"]
    community_issues_df = (
        community_issues[["URL", "Issue", "Created at", "Author", "State"]]
        .sort_values(by=["Created at"], ascending=False)
        .head(10)
        .reset_index()
    )

    df["Milestone"] = df["Milestone"].astype(str).fillna("")
    planned_issues = df[
        ((df["Milestone"].str.startswith("v1")) & (df["State"] == "open"))
        | ((df["Milestone"].str.startswith("1")) & (df["State"] == "open"))
    ]
    planned_issues_df = (
        planned_issues[["URL", "Issue", "Created at", "Milestone", "State"]]
        .sort_values(by=["Milestone"], ascending=True)
        .head(10)
        .reset_index()
    )

    f.write('=== "Most engaging open issues"\n\n')
    f.write("    | Rank | Issue | Reactions | Comments |\n")
    f.write("    |------|-------|:---------:|:--------:|\n")
    for ix, row in engagement_df.iterrows():
        f.write(f"    | {ix+1} | [{row['Issue']}]({row['URL']}) | 👍 {row['Reactions']} | 💬 {row['Comments']} |\n")

    f.write('\n=== "Latest issues open by the community"\n\n')
    f.write("    | Rank | Issue | Author |\n")
    f.write("    |------|-------|:------:|\n")
    for ix, row in community_issues_df.iterrows():
        state = "🟢" if row["State"] == "open" else "🟣"
        f.write(f"    | {ix+1} | {state} [{row['Issue']}]({row['URL']}) | by **{row['Author']}** |\n")

    f.write('\n=== "Planned issues for upcoming releases"\n\n')
    f.write("    | Rank | Issue | Milestone |\n")
    f.write("    |------|-------|:------:|\n")
    for ix, row in planned_issues_df.iterrows():
        state = "🟢" if row["State"] == "open" else "🟣"
        f.write(f"    | {ix+1} | {state} [{row['Issue']}]({row['URL']}) |  **{row['Milestone']}** |\n")

    today = datetime.today().date()
    f.write(f"\nLast update: {today}\n")