[xcl-lens] Create xcl-lens package from rccl_log_parser.py

3555af4d · one · b3030c9c · 3555af4d · b3030c9c · 3555af4d
Commit 3555af4d authored Mar 05, 2026 by one
12 changed files
--- a/.github/workflows/xcl-lens-publish.yml
+++ b/.github/workflows/xcl-lens-publish.yml
+name: Publish xcl-lens to PyPI
+
+on:
+  push:
+    tags:
+      - 'xcl-lens-*'
+
+permissions:
+  contents: read
+
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Run tests
+        working-directory: projects/xcl-lens
+        run: |
+          uv sync --group dev
+          # make test
+
+      - name: Build release distributions
+        working-directory: projects/xcl-lens
+        run: uv build
+
+      - name: Upload distributions
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-dists
+          path: projects/xcl-lens/dist/
+
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs:
+      - release-build
+    permissions:
+      # Required for Trusted Publishing (OIDC) - no PyPI token needed
+      # See: https://docs.pypi.org/trusted-publishers/
+      id-token: write
+
+    environment:
+      name: pypi
+      url: https://pypi.org/project/xcl-lens/
+
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist/
--- a/projects/rccl-log-parser/README.md
+++ b/projects/rccl-log-parser/README.md
-RCCL Log Parser
-===============
-
-## As a Wrapper
-
-```bash
-mpirun -np 4 ./rccl_log_parser.py \
-    ./build/all_reduce_perf -b 4 -e 2G -f 2 -w 3 -n 3 -g 1
-```
-
-## Process an Existing File
-
-```bash
-./rccl_log_parser.py cat rccl-log.txt
-```
\ No newline at end of file
--- a/projects/xcl-lens/.gitignore
+++ b/projects/xcl-lens/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# Redis
+*.rdb
+*.aof
+*.pid
+
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+
+# ActiveMQ
+activemq-data/
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+# Streamlit
+.streamlit/secrets.toml
\ No newline at end of file
--- a/projects/xcl-lens/LICENSE
+++ b/projects/xcl-lens/LICENSE
+MIT License
+
+Copyright (c) 2026 alephpiece
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/projects/xcl-lens/Makefile
+++ b/projects/xcl-lens/Makefile
+.PHONY: help setup format lint test clean bump build
+
+help:
+	@echo "Available targets:"
+	@echo "  make setup           - Create venv, install all dependencies"
+	@echo "  make format          - Auto-fix and format code (ruff)"
+	@echo "  make lint            - Check code style and errors without modifying files (ruff)"
+	@echo "  make test            - Run all unit tests (pytest)"
+	@echo "  make clean           - Remove build caches and the virtual environment"
+	@echo "  make bump part=X     - Bump version (patch/minor/major or set X.Y.Z)"
+	@echo "  make build           - Build wheel and sdist into dist/"
+
+setup:
+	@echo ">> Initializing virtual environment and installing dependencies..."
+	uv sync --group dev
+
+format:
+	uv run ruff check --fix src/
+	uv run ruff format src/
+
+lint:
+	uv run ruff check src/
+	uv run ruff format --check src/
+
+test:
+	uv run pytest -v
+
+clean:
+	rm -rf .pytest_cache .ruff_cache .venv dist
+	find src -type f -name "*.pyc" -delete
+	find src -type d -name "__pycache__" -delete
+
+bump:
+	uvx bump-my-version bump $(part)
+
+build:
+	uv build
--- a/projects/xcl-lens/README.md
+++ b/projects/xcl-lens/README.md
+xcl-lens
+========
+
+RCCL/NCCL log parser and analyzer.
+
+## Installation
+
+### From PyPI
+
+#### Using pipx (recommended for CLI use)
+
+```bash
+pipx install xcl-lens
+```
+
+#### Using uv
+
+```bash
+uv tool install xcl-lens
+```
+
+#### Using pip
+
+```bash
+pip install xcl-lens
+```
+
+### From Source
+
+#### Using pipx
+
+```bash
+pipx install .
+```
+
+#### Using uv
+
+```bash
+# Install in editable mode
+uv pip install -e .
+
+# Or using uv's native install
+uv tool install .
+```
+
+#### Using pip
+
+```bash
+pip install -e .
+```
+
+## Usage
+
+### As a Wrapper
+
+```bash
+mpirun -np 4 xcl-lens \
+    ./build/all_reduce_perf -b 4 -e 2G -f 2 -w 3 -n 3 -g 1
+```
+
+### Process an Existing File
+
+```bash
+xcl-lens cat rccl-log.txt
+```
+
+### Verbose Mode
+
+By default, only the report is printed. Use `-v` or `--verbose` to also print raw log lines:
+
+```bash
+xcl-lens -v ./build/all_reduce_perf -b 4 -e 2G -f 2 -w 3 -n 3 -g 1
+```
+
+## Development
+
+```bash
+make setup    # Set up development environment
+make format   # Auto-fix and format code (ruff)
+make lint     # Check code style and errors (ruff)
+make test     # Run all unit tests (pytest)
+make build    # Build wheel and sdist into dist/
+make clean    # Remove build caches and the virtual environment
+```
\ No newline at end of file
--- a/projects/xcl-lens/pyproject.toml
+++ b/projects/xcl-lens/pyproject.toml
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "xcl-lens"
+dynamic = ["version"]
+description = "RCCL/NCCL log parser and analyzer"
+readme = "README.md"
+license = { text = "MIT" }
+authors = [{ name = "alephpiece", email = "wangan.cs@gmail.com" }]
+requires-python = ">=3.10"
+dependencies = ["pandas>=2.0.0"]
+keywords = ["rccl", "nccl", "logging", "parser", "gpu"]
+classifiers = [
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Operating System :: POSIX :: Linux",
+    "Environment :: Console",
+]
+
+[project.urls]
+Homepage = "https://github.com/alephpiece/hg-misc-tools"
+Repository = "https://github.com/alephpiece/hg-misc-tools"
+Issues = "https://github.com/alephpiece/hg-misc-tools/issues"
+
+[project.scripts]
+xcl-lens = "xcl_lens.main:main"
+
+[dependency-groups]
+dev = ["pytest>=8", "ruff>=0.15"]
+
+[tool.hatch.version]
+path = "src/xcl_lens/__init__.py"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/xcl_lens"]
+
+[tool.hatch.build.targets.sdist]
+exclude = ["tests/__pycache__", ".ruff_cache", ".pytest_cache"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[tool.ruff]
+target-version = "py310"
+line-length = 100
+src = ["src", "tests"]
+
+[tool.ruff.lint]
+select = [
+    "F",   # pyflakes
+    "E",   # pycodestyle errors
+    "W",   # pycodestyle warnings
+    "I",   # isort
+    "UP",  # pyupgrade
+    "B",   # flake8-bugbear
+    "SIM", # flake8-simplify
+    "RUF", # ruff-specific
+]
+
+[tool.bumpversion]
+current_version = "0.1.0"
+commit = true
+commit_args = ""
+tag = true
+tag_name = "xcl-lens-{new_version}"
+message = "[xcl-lens] Bump version: {current_version} → {new_version}"
+
+[[tool.bumpversion.files]]
+filename = "src/xcl_lens/__init__.py"
+search = "__version__ = \"{current_version}\""
+replace = "__version__ = \"{new_version}\""
--- a/projects/xcl-lens/src/xcl_lens/__init__.py
+++ b/projects/xcl-lens/src/xcl_lens/__init__.py
+__version__ = "0.1.0"
+
+from .parser import RcclLogParser
+
+__all__ = ["RcclLogParser"]
--- a/projects/xcl-lens/src/xcl_lens/main.py
+++ b/projects/xcl-lens/src/xcl_lens/main.py
+#!/usr/bin/env python3
+
+import argparse
+import os
+import subprocess
+import sys
+
+from .parser import RcclLogParser
+
+
+def get_mpi_rank():
+    """
+    Try to get Rank ID from common environment variables.
+    If not found, return "0".
+    """
+    # Common MPI Rank environment variables
+    rank_vars = [
+        "OMPI_COMM_WORLD_RANK",  # OpenMPI
+        "PMI_RANK",  # MPICH / MVAPICH
+        "SLURM_PROCID",  # Slurm
+        "RANK",  # General / Torch
+    ]
+
+    for var in rank_vars:
+        if var in os.environ:
+            return int(os.environ[var])
+    return 0
+
+
+def main():
+    rank = get_mpi_rank()
+    log_prefix = f"[Rank {rank}]"
+
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="RCCL Log Parser Wrapper")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="Print raw log lines in addition to the report"
+    )
+    parser.add_argument(
+        "command", nargs=argparse.REMAINDER, help="The executable and arguments to run"
+    )
+
+    args = parser.parse_args()
+
+    verbose = args.verbose
+    cmd = args.command
+
+    # Check if command is provided
+    if not cmd and rank == 0:
+        parser.print_help()
+        sys.exit(1)
+
+    # Get the environment variables
+    env = os.environ.copy()
+
+    # Inject RCCL environment variables
+    env["NCCL_DEBUG"] = "INFO"
+    env["NCCL_DEBUG_SUBSYS"] = "ALL"
+
+    print(f"{log_prefix} [Wrapper] Running command: {' '.join(cmd)}")
+
+    try:
+        parser = RcclLogParser()
+        process = subprocess.Popen(
+            cmd,
+            env=env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+
+        # Collect all output lines
+        for line in process.stdout:
+            if verbose:
+                print(f"{line}", end="", flush=True)
+            parser.collect(line)
+
+        process.wait()
+
+        if rank == 0:
+            parser.report()
+
+        sys.exit(process.returncode)
+    except KeyboardInterrupt:
+        sys.exit(130)
+    except FileNotFoundError:
+        print(f"{log_prefix} Error: Command not found: {cmd[0]}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/projects/xcl-lens/src/xcl_lens/parser/__init__.py
+++ b/projects/xcl-lens/src/xcl_lens/parser/__init__.py
+from .rccl import RcclLogParser
+
+__all__ = ["RcclLogParser"]
--- a/projects/rccl-log-parser/rccl_log_parser.py
+++ b/projects/rccl-log-parser/rccl_log_parser.py
-#!/usr/bin/env python3
-
-import sys
-import os
-import subprocess
 import re
+
 import pandas as pd


@@ -103,13 +99,11 @@ class RcclLogParser:
        print()

    def _report_graph_info(self):
-        """Extract graph information (Optimized)"""
+        """Extract graph information"""
        print("===> Graph Info:\n")

        # Filter lines by looking for 'Pattern' and 'crossNic'
-        filtered_lines = [
-            line for line in self.output if "Pattern" in line and "crossNic" in line
-        ]
+        filtered_lines = [line for line in self.output if "Pattern" in line and "crossNic" in line]

        if not filtered_lines:
            print("  (No graph info found)\n")
@@ -117,26 +111,21 @@ class RcclLogParser:

        df = pd.DataFrame(filtered_lines, columns=["raw_log"])

-        # Extract all fields using a single regex
-        regex_parts = []
-        for key, col_name in self.graph_info_fields.items():
-            regex_parts.append(rf"{key}\s+(?P<{col_name}>[^,\s]+)")
-
-        # Join all parts with .*? to match any characters between fields
-        full_regex = r".*?".join(regex_parts)
+        # Extract each field independently (order-agnostic)
+        # Values are comma-separated, so use [^,\s]+ to exclude trailing commas
+        for pattern, col_name in self.graph_info_fields.items():
+            df[col_name] = df["raw_log"].str.extract(rf"\b{pattern}\s+([^,\s]+)", expand=False)

-        extracted_df = df["raw_log"].str.extract(full_regex)
-
-        # Clean up and convert to numeric
-        if "Pattern" in extracted_df.columns:
-            extracted_df["Pattern"] = pd.to_numeric(
-                extracted_df["Pattern"], errors="coerce"
-            )
+        # Type conversion for correct sorting
+        if "Pattern" in df.columns:
+            df["Pattern"] = pd.to_numeric(df["Pattern"], errors="coerce")

-        extracted_df.drop_duplicates(inplace=True)
-        extracted_df.sort_values(by="Pattern", ascending=False, inplace=True)
+        # Clean up
+        df.drop(columns=["raw_log"], inplace=True)
+        df.drop_duplicates(inplace=True)
+        df.sort_values(by="Pattern", ascending=False, inplace=True)

-        print(extracted_df.fillna("-").to_string(index=False))
+        print(df.fillna("-").to_string(index=False))
        print()

    def _report_cl_transfers(self):
@@ -144,9 +133,7 @@ class RcclLogParser:
        print("===> Unique Ring/Tree Transfers:\n")

        # Filter lines by looking for 'protocol' and 'nbytes'
-        raw_lines = [
-            line for line in self.output if "protocol" in line and "nbytes" in line
-        ]
+        raw_lines = [line for line in self.output if "protocol" in line and "nbytes" in line]

        if not raw_lines:
            print("  (No transfer patterns found)\n")
@@ -156,9 +143,7 @@ class RcclLogParser:

        # Extract all fields using a single loop
        for pattern, col_name in self.cl_transfer_fields.items():
-            df[col_name] = df["raw_log"].str.extract(
-                rf"\b{pattern}\s+(\S+)", expand=False
-            )
+            df[col_name] = df["raw_log"].str.extract(rf"\b{pattern}\s+(\S+)", expand=False)

        # Type conversion for correct sorting
        for field in ["nbytes", "nchannels"]:
@@ -188,9 +173,7 @@ class RcclLogParser:
        print("===> Unique P2P Transfers:\n")

        # Filter lines by looking for 'p2p :' and 'send rank'
-        raw_lines = [
-            line for line in self.output if "p2p :" in line and "send rank" in line
-        ]
+        raw_lines = [line for line in self.output if "p2p :" in line and "send rank" in line]

        if not raw_lines:
            print("  (No P2P transfers found)\n")
@@ -199,9 +182,7 @@ class RcclLogParser:
        # Extract all fields using a single loop
        df = pd.DataFrame(raw_lines, columns=["raw_log"])
        for pattern, col_name in self.p2p_fields.items():
-            df[col_name] = df["raw_log"].str.extract(
-                rf"{pattern}\s+(\S+)", expand=False
-            )
+            df[col_name] = df["raw_log"].str.extract(rf"{pattern}\s+(\S+)", expand=False)

        # Type conversion for correct sorting
        numeric_cols = [
@@ -235,75 +216,3 @@ class RcclLogParser:
        # Fill NaNs with "-" and print
        print(df.fillna("-").to_string(index=False))
        print()
-
-
-def get_mpi_rank():
-    """
-    Try to get Rank ID from common environment variables.
-    If not found, return "0".
-    """
-    # Common MPI Rank environment variables
-    rank_vars = [
-        "OMPI_COMM_WORLD_RANK",  # OpenMPI
-        "PMI_RANK",  # MPICH / MVAPICH
-        "SLURM_PROCID",  # Slurm
-        "RANK",  # General / Torch
-    ]
-
-    for var in rank_vars:
-        if var in os.environ:
-            return int(os.environ[var])
-    return 0
-
-
-def main():
-    rank = get_mpi_rank()
-    log_prefix = f"[Rank {rank}]"
-
-    # Only print usage when rank is 0 or not specified
-    if len(sys.argv) < 2 and rank == 0:
-        script_name = os.path.basename(__file__)
-        print(f"Usage: python {script_name} <executable> [arguments...]")
-        sys.exit(1)
-
-    # Get the command and environment variables
-    cmd = sys.argv[1:]
-    env = os.environ.copy()
-
-    # Inject RCCL environment variables
-    env["NCCL_DEBUG"] = "INFO"
-    env["NCCL_DEBUG_SUBSYS"] = "ALL"
-
-    print(f"{log_prefix} [Wrapper] Running command: {' '.join(cmd)}")
-
-    try:
-        parser = RcclLogParser()
-        process = subprocess.Popen(
-            cmd,
-            env=env,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            bufsize=1,
-        )
-
-        # Collect all output lines
-        for line in process.stdout:
-            print(f"{line}", end="", flush=True)
-            parser.collect(line)
-
-        process.wait()
-
-        if rank == 0:
-            parser.report()
-
-        sys.exit(process.returncode)
-    except KeyboardInterrupt:
-        sys.exit(130)
-    except FileNotFoundError:
-        print(f"{log_prefix} Error: Command not found: {cmd[0]}")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
--- a/projects/xcl-lens/uv.lock
+++ b/projects/xcl-lens/uv.lock