Unverified Commit 4c29d6e7 authored by Nate Mailhot's avatar Nate Mailhot Committed by GitHub
Browse files

feat: check for broken symlinks. add back lychee external link checker with...


feat: check for broken symlinks. add back lychee external link checker with retries to fix failure (#4125)
Signed-off-by: default avatarNate Mailhot <nmailhot@nvidia.com>
parent 9a353c5c
...@@ -3,13 +3,14 @@ ...@@ -3,13 +3,14 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
Script to detect broken links in markdown files within a git repository. Script to detect broken links in markdown files and problematic symbolic links within a git repository.
This script: This script:
1. Finds all .md files in the specified directory (recursively) 1. Finds all .md files in the specified directory (recursively)
2. Parses each file to extract links to other .md files 2. Parses each file to extract links to other .md files
3. Validates if the linked files exist 3. Validates if the linked files exist
4. Generates a JSON or HTML report of broken links with line numbers 4. Detects problematic symbolic links (broken, circular, outside repo)
5. Generates a JSON or HTML report of broken links and problematic symlinks with line numbers
""" """
import argparse import argparse
...@@ -556,9 +557,161 @@ def validate_links( ...@@ -556,9 +557,161 @@ def validate_links(
return broken_links_report return broken_links_report
def find_symbolic_links(root_dir: str, logger: logging.Logger) -> List[Path]:
"""
Find all symbolic links in the given directory recursively.
Args:
root_dir: Root directory to search for symbolic links
logger: Logger instance for logging
Returns:
List of Path objects representing symbolic links
"""
symlinks = []
root_path = Path(root_dir).resolve()
logger.debug(f"Searching for symbolic links in: {root_path}")
try:
for item in root_path.rglob("*"):
if item.is_symlink():
symlinks.append(item)
logger.debug(f"Found symbolic link: {item}")
except (OSError, PermissionError) as e:
logger.warning(f"Error accessing path during symlink search: {e}")
logger.info(f"Found {len(symlinks)} symbolic links in {root_dir}")
return symlinks
def detect_problematic_symlinks(
symlinks: List[Path], git_root_dir: Optional[str], logger: logging.Logger
) -> Dict[str, List[Dict[str, str]]]:
"""
Detect problematic symbolic links including broken, circular, and external links.
Args:
symlinks: List of symbolic link paths to check
git_root_dir: Git repository root directory for relative path calculation
logger: Logger instance for logging
Returns:
Dictionary with categories of problematic symlinks
"""
problematic_symlinks = {
"broken": [],
"circular": [],
"external": [],
"suspicious": [],
}
git_root_path = Path(git_root_dir).resolve() if git_root_dir else None
for symlink in symlinks:
try:
symlink_path = symlink.resolve()
target_path = symlink.readlink()
# Get relative path from git root for reporting
if git_root_path:
try:
relative_symlink = symlink.relative_to(git_root_path)
except ValueError:
relative_symlink = symlink
else:
relative_symlink = symlink
symlink_info = {
"symlink_path": str(relative_symlink),
"target_path": str(target_path),
"absolute_symlink_path": str(symlink),
"issue": "",
}
# Check if symlink is broken (target doesn't exist)
if not symlink_path.exists():
symlink_info[
"issue"
] = f"Broken symlink: target '{target_path}' does not exist"
problematic_symlinks["broken"].append(symlink_info)
logger.warning(f"Broken symlink found: {symlink} -> {target_path}")
continue
# Check for circular symlinks
try:
# Try to resolve the symlink completely
resolved_path = symlink.resolve(strict=True)
if resolved_path == symlink:
symlink_info["issue"] = "Circular symlink: points to itself"
problematic_symlinks["circular"].append(symlink_info)
logger.warning(f"Circular symlink found: {symlink}")
continue
except (OSError, RuntimeError) as e:
if "Too many levels of symbolic links" in str(e):
symlink_info[
"issue"
] = "Circular symlink: too many levels of symbolic links"
problematic_symlinks["circular"].append(symlink_info)
logger.warning(f"Circular symlink found: {symlink}")
continue
# Check if symlink points outside the repository
if git_root_path:
try:
symlink_path.relative_to(git_root_path)
except ValueError:
symlink_info[
"issue"
] = f"External symlink: points outside repository to '{symlink_path}'"
problematic_symlinks["external"].append(symlink_info)
logger.warning(
f"External symlink found: {symlink} -> {symlink_path}"
)
continue
# Check for suspicious patterns (e.g., very long paths, unusual targets)
if len(str(target_path)) > 200:
symlink_info[
"issue"
] = f"Suspicious symlink: unusually long target path ({len(str(target_path))} characters)"
problematic_symlinks["suspicious"].append(symlink_info)
logger.info(f"Suspicious symlink found: {symlink} (long path)")
# Check if target is in a different directory tree (potential maintenance issue)
if "../" in str(target_path) and str(target_path).count("../") > 3:
symlink_info[
"issue"
] = f"Suspicious symlink: target requires many directory traversals ('{target_path}')"
problematic_symlinks["suspicious"].append(symlink_info)
logger.info(f"Suspicious symlink found: {symlink} (many traversals)")
except (OSError, PermissionError) as e:
symlink_info = {
"symlink_path": str(symlink),
"target_path": "unknown",
"absolute_symlink_path": str(symlink),
"issue": f"Error accessing symlink: {e}",
}
problematic_symlinks["broken"].append(symlink_info)
logger.error(f"Error processing symlink {symlink}: {e}")
# Log summary
total_issues = sum(len(issues) for issues in problematic_symlinks.values())
if total_issues > 0:
logger.warning(f"Found {total_issues} problematic symbolic links:")
for category, issues in problematic_symlinks.items():
if issues:
logger.warning(f" {category}: {len(issues)}")
else:
logger.info("No problematic symbolic links found")
return problematic_symlinks
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Detect broken links in markdown files", description="Detect broken links in markdown files and problematic symbolic links",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
...@@ -615,6 +768,12 @@ Examples: ...@@ -615,6 +768,12 @@ Examples:
"--log-file", type=str, help="Log file path for detailed logging" "--log-file", type=str, help="Log file path for detailed logging"
) )
parser.add_argument(
"--check-symlinks",
action="store_true",
help="Also check for problematic symbolic links (broken, circular, external)",
)
args = parser.parse_args() args = parser.parse_args()
# Set up logging # Set up logging
...@@ -657,7 +816,31 @@ Examples: ...@@ -657,7 +816,31 @@ Examples:
logger.error(f"Error processing directory {directory}: {e}") logger.error(f"Error processing directory {directory}: {e}")
continue continue
# Check for problematic symbolic links if requested
all_problematic_symlinks = {}
if args.check_symlinks:
logger.info("Checking for problematic symbolic links...")
for directory in args.directories:
try:
symlinks = find_symbolic_links(directory, logger)
if symlinks:
problematic_symlinks = detect_problematic_symlinks(
symlinks, git_root_dir, logger
)
# Only include categories that have issues
for category, issues in problematic_symlinks.items():
if issues:
if category not in all_problematic_symlinks:
all_problematic_symlinks[category] = []
all_problematic_symlinks[category].extend(issues)
except Exception as e:
logger.error(f"Error checking symlinks in directory {directory}: {e}")
continue
# Prepare the final report # Prepare the final report
total_problematic_symlinks = sum(
len(issues) for issues in all_problematic_symlinks.values()
)
report = { report = {
"summary": { "summary": {
"total_files_processed": total_files_processed, "total_files_processed": total_files_processed,
...@@ -665,8 +848,11 @@ Examples: ...@@ -665,8 +848,11 @@ Examples:
"total_broken_links": sum( "total_broken_links": sum(
len(links) for links in all_broken_links.values() len(links) for links in all_broken_links.values()
), ),
"total_problematic_symlinks": total_problematic_symlinks,
"symlink_check_enabled": args.check_symlinks,
}, },
"broken_links": all_broken_links, "broken_links": all_broken_links,
"problematic_symlinks": all_problematic_symlinks,
"all_processed_files": sorted(all_processed_files), "all_processed_files": sorted(all_processed_files),
} }
...@@ -687,6 +873,7 @@ Examples: ...@@ -687,6 +873,7 @@ Examples:
cleaned_report = { cleaned_report = {
"summary": report["summary"], "summary": report["summary"],
"broken_links": cleaned_broken_links, "broken_links": cleaned_broken_links,
"problematic_symlinks": report["problematic_symlinks"],
"all_processed_files": report["all_processed_files"], "all_processed_files": report["all_processed_files"],
} }
output_content = json.dumps(cleaned_report, indent=2, ensure_ascii=False) output_content = json.dumps(cleaned_report, indent=2, ensure_ascii=False)
...@@ -707,14 +894,41 @@ Examples: ...@@ -707,14 +894,41 @@ Examples:
logger.info("Writing report to stdout") logger.info("Writing report to stdout")
print(output_content) print(output_content)
# Exit with error code if broken links were found # Exit with error code if broken links or problematic symlinks were found
if all_broken_links: # Note: "suspicious" symlinks are warnings only and don't cause failure
has_broken_links = bool(all_broken_links)
# Only count critical symlink issues (broken, circular, external) as errors
critical_symlink_categories = ["broken", "circular", "external"]
critical_symlinks = {
category: issues
for category, issues in all_problematic_symlinks.items()
if category in critical_symlink_categories
}
has_critical_symlinks = bool(critical_symlinks)
total_critical_symlinks = sum(len(issues) for issues in critical_symlinks.values())
# Log suspicious symlinks separately as warnings
suspicious_symlinks = all_problematic_symlinks.get("suspicious", [])
if suspicious_symlinks:
logger.warning( logger.warning(
f"Exiting with error code 1 due to {len(all_broken_links)} files with broken links" f"Found {len(suspicious_symlinks)} suspicious symlinks (warnings only, not causing failure)"
) )
if has_broken_links or has_critical_symlinks:
error_msg = []
if has_broken_links:
error_msg.append(f"{len(all_broken_links)} files with broken links")
if has_critical_symlinks:
error_msg.append(f"{total_critical_symlinks} critical problematic symlinks")
logger.warning(f"Exiting with error code 1 due to: {', '.join(error_msg)}")
sys.exit(1) sys.exit(1)
else: else:
logger.info("No broken links found - exiting successfully") success_msg = "No broken links found"
if args.check_symlinks:
success_msg += " and no critical problematic symlinks found"
logger.info(f"{success_msg} - exiting successfully")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -11,7 +11,6 @@ permissions: ...@@ -11,7 +11,6 @@ permissions:
jobs: jobs:
lychee: lychee:
if: false # Job disabled until fixed
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Check out repository - name: Check out repository
...@@ -66,6 +65,9 @@ jobs: ...@@ -66,6 +65,9 @@ jobs:
lychee \ lychee \
--cache \ --cache \
--no-progress \ --no-progress \
--max-retries 2 \
--retry-wait-time 2 \
--timeout 20 \
--root-dir "${{ github.workspace }}" \ --root-dir "${{ github.workspace }}" \
--exclude-path ".*ATTRIBUTIONS.*" \ --exclude-path ".*ATTRIBUTIONS.*" \
--accept "200..=299, 403, 429" \ --accept "200..=299, 403, 429" \
...@@ -101,6 +103,7 @@ jobs: ...@@ -101,6 +103,7 @@ jobs:
python3 .github/workflows/detect_broken_links.py \ python3 .github/workflows/detect_broken_links.py \
--verbose \ --verbose \
--format json \ --format json \
--check-symlinks \
--output broken-links-report.json \ --output broken-links-report.json \
. .
exit_code=$? exit_code=$?
...@@ -137,7 +140,12 @@ jobs: ...@@ -137,7 +140,12 @@ jobs:
summary = report['summary'] summary = report['summary']
broken_links = report['broken_links'] broken_links = report['broken_links']
problematic_symlinks = report.get('problematic_symlinks', {})
has_broken_links = bool(broken_links)
has_problematic_symlinks = bool(problematic_symlinks)
if has_broken_links:
print('❌ BROKEN LINKS DETECTED') print('❌ BROKEN LINKS DETECTED')
print('=' * 50) print('=' * 50)
print(f'📊 Summary:') print(f'📊 Summary:')
...@@ -175,12 +183,58 @@ jobs: ...@@ -175,12 +183,58 @@ jobs:
print(f' 📍 Target: {link_url}') print(f' 📍 Target: {link_url}')
print() print()
if has_problematic_symlinks:
if has_broken_links:
print('\\n' + '=' * 50)
print('🔗 PROBLEMATIC SYMBOLIC LINKS DETECTED')
print('=' * 50)
print(f'📊 Summary:')
print(f' • Total problematic symlinks: {summary.get(\"total_problematic_symlinks\", 0)}')
print()
for category, symlinks in problematic_symlinks.items():
if symlinks:
category_icons = {
'broken': '💔',
'circular': '🔄',
'external': '🌐',
'suspicious': '⚠️'
}
icon = category_icons.get(category, '❓')
print(f'{icon} {category.upper()} SYMLINKS ({len(symlinks)}):')
print('-' * 40)
for i, symlink in enumerate(symlinks, 1):
symlink_path = symlink['symlink_path']
target_path = symlink['target_path']
issue = symlink['issue']
# Create GitHub annotation for each problematic symlink
annotation_msg = f'Problematic symlink: {issue}'
print(f'::error file={symlink_path}::{annotation_msg}')
# Display in workflow output
print(f' {i}. {symlink_path}')
print(f' → {target_path}')
print(f' ❌ {issue}')
print()
print('=' * 50) print('=' * 50)
print('✅ Next Steps:') print('✅ Next Steps:')
if has_broken_links:
print('1. Check the annotations above in the Files Changed tab (for PRs)') print('1. Check the annotations above in the Files Changed tab (for PRs)')
print('2. Click the GitHub links to jump directly to each broken link') print('2. Click the GitHub links to jump directly to each broken link')
print('3. Fix all broken links before merging') print('3. Fix all broken links before merging')
print('4. Re-run this workflow to verify fixes') if has_problematic_symlinks:
step_num = 4 if has_broken_links else 1
print(f'{step_num}. Review and fix problematic symbolic links')
print(f'{step_num + 1}. Consider replacing broken symlinks with actual files or fixing targets')
print(f'{step_num + 2}. Evaluate if suspicious symlinks with many traversals are necessary')
final_step = (7 if has_broken_links and has_problematic_symlinks
else 4 if has_broken_links
else 4 if has_problematic_symlinks
else 1)
print(f'{final_step}. Re-run this workflow to verify fixes')
except Exception as e: except Exception as e:
print(f'❌ Error reading broken links report: {e}') print(f'❌ Error reading broken links report: {e}')
......
../../docs/benchmarks/benchmarking.md#server-side-benchmarking-in-cluster ../../docs/benchmarks/benchmarking.md
\ No newline at end of file \ No newline at end of file
../../docs/benchmarks/pre_deployment_profiling.md ../../docs/benchmarks/sla_driven_profiling.md
\ No newline at end of file \ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment