chineselink.py

"""
This is to keep Chinese doc update to English doc. Should be run regularly.
There is no sane way to check the contents though. PR review should enforce contributors to update the corresponding translation.
See https://github.com/microsoft/nni/issues/4298 for discussion.

Under docs, run

    python tools/chineselink.py
"""

import hashlib
import shutil
import sys
from pathlib import Path


def iterate_dir(path):
    for p in Path(path).iterdir():
        if p.is_dir():
            yield from iterate_dir(p)
            continue
        yield p

suffix_list = [
    '.html',
    '.md',
    '.rst',
    '.ipynb',
]

pipeline_mode = len(sys.argv) > 1 and sys.argv[1] == 'check'
failed_files = []

# in case I need to change `_zh` to something else
# files = list(filter(lambda d: d.name.endswith('zh_CN.rst'), iterate_dir('source')))
# for file in files:
#     os.rename(file, file.parent / (file.name[:-7] + file.name[-4:]))


def need_to_translate(source, target):
    if not target.exists():
        failed_files.append('(missing) ' + target.as_posix())
        if pipeline_mode:
            return
        shutil.copyfile(source, target)
    if target.suffix == '.html':
        return  # FIXME I don't know how to process html
    target_checksum = hashlib.sha256(path.open('rb').read()).hexdigest()[:32]
    checksum = target.open('r').readline().strip()[3:]
    if checksum != target_checksum:
        failed_files.append('(out-of-date) ' + target.as_posix())
        if pipeline_mode:
            return
    contents = target.open('r').readlines()
    firstline = '.. ' + target_checksum + '\n'
    if contents[0].startswith('.. '):
        contents = [firstline] + contents[1:]
    else:
        contents = [firstline, '\n'] + contents
    target.open('w').writelines(contents)


for path in iterate_dir(Path('source')):
    relative_path = path.relative_to('source')
    if relative_path.as_posix().startswith('_build'):
        continue
    if path.suffix in suffix_list:
        if '_zh.' not in path.name:
            target_path = path.parent / (path.stem + '_zh' + path.suffix)
            if target_path.exists():
                # whitelist files. should be translated
                need_to_translate(path, target_path)
                print(f'Skipped linking for {path} as it is in whitelist.')
        else:
            source_path = path.parent / (path.stem[:-3] + path.suffix)
            if not source_path.exists():
                # delete redundant files
                failed_files.append('(redundant) ' + source_path.as_posix())
                if not pipeline_mode:
                    print(f'Deleting {source_path}')
                    path.unlink()


if pipeline_mode and failed_files:
    raise ValueError(
        'The following files are not up-to-date. Please run "python3 tools/chineselink.py" under docs folder '
        'to refresh them and update their corresponding translation.\n' + '\n'.join(['  ' + line for line in failed_files]))
if failed_files:
    print('Updated files:', failed_files)