mkdoc.py

#!/usr/bin/env python3
#
#  Syntax: mkdoc.py [-I<path> ..] [.. a list of header files ..]
#
#  Extract documentation from C++ header files to use it in Python bindings
#

import os
import sys
import platform
import re
import textwrap

from clang import cindex
from clang.cindex import CursorKind
from collections import OrderedDict
from threading import Thread, Semaphore
from multiprocessing import cpu_count

RECURSE_LIST = [
    CursorKind.TRANSLATION_UNIT,
    CursorKind.NAMESPACE,
    CursorKind.CLASS_DECL,
    CursorKind.STRUCT_DECL,
    CursorKind.CLASS_TEMPLATE
]

PRINT_LIST = [
    CursorKind.CLASS_DECL,
    CursorKind.STRUCT_DECL,
    CursorKind.ENUM_DECL,
    CursorKind.CLASS_TEMPLATE,
    CursorKind.FUNCTION_DECL,
    CursorKind.FUNCTION_TEMPLATE,
    CursorKind.CONVERSION_FUNCTION,
    CursorKind.CXX_METHOD,
    CursorKind.CONSTRUCTOR,
    CursorKind.FIELD_DECL
]

CPP_OPERATORS = {
    '<=': 'le', '>=': 'ge', '==': 'eq', '!=': 'ne', '[]': 'array',
    '+=': 'iadd', '-=': 'isub', '*=': 'imul', '/=': 'idiv', '%=':
    'imod', '&=': 'iand', '|=': 'ior', '^=': 'ixor', '<<=': 'ilshift',
    '>>=': 'irshift', '++': 'inc', '--': 'dec', '<<': 'lshift', '>>':
    'rshift', '&&': 'land', '||': 'lor', '!': 'lnot', '~': 'bnot',
    '&': 'band', '|': 'bor', '+': 'add', '-': 'sub', '*': 'mul', '/':
    'div', '%': 'mod', '<': 'lt', '>': 'gt', '=': 'assign'
}

CPP_OPERATORS = OrderedDict(
    sorted(CPP_OPERATORS.items(), key=lambda t: -len(t[0])))

job_count = cpu_count()
job_semaphore = Semaphore(job_count)

registered_names = dict()


def d(s):
    return s.decode('utf8')


def sanitize_name(name):
    global registered_names
    name = re.sub(r'type-parameter-0-([0-9]+)', r'T\1', name)
    for k, v in CPP_OPERATORS.items():
        name = name.replace('operator%s' % k, 'operator_%s' % v)
    name = re.sub('<.*>', '', name)
    name = ''.join([ch if ch.isalnum() else '_' for ch in name])
    name = re.sub('_$', '', re.sub('_+', '_', name))
    if name in registered_names:
        registered_names[name] += 1
        name += '_' + str(registered_names[name])
    else:
        registered_names[name] = 1
    return '__doc_' + name


def process_comment(comment):
    result = ''

    # Remove C++ comment syntax
    for s in comment.splitlines():
        s = s.strip()
        if s.startswith('/*'):
            s = s[2:].lstrip('* \t')
        elif s.endswith('*/'):
            s = s[:-2].rstrip('* \t')
        elif s.startswith('///'):
            s = s[3:]
        if s.startswith('*'):
            s = s[1:]
        result += s.strip() + '\n'

    # Doxygen tags
    cpp_group = '([\w:]+)'
    param_group = '([\[\w:\]]+)'

    s = result
    s = re.sub(r'\\c\s+%s' % cpp_group, r'``\1``', s)
    s = re.sub(r'\\a\s+%s' % cpp_group, r'*\1*', s)
    s = re.sub(r'\\e\s+%s' % cpp_group, r'*\1*', s)
    s = re.sub(r'\\em\s+%s' % cpp_group, r'*\1*', s)
    s = re.sub(r'\\b\s+%s' % cpp_group, r'**\1**', s)
    s = re.sub(r'\\ingroup\s+%s' % cpp_group, r'', s)
    s = re.sub(r'\\param%s?\s+%s' % (param_group, cpp_group),
               r'\n\n$Parameter ``\2``:\n\n', s)

    for in_, out_ in {
        'return': 'Returns',
        'author': 'Author',
        'authors': 'Authors',
        'copyright': 'Copyright',
        'date': 'Date',
        'remark': 'Remark',
        'sa': 'See also',
        'see': 'See also',
        'extends': 'Extends',
        'throw': 'Throws',
        'throws': 'Throws'
    }.items():
        s = re.sub(r'\\%s\s*' % in_, r'\n\n$%s:\n\n' % out_, s)

    s = re.sub(r'\\details\s*', r'\n\n', s)
    s = re.sub(r'\\brief\s*', r'', s)
    s = re.sub(r'\\short\s*', r'', s)
    s = re.sub(r'\\ref\s*', r'', s)

    # HTML/TeX tags
    s = re.sub(r'<tt>([^<]*)</tt>', r'``\1``', s)
    s = re.sub(r'<em>([^<]*)</em>', r'*\1*', s)
    s = re.sub(r'<b>([^<]*)</b>', r'**\1**', s)
    s = re.sub(r'\\f\$([^\$]*)\\f\$', r'$\1$', s)

    s = s.replace('``true``', '``True``')
    s = s.replace('``false``', '``False``')

    # Re-flow text
    wrapper = textwrap.TextWrapper()
    wrapper.expand_tabs = True
    wrapper.replace_whitespace = True
    wrapper.width = 75
    wrapper.initial_indent = wrapper.subsequent_indent = ''

    result = ''
    for x in re.split(r'\n{2,}', s):
        wrapped = wrapper.fill(x.strip())
        if len(wrapped) > 0 and wrapped[0] == '$':
            result += wrapped[1:] + '\n'
            wrapper.initial_indent = wrapper.subsequent_indent = ' ' * 4
        else:
            result += wrapped + '\n\n'
            wrapper.initial_indent = wrapper.subsequent_indent = ''
    return result.rstrip().lstrip('\n')


def extract(filename, node, prefix, output):
    num_extracted = 0
    if not (node.location.file is None or
            os.path.samefile(d(node.location.file.name), filename)):
        return 0
    if node.kind in RECURSE_LIST:
        sub_prefix = prefix
        if node.kind != CursorKind.TRANSLATION_UNIT:
            if len(sub_prefix) > 0:
                sub_prefix += '_'
            sub_prefix += d(node.spelling)
        for i in node.get_children():
            num_extracted += extract(filename, i, sub_prefix, output)
        if num_extracted == 0:
            return 0
    if node.kind in PRINT_LIST:
        comment = d(node.raw_comment) if node.raw_comment is not None else ''
        comment = process_comment(comment)
        sub_prefix = prefix
        if len(sub_prefix) > 0:
            sub_prefix += '_'
        name = sanitize_name(sub_prefix + d(node.spelling))
        output.append('\nstatic const char *%s =%sR"doc(%s)doc";' %
            (name, '\n' if '\n' in comment else ' ', comment))
        num_extracted += 1
    return num_extracted


class ExtractionThread(Thread):
    def __init__(self, filename, parameters, output):
        Thread.__init__(self)
        self.filename = filename
        self.parameters = parameters
        self.output = output
        job_semaphore.acquire()

    def run(self):
        print('Processing "%s" ..' % self.filename, file=sys.stderr)
        try:
            index = cindex.Index(
                cindex.conf.lib.clang_createIndex(False, True))
            tu = index.parse(self.filename, self.parameters)
            extract(self.filename, tu.cursor, '', self.output)
        finally:
            job_semaphore.release()

if __name__ == '__main__':
    parameters = ['-x', 'c++', '-std=c++11']
    filenames = []

    if platform.system() == 'Darwin':
        dev_path = '/Applications/Xcode.app/Contents/Developer/'
        lib_dir = dev_path + 'Toolchains/XcodeDefault.xctoolchain/usr/lib/'
        sdk_dir = dev_path + 'Platforms/MacOSX.platform/Developer/SDKs'
        libclang = lib_dir + 'libclang.dylib'

        if os.path.exists(libclang):
            cindex.Config.set_library_path(os.path.dirname(libclang))

        if os.path.exists(sdk_dir):
            sysroot_dir = os.path.join(sdk_dir, next(os.walk(sdk_dir))[1][0])
            parameters.append('-isysroot')
            parameters.append(sysroot_dir)

    for item in sys.argv[1:]:
        if item.startswith('-'):
            parameters.append(item)
        else:
            filenames.append(item)

    if len(filenames) == 0:
        print('Syntax: %s [.. a list of header files ..]' % sys.argv[0])
        exit(-1)

    print('''/*
  This file contains docstrings for the Python bindings.
  Do not edit! These were automatically extracted by mkdoc.py
 */

#define __EXPAND(x)                                      x
#define __COUNT(_1, _2, _3, _4, _5, _6, _7, COUNT, ...)  COUNT
#define __VA_SIZE(...)                                   __EXPAND(__COUNT(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1))
#define __CAT1(a, b)                                     a ## b
#define __CAT2(a, b)                                     __CAT1(a, b)
#define __DOC1(n1)                                       __doc_##n1
#define __DOC2(n1, n2)                                   __doc_##n1##_##n2
#define __DOC3(n1, n2, n3)                               __doc_##n1##_##n2##_##n3
#define __DOC4(n1, n2, n3, n4)                           __doc_##n1##_##n2##_##n3##_##n4
#define __DOC5(n1, n2, n3, n4, n5)                       __doc_##n1##_##n2##_##n3##_##n4##_##n5
#define __DOC6(n1, n2, n3, n4, n5, n6)                   __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6
#define __DOC7(n1, n2, n3, n4, n5, n6, n7)               __doc_##n1##_##n2##_##n3##_##n4##_##n5##_##n6##_##n7
#define DOC(...)                                         __EXPAND(__EXPAND(__CAT2(__DOC, __VA_SIZE(__VA_ARGS__)))(__VA_ARGS__))

#if defined(__GNUG__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-variable"
#endif
''')

    output = []
    for filename in filenames:
        thr = ExtractionThread(filename, parameters, output)
        thr.start()

    print('Waiting for jobs to finish ..', file=sys.stderr)
    for i in range(job_count):
        job_semaphore.acquire()

    output.sort()
    for l in output:
        print(l)

    print('''
#if defined(__GNUG__)
#pragma GCC diagnostic pop
#endif
''')