data.py

import os
from pathlib import Path

from llama_index.core.node_parser import CodeSplitter
from llama_index.core.schema import BaseNode
from llama_index.readers.file import FlatReader

Languages = {
    'c': "c",
    'cpp': "cpp",
    'go': "go",
    'java': "java",
    'js': "javascript",
    'md': "markdown",
    'py': "python",
    'ts': "typescript",
}


def traverse(repo_path: str) -> list[str]:
    """
    Traverse the directory, fetch all files
    - skip hidden directories
    - only keep the supported files
    :param repo_path: path to this repo
    """

    def helper(root):
        for entry in os.scandir(root):
            if entry.name.startswith('.'):
                continue
            if entry.is_file():
                ext = entry.name.split('.')[-1].lower()
                if ext not in Languages.keys():
                    continue
                file_paths.append(entry.path)
            elif entry.is_dir():
                helper(entry.path)

    file_paths = []
    helper(repo_path)
    return sorted(file_paths)


def split_into_chunks(file_path, lines_per_chunk, lines_overlap, max_chars) -> list[BaseNode]:
    """
    Split file into chunks
    :param file_path: path to the file
    :param lines_per_chunk: lines for each chunk
    :param lines_overlap: overlap lines between 2 chunks
    :param max_chars: max characters for each chunk
    """
    ext = file_path.split('.')[-1].lower()
    lang = Languages.get(ext, None)
    if not lang:
        return []
    try:
        documents = FlatReader().load_data(Path(file_path))
        splitter = CodeSplitter(
            language=lang,
            chunk_lines=lines_per_chunk,
            chunk_lines_overlap=lines_overlap,
            max_chars=max_chars,
        )
        return splitter.get_nodes_from_documents(documents)
    except Exception as e:
        print(f'`{file_path}`切分失败: {e}')
        return []