"magic_pdf/layout.bak/__init__.py" did not exist on "f99149b8ddc24251dce6de33cfc4ec09e18821c2"
data.py 1.87 KB
Newer Older
dengjb's avatar
dengjb committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
from pathlib import Path

from llama_index.core.node_parser import CodeSplitter
from llama_index.core.schema import BaseNode
from llama_index.readers.file import FlatReader

Languages = {
    'c': "c",
    'cpp': "cpp",
    'go': "go",
    'java': "java",
    'js': "javascript",
    'md': "markdown",
    'py': "python",
    'ts': "typescript",
}


def traverse(repo_path: str) -> list[str]:
    """
    Traverse the directory, fetch all files
    - skip hidden directories
    - only keep the supported files
    :param repo_path: path to this repo
    """

    def helper(root):
        for entry in os.scandir(root):
            if entry.name.startswith('.'):
                continue
            if entry.is_file():
                ext = entry.name.split('.')[-1].lower()
                if ext not in Languages.keys():
                    continue
                file_paths.append(entry.path)
            elif entry.is_dir():
                helper(entry.path)

    file_paths = []
    helper(repo_path)
    return sorted(file_paths)


def split_into_chunks(file_path, lines_per_chunk, lines_overlap, max_chars) -> list[BaseNode]:
    """
    Split file into chunks
    :param file_path: path to the file
    :param lines_per_chunk: lines for each chunk
    :param lines_overlap: overlap lines between 2 chunks
    :param max_chars: max characters for each chunk
    """
    ext = file_path.split('.')[-1].lower()
    lang = Languages.get(ext, None)
    if not lang:
        return []
    try:
        documents = FlatReader().load_data(Path(file_path))
        splitter = CodeSplitter(
            language=lang,
            chunk_lines=lines_per_chunk,
            chunk_lines_overlap=lines_overlap,
            max_chars=max_chars,
        )
        return splitter.get_nodes_from_documents(documents)
    except Exception as e:
        print(f'`{file_path}`切分失败: {e}')
        return []