tools.py 5.71 KB
Newer Older
dengjb's avatar
dengjb committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import json
import os
import zipfile
import git
import urllib.parse
import re

def is_valid_json(json_string):
    try:
        match = re.search(r'\{.*\}', json_string, re.DOTALL)
        if match:
            dict_str = match.group()
            json.loads(dict_str)
        else:
            json.loads(json_string)
        return True
    except ValueError:
        return False

def clone_repo(repo_url, clone_to):
    """
    克隆一个GitHub仓库。

    参数:
    repo_url (str): 原始仓库的URL。
    clone_to (str): 克隆到的本地目录。

    返回:
    str: 成功时返回克隆到的本地目录(包含子目录),不成功时返回空字符串。
    """
    try:
        if not os.path.exists(clone_to):
            os.makedirs(clone_to)

        # 从URL中提取仓库名称
        repo_name = urllib.parse.urlparse(repo_url).path.split('/')[-1]

        # 在clone_to目录下创建新的目录
        cloned_path = os.path.join(clone_to, repo_name)
        if os.path.exists(cloned_path):
            return cloned_path

        # 克隆仓库
        repo = git.Repo.clone_from(repo_url, cloned_path)
        
        print(f"Repository cloned to {cloned_path}")
        return cloned_path
    except Exception as e:
        print(f"Failed to clone repository: {e}")
        return None
def unzip_file(zip_path, extract_dir):
    """
    解压zip文件到指定目录,并在指定目录下创建一个新的目录存放解压后的文件

    参数:
    zip_path (str): zip压缩包的地址
    extract_dir (str): 指定解压的目录

    返回:
    str: 解压后的路径
    """
    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir)

    base_name = os.path.basename(zip_path)
    dir_name = os.path.splitext(base_name)[0]
    new_extract_dir = os.path.join(extract_dir, dir_name)

    if not os.path.exists(new_extract_dir):
        os.makedirs(new_extract_dir)

    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(new_extract_dir)

    return new_extract_dir


def get_project_files_with_content(project_dir):
    """
    获取项目目录下所有文件的相对路径和内容

    参数:
    project_dir (str): 项目目录地址

    返回:
    list: 包含字典的列表,每个字典包含文件的相对路径和内容
    """
    files_list = []

    for root, dirs, files in os.walk(project_dir):
        for file in files:
            if filter_data(file):
                file_path = os.path.join(root, file)
                relative_path = os.path.relpath(file_path, project_dir)
                if "__MACOSX" in relative_path:
                    continue
                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                    content = f.read()
                files_list.append({"path": relative_path, "content": content})
            else:
                continue

    return files_list


def filter_data(obj):
    LANGUAGE_TAG = {
        "c++": "// C++",
        "cpp": "// C++",
        "c": "// C",
        "c#": "// C#",
        "c-sharp": "// C#",
        "css": "/* CSS */",
        "cuda": "// Cuda",
        "fortran": "! Fortran",
        "go": "// Go",
        "html": "<!-- HTML -->",
        "java": "// Java",
        "js": "// JavaScript",
        "javascript": "// JavaScript",
        "kotlin": "// Kotlin",
        "lean": "-- Lean",
        "lua": "-- Lua",
        "objectivec": "// Objective-C",
        "objective-c": "// Objective-C",
        "objective-c++": "// Objective-C++",
        "pascal": "// Pascal",
        "php": "// PHP",
        "python": "# Python",
        "r": "# R",
        "rust": "// Rust",
        "ruby": "# Ruby",
        "scala": "// Scala",
        "shell": "# Shell",
        "sql": "-- SQL",
        "tex": f"% TeX",
        "typescript": "// TypeScript",
        "vue": "<!-- Vue -->",
        "assembly": "; Assembly",
        "dart": "// Dart",
        "perl": "# Perl",
        "prolog": f"% Prolog",
        "swift": "// swift",
        "lisp": "; Lisp",
        "vb": "' Visual Basic",
        "visual basic": "' Visual Basic",
        "matlab": f"% Matlab",
        "delphi": "{ Delphi }",
        "scheme": "; Scheme",
        "basic": "' Basic",
        "groovy": "// Groovy",
        "abap": "* Abap",
        "gdscript": "# GDScript",
        "haskell": "-- Haskell",
        "julia": "# Julia",
        "elixir": "# Elixir",
        "excel": "' Excel",
        "clojure": "; Clojure",
        "actionscript": "// ActionScript",
        "solidity": "// Solidity",
        "powershell": "# PowerShell",
        "erlang": f"% Erlang",
        "cobol": "// Cobol",
        "batchfile": ":: Batch file",
        "makefile": "# Makefile",
        "dockerfile": "# Dockerfile",
        "markdown": "<!-- Markdown -->",
        "cmake": "# CMake",
    }

    programming_languages_to_file_extensions = json.load(
        open("utils/programming-languages-to-file-extensions.json")
    )
    need2del = []
    for key in programming_languages_to_file_extensions.keys():
        if key.lower() not in LANGUAGE_TAG:
            need2del.append(key)

    for key in need2del:
        del programming_languages_to_file_extensions[key]

    ext_to_programming_languages = {}
    want_languages = []
    for key in programming_languages_to_file_extensions:
        for item in programming_languages_to_file_extensions[key]:
            ext_to_programming_languages[item] = key
            want_languages.append(item)

    ext = "." + obj.split(".")[-1]
    with open("utils/keep.txt", "r") as f:
        keep_files = f.readlines()
        keep_files = [l.strip() for l in keep_files]
    # print(ext)
    if ext not in want_languages:
        if obj in keep_files:
            return True
        return False
    else:
        return True