update codes

1768a324 · dengjb · 18493eef · 1768a324 · 1768a324 · 1768a324
Commit 1768a324 authored Jul 16, 2024 by dengjb
20 changed files
--- a/repodemo/prompts/base_prompt.py
+++ b/repodemo/prompts/base_prompt.py
+base_system_prompt = """<|system|>\n你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。"""
+repo_system_prompt = """<|system|>\n你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码。请根据用户给出的项目仓库中的代码，以及用户提出的需求，生成新的代码或者更改已有代码。输出格式：\n\n###PATH:{PATH}\n{CODE}"""
+web_search_prompy = """你将接收到一个用户提出的问题，并请撰写清晰、简洁且准确的答案。
+# Note
+- 您将获得与问题相关的多个上下文片段，每个上下文都以引用编号开头，例如[[citation:x]]，其中x是一个数字。如果适用，请使用上下文并在每个句子的末尾引用上下文。
+- 您的答案必须是正确的、准确的，并且以专家的身份使用无偏见和专业的语调来撰写。
+- 请你的回答限制在2千字以内，不要提供与问题无关的信息，也不要重复。
+- 请以引用编号的格式[[citation:x]]来引用上下文。如果一个句子来自多个上下文，请列出所有适用的引用，例如[[citation:3]][[citation:5]]。
+- 若所有上下文均不相关，请以自己的理解回答用户提出的问题，此时回答中可以不带引用编号。
+- 除了代码和特定的名称和引用外，您的答案必须使用与问题相同的语言来撰写。
+""".lstrip()
+tools_choose_prompt = """<|user|>\nAs a tool selector, you'll provide users with suggestions on tool selection. Depending on the provided tool summary (tools_summary) and user input (input_text), you'll need to follow these steps:
+1. Read and understand the tool summary (tools_summary):
+   - Understand the features, suitcases, and limitations of each tool.
+2. Analyze User Input (input_text):
+   - Understand the user's needs or problems.
+   - Identify keywords or phrases to determine which tool best suits the user's needs.
+3. Decision-making logic:
+   - Recommend a tool if the user's needs correspond to the tool's functionality.
+   - If the user's needs are not suitable for any tool, or if the information is not sufficient to make a judgment, no tool is recommended.
+4. Output:
+   - If a tool is recommended, output the tool name (toolname).
+   - If no tool is recommended, the output is empty.
+Note that recommendations for tool selection should be based on the user's needs and refer to the tool summary provided. Follow the steps above and make sure to provide accurate tool selection suggestions in the output.
+Here is some examples about tools choosing:
+Input:
+tools_summary: {
+    "online_query": "Questions need to be queried on the Internet to ensure accurate answers",
+    "project_qa": "Questions need to be answered specific to the project",
+    "project_modify": "The problem is that we need to modify the project"
+}
+input_text: "今天星期几"
+Output:
+{
+    "thoughts": {
+        "text": "用户想知道今天是星期几。",
+        "reasoning": "根据工具概要，'online_query' 是用来在互联网上查询问题以确保准确答案，这与用户的需求相符。",
+        "criticism": "没有其他工具适合回答这类问题，因为这是一个需要实时信息的查询。",
+        "speak": "让我在网上查一下今天是星期几。"
+    },
+    "tool": {
+        "name": ["online_query"]
+    }
+}
+Input:
+tools_summary: {
+    "online_query": "Questions need to be queried on the Internet to ensure accurate answers",
+    "project_qa": "Questions need to be answered specific to the project",
+    "project_modify": "The problem is that we need to modify the project"
+}
+input_text: "请你帮我把项目的readme改成韩文"
+Output:
+{
+    "thoughts": {
+        "text": "用户需要将项目的readme文件翻译成韩文。",
+        "reasoning": "根据工具概要，project_modify专用于项目修改，这与用户的需求相符。",
+        "criticism": "需要确保用户对翻译后的韩文内容满意，因为翻译质量可能影响项目的整体感受。",
+        "speak": "我们将使用project_modify来修改项目的readme文件。请确认您希望使用的韩文翻译内容。"
+    },
+    "tool": {
+        "name": ["project_modify"]
+    }
+}
+Input:
+tools_summary: {
+    "online_query": "Questions need to be queried on the Internet to ensure accurate answers",
+    "project_qa": "Questions need to be answered specific to the project",
+    "project_modify": "The problem is that we need to modify the project"
+}
+input_text: "你是谁"
+Output:
+{
+    "thoughts": {
+        "text": "用户问“你是谁”。",
+        "reasoning": "用户的提问是一个通用问题，不涉及具体的工具功能需求。",
+        "criticism": "这个问题不需要使用任何工具来回答，只需直接回答用户的问题即可。",
+        "speak": "我是一个人工智能助手，随时为您提供帮助。"
+    },
+    "tool": {
+        "name": []
+    }
+}
+Input:
+tools_summary: {
+    "online_query": "Questions need to be queried on the Internet to ensure accurate answers",
+    "project_qa": "Questions need to be answered specific to the project",
+    "project_modify": "The problem is that we need to modify the project"
+}
+input_text: "解释一下项目"
+Output:
+{
+    "thoughts": {
+        "text": "用户需要对项目进行解释。",
+        "reasoning": "用户的需求是需要对项目进行解释，这通常涉及到具体项目的细节和背景。",
+        "criticism": "目前的工具概要中，只有project_qa适用于与项目相关的问题解答。",
+        "speak": "您能提供更多关于项目的信息吗？这将有助于提供更准确的解释。"
+    },
+    "tool": {
+        "name": ["project_qa"]
+    }
+}
+You should only respond in JSON format as described below 
+Response Format: 
+{
+    "thoughts": {
+        "text": "your thoughts in the current context",
+        "reasoning": "reasoning for tool selection and input content",
+        "criticism": "critical thinking on tool selection and input in current context",
+        "speak": "words you want to speak to the user",
+    },
+    "tool": {
+        "name": ['tool_name'], 
+    }
+}
+The strings corresponding to "text", "reasoning", "criticism", and "speak" in JSON should be described in Chinese.
+If you don't need to use a tool(like solely chat scene), or have already reasoned the final answer associated with user input from the tool, You must abide by the following rules: 
+1. The tool's name in json is [].
+Do not output any other information and do not contain quotation marks, such as `, \", \' and so on.
+Ensure the output can be parsed by Python json.loads.
+Don't output in markdown format, something like ```json or ```,just output in the corresponding string format.
+Input:
+tools_summary: {
+    "online_query": "Questions need to be queried on the Internet to ensure accurate answers",
+    "project_qa": "Questions need to be answered specific to the project",
+    "project_modify": "The problem is that we need to modify the project"
+}
+"""
+tools_input_prompt = """
+input_text: "{input_text}"
+Output:
+<|assistant|>\n"""
+def build_message_list(result):
+    message_list = []
+    segments = result.split("<|")
+    for segment in segments:
+        if segment.startswith("system|>"):
+            message_list.append({"role": "system", "content": segment[8:]})
+        elif segment.startswith("user|>"):
+            message_list.append({"role": "user", "content": segment[6:]})
+        elif segment.startswith("assistant|>"):
+            message_list.append({"role": "assistant", "content": segment[11:]})
+    return message_list
+def get_cur_base_user_prompt(message_history, index_prompt=None, judge_context=""):
+    user_prompt_tmp = """<|user|>\n{user_input}"""
+    assistant_prompt_tmp = """<|assistant|>\n{assistant_input}"""
+    history_prompt = ""
+    for i, message in enumerate(message_history):
+        if message["role"] == "user" or message["role"] == "tool":
+            if i == 0 and index_prompt is not None:
+                history_prompt += "<|user|>\n" + index_prompt + message["content"]
+            else:
+                history_prompt += user_prompt_tmp.format(user_input=message["content"])
+        elif message["role"] == "assistant":
+            history_prompt += assistant_prompt_tmp.format(
+                assistant_input=message["content"]
+            )
+    if "project_modify" not in judge_context:
+        result = base_system_prompt + history_prompt + """<|assistant|>\n"""
+    else:
+        result = repo_system_prompt + history_prompt + """<|assistant|>\n"""
+    message_list = build_message_list(result)
+    # print(message_list)
+    return message_list
--- a/repodemo/public/avatars/user1.png
+++ b/repodemo/public/avatars/user1.png
--- a/repodemo/public/favicon.png
+++ b/repodemo/public/favicon.png
--- a/repodemo/public/logo_dark.png
+++ b/repodemo/public/logo_dark.png
--- a/repodemo/public/logo_light.png
+++ b/repodemo/public/logo_light.png
--- a/repodemo/readme.md
+++ b/repodemo/readme.md
+# CodeGeeX
+# Welcome to My Chat Demo Application
+This is a simple demonstration application.
+## Instructions
+1. Enter your question.
+2. Wait for a response.
+3. Enjoy the conversation!
+## Features
+- Supports multi-turn conversations.
+- Supports online Q&A.
+- Supports uploading local zip packages for project Q&A and modifications.
+- Supports inputting GitHub project links for project Q&A and modifications.
+## Installation
+1. Clone the repository locally.
+2. Start the model. You can deploy the model using vllm or ollama, provide the OpenAI request format, and set the deployed `api_base` and `api_key`. Alternatively, visit [CodeGeeX API](https://open.bigmodel.cn/dev/api#codegeex-4) to get the API key.
+```shell
+#use open.bigmodel.cn api
+openai_api_key = "<|apikey|>"
+openai_api_base = "https://open.bigmodel.cn/api/paas/v4/"
+model_name = "codegeex-4"
+#use vllm
+openai_api_key = "EMPTY"
+openai_api_base = "http://xxxx:xxxx/v1"
+model_name = "codegeex4-all-9b"
+```
+3. Fill in the corresponding model information and `bing_search_api` (if you want to experience online search) in the `.env` file.
+4. Install dependencies: `pip install -r requirements.txt`.
+5. Run the application: `chainlit run run.py --port 8899`.
+## Note
+Please ensure your network environment can access the CodeGeeX API.
+## Disclaimer
+This application is for educational and research purposes only and should not be used for any commercial purposes. The developer is not responsible for any loss or damage caused by the use of this application.
+## Acknowledgements
+Thank you for using our application. If you have any questions or suggestions, please feel free to contact us. We look forward to your feedback and are committed to providing you with better service.
\ No newline at end of file
--- a/repodemo/requirements.txt
+++ b/repodemo/requirements.txt
+chainlit==1.1.305
+beautifulsoup4
+python-dotenv
+gitpython
\ No newline at end of file
--- a/repodemo/run.py
+++ b/repodemo/run.py
+import chainlit as cl
+from chainlit.input_widget import Slider,Switch
+import json
+import re
+from llm.api.codegeex4 import codegeex4
+from prompts.base_prompt import (
+    get_cur_base_user_prompt,
+    build_message_list,
+    tools_choose_prompt,
+    tools_input_prompt
+)
+from utils.bingsearch import bing_search_prompt
+from utils.tools import unzip_file, get_project_files_with_content,clone_repo,is_valid_json
+def tools_choose_agent(input_text):
+    tools_prompt = tools_choose_prompt+tools_input_prompt.format(input_text=input_text)
+    message_list = build_message_list(tools_prompt)
+    judge_tmp = codegeex4(
+        messages_list=message_list,
+        temperature=0.2,
+        top_p=0.95,
+    )
+    judge_context = ""
+    for part in judge_tmp:
+        judge_context += part
+    attempt = 1
+    max_attempts = 10
+    while not is_valid_json(judge_context) and attempt <= max_attempts:
+        judge_tmp = codegeex4(
+            messages_list=message_list,
+            temperature=0.2,
+            top_p=0.95,
+        )
+        judge_context = ""
+        for part in judge_tmp:
+            judge_context += part
+        attempt += 1
+    match = re.search(r'\{.*\}', judge_context, re.DOTALL)
+    if match:
+        dict_str = match.group()
+        response = json.loads(dict_str)
+    else:
+        response = json.loads(judge_context)
+    tool_name = response["tool"]["name"]
+    return tool_name
+@cl.set_chat_profiles
+async def chat_profile():
+    return [
+        cl.ChatProfile(
+            name="联网聊天",
+            markdown_description="聊天demo：支持多轮对话。支持联网回答用户问题。默认联网，如不联网在输入框左边关闭联网功能。",
+            starters=[
+                cl.Starter(
+                    label="请你用python写一个快速排序。",
+                    message="请你用python写一个快速排序。",
+                ),
+                cl.Starter(
+                    label="请你介绍一下自己。",
+                    message="请你介绍一下自己。",
+                ),
+                cl.Starter(
+                    label="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
+                    message="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
+                ),
+                cl.Starter(
+                    label="我是一个python初学者，请你告诉我怎么才能学好python。",
+                    message="我是一个python初学者，请你告诉我怎么才能学好python。",
+                ),
+            ],
+        ),
+        cl.ChatProfile(
+            name="项目问答",
+            markdown_description="项目级能力demo：支持上传本地zip压缩包项目，支持输入GitHub链接项目，可以进行项目问答和对项目进行修改。",
+        ),
+    ]
+@cl.on_settings_update
+async def setup_agent(settings):
+    temperature = settings["temperature"]
+    top_p = settings["top_p"]
+    is_online = settings["is_online"]
+    cl.user_session.set("temperature", temperature)
+    cl.user_session.set("top_p", top_p)
+    cl.user_session.set("is_online", is_online)
+@cl.on_chat_start
+async def start():
+    settings = await cl.ChatSettings(
+        [
+            Slider(
+                id="temperature",
+                label="CodeGeeX4 - Temperature",
+                initial=0.2,
+                min=0,
+                max=1,
+                step=0.1,
+            ),
+            Slider(
+                id="top_p",
+                label="CodeGeeX4 - top_p",
+                initial=0.95,
+                min=0,
+                max=1,
+                step=0.1,
+            ),
+            Switch(
+                id="is_online",
+                label="CodeGeeX4 - is_online",
+                initial=True
+            ),
+        ]
+    ).send()
+    temperature = settings["temperature"]
+    top_p = settings["top_p"]
+    is_online = settings["is_online"]
+    cl.user_session.set("temperature", temperature)
+    cl.user_session.set("top_p", top_p)
+    cl.user_session.set("is_online", is_online)
+    cl.user_session.set("message_history", [])
+    chat_profile = cl.user_session.get("chat_profile")
+    extract_dir = "repodata"
+    if chat_profile == "项目问答":
+        res = await cl.AskActionMessage(
+            content="请选择项目上传方式",
+            actions=[
+                cl.Action(name="zip", value="zip", label="本地上传zip文件"),
+                cl.Action(name="url", value="url", label="上传GitHub链接"),
+            ],
+        ).send()
+        if res.get("value") == "url":
+            repo_path =None
+            while repo_path == None:
+                res = await cl.AskUserMessage(content="请你在下面消息框中提供GitHub仓库URL? ex：https://github.com/THUDM/CodeGeeX4", timeout=3600).send()
+                if res:
+                    repo_path = clone_repo(res['output'],extract_dir)
+                    if repo_path is None:
+                        await cl.Message(
+                                content=f"您的github链接无法正常下载，请检查项目链接或github网络连通情况。",
+                            ).send()
+            files_list = get_project_files_with_content(repo_path)
+            cl.user_session.set("project_index", files_list)
+            if len(files_list) > 0:
+                await cl.Message(
+                    content=f"已成功上传，您可以开始对项目进行提问！",
+                ).send()
+        elif res.get("value") == "zip":
+            files = None
+            while files == None:
+                files = await cl.AskFileMessage(
+                    content="请上传项目zip压缩文件!",
+                    accept={"application/zip": [".zip"]},
+                    max_size_mb=50,
+                ).send()
+            text_file = files[0]
+            extracted_path = unzip_file(text_file.path, extract_dir)
+            files_list = get_project_files_with_content(extracted_path)
+            cl.user_session.set("project_index", files_list)
+            if len(files_list) > 0:
+                await cl.Message(
+                    content=f"已成功上传，您可以开始对项目进行提问！",
+                ).send()
+@cl.step(type="tool")
+async def bing_search_tool(search_text):
+    current_step = cl.context.current_step
+    # Simulate a running task
+    current_step.input = search_text
+    prompt_tmp = bing_search_prompt(search_text)
+    current_step.output = prompt_tmp
+    return prompt_tmp
+@cl.on_message
+async def main(message: cl.Message):
+    chat_profile = cl.user_session.get("chat_profile")
+    message_history = cl.user_session.get("message_history")
+    tool_name = tools_choose_agent(message.content)
+    is_online = cl.user_session.get("is_online")
+    if chat_profile == "联网聊天":
+        if "online_query" in tool_name and is_online:
+            prompt_tmp = await bing_search_tool(message.content)
+            message_history.append({"role": "tool", "content": prompt_tmp})
+        message_history.append({"role": "user", "content": message.content})
+        prompt_content = get_cur_base_user_prompt(message_history=message_history)
+    elif chat_profile == "项目问答":
+        message_history.append({"role": "user", "content": message.content})
+        project_index = cl.user_session.get("project_index")
+        index_prompt = ""
+        index_tmp = """###PATH:{path}\n{code}\n"""
+        for index in project_index:
+            index_prompt += index_tmp.format(path=index["path"], code=index["content"])
+        if len(tool_name)>0:
+            prompt_content = get_cur_base_user_prompt(
+                    message_history=message_history,
+                    index_prompt=index_prompt,
+                    judge_context=tool_name[0],
+                )
+        else:
+            prompt_content = get_cur_base_user_prompt(message_history=message_history)
+    msg = cl.Message(content="")
+    await msg.send()
+    temperature = cl.user_session.get("temperature")
+    top_p = cl.user_session.get("top_p")
+    if len(prompt_content) / 4 < 120000:
+        stream = codegeex4(prompt_content, temperature=temperature, top_p=top_p)
+        for part in stream:
+            if token := (part or " "):
+                await msg.stream_token(token)
+    else:
+        await msg.stream_token("项目太大了，请换小一点的项目。")
+    message_history.append({"role": "assistant", "content": msg.content})
+    await msg.update()
--- a/repodemo/utils/bingsearch.py
+++ b/repodemo/utils/bingsearch.py
+import requests
+from bs4 import BeautifulSoup as BS4
+import os
+BING_API_KEY = os.getenv("bing_api_key")
+def search_with_bing(query: str, search_timeout=30, top_k=6) -> list[dict]:
+    """
+    Search with bing and return the contexts.
+    参考文档: https://docs.microsoft.com/en-us/bing/search-apis/bing-web-search/overview
+    """
+    response = requests.get(
+        url="https://api.bing.microsoft.com/v7.0/search",
+        headers={"Ocp-Apim-Subscription-Key": BING_API_KEY},
+        params={
+            "q": query,
+            "responseFilter": ["webpages"],
+            "freshness": "month",
+            "mkt": "zh-CN",
+        },
+        timeout=search_timeout,
+    )
+    try:
+        json_content = response.json()
+        # print(json_content)
+        contexts = json_content["webPages"]["value"][:top_k]
+        # logger.info("Web搜索完成")
+        return contexts
+    except Exception as e:
+        # logger.error(f"搜索失败，错误原因: {e}")
+        print(f"搜索失败，错误原因: {e}")
+        return []
+def fetch_url(url):
+    response = requests.get(url)
+    # use beautifulsoup4 to parse html
+    soup = BS4(response.text, "html.parser")
+    plain_text = soup.get_text()
+    return plain_text
+def bing_search_prompt(input):
+    contents = search_with_bing(input, search_timeout=5, top_k=6)
+    citations = "\n\n".join(
+        [
+            f"[[citation:{i + 1}]]\n```markdown\n{item['snippet']}\n```"
+            for i, item in enumerate(contents)
+        ]
+    )
+    prompt = f"[引用]\n{citations}\n问：{input}\n"
+    return prompt
--- a/repodemo/utils/keep.txt
+++ b/repodemo/utils/keep.txt
+package-lock.json
+package.json
+config.json
+LICENSE
+yarn.lock
+requirements.txt
+Dockerfile
+build.gradle
+Makefile
+go.mod
+go.sum
+CHANGES.txt
+Cargo.toml
+pubspec.yaml
+root.json
+snapshot.json
+targets.json
+timestamp.json
+Cargo.lock
\ No newline at end of file
--- a/repodemo/utils/programming-languages-to-file-extensions.json
+++ b/repodemo/utils/programming-languages-to-file-extensions.json
+{"ABAP": [".abap"], "AGS Script": [".ash"], "AMPL": [".ampl"], "ANTLR": [".g4"], "API Blueprint": [".apib"], "APL": [".apl", ".dyalog"], "ASP": [".asp", ".asax", ".ascx", ".ashx", ".asmx", ".aspx", ".axd"], "ATS": [".dats", ".hats", ".sats"], "ActionScript": [".as"], "Ada": [".adb", ".ada", ".ads"], "Agda": [".agda"], "Alloy": [".als"], "ApacheConf": [".apacheconf", ".vhost"], "AppleScript": [".applescript", ".scpt"], "Arc": [".arc"], "Arduino": [".ino"], "AsciiDoc": [".asciidoc", ".adoc"], "AspectJ": [".aj"], "Assembly": [".asm", ".a51", ".nasm"], "Augeas": [".aug"], "AutoHotkey": [".ahk", ".ahkl"], "AutoIt": [".au3"], "Awk": [".awk", ".auk", ".gawk", ".mawk", ".nawk"], "Batchfile": [".bat", ".cmd"], "Befunge": [".befunge"], "Bison": [".bison"], "BitBake": [".bb"], "BlitzBasic": [".decls"], "BlitzMax": [".bmx"], "Bluespec": [".bsv"], "Boo": [".boo"], "Brainfuck": [".bf"], "Brightscript": [".brs"], "Bro": [".bro"], "C": [".c", ".cats", ".h", ".idc", ".w"], "C#": [".cs", ".cake", ".cshtml", ".csx"], "C++": [".cpp", ".c++", ".cc", ".cp", ".cxx", ".h++", ".hh", ".hpp", ".hxx", ".inl", ".ipp", ".tcc", ".tpp", ".C", ".H"], "C-ObjDump": [".c-objdump"], "C2hs Haskell": [".chs"], "CLIPS": [".clp"], "CMake": [".cmake", ".cmake.in"], "COBOL": [".cob", ".cbl", ".ccp", ".cobol", ".cpy"], "CSS": [".css"], "CSV": [".csv"], "Cap'n Proto": [".capnp"], "CartoCSS": [".mss"], "Ceylon": [".ceylon"], "Chapel": [".chpl"], "ChucK": [".ck"], "Cirru": [".cirru"], "Clarion": [".clw"], "Clean": [".icl", ".dcl"], "Click": [".click"], "Clojure": [".clj", ".boot", ".cl2", ".cljc", ".cljs", ".cljs.hl", ".cljscm", ".cljx", ".hic"], "CoffeeScript": [".coffee", "._coffee", ".cjsx", ".cson", ".iced"], "ColdFusion": [".cfm", ".cfml"], "ColdFusion CFC": [".cfc"], "Common Lisp": [".lisp", ".asd", ".lsp", ".ny", ".podsl", ".sexp"], "Component Pascal": [".cps"], "Coq": [".coq"], "Cpp-ObjDump": [".cppobjdump", ".c++-objdump", ".c++objdump", ".cpp-objdump", ".cxx-objdump"], "Creole": [".creole"], "Crystal": [".cr"], "Csound": [".csd"], "Cucumber": [".feature"], "Cuda": [".cu", ".cuh"], "Cycript": [".cy"], "Cython": [".pyx", ".pxd", ".pxi"], "D": [".di"], "D-ObjDump": [".d-objdump"], "DIGITAL Command Language": [".com"], "DM": [".dm"], "DNS Zone": [".zone", ".arpa"], "Darcs Patch": [".darcspatch", ".dpatch"], "Dart": [".dart"], "Diff": [".diff", ".patch"], "Dockerfile": [".dockerfile", "Dockerfile"], "Dogescript": [".djs"], "Dylan": [".dylan", ".dyl", ".intr", ".lid"], "E": [".E"], "ECL": [".ecl", ".eclxml"], "Eagle": [".sch", ".brd"], "Ecere Projects": [".epj"], "Eiffel": [".e"], "Elixir": [".ex", ".exs"], "Elm": [".elm"], "Emacs Lisp": [".el", ".emacs", ".emacs.desktop"], "EmberScript": [".em", ".emberscript"], "Erlang": [".erl", ".escript", ".hrl", ".xrl", ".yrl"], "F#": [".fs", ".fsi", ".fsx"], "FLUX": [".flux"], "FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"], "Factor": [".factor"], "Fancy": [".fy", ".fancypack"], "Fantom": [".fan"], "Formatted": [".eam.fs"], "Forth": [".fth", ".4th", ".forth", ".frt"], "FreeMarker": [".ftl"], "G-code": [".g", ".gco", ".gcode"], "GAMS": [".gms"], "GAP": [".gap", ".gi"], "GAS": [".s"], "GDScript": [".gd"], "GLSL": [".glsl", ".fp", ".frag", ".frg", ".fsh", ".fshader", ".geo", ".geom", ".glslv", ".gshader", ".shader", ".vert", ".vrx", ".vsh", ".vshader"], "Genshi": [".kid"], "Gentoo Ebuild": [".ebuild"], "Gentoo Eclass": [".eclass"], "Gettext Catalog": [".po", ".pot"], "Glyph": [".glf"], "Gnuplot": [".gp", ".gnu", ".gnuplot", ".plot", ".plt"], "Go": [".go"], "Golo": [".golo"], "Gosu": [".gst", ".gsx", ".vark"], "Grace": [".grace"], "Gradle": [".gradle"], "Grammatical Framework": [".gf"], "GraphQL": [".graphql"], "Graphviz (DOT)": [".dot", ".gv"], "Groff": [".man", ".1", ".1in", ".1m", ".1x", ".2", ".3", ".3in", ".3m", ".3qt", ".3x", ".4", ".5", ".6", ".7", ".8", ".9", ".me", ".rno", ".roff"], "Groovy": [".groovy", ".grt", ".gtpl", ".gvy"], "Groovy Server Pages": [".gsp"], "HCL": [".hcl", ".tf"], "HLSL": [".hlsl", ".fxh", ".hlsli"], "HTML": [".html", ".htm", ".html.hl", ".xht", ".xhtml"], "HTML+Django": [".mustache", ".jinja"], "HTML+EEX": [".eex"], "HTML+ERB": [".erb", ".erb.deface"], "HTML+PHP": [".phtml"], "HTTP": [".http"], "Haml": [".haml", ".haml.deface"], "Handlebars": [".handlebars", ".hbs"], "Harbour": [".hb"], "Haskell": [".hs", ".hsc"], "Haxe": [".hx", ".hxsl"], "Hy": [".hy"], "IDL": [".dlm"], "IGOR Pro": [".ipf"], "INI": [".ini", ".cfg", ".prefs", ".properties"], "IRC log": [".irclog", ".weechatlog"], "Idris": [".idr", ".lidr"], "Inform 7": [".ni", ".i7x"], "Inno Setup": [".iss"], "Io": [".io"], "Ioke": [".ik"], "Isabelle": [".thy"], "J": [".ijs"], "JFlex": [".flex", ".jflex"], "JSON": [".json", ".geojson", ".lock", ".topojson"], "JSON5": [".json5"], "JSONLD": [".jsonld"], "JSONiq": [".jq"], "JSX": [".jsx"], "Jade": [".jade"], "Jasmin": [".j"], "Java": [".java"], "Java Server Pages": [".jsp"], "JavaScript": [".js", "._js", ".bones", ".es6", ".jake", ".jsb", ".jscad", ".jsfl", ".jsm", ".jss", ".njs", ".pac", ".sjs", ".ssjs", ".xsjs", ".xsjslib"], "Julia": [".jl"], "Jupyter Notebook": [".ipynb"], "KRL": [".krl"], "KiCad": [".kicad_pcb"], "Kit": [".kit"], "Kotlin": [".kt", ".ktm", ".kts"], "LFE": [".lfe"], "LLVM": [".ll"], "LOLCODE": [".lol"], "LSL": [".lsl", ".lslp"], "LabVIEW": [".lvproj"], "Lasso": [".lasso", ".las", ".lasso8", ".lasso9", ".ldml"], "Latte": [".latte"], "Lean": [".lean", ".hlean"], "Less": [".less"], "Lex": [".lex"], "LilyPond": [".ly", ".ily"], "Linker Script": [".ld", ".lds"], "Liquid": [".liquid"], "Literate Agda": [".lagda"], "Literate CoffeeScript": [".litcoffee"], "Literate Haskell": [".lhs"], "LiveScript": [".ls", "._ls"], "Logos": [".xm", ".x", ".xi"], "Logtalk": [".lgt", ".logtalk"], "LookML": [".lookml"], "Lua": [".lua", ".nse", ".pd_lua", ".rbxs", ".wlua"], "M": [".mumps"], "M4": [".m4"], "MAXScript": [".mcr"], "MTML": [".mtml"], "MUF": [".muf"], "Makefile": [".mak", ".mk", ".mkfile", "Makefile"], "Mako": [".mako", ".mao"], "Maple": [".mpl"], "Markdown": [".md", ".markdown", ".mkd", ".mkdn", ".mkdown", ".ron"], "Mask": [".mask"], "Mathematica": [".mathematica", ".cdf", ".ma", ".mt", ".nb", ".nbp", ".wl", ".wlt"], "Matlab": [".matlab"], "Max": [".maxpat", ".maxhelp", ".maxproj", ".mxt", ".pat"], "MediaWiki": [".mediawiki", ".wiki"], "Metal": [".metal"], "MiniD": [".minid"], "Mirah": [".druby", ".duby", ".mir", ".mirah"], "Modelica": [".mo"], "Module Management System": [".mms", ".mmk"], "Monkey": [".monkey"], "MoonScript": [".moon"], "Myghty": [".myt"], "NSIS": [".nsi", ".nsh"], "NetLinx": [".axs", ".axi"], "NetLinx+ERB": [".axs.erb", ".axi.erb"], "NetLogo": [".nlogo"], "Nginx": [".nginxconf"], "Nimrod": [".nim", ".nimrod"], "Ninja": [".ninja"], "Nit": [".nit"], "Nix": [".nix"], "Nu": [".nu"], "NumPy": [".numpy", ".numpyw", ".numsc"], "OCaml": [".ml", ".eliom", ".eliomi", ".ml4", ".mli", ".mll", ".mly"], "ObjDump": [".objdump"], "Objective-C++": [".mm"], "Objective-J": [".sj"], "Octave": [".oct"], "Omgrofl": [".omgrofl"], "Opa": [".opa"], "Opal": [".opal"], "OpenCL": [".cl", ".opencl"], "OpenEdge ABL": [".p"], "OpenSCAD": [".scad"], "Org": [".org"], "Ox": [".ox", ".oxh", ".oxo"], "Oxygene": [".oxygene"], "Oz": [".oz"], "PAWN": [".pwn"], "PHP": [".php", ".aw", ".ctp", ".php3", ".php4", ".php5", ".phps", ".phpt"], "POV-Ray SDL": [".pov"], "Pan": [".pan"], "Papyrus": [".psc"], "Parrot": [".parrot"], "Parrot Assembly": [".pasm"], "Parrot Internal Representation": [".pir"], "Pascal": [".pas", ".dfm", ".dpr", ".lpr"], "Perl": [".pl", ".al", ".perl", ".ph", ".plx", ".pm", ".psgi", ".t"], "Perl6": [".6pl", ".6pm", ".nqp", ".p6", ".p6l", ".p6m", ".pl6", ".pm6"], "Pickle": [".pkl"], "PigLatin": [".pig"], "Pike": [".pike", ".pmod"], "Pod": [".pod"], "PogoScript": [".pogo"], "Pony": [".pony"], "PostScript": [".ps", ".eps"], "PowerShell": [".ps1", ".psd1", ".psm1"], "Processing": [".pde"], "Prolog": [".prolog", ".yap"], "Propeller Spin": [".spin"], "Protocol Buffer": [".proto"], "Public Key": [".pub"], "Pure Data": [".pd"], "PureBasic": [".pb", ".pbi"], "PureScript": [".purs"], "Python": [".py", ".bzl", ".gyp", ".lmi", ".pyde", ".pyp", ".pyt", ".pyw", ".tac", ".wsgi", ".xpy"], "Python traceback": [".pytb"], "QML": [".qml", ".qbs"], "QMake": [".pri"], "R": [".r", ".rd", ".rsx"], "RAML": [".raml"], "RDoc": [".rdoc"], "REALbasic": [".rbbas", ".rbfrm", ".rbmnu", ".rbres", ".rbtbar", ".rbuistate"], "RHTML": [".rhtml"], "RMarkdown": [".rmd"], "Racket": [".rkt", ".rktd", ".rktl", ".scrbl"], "Ragel in Ruby Host": [".rl"], "Raw token data": [".raw"], "Rebol": [".reb", ".r2", ".r3", ".rebol"], "Red": [".red", ".reds"], "Redcode": [".cw"], "Ren'Py": [".rpy"], "RenderScript": [".rsh"], "RobotFramework": [".robot"], "Rouge": [".rg"], "Ruby": [".rb", ".builder", ".gemspec", ".god", ".irbrc", ".jbuilder", ".mspec", ".podspec", ".rabl", ".rake", ".rbuild", ".rbw", ".rbx", ".ru", ".ruby", ".thor", ".watchr"], "Rust": [".rs", ".rs.in"], "SAS": [".sas"], "SCSS": [".scss"], "SMT": [".smt2", ".smt"], "SPARQL": [".sparql", ".rq"], "SQF": [".sqf", ".hqf"], "SQL": [".pls", ".pck", ".pkb", ".pks", ".plb", ".plsql", ".sql", ".cql", ".ddl", ".prc", ".tab", ".udf", ".viw", ".db2"], "STON": [".ston"], "SVG": [".svg"], "Sage": [".sage", ".sagews"], "SaltStack": [".sls"], "Sass": [".sass"], "Scala": [".scala", ".sbt"], "Scaml": [".scaml"], "Scheme": [".scm", ".sld", ".sps", ".ss"], "Scilab": [".sci", ".sce"], "Self": [".self"], "Shell": [".sh", ".bash", ".bats", ".command", ".ksh", ".sh.in", ".tmux", ".tool", ".zsh"], "ShellSession": [".sh-session"], "Shen": [".shen"], "Slash": [".sl"], "Slim": [".slim"], "Smali": [".smali"], "Smalltalk": [".st"], "Smarty": [".tpl"], "Solidity": [".sol"], "SourcePawn": [".sp", ".sma"], "Squirrel": [".nut"], "Stan": [".stan"], "Standard ML": [".ML", ".fun", ".sig", ".sml"], "Stata": [".do", ".ado", ".doh", ".ihlp", ".mata", ".matah", ".sthlp"], "Stylus": [".styl"], "SuperCollider": [".scd"], "Swift": [".swift"], "SystemVerilog": [".sv", ".svh", ".vh"], "TOML": [".toml"], "TXL": [".txl"], "Tcl": [".tcl", ".adp", ".tm"], "Tcsh": [".tcsh", ".csh"], "TeX": [".tex", ".aux", ".bbx", ".bib", ".cbx", ".dtx", ".ins", ".lbx", ".ltx", ".mkii", ".mkiv", ".mkvi", ".sty", ".toc"], "Tea": [".tea"], "Text": [".txt", ".no"], "Textile": [".textile"], "Thrift": [".thrift"], "Turing": [".tu"], "Turtle": [".ttl"], "Twig": [".twig"], "TypeScript": [".ts", ".tsx"], "Unified Parallel C": [".upc"], "Unity3D Asset": [".anim", ".asset", ".mat", ".meta", ".prefab", ".unity"], "Uno": [".uno"], "UnrealScript": [".uc"], "UrWeb": [".ur", ".urs"], "VCL": [".vcl"], "VHDL": [".vhdl", ".vhd", ".vhf", ".vhi", ".vho", ".vhs", ".vht", ".vhw"], "Vala": [".vala", ".vapi"], "Verilog": [".veo"], "VimL": [".vim"], "Visual Basic": [".vb", ".bas", ".frm", ".frx", ".vba", ".vbhtml", ".vbs"], "Volt": [".volt"], "Vue": [".vue"], "Web Ontology Language": [".owl"], "WebAssembly": [".wat"], "WebIDL": [".webidl"], "X10": [".x10"], "XC": [".xc"], "XML": [".xml", ".ant", ".axml", ".ccxml", ".clixml", ".cproject", ".csl", ".csproj", ".ct", ".dita", ".ditamap", ".ditaval", ".dll.config", ".dotsettings", ".filters", ".fsproj", ".fxml", ".glade", ".grxml", ".iml", ".ivy", ".jelly", ".jsproj", ".kml", ".launch", ".mdpolicy", ".mxml", ".nproj", ".nuspec", ".odd", ".osm", ".plist", ".props", ".ps1xml", ".psc1", ".pt", ".rdf", ".rss", ".scxml", ".srdf", ".storyboard", ".stTheme", ".sublime-snippet", ".targets", ".tmCommand", ".tml", ".tmLanguage", ".tmPreferences", ".tmSnippet", ".tmTheme", ".ui", ".urdf", ".ux", ".vbproj", ".vcxproj", ".vssettings", ".vxml", ".wsdl", ".wsf", ".wxi", ".wxl", ".wxs", ".x3d", ".xacro", ".xaml", ".xib", ".xlf", ".xliff", ".xmi", ".xml.dist", ".xproj", ".xsd", ".xul", ".zcml"], "XPages": [".xsp-config", ".xsp.metadata"], "XProc": [".xpl", ".xproc"], "XQuery": [".xquery", ".xq", ".xql", ".xqm", ".xqy"], "XS": [".xs"], "XSLT": [".xslt", ".xsl"], "Xojo": [".xojo_code", ".xojo_menu", ".xojo_report", ".xojo_script", ".xojo_toolbar", ".xojo_window"], "Xtend": [".xtend"], "YAML": [".yml", ".reek", ".rviz", ".sublime-syntax", ".syntax", ".yaml", ".yaml-tmlanguage"], "YANG": [".yang"], "Yacc": [".y", ".yacc", ".yy"], "Zephir": [".zep"], "Zig": [".zig"], "Zimpl": [".zimpl", ".zmpl", ".zpl"], "desktop": [".desktop", ".desktop.in"], "eC": [".ec", ".eh"], "edn": [".edn"], "fish": [".fish"], "mupad": [".mu"], "nesC": [".nc"], "ooc": [".ooc"], "reStructuredText": [".rst", ".rest", ".rest.txt", ".rst.txt"], "wisp": [".wisp"], "xBase": [".prg", ".prw"]}
\ No newline at end of file
--- a/repodemo/utils/tools.py
+++ b/repodemo/utils/tools.py
+import json
+import os
+import zipfile
+import git
+import urllib.parse
+import re
+def is_valid_json(json_string):
+    try:
+        match = re.search(r'\{.*\}', json_string, re.DOTALL)
+        if match:
+            dict_str = match.group()
+            json.loads(dict_str)
+        else:
+            json.loads(json_string)
+        return True
+    except ValueError:
+        return False
+def clone_repo(repo_url, clone_to):
+    """
+    克隆一个GitHub仓库。
+    参数:
+    repo_url (str): 原始仓库的URL。
+    clone_to (str): 克隆到的本地目录。
+    返回:
+    str: 成功时返回克隆到的本地目录（包含子目录），不成功时返回空字符串。
+    """
+    try:
+        if not os.path.exists(clone_to):
+            os.makedirs(clone_to)
+        # 从URL中提取仓库名称
+        repo_name = urllib.parse.urlparse(repo_url).path.split('/')[-1]
+        # 在clone_to目录下创建新的目录
+        cloned_path = os.path.join(clone_to, repo_name)
+        if os.path.exists(cloned_path):
+            return cloned_path
+        # 克隆仓库
+        repo = git.Repo.clone_from(repo_url, cloned_path)
+        print(f"Repository cloned to {cloned_path}")
+        return cloned_path
+    except Exception as e:
+        print(f"Failed to clone repository: {e}")
+        return None
+def unzip_file(zip_path, extract_dir):
+    """
+    解压zip文件到指定目录，并在指定目录下创建一个新的目录存放解压后的文件
+    参数:
+    zip_path (str): zip压缩包的地址
+    extract_dir (str): 指定解压的目录
+    返回:
+    str: 解压后的路径
+    """
+    if not os.path.exists(extract_dir):
+        os.makedirs(extract_dir)
+    base_name = os.path.basename(zip_path)
+    dir_name = os.path.splitext(base_name)[0]
+    new_extract_dir = os.path.join(extract_dir, dir_name)
+    if not os.path.exists(new_extract_dir):
+        os.makedirs(new_extract_dir)
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(new_extract_dir)
+    return new_extract_dir
+def get_project_files_with_content(project_dir):
+    """
+    获取项目目录下所有文件的相对路径和内容
+    参数:
+    project_dir (str): 项目目录地址
+    返回:
+    list: 包含字典的列表，每个字典包含文件的相对路径和内容
+    """
+    files_list = []
+    for root, dirs, files in os.walk(project_dir):
+        for file in files:
+            if filter_data(file):
+                file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(file_path, project_dir)
+                if "__MACOSX" in relative_path:
+                    continue
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                    content = f.read()
+                files_list.append({"path": relative_path, "content": content})
+            else:
+                continue
+    return files_list
+def filter_data(obj):
+    LANGUAGE_TAG = {
+        "c++": "// C++",
+        "cpp": "// C++",
+        "c": "// C",
+        "c#": "// C#",
+        "c-sharp": "// C#",
+        "css": "/* CSS */",
+        "cuda": "// Cuda",
+        "fortran": "! Fortran",
+        "go": "// Go",
+        "html": "<!-- HTML -->",
+        "java": "// Java",
+        "js": "// JavaScript",
+        "javascript": "// JavaScript",
+        "kotlin": "// Kotlin",
+        "lean": "-- Lean",
+        "lua": "-- Lua",
+        "objectivec": "// Objective-C",
+        "objective-c": "// Objective-C",
+        "objective-c++": "// Objective-C++",
+        "pascal": "// Pascal",
+        "php": "// PHP",
+        "python": "# Python",
+        "r": "# R",
+        "rust": "// Rust",
+        "ruby": "# Ruby",
+        "scala": "// Scala",
+        "shell": "# Shell",
+        "sql": "-- SQL",
+        "tex": f"% TeX",
+        "typescript": "// TypeScript",
+        "vue": "<!-- Vue -->",
+        "assembly": "; Assembly",
+        "dart": "// Dart",
+        "perl": "# Perl",
+        "prolog": f"% Prolog",
+        "swift": "// swift",
+        "lisp": "; Lisp",
+        "vb": "' Visual Basic",
+        "visual basic": "' Visual Basic",
+        "matlab": f"% Matlab",
+        "delphi": "{ Delphi }",
+        "scheme": "; Scheme",
+        "basic": "' Basic",
+        "groovy": "// Groovy",
+        "abap": "* Abap",
+        "gdscript": "# GDScript",
+        "haskell": "-- Haskell",
+        "julia": "# Julia",
+        "elixir": "# Elixir",
+        "excel": "' Excel",
+        "clojure": "; Clojure",
+        "actionscript": "// ActionScript",
+        "solidity": "// Solidity",
+        "powershell": "# PowerShell",
+        "erlang": f"% Erlang",
+        "cobol": "// Cobol",
+        "batchfile": ":: Batch file",
+        "makefile": "# Makefile",
+        "dockerfile": "# Dockerfile",
+        "markdown": "<!-- Markdown -->",
+        "cmake": "# CMake",
+    }
+    programming_languages_to_file_extensions = json.load(
+        open("utils/programming-languages-to-file-extensions.json")
+    )
+    need2del = []
+    for key in programming_languages_to_file_extensions.keys():
+        if key.lower() not in LANGUAGE_TAG:
+            need2del.append(key)
+    for key in need2del:
+        del programming_languages_to_file_extensions[key]
+    ext_to_programming_languages = {}
+    want_languages = []
+    for key in programming_languages_to_file_extensions:
+        for item in programming_languages_to_file_extensions[key]:
+            ext_to_programming_languages[item] = key
+            want_languages.append(item)
+    ext = "." + obj.split(".")[-1]
+    with open("utils/keep.txt", "r") as f:
+        keep_files = f.readlines()
+        keep_files = [l.strip() for l in keep_files]
+    # print(ext)
+    if ext not in want_languages:
+        if obj in keep_files:
+            return True
+        return False
+    else:
+        return True
--- a/requirements.txt
+++ b/requirements.txt
+#torch>=2.0
+#tokenizers>=0.14.0
+#transformers==4.35.0
+#accelerate
+#deepspeed==0.12.2
+sympy==1.12
+pebble
+timeout-decorator
+accelerate
+attrdict
+tqdm
+datasets
+tensorboardX
\ No newline at end of file
--- a/resources/all_functions.jpg
+++ b/resources/all_functions.jpg
--- a/resources/all_functions_zh.jpg
+++ b/resources/all_functions_zh.jpg
--- a/resources/local_mode.png
+++ b/resources/local_mode.png
--- a/resources/local_mode_zh.png
+++ b/resources/local_mode_zh.png
--- a/resources/logo.jpeg
+++ b/resources/logo.jpeg
--- a/web_demo/README.md
+++ b/web_demo/README.md
+![](../resources/logo.jpeg)
+[English](README.md) | [中文](README_zh.md)
+## Online Functionality
+CodeGeeX4 supports online search and question answering by calling the Bing API to retrieve search results to access to the latest
+information.
+## Usage Tutorial
+### 1. Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+### 2. Configure Bing API Key
+Configure `BING_API_KEY` in `backend/apis/bing.py`.
+For more details, refer
+to [Bing Search API](https://learn.microsoft.com/zh-cn/previous-versions/azure/cognitive-services/Bing-Web-Search/bing-api-comparison)
+### 3. Run the Project
+```bash
+python main.py
+>>> Running on local URL:  http://127.0.0.1:8080
+```
+## Demo
+![](resources/demo.png)
\ No newline at end of file
--- a/web_demo/README_zh.md
+++ b/web_demo/README_zh.md
+![](../resources/logo.jpeg)
+[English](README.md) | [中文](README_zh.md)
+## 联网功能
+CodeGeeX4支持联网搜索问答，通过调用Bing API获取搜索结果，可获取最新资讯。
+## 使用教程
+### 1. 安装依赖项
+```bash
+pip install -r requirements.txt
+```
+### 2. 配置Bing API Key
+在`backend/apis/bing.py`中配置`BING_API_KEY`
+详情可参考 [必应搜索API](https://learn.microsoft.com/zh-cn/previous-versions/azure/cognitive-services/Bing-Web-Search/bing-api-comparison)
+### 3. 运行项目
+```bash
+python main.py
+>>> Running on local URL:  http://127.0.0.1:8080
+```
+## Demo
+![](resources/demo_zh.png)
\ No newline at end of file