Revert "Merge branch 'master' into 'master'"

This reverts merge request !2

Revert "Merge branch 'master' into 'master'"
This reverts merge request !2
0e1045f0 · lvzhen · 467ec853 · 467ec853 · 467ec853 · 467ec853
Commit 0e1045f0 authored May 10, 2024 by lvzhen
20 changed files
--- a/openai_api_demo/.env
+++ b/openai_api_demo/.env
-LOCAL_MODEL_PATH=<your_path>
-LOCAL_EMBEDDING_MODEL_PATH=<your_path>
\ No newline at end of file
--- a/openai_api_demo/api_server.py
+++ b/openai_api_demo/api_server.py
-"""
-This script implements an API for the ChatGLM3-6B model,
-formatted similarly to OpenAI's API (https://platform.openai.com/docs/api-reference/chat).
-It's designed to be run as a web server using FastAPI and uvicorn,
-making the ChatGLM3-6B model accessible through OpenAI Client.
-
-Key Components and Features:
- Model and Tokenizer Setup: Configures the model and tokenizer paths and loads them.
- FastAPI Configuration: Sets up a FastAPI application with CORS middleware for handling cross-origin requests.
- API Endpoints:
-  - "/v1/models": Lists the available models, specifically ChatGLM3-6B.
-  - "/v1/chat/completions": Processes chat completion requests with options for streaming and regular responses.
-  - "/v1/embeddings": Processes Embedding request of a list of text inputs.
- Token Limit Caution: In the OpenAI API, 'max_tokens' is equivalent to HuggingFace's 'max_new_tokens', not 'max_length'.
-For instance, setting 'max_tokens' to 8192 for a 6b model would result in an error due to the model's inability to output
-that many tokens after accounting for the history and prompt tokens.
- Stream Handling and Custom Functions: Manages streaming responses and custom function calls within chat responses.
- Pydantic Models: Defines structured models for requests and responses, enhancing API documentation and type safety.
- Main Execution: Initializes the model and tokenizer, and starts the FastAPI app on the designated host and port.
-
-Note:
-    This script doesn't include the setup for special tokens or multi-GPU support by default.
-    Users need to configure their special tokens and can enable multi-GPU support as per the provided instructions.
-    Embedding Models only support in One GPU.
-
-    Running this script requires 14-15GB of GPU memory. 2 GB for the embedding model and 12-13 GB for the FP16 ChatGLM3 LLM.
-
-
-"""
-
-import os
-import time
-import tiktoken
-import torch
-import uvicorn
-import json
-from fastapi import FastAPI, HTTPException, Response
-from fastapi.middleware.cors import CORSMiddleware
-
-from contextlib import asynccontextmanager
-from typing import List, Literal, Optional, Union
-from loguru import logger
-from pydantic import BaseModel, Field
-from transformers import AutoTokenizer, AutoModel
-from utils import process_response, generate_chatglm3, generate_stream_chatglm3
-from sentence_transformers import SentenceTransformer
-from tools.schema import tool_class, tool_def, tool_param_start_with, tool_define_param_name
-from sse_starlette.sse import EventSourceResponse
-
-# Set up limit request time
-EventSourceResponse.DEFAULT_PING_INTERVAL = 1000
-
-# set LLM path
-MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
-TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
-
-# set Embedding Model path
-EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', 'BAAI/bge-m3')
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    yield
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.ipc_collect()
-
-
-app = FastAPI(lifespan=lifespan)
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-
-class ModelCard(BaseModel):
-    id: str
-    object: str = "model"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    owned_by: str = "owner"
-    root: Optional[str] = None
-    parent: Optional[str] = None
-    permission: Optional[list] = None
-
-
-class ModelList(BaseModel):
-    object: str = "list"
-    data: List[ModelCard] = []
-
-
-class FunctionCallResponse(BaseModel):
-    name: Optional[str] = None
-    arguments: Optional[str] = None
-
-
-class ChatMessage(BaseModel):
-    role: Literal["user", "assistant", "system", "function"]
-    content: str = None
-    name: Optional[str] = None
-    function_call: Optional[FunctionCallResponse] = None
-
-
-class DeltaMessage(BaseModel):
-    role: Optional[Literal["user", "assistant", "system"]] = None
-    content: Optional[str] = None
-    function_call: Optional[FunctionCallResponse] = None
-
-
-## for Embedding
-class EmbeddingRequest(BaseModel):
-    input: Union[List[str], str]
-    model: str
-
-
-class CompletionUsage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-
-
-class EmbeddingResponse(BaseModel):
-    data: list
-    model: str
-    object: str
-    usage: CompletionUsage
-
-
-# for ChatCompletionRequest
-
-class UsageInfo(BaseModel):
-    prompt_tokens: int = 0
-    total_tokens: int = 0
-    completion_tokens: Optional[int] = 0
-
-
-class ChatCompletionRequest(BaseModel):
-    model: str
-    messages: List[ChatMessage]
-    temperature: Optional[float] = 0.8
-    top_p: Optional[float] = 0.8
-    max_tokens: Optional[int] = None
-    stream: Optional[bool] = False
-    tools: Optional[Union[dict, List[dict]]] = None
-    repetition_penalty: Optional[float] = 1.1
-    agent: Optional[bool] = False
-
-
-class ChatCompletionResponseChoice(BaseModel):
-    index: int
-    message: ChatMessage
-    finish_reason: Literal["stop", "length", "function_call"]
-
-
-class ChatCompletionResponseStreamChoice(BaseModel):
-    delta: DeltaMessage
-    finish_reason: Optional[Literal["stop", "length", "function_call"]]
-    index: int
-
-
-class ChatCompletionResponse(BaseModel):
-    model: str
-    id: str
-    object: Literal["chat.completion", "chat.completion.chunk"]
-    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
-    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
-    usage: Optional[UsageInfo] = None
-
-
-@app.get("/health")
-async def health() -> Response:
-    """Health check."""
-    return Response(status_code=200)
-
-
-@app.post("/v1/embeddings", response_model=EmbeddingResponse)
-async def get_embeddings(request: EmbeddingRequest):
-    if isinstance(request.input, str):
-        embeddings = [embedding_model.encode(request.input)]
-    else:
-        embeddings = [embedding_model.encode(text) for text in request.input]
-    embeddings = [embedding.tolist() for embedding in embeddings]
-
-    def num_tokens_from_string(string: str) -> int:
-        """
-        Returns the number of tokens in a text string.
-        use cl100k_base tokenizer
-        """
-        encoding = tiktoken.get_encoding('cl100k_base')
-        num_tokens = len(encoding.encode(string))
-        return num_tokens
-
-    response = {
-        "data": [
-            {
-                "object": "embedding",
-                "embedding": embedding,
-                "index": index
-            }
-            for index, embedding in enumerate(embeddings)
-        ],
-        "model": request.model,
-        "object": "list",
-        "usage": CompletionUsage(
-            prompt_tokens=sum(len(text.split()) for text in request.input),
-            completion_tokens=0,
-            total_tokens=sum(num_tokens_from_string(text) for text in request.input),
-        )
-    }
-    return response
-
-
-@app.get("/v1/models", response_model=ModelList)
-async def list_models():
-    model_card = ModelCard(
-        id="chatglm3-6b"
-    )
-    return ModelList(
-        data=[model_card]
-    )
-
-
-@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
-async def create_chat_completion(request: ChatCompletionRequest):
-    global model, tokenizer
-
-    if len(request.messages) < 1 or request.messages[-1].role == "assistant":
-        raise HTTPException(status_code=400, detail="Invalid request")
-
-    gen_params = dict(
-        messages=request.messages,
-        temperature=request.temperature,
-        top_p=request.top_p,
-        max_tokens=request.max_tokens or 1024,
-        echo=False,
-        stream=request.stream,
-        repetition_penalty=request.repetition_penalty,
-        agent=request.agent
-    )
-    logger.debug(f"==== request ====\n{gen_params}")
-    gen_params["tools"] = tool_def if gen_params["agent"] else []
-
-    if request.stream:
-
-        # Use the stream mode to read the first few characters, if it is not a function call, direct stram output
-        predict_stream_generator = predict_stream(request.model, gen_params)
-        output = next(predict_stream_generator)
-        if not contains_custom_function(output, gen_params["tools"]):
-            return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
-
-        # Obtain the result directly at one time and determine whether tools needs to be called.
-        logger.debug(f"First result output：\n{output}")
-
-        function_call = None
-        if output and request.tools:
-            try:
-                function_call = process_response(output, use_tool=True)
-            except:
-                logger.warning("Failed to parse tool call")
-
-        # CallFunction
-        if isinstance(function_call, dict):
-            function_call = FunctionCallResponse(**function_call)
-
-            """
-            In this demo, we did not register any tools.
-            You can use the tools that have been implemented in our `tools_using_demo` and implement your own streaming tool implementation here.
-            Similar to the following method:
-            """
-            if tool_param_start_with in output:
-                tool = tool_class.get(function_call.name)
-                if tool:
-                    this_tool_define_param_name = tool_define_param_name.get(function_call.name)
-                    if this_tool_define_param_name:
-                        tool_param = json.loads(function_call.arguments).get(this_tool_define_param_name)
-                        if tool().parameter_validation(tool_param):
-                            observation = str(tool().run(tool_param))
-                            tool_response = observation
-                        else:
-                            tool_response = "Tool parameter values error, please tell the user about this situation."
-                    else:
-                        tool_response = "Tool parameter is not defined in tools schema, please tell the user about this situation."
-                else:
-                    tool_response = "No available tools found, please tell the user about this situation."
-            else:
-                tool_response = "Tool parameter content error, please tell the user about this situation."
-
-            if not gen_params.get("messages"):
-                gen_params["messages"] = []
-
-            gen_params["messages"].append(ChatMessage(
-                role="assistant",
-                content=output,
-            ))
-            gen_params["messages"].append(ChatMessage(
-                role="function",
-                name=function_call.name,
-                content=tool_response,
-            ))
-
-            # Streaming output of results after function calls
-            generate = predict(request.model, gen_params)
-            return EventSourceResponse(generate, media_type="text/event-stream")
-
-        else:
-            # Handled to avoid exceptions in the above parsing function process.
-            generate = parse_output_text(request.model, output)
-            return EventSourceResponse(generate, media_type="text/event-stream")
-
-    # Here is the handling of stream = False
-    response = generate_chatglm3(model, tokenizer, gen_params)
-
-    # Remove the first newline character
-    if response["text"].startswith("\n"):
-        response["text"] = response["text"][1:]
-    response["text"] = response["text"].strip()
-
-    usage = UsageInfo()
-    function_call, finish_reason = None, "stop"
-    if request.tools:
-        try:
-            function_call = process_response(response["text"], use_tool=True)
-        except:
-            logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")
-
-    if isinstance(function_call, dict):
-        finish_reason = "function_call"
-        function_call = FunctionCallResponse(**function_call)
-
-    message = ChatMessage(
-        role="assistant",
-        content=response["text"],
-        function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
-    )
-
-    logger.debug(f"==== message ====\n{message}")
-
-    choice_data = ChatCompletionResponseChoice(
-        index=0,
-        message=message,
-        finish_reason=finish_reason,
-    )
-    task_usage = UsageInfo.model_validate(response["usage"])
-    for usage_key, usage_value in task_usage.model_dump().items():
-        setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
-
-    return ChatCompletionResponse(
-        model=request.model,
-        id="",  # for open_source model, id is empty
-        choices=[choice_data],
-        object="chat.completion",
-        usage=usage
-    )
-
-
-async def predict(model_id: str, params: dict):
-    global model, tokenizer
-
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(role="assistant"),
-        finish_reason=None
-    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    previous_text = ""
-    for new_response in generate_stream_chatglm3(model, tokenizer, params):
-        decoded_unicode = new_response["text"]
-        delta_text = decoded_unicode[len(previous_text):]
-        previous_text = decoded_unicode
-
-        finish_reason = new_response["finish_reason"]
-        if len(delta_text) == 0 and finish_reason != "function_call":
-            continue
-
-        function_call = None
-        if finish_reason == "function_call":
-            try:
-                function_call = process_response(decoded_unicode, use_tool=True)
-            except:
-                logger.warning(
-                    "Failed to parse tool call, maybe the response is not a tool call or have been answered.")
-
-        if isinstance(function_call, dict):
-            function_call = FunctionCallResponse(**function_call)
-
-        delta = DeltaMessage(
-            content=delta_text,
-            role="assistant",
-            function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
-        )
-
-        choice_data = ChatCompletionResponseStreamChoice(
-            index=0,
-            delta=delta,
-            finish_reason=finish_reason
-        )
-        chunk = ChatCompletionResponse(
-            model=model_id,
-            id="",
-            choices=[choice_data],
-            object="chat.completion.chunk"
-        )
-        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
-    )
-    chunk = ChatCompletionResponse(
-        model=model_id,
-        id="",
-        choices=[choice_data],
-        object="chat.completion.chunk"
-    )
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-    yield '[DONE]'
-
-
-def predict_stream(model_id, gen_params):
-    """
-    The function call is compatible with stream mode output.
-
-    The first seven characters are determined.
-    If not a function call, the stream output is directly generated.
-    Otherwise, the complete character content of the function call is returned.
-
-    :param model_id:
-    :param gen_params:
-    :return:
-    """
-    output = ""
-    is_function_call = False
-    has_send_first_chunk = False
-    for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
-        decoded_unicode = new_response["text"]
-        delta_text = decoded_unicode[len(output):]
-        output = decoded_unicode
-
-        # When it is not a function call and the character length is> 7,
-        # try to judge whether it is a function call according to the special function prefix
-        if not is_function_call and len(output) > 7:
-
-            # Determine whether a function is called
-            is_function_call = contains_custom_function(output, gen_params["tools"])
-            if is_function_call:
-                continue
-
-            # Non-function call, direct stream output
-            finish_reason = new_response["finish_reason"]
-
-            # Send an empty string first to avoid truncation by subsequent next() operations.
-            if not has_send_first_chunk:
-                message = DeltaMessage(
-                    content="",
-                    role="assistant",
-                    function_call=None,
-                )
-                choice_data = ChatCompletionResponseStreamChoice(
-                    index=0,
-                    delta=message,
-                    finish_reason=finish_reason
-                )
-                chunk = ChatCompletionResponse(
-                    model=model_id,
-                    id="",
-                    choices=[choice_data],
-                    created=int(time.time()),
-                    object="chat.completion.chunk"
-                )
-                yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-            send_msg = delta_text if has_send_first_chunk else output
-            has_send_first_chunk = True
-            message = DeltaMessage(
-                content=send_msg,
-                role="assistant",
-                function_call=None,
-            )
-            choice_data = ChatCompletionResponseStreamChoice(
-                index=0,
-                delta=message,
-                finish_reason=finish_reason
-            )
-            chunk = ChatCompletionResponse(
-                model=model_id,
-                id="",
-                choices=[choice_data],
-                created=int(time.time()),
-                object="chat.completion.chunk"
-            )
-            yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    if is_function_call:
-        yield output
-    else:
-        yield '[DONE]'
-
-
-async def parse_output_text(model_id: str, value: str):
-    """
-    Directly output the text content of value
-
-    :param model_id:
-    :param value:
-    :return:
-    """
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(role="assistant", content=value),
-        finish_reason=None
-    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
-    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-    yield '[DONE]'
-
-
-def contains_custom_function(value: str, tools: list) -> bool:
-    """
-    Determine whether 'function_call' according to a special function prefix.
-    [Note] This is not a rigorous judgment method, only for reference.
-
-    :param value:
-    :param tools:
-    :return:
-    """
-    for tool in tools:
-        if value and tool["name"] in value:
-            return True
-
-
-if __name__ == "__main__":
-    # Load LLM
-    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
-    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval()
-
-    # load Embedding
-    embedding_model = SentenceTransformer(EMBEDDING_PATH, device="cuda")
-    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
--- a/openai_api_demo/docker-compose.yml
+++ b/openai_api_demo/docker-compose.yml
-version: "3.6"
-
-services:
-        glm3_api:
-                image: python:3.10.13-slim
-                restart: unless-stopped
-                working_dir: /glm3
-                container_name: glm3_api
-                env_file: ./.env
-                networks:
-                        - v_glm3
-                deploy:
-                        resources:
-                                reservations:
-                                        devices:
-                                                - driver: nvidia
-                                                  count: 1
-                                                  capabilities: [gpu]
-                environment:
-                        - MODEL_PATH=/models/chatglm3-6b
-                        - EMBEDDING_PATH=/models/bge-large-zh-v1.5
-                        - TZ=Asia/Shanghai
-                        - PYTHONDONTWRITEBYTECODE=1
-                        - PYTHONUNBUFFERED=1
-                        - DOCKER=True
-                ports:
-                        - 8100:8000
-                volumes:
-                        - ./:/glm3
-                        - ${LOCAL_MODEL_PATH}:/models/chatglm3-6b
-                        - ${LOCAL_EMBEDDING_MODEL_PATH}:/models/bge-large-zh-v1.5
-                command:
-                        - sh
-                        - -c
-                        - |
-                          sed -i s/deb.debian.org/mirrors.tencentyun.com/g /etc/apt/sources.list
-                          sed -i s/security.debian.org/mirrors.tencentyun.com/g /etc/apt/sources.list
-                          apt-get update
-                          python -m pip install -i  https://mirror.sjtu.edu.cn/pypi/web/simple --upgrade pip
-                          pip install -r requirements.txt -i  https://pypi.tuna.tsinghua.edu.cn/simple
-                          python api_server.py
-networks:
-        v_glm3:
-                driver: bridge
\ No newline at end of file
--- a/openai_api_demo/langchain_openai_api.py
+++ b/openai_api_demo/langchain_openai_api.py
-"""
-This script is designed for interacting with a local GLM3 AI model using the `ChatGLM3` class
-from the `langchain_community` library. It facilitates continuous dialogue with the GLM3 model.
-
-1. Start the Local Model Service: Before running this script, you need to execute the `api_server.py` script
-to start the GLM3 model's service.
-2. Run the Script: The script includes functionality for initializing the LLMChain object and obtaining AI responses,
-allowing the user to input questions and receive AI answers.
-3. This demo is not support for streaming.
-
-"""
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
-from langchain_community.llms.chatglm3 import ChatGLM3
-
-
-def initialize_llm_chain(messages: list):
-    template = "{input}"
-    prompt = PromptTemplate.from_template(template)
-
-    endpoint_url = "http://127.0.0.1:8000/v1/chat/completions"
-    llm = ChatGLM3(
-        endpoint_url=endpoint_url,
-        max_tokens=4096,
-        prefix_messages=messages,
-        top_p=0.9
-    )
-    return LLMChain(prompt=prompt, llm=llm)
-
-
-def get_ai_response(llm_chain, user_message):
-    ai_response = llm_chain.invoke({"input": user_message})
-    return ai_response
-
-
-def continuous_conversation():
-    messages = [
-        SystemMessage(content="You are an intelligent AI assistant, named ChatGLM3."),
-    ]
-    while True:
-        user_input = input("Human (or 'exit' to quit): ")
-        if user_input.lower() == 'exit':
-            break
-        llm_chain = initialize_llm_chain(messages=messages)
-        ai_response = get_ai_response(llm_chain, user_input)
-        print("ChatGLM3: ", ai_response["text"])
-        messages += [
-            HumanMessage(content=user_input),
-            AIMessage(content=ai_response["text"]),
-        ]
-
-
-if __name__ == "__main__":
-    continuous_conversation()
--- a/Intel_device_demo/ipex_llm_cpu_demo/api_server.py
+++ b/Intel_device_demo/ipex_llm_cpu_demo/api_server.py
-"""
-This script implements an API for the ChatGLM3-6B model,
-formatted similarly to OpenAI's API (https://platform.openai.com/docs/api-reference/chat).
-It's designed to be run as a web server using FastAPI and uvicorn,
-making the ChatGLM3-6B model accessible through OpenAI Client.
-
-Key Components and Features:
- Model and Tokenizer Setup: Configures the model and tokenizer paths and loads them.
- FastAPI Configuration: Sets up a FastAPI application with CORS middleware for handling cross-origin requests.
- API Endpoints:
-  - "/v1/models": Lists the available models, specifically ChatGLM3-6B.
-  - "/v1/chat/completions": Processes chat completion requests with options for streaming and regular responses.
-  - "/v1/embeddings": Processes Embedding request of a list of text inputs.
- Token Limit Caution: In the OpenAI API, 'max_tokens' is equivalent to HuggingFace's 'max_new_tokens', not 'max_length'.
-For instance, setting 'max_tokens' to 8192 for a 6b model would result in an error due to the model's inability to output
-that many tokens after accounting for the history and prompt tokens.
- Stream Handling and Custom Functions: Manages streaming responses and custom function calls within chat responses.
- Pydantic Models: Defines structured models for requests and responses, enhancing API documentation and type safety.
- Main Execution: Initializes the model and tokenizer, and starts the FastAPI app on the designated host and port.
-
-Note:
-    This script doesn't include the setup for special tokens or multi-GPU support by default.
-    Users need to configure their special tokens and can enable multi-GPU support as per the provided instructions.
-    Embedding Models only support in One GPU.
-
-"""
+# coding=utf-8
+# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
+# Usage: python openai_api.py
+# Visit http://localhost:8000/docs for documents.
+
+# 在OpenAI的API中，max_tokens 等价于 HuggingFace 的 max_new_tokens 而不是 max_length，。
+# 例如，对于6b模型，设置max_tokens = 8192，则会报错，因为扣除历史记录和提示词后，模型不能输出那么多的tokens。

 import os
 import time
-import tiktoken
+from contextlib import asynccontextmanager
+from typing import List, Literal, Optional, Union
+
 import torch
 import uvicorn
-
-from fastapi import FastAPI, HTTPException, Response
+from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-
-from contextlib import asynccontextmanager
-from typing import List, Literal, Optional, Union
 from loguru import logger
 from pydantic import BaseModel, Field
-from ipex_llm.transformers import AutoModel
-from transformers import AutoTokenizer
-from utils import process_response, generate_chatglm3, generate_stream_chatglm3
-# from sentence_transformers import SentenceTransformer
-
 from sse_starlette.sse import EventSourceResponse
+from transformers import AutoTokenizer, AutoModel

-# Set up limit request time
-EventSourceResponse.DEFAULT_PING_INTERVAL = 1000
+from utils import process_response, generate_chatglm3, generate_stream_chatglm3

-# set LLM path
 MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
 TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
-
-# set Embedding Model path
-EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', 'BAAI/bge-large-zh-v1.5')
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


 @asynccontextmanager
-async def lifespan(app: FastAPI):
+async def lifespan(app: FastAPI):  # collects GPU memory
    yield
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
@@ -108,33 +79,6 @@ class DeltaMessage(BaseModel):
    function_call: Optional[FunctionCallResponse] = None


-## for Embedding
-class EmbeddingRequest(BaseModel):
-    input: List[str]
-    model: str
-
-
-class CompletionUsage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-
-
-class EmbeddingResponse(BaseModel):
-    data: list
-    model: str
-    object: str
-    usage: CompletionUsage
-
-
-# for ChatCompletionRequest
-
-class UsageInfo(BaseModel):
-    prompt_tokens: int = 0
-    total_tokens: int = 0
-    completion_tokens: Optional[int] = 0
-
-
 class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
@@ -142,7 +86,8 @@ class ChatCompletionRequest(BaseModel):
    top_p: Optional[float] = 0.8
    max_tokens: Optional[int] = None
    stream: Optional[bool] = False
-    tools: Optional[Union[dict, List[dict]]] = None
+    functions: Optional[Union[dict, List[dict]]] = None
+    # Additional parameters
    repetition_penalty: Optional[float] = 1.1


@@ -153,68 +98,29 @@ class ChatCompletionResponseChoice(BaseModel):


 class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
    delta: DeltaMessage
    finish_reason: Optional[Literal["stop", "length", "function_call"]]
-    index: int
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0


 class ChatCompletionResponse(BaseModel):
    model: str
-    id: str
    object: Literal["chat.completion", "chat.completion.chunk"]
    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
    usage: Optional[UsageInfo] = None


-@app.get("/health")
-async def health() -> Response:
-    """Health check."""
-    return Response(status_code=200)
-
-
-@app.post("/v1/embeddings", response_model=EmbeddingResponse)
-async def get_embeddings(request: EmbeddingRequest):
-    embeddings = [embedding_model.encode(text) for text in request.input]
-    embeddings = [embedding.tolist() for embedding in embeddings]
-
-    def num_tokens_from_string(string: str) -> int:
-        """
-        Returns the number of tokens in a text string.
-        use cl100k_base tokenizer
-        """
-        encoding = tiktoken.get_encoding('cl100k_base')
-        num_tokens = len(encoding.encode(string))
-        return num_tokens
-
-    response = {
-        "data": [
-            {
-                "object": "embedding",
-                "embedding": embedding,
-                "index": index
-            }
-            for index, embedding in enumerate(embeddings)
-        ],
-        "model": request.model,
-        "object": "list",
-        "usage": CompletionUsage(
-            prompt_tokens=sum(len(text.split()) for text in request.input),
-            completion_tokens=0,
-            total_tokens=sum(num_tokens_from_string(text) for text in request.input),
-        )
-    }
-    return response
-
-
 @app.get("/v1/models", response_model=ModelList)
 async def list_models():
-    model_card = ModelCard(
-        id="chatglm3-6b"
-    )
-    return ModelList(
-        data=[model_card]
-    )
+    model_card = ModelCard(id="chatglm3-6b")
+    return ModelList(data=[model_card])


 @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
@@ -232,74 +138,24 @@ async def create_chat_completion(request: ChatCompletionRequest):
        echo=False,
        stream=request.stream,
        repetition_penalty=request.repetition_penalty,
-        tools=request.tools,
+        functions=request.functions,
    )
+
    logger.debug(f"==== request ====\n{gen_params}")

    if request.stream:
+        generate = predict(request.model, gen_params)
+        return EventSourceResponse(generate, media_type="text/event-stream")

-        # Use the stream mode to read the first few characters, if it is not a function call, direct stram output
-        predict_stream_generator = predict_stream(request.model, gen_params)
-        output = next(predict_stream_generator)
-        if not contains_custom_function(output):
-            return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
-
-        # Obtain the result directly at one time and determine whether tools needs to be called.
-        logger.debug(f"First result output：\n{output}")
-
-        function_call = None
-        if output and request.tools:
-            try:
-                function_call = process_response(output, use_tool=True)
-            except:
-                logger.warning("Failed to parse tool call")
-
-        # CallFunction
-        if isinstance(function_call, dict):
-            function_call = FunctionCallResponse(**function_call)
-
-            """
-            In this demo, we did not register any tools.
-            You can use the tools that have been implemented in our `tools_using_demo` and implement your own streaming tool implementation here.
-            Similar to the following method:
-                function_args = json.loads(function_call.arguments)
-                tool_response = dispatch_tool(tool_name: str, tool_params: dict)
-            """
-            tool_response = ""
-
-            if not gen_params.get("messages"):
-                gen_params["messages"] = []
-
-            gen_params["messages"].append(ChatMessage(
-                role="assistant",
-                content=output,
-            ))
-            gen_params["messages"].append(ChatMessage(
-                role="function",
-                name=function_call.name,
-                content=tool_response,
-            ))
-
-            # Streaming output of results after function calls
-            generate = predict(request.model, gen_params)
-            return EventSourceResponse(generate, media_type="text/event-stream")
-
-        else:
-            # Handled to avoid exceptions in the above parsing function process.
-            generate = parse_output_text(request.model, output)
-            return EventSourceResponse(generate, media_type="text/event-stream")
-
-    # Here is the handling of stream = False
    response = generate_chatglm3(model, tokenizer, gen_params)

    # Remove the first newline character
    if response["text"].startswith("\n"):
        response["text"] = response["text"][1:]
    response["text"] = response["text"].strip()
-
    usage = UsageInfo()
    function_call, finish_reason = None, "stop"
-    if request.tools:
+    if request.functions:
        try:
            function_call = process_response(response["text"], use_tool=True)
        except:
@@ -325,14 +181,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
    task_usage = UsageInfo.model_validate(response["usage"])
    for usage_key, usage_value in task_usage.model_dump().items():
        setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
-
-    return ChatCompletionResponse(
-        model=request.model,
-        id="",  # for open_source model, id is empty
-        choices=[choice_data],
-        object="chat.completion",
-        usage=usage
-    )
+    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)


 async def predict(model_id: str, params: dict):
@@ -343,7 +192,7 @@ async def predict(model_id: str, params: dict):
        delta=DeltaMessage(role="assistant"),
        finish_reason=None
    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    previous_text = ""
@@ -361,8 +210,7 @@ async def predict(model_id: str, params: dict):
            try:
                function_call = process_response(decoded_unicode, use_tool=True)
            except:
-                logger.warning(
-                    "Failed to parse tool call, maybe the response is not a tool call or have been answered.")
+                logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")

        if isinstance(function_call, dict):
            function_call = FunctionCallResponse(**function_call)
@@ -378,12 +226,7 @@ async def predict(model_id: str, params: dict):
            delta=delta,
            finish_reason=finish_reason
        )
-        chunk = ChatCompletionResponse(
-            model=model_id,
-            id="",
-            choices=[choice_data],
-            object="chat.completion.chunk"
-        )
+        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
        yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    choice_data = ChatCompletionResponseStreamChoice(
@@ -391,141 +234,16 @@ async def predict(model_id: str, params: dict):
        delta=DeltaMessage(),
        finish_reason="stop"
    )
-    chunk = ChatCompletionResponse(
-        model=model_id,
-        id="",
-        choices=[choice_data],
-        object="chat.completion.chunk"
-    )
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    yield '[DONE]'


-def predict_stream(model_id, gen_params):
-    """
-    The function call is compatible with stream mode output.
-
-    The first seven characters are determined.
-    If not a function call, the stream output is directly generated.
-    Otherwise, the complete character content of the function call is returned.
-
-    :param model_id:
-    :param gen_params:
-    :return:
-    """
-    output = ""
-    is_function_call = False
-    has_send_first_chunk = False
-    for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
-        decoded_unicode = new_response["text"]
-        delta_text = decoded_unicode[len(output):]
-        output = decoded_unicode
-
-        # When it is not a function call and the character length is> 7,
-        # try to judge whether it is a function call according to the special function prefix
-        if not is_function_call and len(output) > 7:
-
-            # Determine whether a function is called
-            is_function_call = contains_custom_function(output)
-            if is_function_call:
-                continue
-
-            # Non-function call, direct stream output
-            finish_reason = new_response["finish_reason"]
-
-            # Send an empty string first to avoid truncation by subsequent next() operations.
-            if not has_send_first_chunk:
-                message = DeltaMessage(
-                    content="",
-                    role="assistant",
-                    function_call=None,
-                )
-                choice_data = ChatCompletionResponseStreamChoice(
-                    index=0,
-                    delta=message,
-                    finish_reason=finish_reason
-                )
-                chunk = ChatCompletionResponse(
-                    model=model_id,
-                    id="",
-                    choices=[choice_data],
-                    created=int(time.time()),
-                    object="chat.completion.chunk"
-                )
-                yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-            send_msg = delta_text if has_send_first_chunk else output
-            has_send_first_chunk = True
-            message = DeltaMessage(
-                content=send_msg,
-                role="assistant",
-                function_call=None,
-            )
-            choice_data = ChatCompletionResponseStreamChoice(
-                index=0,
-                delta=message,
-                finish_reason=finish_reason
-            )
-            chunk = ChatCompletionResponse(
-                model=model_id,
-                id="",
-                choices=[choice_data],
-                created=int(time.time()),
-                object="chat.completion.chunk"
-            )
-            yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    if is_function_call:
-        yield output
-    else:
-        yield '[DONE]'
-
-
-async def parse_output_text(model_id: str, value: str):
-    """
-    Directly output the text content of value
-
-    :param model_id:
-    :param value:
-    :return:
-    """
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(role="assistant", content=value),
-        finish_reason=None
-    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
-    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-    yield '[DONE]'
-
-
-def contains_custom_function(value: str) -> bool:
-    """
-    Determine whether 'function_call' according to a special function prefix.
-
-    For example, the functions defined in "tools_using_demo/tool_register.py" are all "get_xxx" and start with "get_"
-
-    [Note] This is not a rigorous judgment method, only for reference.
-
-    :param value:
-    :return:
-    """
-    return value and 'get_' in value
-
 if __name__ == "__main__":
-    # Load LLM
+
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
-    model = AutoModel.from_pretrained(MODEL_PATH,
-                                      load_in_4bit=True,
-                                      trust_remote_code=True)
-    # load Embedding
-    # embedding_model = SentenceTransformer(EMBEDDING_PATH, device="cuda")
+    if 'cuda' in DEVICE:  # AMD, NVIDIA GPU can use Half Precision
+        model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
+    else:  # CPU, Intel GPU and other GPU can use Float16 Precision Only
+        model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
--- a/openai_api_demo/openai_api_request.py
+++ b/openai_api_demo/openai_api_request.py
-"""
-This script is an example of using the OpenAI API to create various interactions with a ChatGLM3 model.
-It includes functions to:
+# 使用curl命令测试返回
+# curl -X POST "http://127.0.0.1:8000/v1/chat/completions" \
+# -H "Content-Type: application/json" \
+# -d "{\"model\": \"chatglm3-6b\", \"messages\": [{\"role\": \"system\", \"content\": \"You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.\"}, {\"role\": \"user\", \"content\": \"你好，给我讲一个故事，大概100字\"}], \"stream\": false, \"max_tokens\": 100, \"temperature\": 0.8, \"top_p\": 0.8}"

-1. Conduct a basic chat session, asking about weather conditions in multiple cities.
-2. Initiate a simple chat in Chinese, asking the model to tell a short story.
-3. Retrieve and print embeddings for a given text input.
+# 使用Python代码测返回
+import requests
+import json

-Each function demonstrates a different aspect of the API's capabilities, showcasing how to make requests
-and handle responses.
-"""
+base_url = "http://127.0.0.1:8000"

-from openai import OpenAI

-base_url = "http://127.0.0.1:8000/v1/"
-client = OpenAI(api_key="EMPTY", base_url=base_url)
+def create_chat_completion(model, messages, functions, use_stream=False):
+    data = {
+        "function": functions,  # 函数定义
+        "model": model,  # 模型名称
+        "messages": messages,  # 会话历史
+        "stream": use_stream,  # 是否流式响应
+        "max_tokens": 100,  # 最多生成字数
+        "temperature": 0.8,  # 温度
+        "top_p": 0.8,  # 采样概率
+    }
+
+    response = requests.post(f"{base_url}/v1/chat/completions", json=data, stream=use_stream)
+    if response.status_code == 200:
+        if use_stream:
+            # 处理流式响应
+            for line in response.iter_lines():
+                if line:
+                    decoded_line = line.decode('utf-8')[6:]
+                    try:
+                        response_json = json.loads(decoded_line)
+                        content = response_json.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                        print(content)
+                    except:
+                        print("Special Token:", decoded_line)
+        else:
+            # 处理非流式响应
+            decoded_line = response.json()
+            content = decoded_line.get("choices", [{}])[0].get("message", "").get("content", "")
+            print(content)
+    else:
+        print("Error:", response.status_code)
+        return None


-def function_chat():
-    messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
-    tools = [
+def function_chat(use_stream=True):
+    functions = [
        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {
-                            "type": "string",
-                            "description": "The city and state, e.g. San Francisco, CA",
-                        },
-                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. Beijing",
                    },
-                    "required": ["location"],
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
+                "required": ["location"],
            },
        }
    ]
-
-    response = client.chat.completions.create(
-        model="chatglm3-6b",
-        messages=messages,
-        tools=tools,
-        tool_choice="auto",
-    )
-    if response:
-        content = response.choices[0].message.content
-        print(content)
-    else:
-        print("Error:", response.status_code)
+    chat_messages = [
+        {
+            "role": "user",
+            "content": "波士顿天气如何？",
+        },
+        {
+            "role": "assistant",
+            "content": "get_current_weather\n ```python\ntool_call(location='Beijing', unit='celsius')\n```",
+            "function_call": {
+                "name": "get_current_weather",
+                "arguments": '{"location": "Beijing", "unit": "celsius"}',
+            },
+        },
+        {
+            "role": "function",
+            "name": "get_current_weather",
+            "content": '{"temperature": "12", "unit": "celsius", "description": "Sunny"}',
+        },
+        # ... 接下来这段是 assistant 的回复和用户的回复。
+        # {
+        #     "role": "assistant",
+        #     "content": "根据最新的天气预报，目前北京的天气情况是晴朗的，温度为12摄氏度。",
+        # },
+        # {
+        #     "role": "user",
+        #     "content": "谢谢",
+        # }
+    ]
+    create_chat_completion("chatglm3-6b", messages=chat_messages, functions=functions, use_stream=use_stream)


 def simple_chat(use_stream=True):
-    messages = [
+    functions = None
+    chat_messages = [
        {
            "role": "system",
-            "content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's "
-                       "instructions carefully. Respond using markdown.",
+            "content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's instructions carefully. Respond using markdown.",
        },
        {
            "role": "user",
-            "content": "你好，请你用生动的话语给我讲一个小故事吧"
+            "content": "你好，给我讲一个故事，大概100字"
        }
    ]
-    response = client.chat.completions.create(
-        model="chatglm3-6b",
-        messages=messages,
-        stream=use_stream,
-        max_tokens=256,
-        temperature=0.8,
-        presence_penalty=1.1,
-        top_p=0.8)
-    if response:
-        if use_stream:
-            for chunk in response:
-                print(chunk.choices[0].delta.content)
-        else:
-            content = response.choices[0].message.content
-            print(content)
-    else:
-        print("Error:", response.status_code)
-
-
-def embedding():
-    response = client.embeddings.create(
-        model="bge-large-zh-1.5",
-        input=["你好，给我讲一个故事，大概100字"],
-    )
-    embeddings = response.data[0].embedding
-    print("嵌入完成，维度：", len(embeddings))
+    create_chat_completion("chatglm3-6b", messages=chat_messages, functions=functions, use_stream=use_stream)


 if __name__ == "__main__":
-    simple_chat(use_stream=False)
-    simple_chat(use_stream=True)
-    embedding()
-    function_chat()
+    function_chat(use_stream=False)
+    # simple_chat(use_stream=True)
--- a/openai_api_demo/requirements.txt
+++ b/openai_api_demo/requirements.txt
+openai>=1.3.0
+pydantic>=2.5.1
\ No newline at end of file
--- a/openai_api_demo/utils.py
+++ b/openai_api_demo/utils.py
+import os
 import gc
 import json
 import torch
+from torch.nn import Module
 from transformers import PreTrainedModel, PreTrainedTokenizer
+from transformers import AutoModel
 from transformers.generation.logits_process import LogitsProcessor
-from typing import Union, Tuple
+from typing import Dict, Union, Optional, Tuple
+
+
+def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
+    # transformer.word_embeddings 占用1层
+    # transformer.final_layernorm 和 lm_head 占用1层
+    # transformer.layers 占用 28 层
+    # 总共30层分配到num_gpus张卡上
+    num_trans_layers = 28
+    per_gpu_layers = 30 / num_gpus
+
+    # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
+    # windows下 model.device 会被设置成 transformer.word_embeddings.device
+    # linux下 model.device 会被设置成 lm_head.device
+    # 在调用chat或者stream_chat时,input_ids会被放到model.device上
+    # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
+    # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
+    # 本文件来源于https://github.com/THUDM/ChatGLM-6B/blob/main/utils.py
+    # 仅此处做少许修改以支持ChatGLM3
+    device_map = {
+        'transformer.embedding.word_embeddings': 0,
+        'transformer.encoder.final_layernorm': 0,
+        'transformer.output_layer': 0,
+        'transformer.rotary_pos_emb': 0,
+        'lm_head': 0
+    }
+
+    used = 2
+    gpu_target = 0
+    for i in range(num_trans_layers):
+        if used >= per_gpu_layers:
+            gpu_target += 1
+            used = 0
+        assert gpu_target < num_gpus
+        device_map[f'transformer.encoder.layers.{i}'] = gpu_target
+        used += 1
+
+    return device_map
+
+
+def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
+                       device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
+    if num_gpus < 2 and device_map is None:
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
+    else:
+        from accelerate import dispatch_model
+
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()
+
+        if device_map is None:
+            device_map = auto_configure_device_map(num_gpus)
+
+        model = dispatch_model(model, device_map=device_map)
+
+    return model


 class InvalidScoreLogitsProcessor(LogitsProcessor):
@@ -46,13 +103,13 @@ def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
 @torch.inference_mode()
 def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
    messages = params["messages"]
-    tools = params["tools"]
+    functions = params["functions"]
    temperature = float(params.get("temperature", 1.0))
    repetition_penalty = float(params.get("repetition_penalty", 1.0))
    top_p = float(params.get("top_p", 1.0))
    max_new_tokens = int(params.get("max_tokens", 256))
    echo = params.get("echo", True)
-    messages = process_chatglm_messages(messages, tools=tools)
+    messages = process_chatglm_messages(messages, functions=functions)
    query, role = messages[-1]["content"], messages[-1]["role"]

    inputs = tokenizer.build_chat_input(query, history=messages[:-1], role=role)
@@ -65,7 +122,6 @@ def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokeni
    eos_token_id = [
        tokenizer.eos_token_id,
        tokenizer.get_command("<|user|>"),
-        tokenizer.get_command("<|observation|>")
    ]

    gen_kwargs = {
@@ -120,19 +176,17 @@ def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokeni
    torch.cuda.empty_cache()


-def process_chatglm_messages(messages, tools=None):
+def process_chatglm_messages(messages, functions=None):
    _messages = messages
    messages = []
-    msg_has_sys = False
-    if tools:
+    if functions:
        messages.append(
            {
                "role": "system",
                "content": "Answer the following questions as best as you can. You have access to the following tools:",
-                "tools": tools
+                "tools": functions
            }
        )
-        msg_has_sys = True

    for m in _messages:
        role, content, func_call = m.role, m.content, m.function_call
@@ -155,9 +209,6 @@ def process_chatglm_messages(messages, tools=None):
                    }
                )
        else:
-            if role == "system" and msg_has_sys:
-                msg_has_sys = False
-                continue
            messages.append({"role": role, "content": content})
    return messages


--- a/openai_api_demo/zhipu_api_request.py
+++ b/openai_api_demo/zhipu_api_request.py
-"""
-This script is an example of using the Zhipu API to create various interactions with a ChatGLM3 model. It includes
-functions to:
-
-1. Conduct a basic chat session, asking about weather conditions in multiple cities.
-2. Initiate a simple chat in Chinese, asking the model to tell a short story.
-3. Retrieve and print embeddings for a given text input.
-Each function demonstrates a different aspect of the API's capabilities,
-showcasing how to make requests and handle responses.
-
-Note: Make sure your Zhipu API key is set as an environment
-variable formate as xxx.xxx (just for check, not need a real key).
-"""
-
-from zhipuai import ZhipuAI
-
-base_url = "http://127.0.0.1:8000/v1/"
-client = ZhipuAI(api_key="EMP.TY", base_url=base_url)
-
-
-def function_chat():
-    messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
-    tools = [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather",
-                "description": "Get the current weather in a given location",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "location": {
-                            "type": "string",
-                            "description": "The city and state, e.g. San Francisco, CA",
-                        },
-                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
-                    },
-                    "required": ["location"],
-                },
-            },
-        }
-    ]
-
-    response = client.chat.completions.create(
-        model="chatglm3_6b",
-        messages=messages,
-        tools=tools,
-        tool_choice="auto",
-    )
-    if response:
-        content = response.choices[0].message.content
-        print(content)
-    else:
-        print("Error:", response.status_code)
-
-
-def simple_chat(use_stream=True):
-    messages = [
-        {
-            "role": "system",
-            "content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow "
-                       "the user's instructions carefully. Respond using markdown.",
-        },
-        {
-            "role": "user",
-            "content": "你好，请你介绍一下chatglm3-6b这个模型"
-        }
-    ]
-    response = client.chat.completions.create(
-        model="chatglm3_",
-        messages=messages,
-        stream=use_stream,
-        max_tokens=256,
-        temperature=0.8,
-        top_p=0.8)
-    if response:
-        if use_stream:
-            for chunk in response:
-                print(chunk.choices[0].delta.content)
-        else:
-            content = response.choices[0].message.content
-            print(content)
-    else:
-        print("Error:", response.status_code)
-
-
-def embedding():
-    response = client.embeddings.create(
-        model="bge-large-zh-1.5",
-        input=["ChatGLM3-6B 是一个大型的中英双语模型。"],
-    )
-    embeddings = response.data[0].embedding
-    print("嵌入完成，维度：", len(embeddings))
-
-
-if __name__ == "__main__":
-    simple_chat(use_stream=False)
-    simple_chat(use_stream=True)
-    embedding()
-    function_chat()
--- a/requirements.txt
+++ b/requirements.txt
-# basic requirements
+protobuf
+pydantic==1.10.9
+transformers==4.30.2
+sentencepiece==0.1.99
+accelerate==0.21.0
+sse-starlette
+astunparse==1.6.2

-protobuf>=4.25.3
-transformers>=4.39.3
-tokenizers>=0.15.0
-cpm_kernels>=1.0.11
-torch>=2.1.0
-gradio>=4.26.0
-sentencepiece>=0.2.0
-sentence_transformers>=2.4.0
-accelerate>=0.29.2
-streamlit>=1.33.0
-fastapi>=0.110.0
-loguru~=0.7.2
-mdtex2html>=1.3.0
-latex2mathml>=3.77.0
-jupyter_client>=8.6.1
-nltk
-
-# for openai demo
-#openai>=1.17.1
-#zhipuai>=2.0.1
-#pydantic>=2.7.0
-#sse-starlette>=2.0.0
-#uvicorn>=0.29.0
-#timm>=0.9.16
-#tiktoken>=0.6.0
-
-# for langchain demo
-
-#langchain>=0.1.16
-#langchainhub>=0.1.15
-#arxiv>=2.1.0
--- a/tensorrt_llm_demo/README.md
+++ b/tensorrt_llm_demo/README.md
-# 使用NVIDIA TensorRT-LLM部署ChatGLM3
-
-[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架，您可以按照以下步骤来使用TensorRT-LLM部署ChatGLM3模型。
-
-## 1. 安装TensorRT-LLM
-#### 获取TensorRT-LLM代码：
-
-```bash
-# TensorRT-LLM 代码需要使用 git-lfs 拉取
-apt-get update && apt-get -y install git git-lfs
-
-git clone https://github.com/NVIDIA/TensorRT-LLM.git
-cd TensorRT-LLM
-
-# 本流程将使用 v0.7.0 Release 版本
-git checkout tags/v0.7.0 -b release/0.7.0
-git submodule update --init --recursive
-git lfs install
-git lfs pull
-```
-
-#### 构建docker镜像并安装TensorRT-LLM：
-```bash
-make -C docker release_build
-```
-
-#### 运行docker镜像：
-```bash
-make -C docker release_run
-```
-
-## 3. 为ChatGLM3模型构建TensorRT-LLM推理引擎：
-
-#### 安装Python依赖：
-```bash
-cd ./examples/chatglm
-pip install -r requirements.txt
-apt-get update
-apt-get install git-lfs
-```
-#### 从Huggingface下载ChatGLM3模型：
-```
-# 您可以选择具体想部署的模型下载
-git clone https://huggingface.co/THUDM/chatglm3-6b      chatglm3_6b
-git clone https://huggingface.co/THUDM/chatglm3-6b-base chatglm3_6b_base
-git clone https://huggingface.co/THUDM/chatglm3-6b-32k  chatglm3_6b_32k
-```
-
-#### 使用build.py来构建推理引擎：
-以下是一些使用build.py构建推理引擎的示例：
-```bash
-# 构建一个默认的精度为fp16的引擎
-python3 build.py -m chatglm3_6b --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
-
-# 构建一个默认的精度为fp16的引擎，并打开FMHA功能（详见下文）
-python3 build.py -m chatglm3_6b --enable_context_fmha --output_dir trt_engines/chatglm3_6b/fp16/1-gpu
-
-# 构建一个w8a16的引擎
-python3 build.py -m chatglm3_6b --use_weight_only --output_dir trt_engines/chatglm3_6b/weight_only/1-gpu
-
-# 构建一个默认的精度为fp16的引擎，并支持使用两个GPU
-python3 build.py -m chatglm3_6b --world_size 2 --output_dir trt_engines/chatglm3_6b/fp16/2-gpu
-
-# 使用chatglm3_6b_base模型
-python3 build.py -m chatglm3_6b_base --output_dir trt_engines/chatglm3_6b_base/fp16/1-gpu
-
-# 使用chatglm3_6b-32k模型
-python3 build.py -m chatglm3_6b_32k --output_dir trt_engines/chatglm3_6b-32k/fp16/1-gpu
-```
-
-#### 可配置的plugin参数
-
-* 使用 `--use_gpt_attention_plugin <DataType>` 来配置 GPT Attention plugin (默认使用float16)。
-* 使用 `--use_gemm_plugin <DataType>` 来配置 GEMM plugin (默认使用float16)。
-* 使用 `--use_rmsnorm_plugin <DataType>` 来配置 RMS normolization plugin (默认使用float16)。
-
-#### Fused Multi-Head Attention (FMHA)
-
-* 使用 `--enable_context_fmha` 或 `--enable_context_fmha_fp32_acc` 参数来开启FMHA kernels, 可以获得更好的性能的同时降低显存开销。
-
-* `--use_gpt_attention_plugin` 如果被设置为关闭的话将无法使用FMHA功能。
-
-* `--enable_context_fmha` 将会使用FP16 accumulator, 可能会略微降低精度. 您也可以选择使用`--enable_context_fmha_fp32_acc` 来保护精度，但这会略微降低FMHA的性能提升。
-
-#### Weight-Only 量化
-
-* 使用 `--use_weight_only` 来开启 Weight-Only 量化, 这样可以加速推理并减少显存开销。
-
-* 你还可以通过切换 `--weight_only_precision int8` 或者 `--weight_only_precision int4` 来选择具体是使用int8还是int4量化，默认为Int8。
-
-#### In-flight Batching（须使用NVIDIA Triton进行推理）
-
-* 使用 `--use_inflight_batching` 来开启 In-flight Batching，启用后，Paged KV Cache也会自动启用。
-
-* Paged KV cache中block的数量可以用`--tokens_per_block` 来配置。
-
-更多详细的功能和配置请参考：[TensorRT-LLM ChatGLM实现](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/chatglm)。
-
-## 3. 使用TensorRT-LLM Python Runtime进行推理
-
-#### 单机单卡的推理示例：
-
-```bash
-python3 ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
-                  --max_output_len 50 \
-                  --tokenizer_dir chatglm3_6b \
-                  --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
-```
-
-#### 单机多卡的推理示例：
-
-```bash
-mpirun -n 2 \
-    python ../run.py --input_text "What's new between ChatGLM3-6B and ChatGLM2-6B?" \
-                     --max_output_len 50 \
-                     --tokenizer_dir chatglm3_6b \
-                     --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
-```
-
-* 如果您以root权限运行 `mpirun`，则可能需要添加 `--allow-run-as-root` 参数。
-
-#### 运行summarize.py进行文章总结任务：
-
-```bash
-python3 ../summarize.py --test_trt_llm \
-                        --hf_model_dir chatglm3_6b \
-                        --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
-```
-
-#### 运行我们提供的对话demo:[tensorrt_llm_cli_demo.py](tensorrt_llm_cli_demo.py):
-
-```bash
-python3 tensorrt_llm_cli_demo.py --tokenizer_dir chatglm3_6b --engine_dir trt_engines/chatglm3_6b/fp16/1-gpu
-```
-
-运行结果展示：
-```
-用户: what is your name?
-ChatGLM3-6B:Hello, I am an assistant named ChatGLM3-6B, and you can call me assistant. What can I help you with??
-
-用户: what is new in ChatGLM3-6B compared with ChatGLM2-6B?
-ChatGLM3-6B:ChatGLM3-6B is an improved version of ChatGLM2-6B. Compared with ChatGLM2-6B, ChatGLM3-6B has the following improvements:
-
-1. Enhanced language understanding capabilities: ChatGLM3-6B's language model is based on the GLM3-6B model, which has been pre-trained on more diverse and large-scale data, resulting in better language understanding and generation capabilities.
-
-2. Improved generation ability: ChatGLM3-6B has improved the generation ability compared to ChatGLM2-6B. With more training data and optimization algorithms, ChatGLM3-6B can generate more coherent and natural-looking text.
-
-3. Enhanced adaptability to different dialogue scenarios: ChatGLM3-6B has been trained on more diverse dialogue data, including dialogue scenarios with different languages, cultures, and styles, making it more adaptable to different dialogue scenarios.
-
-4. New features and functions: ChatGLM3-6B also has some new features and functions, such as support for multiple choice questions, sentiment analysis, and entity recognition.
-
-In short, ChatGLM3-6B is more advanced and capable than ChatGLM2-6B, and can better meet the needs of users in various scenarios..
-```
-
-#### 性能测试：
-
-您可以在[这里](https://github.com/NVIDIA/TensorRT-LLM/tree/main/benchmarks/python)查阅到如何测试 TensorRT-LLM 上运行 ChatGLM3 的性能。
-
-## 4. 使用NVIDIA Triton部署在线推理服务器
-使用 NVIDIA Triton 可以部署高性能，高拓展性，高稳定性的推理服务，并且可以开启In-flight Batching功能提升实际推理服务时的吞吐。详见[In-flight Batching Triton Backend](https://github.com/triton-inference-server/tensorrtllm_backend/tree/main/inflight_batcher_llm)。
\ No newline at end of file
--- a/tensorrt_llm_demo/tensorrt_llm_cli_demo.py
+++ b/tensorrt_llm_demo/tensorrt_llm_cli_demo.py
-"""
-This script is a part of a larger project for generating text using large language models.
-It includes functionalities for finding engine files, parsing arguments, setting up configurations for different models,
-and executing the generation process with various settings.
-This script particularly supports models like ChatGLM3-6B and its variants,
-handling quantization, serialization, and runtime aspects.
-
-
-Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-Modifications made by Yuxuan.Zhang @ ZhipuAI on 2023-12-24.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Modifications:
-
-1. Removed input_file, tokenizer_type, and other parameters unrelated to dialogue. Set num_beams to 1.
-2. Adapted single turn dialogue into ChatGLM3-6B template and implemented multi-turn conversations.
-
-"""
-
-import argparse
-import json
-import torch
-import transformers
-
-from pathlib import Path
-from typing import List
-
-import tensorrt_llm
-from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import (GenerationSession, ModelConfig, SamplingConfig)
-
-
-def find_engines(dir: Path, model_name: str = "*", dtype: str = "*", tp_size: str = "*", rank: str = "*") -> List[Path]:
-    """
-    Searches for engine files matching a specified pattern within a directory.
-    This is typically used to locate compiled model files for efficient execution on specific hardware.
-    Parameters:
-        - dir: The directory to search.
-        - model_name, dtype, tp_size, rank:
-        Pattern matching parameters to filter engine files by model name, data type,
-        tensor parallel size, and rank respectively.
-    Returns:
-        - A list of Paths pointing to the engine files.
-    """
-
-    template = f"{model_name}_{dtype}_tp{tp_size}_rank{rank}.engine"
-    return list(dir.glob(template))
-
-
-def parse_arguments(args=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name',
-                        type=str,
-                        choices=[
-                            "chatglm3_6b",
-                            "chatglm3_6b_base",
-                            "chatglm3_6b_32k"
-                        ],
-                        default="chatglm3_6b",
-                        help='the name of the model')
-    parser.add_argument('--max_output_len', type=int, default=4096)
-    parser.add_argument('--engine_dir', type=str, default=None)
-    parser.add_argument('--tokenizer_dir', type=str, default=None)
-    parser.add_argument('--temperature', type=float, default=0.95)
-    parser.add_argument('--top_k', type=int, default=1)
-    parser.add_argument('--top_p', type=float, default=0.8)
-    parser.add_argument('--random_seed', type=int, default=2023)
-    parser.add_argument('--streaming', default=True, action='store_true')
-    args = parser.parse_args(args)
-
-    return args
-
-
-def main():
-    """
-    The main execution function of the script. It orchestrates the text generation process
-    by performing several key steps:
-        - Parses command-line arguments to configure model details, output specifications,
-        and other user-defined parameters.
-        - Loads the model configuration from a specified directory and prepares the environment for text generation
-        based on the model and hardware specifics.
-        - Sets up the generation session with the appropriate model, tokenizer, and runtime configurations.
-        - Enters a loop to continuously accept user input, generate text based on the provided prompts, and output
-        the model's responses.
-        - Handles special commands such as 'stop' to end the conversation and 'clear' to reset the chat history.
-        - Manages resources and ensures that the generated text is properly formatted and presented to the user.
-    The function is designed to be the entry point of the script, invoking all necessary components and managing the
-    flow of data and control throughout the execution.
-    """
-
-    args = parse_arguments()
-
-    config_path = Path(args.engine_dir) / 'config.json'
-    with open(config_path, 'r') as f:
-        config = json.load(f)
-
-    dtype = config['builder_config']['precision']
-    max_output_len = min(config['builder_config']['max_output_len'], args.max_output_len)
-    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
-    remove_input_padding = config['builder_config']['remove_input_padding']
-    tp_size = config['builder_config']['tensor_parallel']
-    pp_size = config['builder_config']['pipeline_parallel']
-    world_size = tp_size * pp_size
-
-    assert world_size == tensorrt_llm.mpi_world_size(), f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-
-    max_output_len = min(max_output_len, args.max_output_len)
-    runtime_rank = tensorrt_llm.mpi_rank()
-    runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank, tp_size=world_size)
-    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
-
-    serialize_path = find_engines(
-        dir=Path(args.engine_dir),
-        model_name=args.model_name,
-        dtype=dtype,
-        tp_size=world_size,
-        rank=runtime_rank)[0]
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_dir, trust_remote_code=True)
-    model_config = ModelConfig(vocab_size=config['builder_config']['vocab_size'],
-                               num_layers=config['builder_config']['num_layers'],
-                               num_heads=config['builder_config']['num_heads'] // tp_size,
-                               num_kv_heads=(config['builder_config']['num_kv_heads'] + tp_size - 1) // tp_size,
-                               hidden_size=config['builder_config']['hidden_size'] // tp_size,
-                               gpt_attention_plugin=use_gpt_attention_plugin,
-                               remove_input_padding=config['builder_config']['remove_input_padding'],
-                               model_name=args.model_name,
-                               paged_kv_cache=config['builder_config']['paged_kv_cache'],
-                               quant_mode=QuantMode(config['builder_config']['quant_mode']),
-                               dtype=dtype)
-
-    sampling_config = SamplingConfig(
-        end_id=tokenizer.eos_token_id,
-        pad_id=tokenizer.pad_token_id,
-        num_beams=1,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        top_p=args.top_p
-    )
-    sampling_config.random_seed = args.random_seed
-
-    with open(serialize_path, 'rb') as f:
-        engine_buffer = f.read()
-        session = GenerationSession
-
-    decoder = session(model_config, engine_buffer, runtime_mapping)
-
-    history = []
-    while True:
-        input_text_with_history = ""
-        max_input_len = config['builder_config']['max_input_len']
-        input_text = input("用户: ")
-        if input_text.lower() == 'stop':
-            break
-
-        if input_text.lower() == 'clear':
-            history = []
-            print("ChatGLM3-6B: 对话历史已清空")
-            continue
-
-        history.append(input_text)
-
-        for idx, content in enumerate(history):
-            if idx % 2 != 0:
-                input_text_with_history += "{}\n".format(content)
-            else:
-                input_text_with_history += "<|user|>{}\n<|assistant|>".format(content)
-
-        tokenized = tokenizer(
-            input_text_with_history,
-            return_tensors="pt",
-            padding=True,
-            return_length=True
-        )
-
-        input_ids = tokenized['input_ids'].int()
-        input_lengths = tokenized['length'].int()
-        max_input_len_real = torch.max(input_lengths)
-        if max_input_len_real > max_input_len:
-            input_ids = input_ids[:, :max_input_len]
-            input_lengths = torch.where(input_lengths > max_input_len, max_input_len, input_lengths)
-        else:
-            max_input_len = max_input_len_real
-        if remove_input_padding:
-            input_ids_no_padding = (torch.zeros(1, torch.sum(input_lengths), dtype=torch.int32))
-
-            lengths_acc = torch.cumsum(torch.cat([torch.IntTensor([0]), input_lengths]), dim=0)
-
-            for i in range(len(input_ids)):
-                input_ids_no_padding[0, lengths_acc[i]:lengths_acc[i + 1]] = torch.IntTensor(
-                    input_ids[i, max_input_len - input_lengths[i]:max_input_len])
-
-            input_ids = input_ids_no_padding
-
-        elif use_gpt_attention_plugin:
-            input_ids_padding_right = torch.zeros_like(input_ids) + sampling_config.end_id
-            for i, sample in enumerate(input_ids):
-                nPadding = 0
-                for token in sample:
-                    if token == sampling_config.pad_id:
-                        nPadding += 1
-                    else:
-                        break
-                input_ids_padding_right[i, :len(sample[nPadding:])] = sample[nPadding:]
-            input_ids = input_ids_padding_right
-        input_lengths = torch.tensor([input_ids.shape[-1]], dtype=torch.int32)
-        decoder.setup(1, max_input_len, max_output_len, 1)
-        output = decoder.decode(
-            input_ids.contiguous().cuda(),
-            input_lengths.contiguous().cuda(),
-            sampling_config,
-            output_sequence_lengths=True,
-            return_dict=True,
-            streaming=args.streaming
-        )
-
-        print("ChatGLM3-6B:", end="")
-        generated_text = ""
-        if args.streaming:
-            for output_item in output:
-                output_id = output_item["output_ids"]
-                output_sequence_lengths = output_item["sequence_lengths"]
-                output_id = output_id[0, 0, output_sequence_lengths[0, 0] - 1]
-                output_word = tokenizer.convert_ids_to_tokens(int(output_id))
-                output_word = output_word.replace("▁", " ")
-                output_word = tokenizer.convert_tokens_to_string(output_word)
-                print(output_word, end="", flush=True)
-                generated_text += output_word
-            print("\n")
-        else:
-            torch.cuda.synchronize()
-            output_ids = output["output_ids"][0]
-            output = output_ids[0, input_lengths.item():]
-            generated_text = tokenizer.decode(output, skip_special_tokens=True)
-            print(generated_text)
-
-        history.append(generated_text)
-
-    del decoder
-    print(f"Good bye!")
-
-
-if __name__ == '__main__':
-    main()
--- a/tools_using_demo/README.md
+++ b/tools_using_demo/README.md
--- a/tools_using_demo/README_en.md
+++ b/tools_using_demo/README_en.md
--- a/tools_using_demo/cli_demo_tool.py
+++ b/tools_using_demo/cli_demo_tool.py
-"""
-This demo script is designed for interacting with the ChatGLM3-6B in Function, to show Function Call capabilities.
-"""
-
 import os
 import platform
-import torch
 from transformers import AutoTokenizer, AutoModel
+import torch

 MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b')
 TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
-model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True, device_map="auto").eval()
+if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
+else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()

 os_name = platform.system()
 clear_command = 'cls' if os_name == 'Windows' else 'clear'
@@ -26,79 +26,10 @@ def build_prompt(history):
    return prompt


-tools = [
-    {'name': 'track', 'description': '追踪指定股票的实时价格',
-     'parameters':
-         {
-             'type': 'object', 'properties':
-             {'symbol':
-                 {
-                     'description': '需要追踪的股票代码'
-                 }
-             },
-             'required': []
-         }
-     }, {
-        'name': '/text-to-speech', 'description': '将文本转换为语音',
-        'parameters':
-            {
-                'type': 'object', 'properties':
-                {
-                    'text':
-                        {
-                            'description': '需要转换成语音的文本'
-                        },
-                    'voice':
-                        {
-                            'description': '要使用的语音类型（男声、女声等）'
-                        },
-                    'speed': {
-                        'description': '语音的速度（快、中等、慢等）'
-                    }
-                }, 'required': []
-            }
-    },
-    {
-        'name': '/image_resizer', 'description': '调整图片的大小和尺寸',
-        'parameters': {'type': 'object',
-                       'properties':
-                           {
-                               'image_file':
-                                   {
-                                       'description': '需要调整大小的图片文件'
-                                   },
-                               'width':
-                                   {
-                                       'description': '需要调整的宽度值'
-                                   },
-                               'height':
-                                   {
-                                       'description': '需要调整的高度值'
-                                   }
-                           },
-                       'required': []
-                       }
-    },
-    {
-        'name': '/foodimg', 'description': '通过给定的食品名称生成该食品的图片',
-        'parameters': {
-            'type': 'object', 'properties':
-                {
-                    'food_name':
-                        {
-                            'description': '需要生成图片的食品名称'
-                        }
-                },
-            'required': []
-        }
-    }
-]
-system_item = {
-    "role": "system",
-    "content": "Answer the following questions as best as you can. You have access to the following tools:",
-    "tools": tools
-}
-
+tools = [{'name': 'track', 'description': '追踪指定股票的实时价格', 'parameters': {'type': 'object', 'properties': {'symbol': {'description': '需要追踪的股票代码'}}, 'required': []}}, {'name': '/text-to-speech', 'description': '将文本转换为语音', 'parameters': {'type': 'object', 'properties': {'text': {'description': '需要转换成语音的文本'}, 'voice': {'description': '要使用的语音类型（男声、女声等）'}, 'speed': {'description': '语音的速度（快、中等、慢等）'}}, 'required': []}}, {'name': '/image_resizer', 'description': '调整图片的大小和尺寸', 'parameters': {'type': 'object', 'properties': {'image_file': {'description': '需要调整大小的图片文件'}, 'width': {'description': '需要调整的宽度值'}, 'height': {'description': '需要调整的高度值'}}, 'required': []}}, {'name': '/foodimg', 'description': '通过给定的食品名称生成该食品的图片', 'parameters': {'type': 'object', 'properties': {'food_name': {'description': '需要生成图片的食品名称'}}, 'required': []}}]
+system_item = {"role": "system",
+               "content": "Answer the following questions as best as you can. You have access to the following tools:",
+               "tools": tools}

 def main():
    past_key_values, history = None, [system_item]
@@ -110,7 +41,7 @@ def main():
        if query.strip() == "stop":
            break
        if query.strip() == "clear":
-            past_key_values, history = None, [system_item]
+            past_key_values, history = None,  [system_item]
            role = "user"
            os.system(clear_command)
            print("欢迎使用 ChatGLM3-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
@@ -121,9 +52,7 @@ def main():
        print("")
        if isinstance(response, dict):
            role = "observation"
-        else:
-            role = "user"


 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
--- a/tools_using_demo/openai_api_demo.py
+++ b/tools_using_demo/openai_api_demo.py
@@ -12,13 +12,13 @@ client = OpenAI(
  api_key = "xxx"
 )

-tools = get_tools()
+functions = get_tools()


-def run_conversation(query: str, stream=False, tools=None, max_retry=5):
+def run_conversation(query: str, stream=False, functions=None, max_retry=5):
    params = dict(model="chatglm3", messages=[{"role": "user", "content": query}], stream=stream)
-    if tools:
-        params["tools"] = tools
+    if functions:
+        params["functions"] = functions
    response = client.chat.completions.create(**params)

    for _ in range(max_retry):
@@ -74,7 +74,7 @@ def run_conversation(query: str, stream=False, tools=None, max_retry=5):
                        {
                            "role": "function",
                            "name": function_call.name,
-                            "content": tool_response,
+                            "content": tool_response,  # 调用函数返回结果
                        }
                    )

@@ -90,4 +90,4 @@ if __name__ == "__main__":
    logger.info("\n=========== next conversation ===========")

    query = "帮我查询北京的天气怎么样"
-    run_conversation(query, tools=tools, stream=True)
+    run_conversation(query, functions=functions, stream=True)
--- a/tool_using/requirements.txt
+++ b/tool_using/requirements.txt
+openai>=1.2.0
--- a/tool_using/test.py
+++ b/tool_using/test.py
+tools = [
+    {
+        "name": "track",
+        "description": "追踪指定股票的实时价格",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "symbol": {
+                    "description": "需要追踪的股票代码"
+                }
+            },
+            "required": ['symbol']
+        }
+    },
+    {
+        "name": "text-to-speech",
+        "description": "将文本转换为语音",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "text": {
+                    "description": "需要转换成语音的文本"
+                },
+                "voice": {
+                    "description": "要使用的语音类型（男声、女声等）"
+                },
+                "speed": {
+                    "description": "语音的速度（快、中等、慢等）"
+                }
+            },
+            "required": ['text']
+        }
+    }
+]
+system_info = {"role": "system", "content": "Answer the following questions as best as you can. You have access to the following tools:", "tools": tools}
+
+import os
+import platform
+from transformers import AutoTokenizer, AutoModel
+import torch
+
+MODEL_PATH = os.environ.get('MODEL_PATH', '../../chatglm3-6b')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
+if 'cuda' in DEVICE: # AMD, NVIDIA GPU can use Half Precision
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).to(DEVICE).eval()
+else: # CPU, Intel GPU and other GPU can use Float16 Precision Only
+    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
+
+history = [system_info]
+query = "帮我查询股票10111的价格"
+response, history = model.chat(tokenizer, query, history=history)
+print(response)
\ No newline at end of file
--- a/tools_using_demo/tool_register.py
+++ b/tools_using_demo/tool_register.py
-"""
-这段代码是工具注册的部分，通过注册工具，让模型实现工具调用
-"""
 import inspect
 import traceback
 from copy import deepcopy
@@ -40,7 +37,7 @@ def register_tool(func: callable):
    tool_def = {
        "name": tool_name,
        "description": tool_description,
-        "parameters": tool_params
+        "params": tool_params
    }

    print("[registered tool] " + pformat(tool_def))
@@ -65,7 +62,7 @@ def get_tools() -> dict:
    return deepcopy(_TOOL_DESCRIPTIONS)


-# tools Definitions
+# Tool Definitions

 @register_tool
 def random_number_generator(

--- a/update_requirements.sh
+++ b/update_requirements.sh
-#!/bin/bash
-
-python -m pip install --upgrade pip -i https://pypi.tuna.tsinghua.edu.cn/simple
-
-while read requirement; do
-    python -m pip install --upgrade "$requirement" -i https://pypi.tuna.tsinghua.edu.cn/simple
-done < requirements.txt