v1.0

fbb7398d · chenzk · fbb7398d · fbb7398d · fbb7398d · fbb7398d
Commit fbb7398d authored Jan 17, 2025 by chenzk
20 changed files
--- a/assets/step-ministep32.jpg
+++ b/assets/step-ministep32.jpg
--- a/assets/step_patch.jpg
+++ b/assets/step_patch.jpg
--- a/assets/strawberry.jpg
+++ b/assets/strawberry.jpg
--- a/assets/strawberry_2.jpg
+++ b/assets/strawberry_2.jpg
--- a/assets/test@k.jpg
+++ b/assets/test@k.jpg
--- a/assets/trans-case-1.jpg
+++ b/assets/trans-case-1.jpg
--- a/assets/trans-case-2.jpg
+++ b/assets/trans-case-2.jpg
--- a/assets/translation.jpg
+++ b/assets/translation.jpg
--- a/assets/translation_3.jpg
+++ b/assets/translation_3.jpg
--- a/assets/tree_visualization.png
+++ b/assets/tree_visualization.png
--- a/data/CoT_demo.json
+++ b/data/CoT_demo.json
--- a/data/NOTICE
+++ b/data/NOTICE
+Copyright (C) 2024 AIDC-AI
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+This dataset was generated based on the following third-party model:
+We used data generated by Qwen2.5-7B-Instruct (https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), licensed under Apache License Version 2 (https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/LICENSE, SPDX-License-identifier:Apache-2.0)
\ No newline at end of file
--- a/doc/algorithm.png
+++ b/doc/algorithm.png
--- a/doc/llama3.png
+++ b/doc/llama3.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
+ENV DEBIAN_FRONTEND=noninteractive
+# RUN yum update && yum install -y git cmake wget build-essential
+# RUN source /opt/dtk-24.04.3/env.sh
+# # 安装pip相关依赖
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+torch
+transformers
+# vllm==0.6.2
+tqdm
--- a/examples/README.md
+++ b/examples/README.md
+# Deploy Marco-o1 API with FastAPI
+
+This example provides an API using FastAPI to interact with a language model. You have the option to choose between using streaming responses or non-streaming responses, depending on your use-case requirements.
+
+## Requirements
+
+- FastAPI
+- Uvicorn
+- Transformers
+- Torch
+- VLLM
+- HTTPX (for streaming response)
+- Requests (optional, for non-streaming)
+
+
+## Running the API Server
+
+### Non-Streaming Mode
+
+To start the FastAPI server with non-streaming responses:
+
+```bash
+uvicorn vllm_fastapi:app --workers 1
+```
+
+To run a client with non-streaming responses:
+
+```bash
+python3 client.py
+```
+
+### Streaming Mode
+
+To start the FastAPI server with non-streaming responses:
+
+```bash
+uvicorn stream_vllm_fastapi:app --workers 1
+```
+
+To run a client with non-streaming responses:
+
+```bash
+python3 stream_client.py
+```
--- a/examples/client.py
+++ b/examples/client.py
+import requests
+
+def chat_with_model(user_input, history):
+    url = "http://localhost:8000/chat/"
+    payload = {
+        "user_input": user_input,
+        "history": history
+    }
+    headers = {
+        "Content-Type": "application/json"
+    }
+
+    response = requests.post(url, json=payload, headers=headers)
+
+    if response.status_code == 200:
+        return response.json()
+    else:
+        print(f"Failed to send request: {response.status_code} {response.text}")
+        return None
+
+def main():
+    history = []
+    print("Enter 'q' to quit, 'c' to clear chat history.")
+    while True:
+        user_input = input("User: ").strip()
+        if user_input.lower() in ['q', 'quit']:
+            print("Exiting chat.")
+            break
+        if user_input.lower() == 'c':
+            print("Clearing chat history.")
+            history.clear()
+            continue
+
+        result = chat_with_model(user_input, history)
+        if result:
+            # Display the response from the model.
+            print(f"Assistant: {result['response']}")
+            # Update the chat history from the response.
+            history = result['history']
+
+if __name__ == "__main__":
+    main()
--- a/examples/stream_client.py
+++ b/examples/stream_client.py
+import httpx
+
+
+def chat_with_model_stream(user_input, history, url="http://localhost:8000/chat/"):
+    payload = {
+        "user_input": user_input,
+        "history": history
+    }
+
+    headers = {
+        "Content-Type": "application/json"
+    }
+
+    # Use httpx to send a POST request without stream=True, handle streaming in response context
+    with httpx.Client() as client:
+        with client.stream("POST", url, json=payload, headers=headers) as response:
+            if response.status_code == 200:
+                print("Assistant:", end=" ")
+                for chunk in response.iter_text():
+                    print(chunk, end="", flush=True)
+                print()
+            else:
+                print(f"Failed to send request: {response.status_code} {response.text}")
+                return None
+
+
+def main():
+    history = []
+    print("Enter 'q' to quit, 'c' to clear chat history.")
+    while True:
+        user_input = input("User: ").strip()
+        if user_input.lower() in ['q', 'quit']:
+            print("Exiting chat.")
+            break
+        if user_input.lower() == 'c':
+            print("Clearing chat history.")
+            history.clear()
+            continue
+
+        chat_with_model_stream(user_input, history)
+        # Future improvement: Update history based on API response if needed.
+
+
+if __name__ == "__main__":
+    main()
+
--- a/examples/stream_vllm_fastapi.py
+++ b/examples/stream_vllm_fastapi.py
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+import torch
+from vllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+
+app = FastAPI()
+
+
+class ChatRequest(BaseModel):
+    user_input: str
+    history: list
+
+
+tokenizer = None
+model = None
+
+
+@app.on_event("startup")
+def load_model_and_tokenizer():
+    global tokenizer, model
+    path = "AIDC-AI/Marco-o1"
+    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+    model = LLM(model=path, tensor_parallel_size=4)
+
+
+def generate_response_stream(model, text, max_new_tokens=4096):
+    new_output = ''
+    sampling_params = SamplingParams(
+        max_tokens=1,
+        temperature=0,
+        top_p=0.9
+    )
+    with torch.inference_mode():
+        for _ in range(max_new_tokens):
+            outputs = model.generate(
+                [f'{text}{new_output}'],
+                sampling_params=sampling_params,
+                use_tqdm=False
+            )
+            next_token = outputs[0].outputs[0].text
+            new_output += next_token
+            yield next_token  # Yield each part of the response
+
+            if new_output.endswith('</Output>'):
+                break
+
+
+@app.post("/chat/")
+async def chat(request: ChatRequest):
+    if not request.user_input:
+        raise HTTPException(status_code=400, detail="Input cannot be empty.")
+
+    if request.user_input.lower() in ['q', 'quit']:
+        return {"response": "Exiting chat."}
+
+    if request.user_input.lower() == 'c':
+        request.history.clear()
+        return {"response": "Clearing chat history."}
+
+    request.history.append({"role": "user", "content": request.user_input})
+    text = tokenizer.apply_chat_template(request.history, tokenize=False, add_generation_prompt=True)
+
+    response_stream = generate_response_stream(model, text)
+
+    # Stream the response using StreamingResponse
+    return StreamingResponse(response_stream, media_type="text/plain")
+