Commit fbb7398d authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2228 failed with stages
in 0 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
Copyright (C) 2024 AIDC-AI
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This dataset was generated based on the following third-party model:
We used data generated by Qwen2.5-7B-Instruct (https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), licensed under Apache License Version 2 (https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/LICENSE, SPDX-License-identifier:Apache-2.0)
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04
ENV DEBIAN_FRONTEND=noninteractive
# RUN yum update && yum install -y git cmake wget build-essential
# RUN source /opt/dtk-24.04.3/env.sh
# # 安装pip相关依赖
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
torch
transformers
# vllm==0.6.2
tqdm
# Deploy Marco-o1 API with FastAPI
This example provides an API using FastAPI to interact with a language model. You have the option to choose between using streaming responses or non-streaming responses, depending on your use-case requirements.
## Requirements
- FastAPI
- Uvicorn
- Transformers
- Torch
- VLLM
- HTTPX (for streaming response)
- Requests (optional, for non-streaming)
## Running the API Server
### Non-Streaming Mode
To start the FastAPI server with non-streaming responses:
```bash
uvicorn vllm_fastapi:app --workers 1
```
To run a client with non-streaming responses:
```bash
python3 client.py
```
### Streaming Mode
To start the FastAPI server with non-streaming responses:
```bash
uvicorn stream_vllm_fastapi:app --workers 1
```
To run a client with non-streaming responses:
```bash
python3 stream_client.py
```
import requests
def chat_with_model(user_input, history):
url = "http://localhost:8000/chat/"
payload = {
"user_input": user_input,
"history": history
}
headers = {
"Content-Type": "application/json"
}
response = requests.post(url, json=payload, headers=headers)
if response.status_code == 200:
return response.json()
else:
print(f"Failed to send request: {response.status_code} {response.text}")
return None
def main():
history = []
print("Enter 'q' to quit, 'c' to clear chat history.")
while True:
user_input = input("User: ").strip()
if user_input.lower() in ['q', 'quit']:
print("Exiting chat.")
break
if user_input.lower() == 'c':
print("Clearing chat history.")
history.clear()
continue
result = chat_with_model(user_input, history)
if result:
# Display the response from the model.
print(f"Assistant: {result['response']}")
# Update the chat history from the response.
history = result['history']
if __name__ == "__main__":
main()
import httpx
def chat_with_model_stream(user_input, history, url="http://localhost:8000/chat/"):
payload = {
"user_input": user_input,
"history": history
}
headers = {
"Content-Type": "application/json"
}
# Use httpx to send a POST request without stream=True, handle streaming in response context
with httpx.Client() as client:
with client.stream("POST", url, json=payload, headers=headers) as response:
if response.status_code == 200:
print("Assistant:", end=" ")
for chunk in response.iter_text():
print(chunk, end="", flush=True)
print()
else:
print(f"Failed to send request: {response.status_code} {response.text}")
return None
def main():
history = []
print("Enter 'q' to quit, 'c' to clear chat history.")
while True:
user_input = input("User: ").strip()
if user_input.lower() in ['q', 'quit']:
print("Exiting chat.")
break
if user_input.lower() == 'c':
print("Clearing chat history.")
history.clear()
continue
chat_with_model_stream(user_input, history)
# Future improvement: Update history based on API response if needed.
if __name__ == "__main__":
main()
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
app = FastAPI()
class ChatRequest(BaseModel):
user_input: str
history: list
tokenizer = None
model = None
@app.on_event("startup")
def load_model_and_tokenizer():
global tokenizer, model
path = "AIDC-AI/Marco-o1"
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
model = LLM(model=path, tensor_parallel_size=4)
def generate_response_stream(model, text, max_new_tokens=4096):
new_output = ''
sampling_params = SamplingParams(
max_tokens=1,
temperature=0,
top_p=0.9
)
with torch.inference_mode():
for _ in range(max_new_tokens):
outputs = model.generate(
[f'{text}{new_output}'],
sampling_params=sampling_params,
use_tqdm=False
)
next_token = outputs[0].outputs[0].text
new_output += next_token
yield next_token # Yield each part of the response
if new_output.endswith('</Output>'):
break
@app.post("/chat/")
async def chat(request: ChatRequest):
if not request.user_input:
raise HTTPException(status_code=400, detail="Input cannot be empty.")
if request.user_input.lower() in ['q', 'quit']:
return {"response": "Exiting chat."}
if request.user_input.lower() == 'c':
request.history.clear()
return {"response": "Clearing chat history."}
request.history.append({"role": "user", "content": request.user_input})
text = tokenizer.apply_chat_template(request.history, tokenize=False, add_generation_prompt=True)
response_stream = generate_response_stream(model, text)
# Stream the response using StreamingResponse
return StreamingResponse(response_stream, media_type="text/plain")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment