v1.0

fbb7398d · chenzk · fbb7398d · fbb7398d · fbb7398d · fbb7398d
Commit fbb7398d authored Jan 17, 2025 by chenzk
7 changed files
--- a/examples/vllm_fastapi.py
+++ b/examples/vllm_fastapi.py
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import torch
+from vllm import LLM, SamplingParams
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+app = FastAPI()
+
+
+class ChatRequest(BaseModel):
+    user_input: str
+    history: list
+
+
+tokenizer = None
+model = None
+
+
+@app.on_event("startup")
+def load_model_and_tokenizer():
+    global tokenizer, model
+    path = "AIDC-AI/Marco-o1"
+    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+    model = LLM(model=path, tensor_parallel_size=4)
+
+
+def generate_response(model, text, max_new_tokens=4096):
+    new_output = ''
+    sampling_params = SamplingParams(
+        max_tokens=1,
+        temperature=0,
+        top_p=0.9
+    )
+    with torch.inference_mode():
+        for _ in range(max_new_tokens):
+            outputs = model.generate(
+                [f'{text}{new_output}'],
+                sampling_params=sampling_params,
+                use_tqdm=False
+            )
+            new_output += outputs[0].outputs[0].text
+            if new_output.endswith('</Output>'):
+                break
+    return new_output
+
+
+@app.post("/chat/")
+async def chat(request: ChatRequest):
+    if not request.user_input:
+        raise HTTPException(status_code=400, detail="Input cannot be empty.")
+
+    if request.user_input.lower() in ['q', 'quit']:
+        return {"response": "Exiting chat."}
+
+    if request.user_input.lower() == 'c':
+        request.history.clear()
+        return {"response": "Clearing chat history."}
+
+    request.history.append({"role": "user", "content": request.user_input})
+    text = tokenizer.apply_chat_template(request.history, tokenize=False, add_generation_prompt=True)
+    response = generate_response(model, text)
+    request.history.append({"role": "assistant", "content": response})
+
+    return {"response": response, "history": request.history}
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1232
+# 模型名称
+modelName=marco-o1_pytorch
+# 模型描述
+modelDescription=通过结合CoT微调、MCTS算法和推理行动策略，Marco-o1超越部分闭源产品。
+# 应用场景
+appScenario=推理,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/requirements.txt
+++ b/requirements.txt
+torch
+transformers
+# vllm==0.6.2
+tqdm
--- a/src/talk_with_model.py
+++ b/src/talk_with_model.py
+"""
+Copyright (C) 2024 AIDC-AI
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+import torch
+from typing import List, Dict, Tuple
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def load_model_and_tokenizer(path):
+    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(path, trust_remote_code=True).to('cuda:0')
+    model.eval()
+    return tokenizer, model
+
+
+def generate_response(model, tokenizer,
+                      input_ids, attention_mask,
+                      max_new_tokens=4096):
+    generated_ids = input_ids
+    with torch.inference_mode():
+        for _ in range(max_new_tokens):
+            outputs = model(input_ids=generated_ids, attention_mask=attention_mask)
+            next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(-1)
+            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
+            attention_mask = torch.cat([attention_mask, torch.ones_like(next_token_id)], dim=-1)
+            new_token = tokenizer.decode(next_token_id.squeeze(), skip_special_tokens=True)
+            print(new_token, end='', flush=True)
+            if next_token_id.item() == tokenizer.eos_token_id:
+                break
+    return tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
+
+
+def chat(model, tokenizer):
+    history: List[Dict[str, str]] = []
+    print("Enter 'q' to quit, 'c' to clear chat history.")
+    while True:
+        user_input = input("User: ").strip().lower()
+        if user_input == 'q':
+            print("Exiting chat.")
+            break
+        if user_input == 'c':
+            print("Clearing chat history.")
+            history.clear()
+            continue
+        if not user_input:
+            print("Input cannot be empty.")
+            continue
+
+        history.append({"role": "user", "content": user_input})
+        text = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
+        model_inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=4096).to('cuda:0')
+
+        print('Assistant:', end=' ', flush=True)
+        response = generate_response(model, tokenizer, model_inputs.input_ids, model_inputs.attention_mask)
+        print()
+        history.append({"role": "assistant", "content": response})
+
+
+def main():
+    path = "AIDC-AI/Marco-o1"
+    tokenizer, model = load_model_and_tokenizer(path)
+    print('Starting chat.')
+    chat(model, tokenizer)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/talk_with_model_vllm.py
+++ b/src/talk_with_model_vllm.py
+"""
+Copyright (C) 2024 AIDC-AI
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+import torch
+from vllm import LLM, SamplingParams
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def load_model_and_tokenizer(path):
+    tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+    model = LLM(model=path, tensor_parallel_size=4)
+    return tokenizer, model
+
+
+def generate_response(model,
+                      text,
+                      max_new_tokens=4096):
+    new_output = ''
+    sampling_params = SamplingParams(
+        max_tokens=1,
+        temperature=0,
+        top_p=0.9
+    )
+    with torch.inference_mode():
+        for _ in range(max_new_tokens):
+            outputs = model.generate(
+                [f'{text}{new_output}'],
+                sampling_params=sampling_params,
+                use_tqdm=False
+            )
+            new_output += outputs[0].outputs[0].text
+            print(outputs[0].outputs[0].text, end='', flush=True)
+            if new_output.endswith('</Output>'):
+                break
+    return new_output
+
+
+def chat(model, tokenizer):
+    history = []
+    print("Enter 'q' to quit, 'c' to clear chat history.")
+    while True:
+        user_input = input("User: ").strip().lower()
+        if user_input == 'q':
+            print("Exiting chat.")
+            break
+        if user_input == 'c':
+            print("Clearing chat history.")
+            history.clear()
+            continue
+        if not user_input:
+            print("Input cannot be empty.")
+            continue
+
+        history.append({"role": "user", "content": user_input})
+        text = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
+        print('Assistant:', end=' ', flush=True)
+        response = generate_response(model, text)
+        print()
+        history.append({"role": "assistant", "content": response})
+
+
+def main():
+    path = "AIDC-AI/Marco-o1"
+    #path = 'Your local path here'
+
+    tokenizer, model = load_model_and_tokenizer(path)
+    print('Starting chat.')
+    chat(model, tokenizer)
+
+
+if __name__ == "__main__":
+    main()
--- a/whl/lmslim-0.1.2+das.dtk24043-cp310-cp310-linux_x86_64.whl
+++ b/whl/lmslim-0.1.2+das.dtk24043-cp310-cp310-linux_x86_64.whl
--- a/whl/vllm-0.6.2+das.opt1.cd549d3.dtk24043-cp310-cp310-linux_x86_64.whl
+++ b/whl/vllm-0.6.2+das.opt1.cd549d3.dtk24043-cp310-cp310-linux_x86_64.whl