v1.0

a52e53db · chenzk · a52e53db · a52e53db · a52e53db · a52e53db
Commit a52e53db authored Apr 29, 2025 by chenzk
Showing with 89 additions and 0 deletions

icon.png icon.png +0 -0

infer_transformers.py infer_transformers.py +33 -0

infer_vllm.py infer_vllm.py +45 -0

model.properties model.properties +10 -0

requirements.txt requirements.txt +1 -0

No files found.
--- a/icon.png
+++ b/icon.png
--- a/infer_transformers.py
+++ b/infer_transformers.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "Qwen/Qwen3-8B"
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto"
+)
+# prepare the model input
+prompt = "Give me a short introduction to large language models."
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32768
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
+# the result will begin with thinking content in <think></think> tags, followed by the actual response
+print(tokenizer.decode(output_ids, skip_special_tokens=True))
--- a/infer_vllm.py
+++ b/infer_vllm.py
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+from multiprocessing import freeze_support
+if __name__ == '__main__':
+    freeze_support()
+    # Initialize the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+    # Pass the default decoding hyperparameters of Qwen3-8B.
+    # max_tokens is for the maximum length for generation.
+    sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=512)
+    # Input the model name or path. Can be GPTQ or AWQ models.
+    llm = LLM(model="Qwen/Qwen3-8B" , tensor_parallel_size=4)
+    # Prepare your prompts
+    '''
+    prompt = "Give me a short introduction to large language models."
+    messages = [
+        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+        {"role": "user", "content": prompt}
+    ]
+    '''
+    prompt = "How many r's are in the word \"strawberry\""
+    messages = [
+        {"role": "user", "content": prompt}
+    ]
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # generate outputs
+    outputs = llm.generate([text], sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1517
+# 模型名称
+modelName=Qwen3_pytorch
+# 模型描述
+modelDescription=参数量仅为DeepSeek-R1的1/3，成本大幅下降，性能全面超越R1、OpenAI-o1等全球顶尖模型，将快思考与慢思考集成进同一个模型。
+# 应用场景
+appScenario=推理,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/requirements.txt
+++ b/requirements.txt
+transformers>=4.51.0