Commit 88c3b719 authored by Rayyyyy's avatar Rayyyyy
Browse files

update README

parent d3471cd8
Pipeline #1010 failed with stages
in 0 seconds
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
max_model_len, tp_size = 8192, 1
model_name_or_path = "deepseek-ai/DeepSeek-V2-Lite-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
llm = LLM(model=model_name_or_path, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, enforce_eager=True)
sampling_params = SamplingParams(temperature=0.3, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
messages_list = [
[{"role": "user", "content": "Who are you?"}],
[{"role": "user", "content": "Translate the following content into Chinese directly: DeepSeek-V2 adopts innovative architectures to guarantee economical training and efficient inference."}],
[{"role": "user", "content": "Write a piece of quicksort code in C++."}],
]
prompt_token_ids = [tokenizer.apply_chat_template(messages, add_generation_prompt=True) for messages in messages_list]
outputs = llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sampling_params)
generated_text = [output.outputs[0].text for output in outputs]
print(generated_text)
......@@ -3,7 +3,7 @@
[deepseek-v2](https://arxiv.org/abs/2405.04434)
## 模型结构
DeepSeek-V2对模型框架进行了全方位的创新,提出了媲美MHA的MLA(Multi-head Latent Attention)架构,大幅减少计算量和推理显存;自研Sparse结构DeepSeekMoE进一步将计算量降低到极致,两者结合最终实现模型性能跨级别的提升。
<div align=center>
<img src="./doc/model.png"/>
......@@ -58,10 +58,28 @@ export HF_ENDPOINT=https://hf-mirror.com
暂无
## 训练
暂无
## 推理
基于Huggingface's Transformers进行推理,根据本地模型地址设置`model_name_or_path`参数。
如未下载预训练模型,代码会根据选择自动进行下载,当前可用模型为:"deepseek-ai/DeepSeek-V2-Lite"、"deepseek-ai/DeepSeek-V2-Lite-Chat"。
### 文本扩写
```bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
python text_completion.py
```
### 对话
```bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
python chat_completion.py
```
### 精度
暂无
......@@ -71,11 +89,9 @@ export HF_ENDPOINT=https://hf-mirror.com
对话问答
### 热点应用行业
制造,广媒,家居,教育
金融,广媒,教育
## 预训练权重
模型目录结构如下:
```bash
├── model_save_path
......@@ -117,5 +133,4 @@ export HF_ENDPOINT=https://hf-mirror.com
## 参考资料
- https://github.com/deepseek-ai/DeepSeek-V2
- https://huggingface.co/deepseek-ai/DeepSeek-V2
- https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
- https://huggingface.co/deepseek-ai
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
model_name_or_path = "/home/DeepSeek-V2/DeepSeek-V2-Lite-Chat"
# model_name_or_path = "deepseek-ai/DeepSeek-V2-Lite-Chat"
model_name_or_path = "deepseek-ai/DeepSeek-V2-Lite-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
......@@ -13,8 +12,9 @@ model.generation_config.pad_token_id = model.generation_config.eos_token_id
messages = [
{"role": "user", "content": "Write a piece of quicksort code in C++"}
]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=100)
result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print("result", result)
\ No newline at end of file
print("result", result)
# 模型唯一标识
modelCode=639
# 模型名称
modelName=deepseek-v2_pytorch
# 模型描述
modelDescription=DeepSeek-V2,第二代MoE模型,参数更多、能力更强、成本更低
# 应用场景
appScenario=推理,对话问答,金融,广媒,教育
# 框架类型
frameType=pytorch
echo "Export params ..."
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
python text_completion.py
\ No newline at end of file
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
# model_name_or_path = "deepseek-ai/DeepSeek-V2-Lite"
model_name_or_path = "/home/DeepSeek-V2/DeepSeek-V2-Lite-Chat"
model_name_or_path = "deepseek-ai/DeepSeek-V2-Lite"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
# `max_memory` should be set based on your devices
......@@ -25,4 +25,4 @@ inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("result", result)
\ No newline at end of file
print("result", result)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment