Commit 8bd0f91c authored by PanZezhong1725's avatar PanZezhong1725
Browse files

feat: specify distribution policy when register

parent 11c6e423
...@@ -6,7 +6,6 @@ from libinfinicore_infer import ( ...@@ -6,7 +6,6 @@ from libinfinicore_infer import (
DataType, DataType,
DeviceType, DeviceType,
load_model_weight, load_model_weight,
load_model_weight_distributed,
create_jiuge_awq_weights, create_jiuge_awq_weights,
create_jiuge_awq_model, create_jiuge_awq_model,
destroy_jiuge_awq_model, destroy_jiuge_awq_model,
...@@ -196,38 +195,15 @@ class JiugeAWQForCausalLM: ...@@ -196,38 +195,15 @@ class JiugeAWQForCausalLM:
for key in f.keys(): for key in f.keys():
# print(key) # print(key)
tensor = f.get_tensor(key) tensor = f.get_tensor(key)
if "proj" in key and "bias" not in key: if "o_proj.scales" in key:
if "o_proj" in key or "down_proj" in key: tensor = tensor * self.meta.scale_o
tensor = ( elif "down_proj.scales" in key:
tensor.reshape(tensor.shape[0], self.ndev, -1) tensor = tensor * self.meta.scale_down
.permute(1, 0, 2) elif "embed_tokens.weight" in key:
.contiguous() tensor = tensor * self.meta.scale_input
) elif "lm_head.weight" in key:
if "o_proj.scales" in key: tensor = tensor * self.meta.scale_output
tensor = tensor * self.meta.scale_o load_model_weight(self.weights, key, tensor.data_ptr())
elif "down_proj.scales" in key:
tensor = tensor * self.meta.scale_down
load_model_weight_distributed(
self.weights,
key,
tensor.data_ptr(),
self.dev_ids,
self.ndev,
)
else:
load_model_weight_distributed(
self.weights,
key,
tensor.data_ptr(),
self.dev_ids,
self.ndev,
)
else:
if "embed_tokens.weight" in key:
tensor = tensor * self.meta.scale_input
elif "lm_head.weight" in key:
tensor = tensor * self.meta.scale_output
load_model_weight(self.weights, key, tensor.data_ptr())
def max_context_len(self): def max_context_len(self):
return self.meta.dctx return self.meta.dctx
......
...@@ -412,16 +412,6 @@ def __open_library__(): ...@@ -412,16 +412,6 @@ def __open_library__():
] ]
lib.loadModelWeight.restype = None lib.loadModelWeight.restype = None
# loadModelWeightDistributed
lib.loadModelWeightDistributed.argtypes = [
POINTER(ModelWeightsCStruct), # struct ModelWeights*
c_char_p, # const char* name
c_void_p, # void* data
POINTER(c_int), # int* ranks
c_int, # int nrank
]
lib.loadModelWeightDistributed.restype = None
return lib return lib
...@@ -432,10 +422,6 @@ def load_model_weight(weights, name, data): ...@@ -432,10 +422,6 @@ def load_model_weight(weights, name, data):
LIB.loadModelWeight(weights, name.encode("utf-8"), data) LIB.loadModelWeight(weights, name.encode("utf-8"), data)
def load_model_weight_distributed(weights, name, data, ranks, nrank):
LIB.loadModelWeightDistributed(weights, name.encode("utf-8"), data, ranks, nrank)
create_jiuge_model = LIB.createJiugeModel create_jiuge_model = LIB.createJiugeModel
destroy_jiuge_model = LIB.destroyJiugeModel destroy_jiuge_model = LIB.destroyJiugeModel
create_kv_cache = LIB.createKVCache create_kv_cache = LIB.createKVCache
......
...@@ -4,63 +4,65 @@ ...@@ -4,63 +4,65 @@
#include "../utils.hpp" #include "../utils.hpp"
#include <infinirt.h> #include <infinirt.h>
#include <numeric>
namespace infinicore { namespace infinicore::weights {
WeightsLoader::WeightsLoader(infiniDevice_t dev, const std::vector<int> &dev_ids) : _device(dev), _dev_ids(dev_ids) { void Weight::load(const void *host_data, infinirtStream_t stream) {
if (_dist_type == DistributionType::FULL) {
_tensor->load(host_data, stream);
} else if (_dist_type == DistributionType::ROW || _tensor->ndim() == 1) { // 1D column-distributed is same as row-distributed
_tensor->load((const char *)host_data + _rank * _tensor->numel() * dsize(_tensor->dtype()), stream);
} else if (_dist_type == DistributionType::COLUMN && _tensor->ndim() > 1) { // _dist_type == DistributionType::COLUMN
void *rearranged_ptr;
RUN_INFINI(infinirtMallocHost(&rearranged_ptr, _tensor->numel() * dsize(_tensor->dtype())));
size_t row_size = _tensor->shape()[_tensor->ndim() - 1] * dsize(_tensor->dtype());
size_t host_offset = _rank * row_size;
size_t host_row_size = _nrank * row_size;
size_t rows = std::accumulate(_tensor->shape().begin(), _tensor->shape().end() - 1, size_t(1), std::multiplies<size_t>());
for (size_t row = 0; row < rows; row++) {
memcpy((char *)rearranged_ptr + row * row_size,
(char *)host_data + host_offset + row * host_row_size,
row_size);
}
_tensor->load(rearranged_ptr, stream);
RUN_INFINI(infinirtFreeHost(rearranged_ptr));
} else {
std::cerr << "Unsupported distribution type: " << _dist_type << std::endl;
std::abort();
}
};
Loader::Loader(infiniDevice_t dev, const std::vector<int> &dev_ids) : _device(dev), _dev_ids(dev_ids) {
_streams.resize(_dev_ids.size()); _streams.resize(_dev_ids.size());
_weights.resize(_dev_ids.size()); _weights_maps.resize(_dev_ids.size());
for (int rank = 0; rank < int(_dev_ids.size()); rank++) { for (int rank = 0; rank < int(_dev_ids.size()); rank++) {
RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank])); RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank]));
_weights[rank] = std::unordered_map<std::string, std::shared_ptr<Tensor>>(); _weights_maps[rank] = std::unordered_map<std::string, std::shared_ptr<Weight>>();
RUN_INFINI(infinirtStreamCreate(&_streams[rank])); RUN_INFINI(infinirtStreamCreate(&_streams[rank]));
} }
} }
void WeightsLoader::resigter(const std::string &name, std::shared_ptr<Tensor> tensor, int rank) { void Loader::resigter(const std::string &name, std::shared_ptr<Tensor> tensor, int rank, DistributionType dist_type) {
_weights[rank][name] = tensor; _weights_maps[rank][name] = std::make_shared<Weight>(tensor, rank, _dev_ids.size(), dist_type);
} }
void WeightsLoader::load_weight(const std::string &name, const void *host_data) { void Loader::load(const std::string &name, const void *host_data) {
for (int rank = 0; rank < int(_dev_ids.size()); rank++) { for (int rank = 0; rank < int(_dev_ids.size()); rank++) {
RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank])); RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank]));
auto it = _weights[rank].find(name); auto it = _weights_maps[rank].find(name);
if (it == _weights[rank].end()) { if (it == _weights_maps[rank].end()) {
std::cerr << "Weight " << name << " not found in rank " << rank << std::endl; std::cerr << "Weight " << name << " not found in rank " << rank << std::endl;
std::abort(); std::abort();
} }
_weights[rank][name]->load(host_data, _streams[rank]); _weights_maps[rank][name]->load(host_data, _streams[rank]);
}
for (int rank = int(_dev_ids.size() - 1); rank >= 0; rank--) {
RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank]));
RUN_INFINI(infinirtStreamSynchronize(_streams[rank]));
}
}
void WeightsLoader::load_distributed_weight(const std::string &name, const void *host_data, const std::vector<int> &ranks) {
for (size_t i = 0; i < ranks.size(); i++) {
int rank = ranks[i];
RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank]));
auto it = _weights[rank].find(name);
if (it == _weights[rank].end()) {
std::cerr << "Weight " << name << " not found in rank " << rank << std::endl;
std::abort();
}
_weights[rank][name]->load((char *)host_data + i * _weights[rank][name]->numel() * dsize(_weights[rank][name]->dtype()), _streams[rank]);
} }
for (int rank = int(_dev_ids.size() - 1); rank >= 0; rank--) { for (int rank = int(_dev_ids.size() - 1); rank >= 0; rank--) {
RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank])); RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank]));
RUN_INFINI(infinirtStreamSynchronize(_streams[rank])); RUN_INFINI(infinirtStreamSynchronize(_streams[rank]));
} }
} }
void WeightsLoader::load_rank_weight(const std::string &name, const void *host_data, int rank) {
auto it = _weights[rank].find(name); void Loader::finalize() {
if (it == _weights[rank].end()) {
std::cerr << "Weight " << name << " not found in rank " << rank << std::endl;
std::abort();
}
RUN_INFINI(infinirtSetDevice(_device, _dev_ids[rank]));
_weights[rank][name]->load(host_data);
}
void WeightsLoader::finalize() {
int dev_id; int dev_id;
RUN_INFINI(infinirtGetDevice(nullptr, &dev_id)); RUN_INFINI(infinirtGetDevice(nullptr, &dev_id));
for (int rank = 0; rank < int(_dev_ids.size()); rank++) { for (int rank = 0; rank < int(_dev_ids.size()); rank++) {
...@@ -70,25 +72,16 @@ void WeightsLoader::finalize() { ...@@ -70,25 +72,16 @@ void WeightsLoader::finalize() {
} }
RUN_INFINI(infinirtSetDevice(_device, dev_id)); RUN_INFINI(infinirtSetDevice(_device, dev_id));
} }
std::shared_ptr<Tensor> WeightsLoader::get(const std::string &name, int rank) { std::shared_ptr<Tensor> Loader::get(const std::string &name, int rank) {
return _weights[rank][name]; return _weights_maps[rank][name]->tensor();
} }
} // namespace infinicore } // namespace infinicore::weights
__C void __C void
loadModelWeight(struct ModelWeights *weights_, const char *name, void *data) { loadModelWeight(struct ModelWeights *weights_, const char *name, void *data) {
std::string name_str(name); std::string name_str(name);
// std::cout << "Loading weight: " << name_str << std::endl; // std::cout << "Loading weight: " << name_str << std::endl;
auto weights = reinterpret_cast<infinicore::WeightsLoader *>(weights_); auto weights = reinterpret_cast<infinicore::weights::Loader *>(weights_);
weights->load_weight(name_str, data); weights->load(name_str, data);
}
__C void
loadModelWeightDistributed(struct ModelWeights *weights_, const char *name, void *data, int *ranks, int nrank) {
std::string name_str(name);
// std::cout << "Loading dist weight: " << name_str << std::endl;
auto weights = reinterpret_cast<infinicore::WeightsLoader *>(weights_);
std::vector<int> rank_vec(ranks, ranks + nrank);
weights->load_distributed_weight(name_str, data, rank_vec);
} }
...@@ -7,24 +7,55 @@ ...@@ -7,24 +7,55 @@
#include <vector> #include <vector>
namespace infinicore { namespace infinicore {
class WeightsLoader {
namespace weights {
enum DistributionType {
FULL,
ROW,
COLUMN
};
class Weight {
private:
std::shared_ptr<Tensor> _tensor;
int _rank;
int _nrank;
DistributionType _dist_type;
public:
Weight(std::shared_ptr<Tensor> tensor,
int rank = 0,
int nrank = 1,
DistributionType dist_type = DistributionType::FULL)
: _tensor(tensor), _rank(rank), _nrank(nrank), _dist_type(dist_type) {}
std::shared_ptr<Tensor> tensor() const { return _tensor; }
int rank() const { return _rank; }
int nrank() const { return _nrank; }
void load(const void *host_data, infinirtStream_t stream = nullptr);
};
class Loader {
protected: protected:
std::vector<std::unordered_map<std::string, std::shared_ptr<Tensor>>> _weights; std::vector<std::unordered_map<std::string, std::shared_ptr<Weight>>> _weights_maps;
infiniDevice_t _device; infiniDevice_t _device;
std::vector<int> _dev_ids; std::vector<int> _dev_ids;
std::vector<infinirtStream_t> _streams; std::vector<infinirtStream_t> _streams;
public: public:
WeightsLoader(infiniDevice_t, const std::vector<int> &dev_ids); Loader(infiniDevice_t, const std::vector<int> &dev_ids);
void resigter(const std::string &name, std::shared_ptr<Tensor> tensor, int rank = 0);
void load_weight(const std::string &name, const void *host_data); /// @brief register a tensor to the loader
void load_distributed_weight(const std::string &name, const void *host_data, const std::vector<int> &ranks); /// @param name name (aka key) of the tensor
void load_rank_weight(const std::string &name, const void *host_data, int rank); /// @param tensor
/// @param rank the rank of the weight tensor (default 0)
/// @param dist_type either FULL, or distributed by ROW or COLUMN (default FULL)
void resigter(const std::string &name, std::shared_ptr<Tensor> tensor, int rank = 0, DistributionType dist_type = DistributionType::FULL);
void load(const std::string &name, const void *host_data);
void finalize(); void finalize();
std::shared_ptr<Tensor> get(const std::string &name, int rank = 0); std::shared_ptr<Tensor> get(const std::string &name, int rank = 0);
const std::vector<int> &dev_ids() const { return _dev_ids; } const std::vector<int> &devIds() const { return _dev_ids; }
infiniDevice_t device() const { return _device; } infiniDevice_t device() const { return _device; }
}; };
} // namespace weights
} // namespace infinicore } // namespace infinicore
#endif // WEIGHTS_LOADER_HPP #endif // WEIGHTS_LOADER_HPP
...@@ -351,7 +351,7 @@ void launchDevice(const JiugeAWQMeta *meta, std::shared_ptr<JiugeAWQDeviceWeight ...@@ -351,7 +351,7 @@ void launchDevice(const JiugeAWQMeta *meta, std::shared_ptr<JiugeAWQDeviceWeight
JiugeAWQModel::JiugeAWQModel(const JiugeAWQMeta *meta, const ModelWeights *weights_) { JiugeAWQModel::JiugeAWQModel(const JiugeAWQMeta *meta, const ModelWeights *weights_) {
auto weights = (JiugeAWQWeights *)(weights_); auto weights = (JiugeAWQWeights *)(weights_);
device = weights->device(); device = weights->device();
dev_ids = weights->dev_ids(); dev_ids = weights->devIds();
int ndev = int(dev_ids.size()); int ndev = int(dev_ids.size());
dev_resources = std::vector<DeviceResource>(ndev); dev_resources = std::vector<DeviceResource>(ndev);
states = std::vector<InferState>(ndev); states = std::vector<InferState>(ndev);
......
...@@ -19,7 +19,7 @@ struct JiugeAWQDeviceWeight { ...@@ -19,7 +19,7 @@ struct JiugeAWQDeviceWeight {
std::vector<std::shared_ptr<QuantInt4Weight>> w_attn_q, w_attn_k, w_attn_v, w_attn_out, w_ffn_gate, w_ffn_up, w_ffn_down; std::vector<std::shared_ptr<QuantInt4Weight>> w_attn_q, w_attn_k, w_attn_v, w_attn_out, w_ffn_gate, w_ffn_up, w_ffn_down;
}; };
class JiugeAWQWeights : public infinicore::WeightsLoader { class JiugeAWQWeights : public infinicore::weights::Loader {
private: private:
std::vector<std::shared_ptr<JiugeAWQDeviceWeight>> _device_weights; std::vector<std::shared_ptr<JiugeAWQDeviceWeight>> _device_weights;
......
...@@ -43,7 +43,7 @@ inline std::shared_ptr<Tensor> getCosTable(size_t dctx, size_t dh, float theta) ...@@ -43,7 +43,7 @@ inline std::shared_ptr<Tensor> getCosTable(size_t dctx, size_t dh, float theta)
JiugeAWQWeights::JiugeAWQWeights( JiugeAWQWeights::JiugeAWQWeights(
const JiugeAWQMeta *meta, const JiugeAWQMeta *meta,
infiniDevice_t device, infiniDevice_t device,
const std::vector<int> &dev_ids) : infinicore::WeightsLoader(device, dev_ids) { const std::vector<int> &dev_ids) : infinicore::weights::Loader(device, dev_ids) {
auto ndev = dev_ids.size(); auto ndev = dev_ids.size();
_device_weights.resize(ndev); _device_weights.resize(ndev);
infiniDtype_t dt_logits = meta->dt_logits; infiniDtype_t dt_logits = meta->dt_logits;
...@@ -82,35 +82,35 @@ JiugeAWQWeights::JiugeAWQWeights( ...@@ -82,35 +82,35 @@ JiugeAWQWeights::JiugeAWQWeights(
for (size_t layer = 0; layer < nlayer; layer++) { for (size_t layer = 0; layer < nlayer; layer++) {
#define RIGISTER_LAYER_WEIGHT(W_NAME, W_VAR, W_SHAPE, W_DTYPE) \ #define RIGISTER_LAYER_WEIGHT(W_NAME, W_VAR, W_SHAPE, W_DTYPE, W_DIST_TYPE) \
auto W_VAR = Tensor::weight(nullptr, W_DTYPE, W_SHAPE); \ auto W_VAR = Tensor::weight(nullptr, W_DTYPE, W_SHAPE); \
this->resigter(W_NAME, W_VAR, i); \ this->resigter(W_NAME, W_VAR, i, infinicore::weights::DistributionType::W_DIST_TYPE); \
weight->W_VAR.push_back(W_VAR); weight->W_VAR.push_back(W_VAR);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".input_layernorm.weight", w_attn_norm, {d}, dt_norm_w); RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".input_layernorm.weight", w_attn_norm, {d}, dt_norm_w, FULL);
#define REGISTER_LAYER_QUANT_WEIGHT(W_NAME, W_VAR, W_IN, W_OUT) \ #define REGISTER_LAYER_QUANT_WEIGHT(W_NAME, W_VAR, W_IN, W_OUT, W_DIST_TYPE) \
auto W_VAR = std::make_shared<QuantInt4Weight>(); \ auto W_VAR = std::make_shared<QuantInt4Weight>(); \
W_VAR->w = Tensor::weight(nullptr, INFINI_DTYPE_I32, {W_IN, (W_OUT)*nbit / 32}); \ W_VAR->w = Tensor::weight(nullptr, INFINI_DTYPE_I32, {W_IN, (W_OUT)*nbit / 32}); \
this->resigter(W_NAME + ".qweight", W_VAR->w, i); \ this->resigter(W_NAME + ".qweight", W_VAR->w, i, infinicore::weights::DistributionType::W_DIST_TYPE); \
W_VAR->s = Tensor::weight(nullptr, INFINI_DTYPE_F16, {(W_IN) / quant_group_size, (W_OUT)}); \ W_VAR->s = Tensor::weight(nullptr, INFINI_DTYPE_F16, {(W_IN) / quant_group_size, (W_OUT)}); \
this->resigter(W_NAME + ".scales", W_VAR->s, i); \ this->resigter(W_NAME + ".scales", W_VAR->s, i, infinicore::weights::DistributionType::W_DIST_TYPE); \
W_VAR->z = Tensor::weight(nullptr, INFINI_DTYPE_I32, {(W_IN) / quant_group_size, (W_OUT)*nbit / 32}); \ W_VAR->z = Tensor::weight(nullptr, INFINI_DTYPE_I32, {(W_IN) / quant_group_size, (W_OUT)*nbit / 32}); \
this->resigter(W_NAME + ".qzeros", W_VAR->z, i); \ this->resigter(W_NAME + ".qzeros", W_VAR->z, i, infinicore::weights::DistributionType::W_DIST_TYPE); \
weight->W_VAR.push_back(W_VAR); weight->W_VAR.push_back(W_VAR);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj", w_attn_q, d, nh * dh); REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj", w_attn_q, d, nh * dh, COLUMN);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj", w_attn_k, d, nkvh * dh); REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj", w_attn_k, d, nkvh * dh, COLUMN);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj", w_attn_v, d, nkvh * dh); REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj", w_attn_v, d, nkvh * dh, COLUMN);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj.bias", b_attn_q, {nh * dh}, INFINI_DTYPE_F16); RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj.bias", b_attn_q, {nh * dh}, INFINI_DTYPE_F16, COLUMN);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj.bias", b_attn_k, {nkvh * dh}, INFINI_DTYPE_F16); RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj.bias", b_attn_k, {nkvh * dh}, INFINI_DTYPE_F16, COLUMN);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj.bias", b_attn_v, {nkvh * dh}, INFINI_DTYPE_F16); RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj.bias", b_attn_v, {nkvh * dh}, INFINI_DTYPE_F16, COLUMN);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.o_proj", w_attn_out, nh * dh, d); REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.o_proj", w_attn_out, nh * dh, d, ROW);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".post_attention_layernorm.weight", w_ffn_norm, {d}, dt_norm_w); RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".post_attention_layernorm.weight", w_ffn_norm, {d}, dt_norm_w, FULL);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.gate_proj", w_ffn_gate, d, di); REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.gate_proj", w_ffn_gate, d, di, COLUMN);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.up_proj", w_ffn_up, d, di); REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.up_proj", w_ffn_up, d, di, COLUMN);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.down_proj", w_ffn_down, di, d); REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.down_proj", w_ffn_down, di, d, ROW);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment