Unverified Commit de3e6b95 authored by Haojie Wang's avatar Haojie Wang Committed by GitHub
Browse files

Merge pull request #187 from InfiniTensor/issue/186

issue/186 support longrope
parents c1a3ab29 fc454c77
......@@ -7,6 +7,8 @@
#include "../infinilm_model.hpp"
#include <infinicore/nn/rope.hpp>
namespace infinilm::models::llama {
/**
......@@ -20,41 +22,43 @@ struct LlamaConfig : public InfinilmModel::Config {
infinicore::DataType dtype = infinicore::DataType::F32;
// Vocabulary and embedding
size_t vocab_size = 32000; // Vocabulary size
size_t hidden_size = 4096; // Hidden dimension size
size_t intermediate_size = 11008; // MLP intermediate dimension
size_t vocab_size = 32000; // Vocabulary size
size_t hidden_size = 4096; // Hidden dimension size
size_t intermediate_size = 11008; // MLP intermediate dimension
// Architecture
size_t num_hidden_layers = 32; // Number of decoder layers
size_t num_attention_heads = 32; // Number of attention heads
size_t num_key_value_heads = 32; // Number of key-value heads (for GQA)
size_t head_dim = 128; // Attention head dimension (hidden_size / num_attention_heads)
size_t num_hidden_layers = 32; // Number of decoder layers
size_t num_attention_heads = 32; // Number of attention heads
size_t num_key_value_heads = 32; // Number of key-value heads (for GQA)
size_t head_dim = 128; // Attention head dimension (hidden_size / num_attention_heads)
// Position embeddings
size_t max_position_embeddings = 2048; // Maximum sequence length
double rope_theta = 10000.0; // RoPE base frequency
size_t max_position_embeddings = 2048; // Maximum sequence length
double rope_theta = 10000.0; // RoPE base frequency
std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> rope_scaling = nullptr; // RoPE scaling type
// Normalization
double rms_norm_eps = 1e-6; // RMSNorm epsilon
double rms_norm_eps = 1e-6; // RMSNorm epsilon
// Activation
std::string hidden_act = "silu"; // Activation function (typically "silu")
std::string model_type = "llama"; // Model type identifier (matches HF configs)
std::string hidden_act = "silu"; // Activation function (typically "silu")
std::string model_type = "llama"; // Model type identifier (matches HF configs)
// Optional features
bool use_cache = true; // Whether to use KV cache
bool attention_bias = true; // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility)
bool attention_output_bias = false; // Whether to use bias in output projection (o_proj)
bool mlp_bias = false; // Whether to use bias in MLP projections
bool tie_word_embeddings = false; // Whether to tie input/output embeddings
bool use_cache = true; // Whether to use KV cache
bool attention_bias = true; // Whether to use bias in Q/K/V projections (default true for 9G7B compatibility)
bool attention_output_bias = false; // Whether to use bias in output projection (o_proj)
bool mlp_bias = false; // Whether to use bias in MLP projections
bool tie_word_embeddings = false; // Whether to tie input/output embeddings
// Training/initialization parameters
double attention_dropout = 0.0; // Dropout ratio for attention probabilities
double initializer_range = 0.02; // Standard deviation for weight initialization
size_t pretraining_tp = 1; // Tensor parallelism rank used during pretraining
double attention_dropout = 0.0; // Dropout ratio for attention probabilities
double initializer_range = 0.02; // Standard deviation for weight initialization
size_t pretraining_tp = 1; // Tensor parallelism rank used during pretraining
// Model metadata
std::string name_or_path = ""; // Model name or path identifier
std::string name_or_path = ""; // Model name or path identifier
// Token IDs
int64_t pad_token_id = -1; // Padding token ID (optional)
......
......@@ -34,7 +34,7 @@ LlamaModel::LlamaModel(const LlamaConfig &config,
// Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing
INFINICORE_NN_MODULE_INIT(rotary_emb, config.head_dim, config.max_position_embeddings,
config.rope_theta, infinicore::nn::RoPE::Algo::GPT_NEOX,
dtype, device);
dtype, device, config.rope_scaling);
for (auto &layer : layers_) {
if (layer) {
......
......@@ -6,6 +6,7 @@
#include "../../models/llama/llama_attention.hpp"
#include "infinicore/device.hpp"
#include "infinicore/nn/module.hpp"
#include "infinicore/nn/rope.hpp"
#include "infinicore/tensor.hpp"
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
......@@ -69,7 +70,8 @@ inline void bind_llama(py::module &m) {
.def_readwrite("pretraining_tp", &LlamaConfig::pretraining_tp)
.def_readwrite("name_or_path", &LlamaConfig::name_or_path)
.def_readwrite("pad_token_id", &LlamaConfig::pad_token_id)
.def_property("bos_token_id", [](const LlamaConfig &self) {
.def_property(
"bos_token_id", [](const LlamaConfig &self) {
// Always return as list to match Python config format
return py::cast(self.bos_token_id); }, [](LlamaConfig &self, py::object value) {
// Accept both single int and list
......@@ -80,7 +82,8 @@ inline void bind_llama(py::module &m) {
} else {
throw py::type_error("bos_token_id must be int or list of ints");
} })
.def_property("eos_token_id", [](const LlamaConfig &self) {
.def_property(
"eos_token_id", [](const LlamaConfig &self) {
// Always return as list to match Python config format
return py::cast(self.eos_token_id); }, [](LlamaConfig &self, py::object value) {
// Accept both single int and list
......@@ -91,6 +94,86 @@ inline void bind_llama(py::module &m) {
} else {
throw py::type_error("eos_token_id must be int or list of ints");
} })
.def_property(
"rope_scaling",
// ---------- getter ----------
[](const LlamaConfig &self) -> py::object {
if (!self.rope_scaling) {
return py::none();
}
using ScalingConfig = infinicore::nn::RoPE::ScalingConfig;
using LongRopeConfig = infinicore::nn::RoPE::LongRopeConfig;
py::dict d;
if (auto *lr = dynamic_cast<const LongRopeConfig *>(self.rope_scaling.get())) {
d["type"] = "longrope";
d["rope_type"] = "longrope";
d["factor"] = lr->factor();
d["original_max_position_embeddings"] = lr->original_max_position_embeddings();
d["short_factor"] = lr->short_factor();
d["long_factor"] = lr->long_factor();
} else {
throw std::runtime_error("Unknown RoPE scaling type");
}
return std::move(d);
},
// ---------- setter ----------
[](LlamaConfig &self, py::object value) {
if (value.is_none()) {
self.rope_scaling.reset();
return;
}
if (!py::isinstance<py::dict>(value)) {
throw py::type_error("rope_scaling must be a dict or None");
}
py::dict d = value.cast<py::dict>();
auto get_str = [&](const char *k) {
if (!d.contains(k)) {
throw py::key_error(k);
}
return py::cast<std::string>(d[k]);
};
std::string type = d.contains("rope_type")
? py::cast<std::string>(d["rope_type"])
: get_str("type");
if (type == "longrope") {
using LongRopeConfig = infinicore::nn::RoPE::LongRopeConfig;
if (!d.contains("short_factor") || !d.contains("long_factor") || !d.contains("original_max_position_embeddings")) {
throw py::value_error(
"longrope requires short_factor, long_factor, "
"original_max_position_embeddings");
}
std::vector<float> short_factor = py::cast<std::vector<float>>(d["short_factor"]);
std::vector<float> long_factor = py::cast<std::vector<float>>(d["long_factor"]);
size_t original_max_position_embeddings = py::cast<size_t>(d["original_max_position_embeddings"]);
float factor = 1.0f;
if (d.contains("factor")) {
factor = py::cast<float>(d["factor"]);
}
self.rope_scaling = std::make_shared<LongRopeConfig>(
std::move(short_factor),
std::move(long_factor),
original_max_position_embeddings,
factor);
} else {
throw py::value_error("Unsupported rope_scaling type: " + type);
}
})
.def("validate", &LlamaConfig::validate)
.def("kv_dim", &LlamaConfig::kv_dim)
// Add __dir__ to make attributes discoverable via dir() in Python
......@@ -108,6 +191,7 @@ inline void bind_llama(py::module &m) {
dir_list.append("hidden_act");
dir_list.append("model_type");
dir_list.append("rope_theta");
dir_list.append("rope_scaling");
dir_list.append("attention_bias");
dir_list.append("attention_output_bias");
dir_list.append("mlp_bias");
......
......@@ -368,7 +368,7 @@ def render_ceval(_tokenizer, conversation):
def render_mmlu(_tokenizer, question, choices):
"""Render MMLU question and choices to input content"""
choices_text = "\n".join(
[f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)]
[f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)]
)
instruction = (
"You are a multiple-choice question solver. "
......@@ -924,7 +924,9 @@ def test():
splits_to_load = (
["test"]
if split == "test"
else ["validation"] if split == "val" else ["validation", "test"]
else ["validation"]
if split == "val"
else ["validation", "test"]
)
# Load each subject individually from hardcoded list, excluding "all"
for subject_name in mmlu_subjects:
......@@ -946,7 +948,9 @@ def test():
splits_to_load = (
["test"]
if split == "test"
else ["validation"] if split == "val" else ["validation", "test"]
else ["validation"]
if split == "val"
else ["validation", "test"]
)
records = []
for sp in splits_to_load:
......@@ -980,14 +984,13 @@ def test():
all_results = []
for subj in subject_list:
print(f"\n{'='*60}")
print(f"\n{'=' * 60}")
print(f"Evaluating subject: {subj}")
print(f"{'='*60}\n")
print(f"{'=' * 60}\n")
try:
samples, actual_subj_name = load_subject_samples(subj)
print(f"Loaded {len(samples)} samples for subject: {actual_subj_name}")
# Limit number of samples if specified
if num_samples is not None and num_samples > 0:
original_count = len(samples)
......@@ -996,37 +999,9 @@ def test():
f"Limited to {len(samples)} samples for validation (from {original_count} total)"
)
# Test with first sample if available
if len(samples) > 0:
sample = samples[0]
if benchmark == "ceval":
input_content = f"'question':{sample['question']},'A': {sample['A']}, 'B':{sample['B']}, 'C': {sample['C']},'D': {sample['D']}。"
test_conversation = [
{
"role": "system",
"content": "请从question的A,B,C,D四个选项中选择正确的选项。例如,标准答案:A。",
},
{"role": "user", "content": input_content},
]
test_output = model.generate(
test_conversation,
max_steps=max_new_tokens,
topp_=1.0,
topk_=1,
temperature_=1.0,
)
elif benchmark == "mmlu":
question = sample["question"]
choices = sample["choices"]
test_output = model.generate(
question,
choices,
max_steps=max_new_tokens,
topp_=1.0,
topk_=1,
temperature_=1.0,
)
print(f"\nTest output: {test_output}\n")
if len(samples) == 0:
print(f"No samples found for subject: {actual_subj_name}")
continue
# Evaluate samples for this subject
result = evaluate_samples(
......@@ -1044,13 +1019,22 @@ def test():
model.destroy_model_instance()
# Calculate overall results
print(f"\n{'=' * 60}")
print("OVERALL RESULTS")
print(f"{'=' * 60}")
if len(all_results) == 0:
print("No tests were run.")
return
elif len(all_results) > 1:
for r in all_results:
print(
f"Subject '{r['subject']}': {r['correct']}/{r['total']} = {r['accuracy']:.2%}"
)
overall_correct = sum(r["correct"] for r in all_results)
overall_total = sum(r["total"] for r in all_results)
overall_accuracy = overall_correct / overall_total if overall_total > 0 else 0.0
print(f"\n{'='*60}")
print("OVERALL RESULTS")
print(f"{'='*60}")
print(f"{'=' * 60}")
if benchmark == "ceval":
print(
f"Overall 成绩: {overall_correct}/{overall_total} = {overall_accuracy:.2%}"
......@@ -1062,7 +1046,7 @@ def test():
print(f"Total Latency: {TOTAL_TIME} seconds")
print(f"Total Tokens Processed: {TOTAL_TOKENS} tokens")
print(f"Overall Throughput: {TOTAL_TOKENS/TOTAL_TIME:.2f} tokens/s")
print(f"Overall Throughput: {TOTAL_TOKENS / TOTAL_TIME:.2f} tokens/s")
# Write CSV if output path is specified
if output_csv:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment