v1.0

0371621a · chenzk · 0371621a · 0371621a · 0371621a · 0371621a
Commit 0371621a authored Nov 27, 2024 by chenzk
20 changed files
--- a/doc/llm.png
+++ b/doc/llm.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-ubuntu22.04-dtk24.04.2-py3.10
+ENV DEBIAN_FRONTEND=noninteractive
+# RUN yum update && yum install -y git cmake wget build-essential
+# RUN source /opt/dtk-24.04.2/env.sh
+# # 安装pip相关依赖
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
--- a/docker_start.sh
+++ b/docker_start.sh
+docker run -it --shm-size=32G -v $PWD/allamo:/home/allamo -v /parastor/DL_DATA/HOT:/home/HOT -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name llama f6b99c8a0f01 bash
+# python -m torch.utils.collect_env
+
--- a/dpo_fsdp_train.py
+++ b/dpo_fsdp_train.py
+from allamo.configuration import AllamoConfiguration
+from allamo.trainer.dpo_fsdp_trainer import DPOTrainer
+
+if __name__ == '__main__':
+    config = AllamoConfiguration()
+    config.training_type = 'dpo'
+    trainer = DPOTrainer(config)
+    trainer.init_wandb()
+    trainer.train()
+    trainer.close()
--- a/fsdp_train.py
+++ b/fsdp_train.py
+from allamo.configuration import AllamoConfiguration
+from allamo.trainer.fsdp_trainer import FSDPTrainer
+
+if __name__ == '__main__':
+    config = AllamoConfiguration()
+    trainer = FSDPTrainer(config)
+    trainer.init_wandb()
+    trainer.train()
+    trainer.close()
--- a/icon.png
+++ b/icon.png
--- a/infer.sh
+++ b/infer.sh
+python inference/sample.py --config="./train_configs/train_1B.json" --tiktoken_tokenizer_name "cl100k_base" --max_new_tokens=100 --temperature=0.7 --top_k=200 --num_samples=5 --prompt="Long long time ago"
+# python inference/sample.py --config="./train_configs/train_1B.json" --hf_tokenizer_path "scripts/Meta-Llama-3.1-8B" --max_new_tokens=100 --temperature=0.7 --top_k=200 --num_samples=5 --prompt="Long long time ago" # for Meta-Llama-3.1-8B
--- a/inference/__init__.py
+++ b/inference/__init__.py
--- a/inference/sample.py
+++ b/inference/sample.py
+"""
+Use this file to sample from a trained model.
+"""
+import dataclasses
+import json
+import os
+import time
+import torch
+from allamo.logging import configure_logger, logger
+from allamo.configuration import AllamoConfiguration
+from allamo.model.model import AllamoTransformerConfig, AllamoTransformer
+from allamo.torch_utils import configure_torch
+from allamo.train_utils import remove_unwanted_prefix_from_model_state_dict
+
+class AllamoSampler:
+
+    def __init__(self, config: AllamoConfiguration):
+        configure_logger()
+        self.config = config
+        configure_torch(config)
+
+        if config.init_from == 'resume_last':
+            checkpoint_name = 'last_eval_ckpt'
+        else:
+            checkpoint_name = 'ckpt'
+        ckpt_dir = config.checkpoint_path if config.checkpoint_path else config.out_dir
+        model_config_fields = [f.name for f in dataclasses.fields(AllamoTransformerConfig)]
+        logger.info(f"Loading '{checkpoint_name}' checkpoint files from {ckpt_dir}...")
+        with open(os.path.join(ckpt_dir, f'config_{checkpoint_name}.json'), "r", encoding="utf-8") as f:
+            config_checkpoint = json.load(f)
+        for k in model_config_fields:
+            if hasattr(config, k) and k in config_checkpoint['model_args']:
+                setattr(config, k, config_checkpoint['model_args'][k])
+                
+        model_checkpoint = torch.load(os.path.join(ckpt_dir, f'model_{checkpoint_name}.pt'), map_location='cpu')
+        self.__load_model(config, config_checkpoint, model_checkpoint, model_config_fields)
+        self.__load_tokenizer(config)
+        del model_checkpoint
+            
+    def __load_model(self, config: AllamoConfiguration, config_checkpoint, model_checkpoint, model_config_fields):
+        ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'bfloat16-true': torch.bfloat16, 'float16': torch.float16}[config.dtype]
+        model_args = {k: getattr(config, k) for k in model_config_fields if hasattr(config, k)}
+        modelConf = AllamoTransformerConfig(**model_args)
+        model = AllamoTransformer(modelConf)
+        
+        remove_unwanted_prefix_from_model_state_dict(model_checkpoint)
+        model.load_state_dict(model_checkpoint)
+        model.eval()
+        model.to(device=config.device, dtype=ptdtype)
+        if config.compile:
+            model = torch.compile(model)
+        self.model = model
+        logger.info(f"Model loaded from checkpoint")
+        if 'iter_num' in config_checkpoint:
+            logger.info(f"Last model iteration: {config_checkpoint['iter_num']}")
+        
+    def __load_tokenizer(self, config: AllamoConfiguration):
+        tiktoken_tokenizer_name = config.tiktoken_tokenizer_name
+        hf_tokenizer_path = config.hf_tokenizer_path
+        if hf_tokenizer_path is not None:
+            from transformers import AutoTokenizer
+            tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_path)
+            logger.info(f"HuggingFace tokenizer loaded: {hf_tokenizer_path}")
+        elif tiktoken_tokenizer_name is not None:
+            import tiktoken
+            tokenizer = tiktoken.get_encoding(tiktoken_tokenizer_name)
+            logger.info(f"Tiktoken tokenizer loaded: {tiktoken_tokenizer_name}")
+        else:
+            raise Exception('Tokenizer is not provided. Please specify either a Tiktoken tokenizer or a HuggingFace tokenizer')
+        # ensure that the tokenizer and model vocabulary sizes are equal
+        # assert len(tokenizer) == self.model.config.vocab_size
+        self.tokenizer = tokenizer
+    
+    def tokenize_prompt(self, text: str):
+        return self.tokenizer.encode(text)
+        
+    def encode_prompt(self, text: str):
+        prompt_tokens = self.tokenize_prompt(text)
+        prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=self.config.device)
+        prompt_tokens = torch.unsqueeze(prompt_tokens, 0)
+        return prompt_tokens
+        
+    def generate_embeddings(self, text: str):
+        if text:
+            with torch.no_grad():
+                prompt_tokens = self.encode_prompt(text)
+                embeddings = self.model.generate_embeddings(prompt_tokens)
+                embeddings = torch.squeeze(embeddings[:, [-1], :]) # use only the last position
+                return embeddings.tolist()
+        return []
+                
+    def generate_completions(self, text: str, samples: int, new_tokens: int, temperature: float, top_k: int):
+        result = []
+        timer = time.time()
+        with torch.no_grad():
+            prompt_tokens = self.encode_prompt(text)
+            for _ in range(samples):
+                y = self.model.generate(prompt_tokens, new_tokens, temperature=temperature, top_k=top_k)
+                result.append(self.tokenizer.decode(y[0].tolist()).strip())
+        dt = time.time() - timer
+        logger.info(f"{new_tokens*samples} completion tokens generated in {dt:.2f}secs ({new_tokens*samples/dt:.2f} tokens/sec) for {prompt_tokens.shape[1]} input tokens")
+        print("##################result:", result, "##################")
+        return result
+
+
+if __name__ == '__main__':
+    config = AllamoConfiguration()
+    sampler = AllamoSampler(config)
+
+    # encode the beginning of the prompt
+    if config.prompt.startswith('FILE:'):
+        with open(config.prompt[5:], 'r', encoding='utf-8') as f:
+            config.prompt = f.read()
+            
+    completions = sampler.generate_completions(config.prompt, config.num_samples, config.max_new_tokens, temperature=config.temperature, top_k=config.top_k)
+    logger.info("Completions:")
+    for completion in completions:
+        logger.info(completion)
+        logger.info('----------------')
+
--- a/inference/sample_api.py
+++ b/inference/sample_api.py
+import logging
+from flask import Flask, request, jsonify
+from sample import AllamoSampler
+from allamo.configuration import AllamoConfiguration
+
+config = AllamoConfiguration()
+sampler = AllamoSampler(config)
+app = Flask(__name__)
+
+@app.route('/tokens', methods=['POST'])
+def tokens():
+    payload = request.json
+    prompt = payload.get('prompt') if 'prompt' in payload else None
+    tokens = sampler.tokenize_prompt(prompt)
+    return jsonify({'tokens': tokens, 'length': len(tokens)})
+
+@app.route('/embeddings', methods=['POST'])
+def embeddings():
+    payload = request.json
+    prompt = payload.get('prompt') if 'prompt' in payload else None
+    embeddings = sampler.generate_embeddings(prompt)
+    return jsonify({'embeddings': embeddings})
+    
+@app.route('/completions', methods=['POST'])
+def completions():
+    payload = request.json
+    prompt = payload.get('prompt') if 'prompt' in payload else None
+    num_samples = int(payload.get('num_samples')) if 'num_samples' in payload else config.num_samples
+    max_new_tokens = int(payload.get('max_new_tokens')) if 'max_new_tokens' in payload else config.max_new_tokens
+    temperature = float(payload.get('temperature')) if 'temperature' in payload else config.temperature
+    top_k = int(payload.get('top_k')) if 'top_k' in payload else config.top_k
+    completions = sampler.generate_completions(prompt, num_samples, max_new_tokens, temperature, top_k)
+    return jsonify({'completions': completions})
+    
+if __name__ == '__main__':
+    app.run()
--- a/inference/sample_ui.py
+++ b/inference/sample_ui.py
+import gradio as gr
+import requests
+
+API_URL = "http://localhost:5000/completions"
+
+def get_completion(prompt, num_samples, max_new_tokens, temperature, top_k):
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "prompt": prompt,
+        "num_samples": num_samples,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_k": top_k
+    }
+    response = requests.post(API_URL, headers=headers, json=data)
+    response.raise_for_status()
+    completions = list(response.json()["completions"])
+    return '\n\n-------\n\n'.join(completions)
+
+iface = gr.Interface(
+    fn=get_completion,
+    inputs=["text", \
+        gr.Number(value=1, label="Number of samples to generate"), \
+        gr.Number(value=50, label="Number of tokens to generate in each sample"), \
+        gr.Slider(0.1, 1.9, step=0.1, value=0.8, label="Temperature value for text generation"), \
+        gr.Number(value=200, label="Top k most likely tokens to be retained during text generation") \
+    ],
+    outputs="text",
+    title="Text Completion with Allamo",
+    theme="light"
+)
+
+iface.launch(server_name="0.0.0.0", server_port=7809)
--- a/llama3__scratch/config.py
+++ b/llama3__scratch/config.py
+import torch
+from dataclasses import dataclass
+from typing import Optional, Tuple, List
+from dataset import vocabulary
+
+vocab = vocabulary()
+vocab_size = len(vocab)
+
+# Define parameters dataclass: we'll use these parameters during model building, training and inference.
+@dataclass
+class ModelArgs:
+    dim: int = 512              # embedding dimension
+    n_layers: int = 8           # number of model decoder blocks
+    n_heads: int = 8            # number of heads for queries embedding
+    n_kv_heads: int = 4         # number of heads for keys and values embedding
+    vocab_size: int = vocab_size # Length of vocabulary
+    multiple_of: int = 256        # Require to calculate dim of feedfoward network
+    ffn_dim_multiplier: Optional[float] = None  # Require to calculate dim of feedfoward network
+    norm_eps: float = 1e-5                       # Default Epsilon value set for the RMSNorm calculation
+    rope_theta: float = 10000.0   # Default theta value for the RePE calculation
+
+    max_batch_size: int = 10      # Max batch size
+    max_seq_len: int = 256         # Max sequence length
+
+    epochs: int = 2500             # Total number of training iteration
+    log_interval: int = 10        # Number of interval to print the logs and loss values
+    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'   # Assign device to cuda or cpu based on availability
\ No newline at end of file
--- a/llama3__scratch/dataset.py
+++ b/llama3__scratch/dataset.py
+import torch
+
+def loaddata(prompts="Hello World", txt='tiny_shakespeare.txt'):
+    # Using Tiny Shakespeare dataset for character-level tokenizer. Some part of the following character-level tokenizer is referenced from Andrej karpathy's GitHub (https://github.com/karpathy/nanoGPT/blob/master/data/shakespeare_char/prepare.py) which I found is explained very well.
+    # Load tiny_shakespeare data file (https://github.com/tamangmilan/llama3/blob/main/tiny_shakespeare.txt)
+    # Load tiny_shakespeare data file.
+    with open(txt, 'r') as f:
+        data = f.read()
+    return data
+
+def vocabulary():
+    data = loaddata()
+    # Prepare vocabulary by taking all the unique characters from the tiny_shakespeare data
+    vocab = sorted(list(set(data)))
+
+    # Training Llama 3 model requires addtional tokens such as <|begin_of_text|>, <|end_of_text|> and <|pad_id|>, we'll add them into vocabulary
+    vocab.extend(['<|begin_of_text|>','<|end_of_text|>','<|pad_id|>'])
+    vocab_size = len(vocab)
+    return vocab
+
+def tokenlizer():
+    vocab = vocabulary()
+    # Create a mapping between characters with corresponding integer indexes in vocabulary.
+    # This is important to build tokenizers encode and decode functions.
+    itos = {i:ch for i, ch in enumerate(vocab)}
+    stoi = {ch:i for i, ch in enumerate(vocab)}
+    # Define tensor token variable to be used later during model training
+    # token_bos = torch.tensor([stoi['<|begin_of_text|>']], dtype=torch.int, device=device)
+    # token_eos = torch.tensor([stoi['<|end_of_text|>']], dtype=torch.int, device=device)
+    # token_pad = torch.tensor([stoi['<|pad_id|>']], dtype=torch.int, device=device)
+    token_bos = torch.tensor([stoi['<|begin_of_text|>']], dtype=torch.int).cuda()
+    token_eos = torch.tensor([stoi['<|end_of_text|>']], dtype=torch.int).cuda()
+    token_pad = torch.tensor([stoi['<|pad_id|>']], dtype=torch.int).cuda()
+    return stoi, itos, token_bos, token_eos, token_pad
+
+# Tokenizers encode function: take a string, output a list of integers
+def encode(s):
+    vocab = vocabulary()
+    stoi = {ch:i for i, ch in enumerate(vocab)}
+    return [stoi[ch] for ch in s]
+
+# Tokenizers decode function: take a list of integers, output a string
+def decode(l):
+    stoi, itos, token_bos, token_eos, token_pad = tokenlizer()
+    return ''.join(itos[i] for i in l)
+
+# prompts = "Hello World"
+# encoded_tokens = encode(prompts)
+# decoded_text = decode(encoded_tokens)
\ No newline at end of file
--- a/llama3__scratch/infer.py
+++ b/llama3__scratch/infer.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+from model import Transformer
+from dataset import loaddata, tokenlizer, encode, decode
+from config import ModelArgs
+
+## Inference Llama 3 Model
+# This function generates text sequences based on provided prompts using the LLama 3 model we've built and trained.
+
+def generate(model, prompts: str, params: ModelArgs, max_gen_len: int=500, temperature: float = 0.6, top_p: float = 0.9):
+
+    # prompt_tokens: List of user input texts or prompts
+    # max_gen_len: Maximum length of the generated text sequence.
+    # temperature: Temperature value for controlling randomness in sampling. Defaults to 0.6.
+    # top_p: Top-p probability threshold for sampling prob output from the logits. Defaults to 0.9.
+    # prompt_tokens = [0]
+    bsz = 1  #For inferencing, in general user just input one prompt which we'll take it as 1-batch
+    stoi, itos, token_bos, token_eos, token_pad = tokenlizer()
+    prompt_tokens = token_bos.tolist() + encode(prompts)
+    assert len(prompt_tokens) <= params.max_seq_len, "prompt token length should be small than max_seq_len"
+    total_len = min(len(prompt_tokens)+max_gen_len, params.max_seq_len)
+
+    # this tokens matrix is to store the input prompts and all the output that is generated by model.
+    # later we'll use the tokenizers decode function to decode this token to view results in text format
+    tokens = torch.full((bsz,total_len), fill_value=token_pad.item(), dtype=torch.long, device=params.device)
+
+    # fill in the prompt tokens into the token matrix
+    tokens[:,:len(prompt_tokens)] = torch.tensor(prompt_tokens, dtype=torch.long, device=params.device)
+
+    #create a prompt_mask_token for later use to identify if the token is a prompt token or a padding token
+    # True if it is a prompt token, False if it is a padding token
+    input_text_mask = tokens != token_pad.item()
+
+    #now we can start inferencing using one token at a time from the prompt_tokens list starting with the first position.
+    prev_pos = 0
+    for cur_pos in range(1, total_len):
+      with torch.no_grad():
+        logits, _ = model(x=tokens[:,prev_pos:cur_pos], start_pos=prev_pos)
+      if temperature > 0:
+        probs = torch.softmax(logits[:, -1]/temperature, dim=-1)
+        next_token = sample_top_p(probs, top_p)
+      else:
+        next_token = torch.argmax(logits[:, -1], dim=-1)
+
+      next_token = next_token.reshape(-1)
+
+      # only replace the token if it's a padding token
+      next_token = torch.where(input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token)
+      tokens[:, cur_pos] = next_token
+
+      prev_pos = cur_pos
+      if tokens[:,cur_pos]==token_pad.item() and next_token == token_eos.item():
+        break
+
+    output_tokens, output_texts = [], []
+
+    for i, toks in enumerate(tokens.tolist()):
+      # eos_idx = toks.index(token_eos.item())
+      if token_eos.item() in toks:
+        eos_idx = toks.index(token_eos.item())
+        toks = toks[:eos_idx]
+
+      output_tokens.append(toks)
+      output_texts.append(decode(toks))
+    return output_tokens, output_texts
+
+# Perform top-p (nucleus) sampling on a probability distribution.
+# probs (torch.Tensor): Probability distribution tensor derived from the logits.
+# p: Probability threshold for top-p sampling.
+# According to the paper, Top-p sampling selects the smallest set of tokens whose cumulative probability mass exceeds the threshold p.
+# The distribution is renormalized based on the selected tokens.
+def sample_top_p(probs, p):
+    probs_sort, prob_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(prob_idx, -1, next_token)
+    # Sampled token indices from the vocabular is returned
+    return next_token
+
+def load_ckpt(model, args:ModelArgs):
+    epoch = 2499
+    path = "checkpoints/model_{}.pth".format(epoch)
+    checkpoint = torch.load(path, map_location='cpu')
+    model.load_state_dict(checkpoint['model'])
+    device = args.device
+    model.to(device)
+    model.eval()
+    return model
+
+
+## Perform the inferencing on user input prompts
+# prompts = "Consider you what services he has done"
+prompts = "Would you proceed especially against Caius Marcius?"
+
+model = Transformer(ModelArgs).to(ModelArgs.device)
+model = load_ckpt(model, ModelArgs)
+output_tokens, output_texts = generate(model, prompts, ModelArgs)
+output_texts = output_texts[0].replace("<|begin_of_text|>", "")
+print("output: ", output_texts)
\ No newline at end of file
--- a/llama3__scratch/model.py
+++ b/llama3__scratch/model.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+from typing import Optional, Tuple, List
+from config import ModelArgs
+import math
+
+##The Decoder Block
+# Note: Since the Llama 3 model is developed by Meta, so to be in sync with their codebase and for future compatibility,
+# I will use most of the code from Meta GitHub with some necessary changes required to achieve our goal.
+# Note: Since we want to see the results of training and inferencing faster rather than focusing on high accuracy, we're taking lower values for most of the parameters which are set higher in the Llama 3 model.
+
+## the RMSNorm
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.args = ModelArgs
+        self.eps = eps
+        # Scaling parameter gamma, initialized with one and the no of parameters is equal to the size of dim
+        self.weight = nn.Parameter(torch.ones(dim).to(self.args.device))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps).to(self.args.device)
+
+    def forward(self, x):
+        #Shape: x[bs,seq,dim]
+        output = self._norm(x.float()).type_as(x)
+
+        #Shape: x[bs,seq,dim] -> x_norm[bs,seq,dim]
+        return output * self.weight
+
+## The RoPE
+def precompute_freqs_cis(dim:int, seq_len: int, theta: float=10000.0):
+    # Computing Theta value for each dim pair which is dim/2
+    device = ModelArgs.device
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2,device=device)[:(dim//2)].float()/dim))
+
+    # Computing range of positions(m) in the sequence
+    t = torch.arange(seq_len, dtype=torch.float32, device=device)
+
+    # freqs gives all the Theta value range for all the position of tokens in the sequence
+    freqs = torch.outer(t, freqs).to(device)
+
+    # This is the rotation matrix which needs to be converted to Polar form in order to perform rotation to the embedding
+    freqs_cis = torch.polar(torch.ones_like(freqs).to(device), freqs).to(device)
+    return freqs_cis
+
+def reshape_for_broadcast(freqs_cis, x):
+    ndim = x.ndim
+    assert 0<=1<ndim
+    assert freqs_cis.shape == (x.shape[1],x.shape[-1]), "the last two dimension of freqs_cis, x must match"
+    shape = [d if i==1 or i==ndim-1 else 1 for i,d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+
+def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor)->Tuple[torch.Tensor, torch.Tensor]:
+    device = ModelArgs.device
+    # Applying rotary positional encoding to both query and key embedding together
+    # First: The last dimension of xq and xk embedding needs to be reshaped to make it a pair. As rotation matrix is applied to each pair of dim.
+    # Next: convert both xq and xk to complex number as the rotation matrix is only applicable to complex number
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)).to(device)    #xq_:[bsz, seq_len, n_heads, head_dim/2]
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)).to(device)    #xk_:[bsz, seq_len, n_heads, head_dim/2]
+
+    # The rotation matrix(freqs_cis) dimensions across seq_len(dim=1) and head_dim(dim=3) should match with the embedding
+    # Also, the shape freqs_cis should be the same with xq and xk, hence change the shape of freqs_cis:[seq_len,head_dim] -> freqs_cis:[1,seq_len,1,head_dim]
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+
+    #Finally, perform rotation operation by multiplying with freqs_cis.
+    #After the rotation is completed, convert both xq_out and xk_out back to real number and return
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).to(device) #xq_out:[bsz, seq_len, n_heads, head_dim]
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).to(device) #xk_out:[bsz, seq_len, n_heads, head_dim]
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+class Attention(nn.Module):
+  def __init__(self, args: ModelArgs):
+    super().__init__()
+    self.args = args
+    # Embedding dimension
+    self.dim = args.dim
+    # Number of heads assigned to Query
+    self.n_heads = args.n_heads
+    # Number of heads assigned to Key and values. If "None", the number will be same as Query.
+    self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+    # Dimension of each head relative to model dimension
+    self.head_dim = args.dim // args.n_heads
+    # Number of repetition in order to make time Key, Value heads to match Query heads number
+    self.n_rep = args.n_heads // args.n_kv_heads
+
+    # Weight initialize for Keys, Querys, Values and Oupt. Notice that the out_feature value of weight for q and kv are based on it's heads
+    self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False, device=args.device)
+    self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False, device=args.device)
+    self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False, device=args.device)
+    self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False, device=args.device)
+
+    # Initialize caches to store Key, Values at start. (KV Cache Implementation)
+    self.cache_k = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim), device=args.device)
+    self.cache_v = torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim), device=args.device)
+
+  def forward(self, x: torch.Tensor, start_pos, inference):
+    # Shape of the input embedding: [bsz,seq_len,dim]
+    bsz, seq_len, _ = x.shape
+    # Mask will be used during 'Training' and is not required for 'inference' due to the use of KV cache.
+    mask = None
+
+    xq = self.wq(x)  #x[bsz,seq_len,dim]*wq[dim,n_heads * head_dim] -> q[bsz,seq_len,n_heads * head_dim]
+    xk = self.wk(x)  #x[bsz,seq_len,dim]*wq[dim,n_kv_heads * head_dim] -> k[bsz,seq_len,n_kv_heads * head_dim]
+    xv = self.wv(x)  #x[bsz,seq_len,dim]*wq[dim,n_kv_heads * head_dim] -> v[bsz,seq_len,n_kv_heads * head_dim]
+
+    # Reshaping Querys, Keys and Values by their number of heads. (Group Query Attention Implementation)
+    xq = xq.view(bsz, seq_len, self.n_heads, self.head_dim)      #xq[bsz,seq_len,n_heads, head_dim]
+    xk = xk.view(bsz, seq_len, self.n_kv_heads, self.head_dim)   #xk[bsz,seq_len,n_kv_heads, head_dim]
+    xv = xv.view(bsz, seq_len, self.n_kv_heads, self.head_dim)   #xv[bsz,seq_len,n_kv_heads, head_dim]
+
+    # Model - Inference Mode: kv-cache is enabled at inference mode only.
+    if inference:
+      # Compute rotation matrix for each position in the sequence
+      freqs_cis = precompute_freqs_cis(dim=self.head_dim, seq_len=self.args.max_seq_len * 2)
+      # During inferencing, we should only take the rotation matrix range from the current position of the tokens.
+      freqs_cis = freqs_cis[start_pos : start_pos + seq_len]
+      # Apply RoPE to Queries and Keys embeddings
+      xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
+
+      self.cache_k = self.cache_k.to(xq)
+      self.cache_v = self.cache_v.to(xq)
+      # Store Keys and Values token embedding into their respective cache [KV Cache Implementation]
+      self.cache_k[:bsz, start_pos:start_pos + seq_len] = xk
+      self.cache_v[:bsz, start_pos:start_pos + seq_len] = xv
+
+      # Assign all the previous tokens embeddings upto current tokens position to Keys and Values variable for Attention Calculation
+      keys = self.cache_k[:bsz, :start_pos + seq_len]
+      values = self.cache_v[:bsz, :start_pos + seq_len]
+
+      # At this point, they Keys and Values shape aren't same with Queries Embedding which has to be in order to computer attention score
+      # Use repeat_kv function to make Keys,Values shape same as queries shape
+      keys = repeat_kv(keys, self.n_rep)      #keys[bsz,seq_len,n_heads,head_dim]
+      values = repeat_kv(values, self.n_rep)  #values[bsz,seq_len,n_heads,head_dim]
+
+    # Mode - Training mode: KV-Cache not implemented
+    else:
+      # Compute rotation matrix and apply RoPE to queries and keys for for training.
+      freqs_cis = precompute_freqs_cis(dim=self.head_dim, seq_len=self.args.max_seq_len)
+
+      #xq[bsz,seq_len,n_heads, head_dim], xk[bsz,seq_len,n_heads, head_dim]
+      xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
+
+      # Use repeat_kv function to make Keys,Values shape same as the queries shape
+      #keys[bsz,seq_len,n_heads,head_dim], #values[bsz,seq_len,n_heads,head_dim]
+      keys = repeat_kv(xk, self.n_rep)
+      values = repeat_kv(xv, self.n_rep)
+
+      # For training mode, we'll compute mask and apply to the attention score later
+      mask = torch.full((seq_len, seq_len),float("-inf"),device=self.args.device)
+      mask = torch.triu(mask, diagonal=1).to(self.args.device)
+
+    # To compute attention, we'll need to perform a transpose operation to reshape all queries, keys and values bring heads at dim 1 and seq at dim 2
+    xq = xq.transpose(1,2)                  #xq[bsz,n_heads,seq_len,head_dim]
+    keys = keys.transpose(1,2)              #keys[bsz,n_heads,seq_len,head_dim]
+    values = values.transpose(1,2)          #values[bsz,n_heads,seq_len,head_dim]
+
+    # Computing attention score
+    scores = torch.matmul(xq, keys.transpose(2,3)).to(self.args.device)/math.sqrt(self.head_dim)
+    if mask is not None:
+      scores = scores + mask
+
+    # Apply softmax to the attention score
+    scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+    # Matrix multiplication of attention score with the values
+    output = torch.matmul(scores, values).to(self.args.device)
+
+    # We get the contextual embedding for each head
+    # All heads need to be reshaped back and combined to give a single single contextual attention output
+    # Shape change: output[bsz,n_heads,seq_len,head_dim] -> output[bsz,seq_len, n_heads,head_dim] -> output[bsz,seq_len, n_heads * head_dim]
+    output = output.transpose(1,2).contiguous().view(bsz, seq_len, -1)
+
+    # shape: output [bsz,seq_len,dim]
+    return self.wo(output)
+
+# If the number of keys/values heads is less than query heads, this function expands the key/values embeddings with the required number of repetition
+def repeat_kv(x:torch.Tensor, n_rep: int)-> torch.Tensor:
+  bsz, seq_len, n_kv_heads, head_dim = x.shape
+  if n_rep == 1:
+    return x
+  return (
+      x[:,:,:,None,:]
+      .expand(bsz,seq_len,n_kv_heads,n_rep, head_dim)
+      .reshape(bsz,seq_len,n_kv_heads * n_rep, head_dim)
+  )
+
+
+## The Feedfoward Network (SwiGLU activation)
+class FeedForward(nn.Module):
+  def __init__(self, dim:int, hidden_dim:int, multiple_of:int, ffn_dim_multiplier:Optional[float], args:ModelArgs):
+    super().__init__()
+    # Models embedding dimension
+    self.dim = dim
+
+    # We must use the hidden dimensions calculation shared by Meta which is the ideal one for this model
+    # Hidden dimension are calculated such that it is a multiple of 256.
+    hidden_dim = int(2 * hidden_dim/3)
+    if ffn_dim_multiplier is not None:
+      hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+    # define hiddne layers weights
+    self.w1 = nn.Linear(self.dim, hidden_dim, bias=False, device=args.device)
+    self.w2 = nn.Linear(hidden_dim, self.dim, bias=False, device=args.device)
+    self.w3 = nn.Linear(self.dim, hidden_dim, bias=False, device=args.device)
+
+  def forward(self, x):
+    # Shape: [bsz,seq_len,dim]
+    return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+## The Decoder Block. The class name is assigned as TransformerBlock to match the name of Meta llama 3 code base.
+class TransformerBlock(nn.Module):
+  def __init__(self, args: ModelArgs):
+    super().__init__()
+    self.args = args
+    # Initilizate RMSNorm for attention
+    self.attention_norm = RMSNorm(dim=args.dim, eps = args.norm_eps)
+    # Initilizate Attention class
+    self.attention = Attention(args)
+    # Initilizate RMSNorm for feedfoward class
+    self.ff_norm = RMSNorm(dim=args.dim, eps = args.norm_eps)
+    # Initilizate feedfoward class
+    self.feedforward = FeedForward(args.dim, 4 * args.dim, args.multiple_of, args.ffn_dim_multiplier, args)
+
+  def forward(self, x, start_pos, inference):
+    # start_pos = token position for inference mode, inference = True for inference and False for training mode
+    # i) pass input embedding to attention_norm and then pass to attention block.
+    # ii) the output of attention is then added to embedding(before norm)
+    h = x + self.attention(self.attention_norm(x), start_pos, inference)
+
+    # i) pass attention output to ff_norm and then pass to the feedforward network.
+    # ii) the output of feedforward network is then added to the attention output(before ff_norm)
+    out = h + self.feedforward(self.ff_norm(h))
+    # Shape: [bsz,seq_len,dim]
+    return out
+
+class Transformer(nn.Module):
+  def __init__(self, params: ModelArgs):
+    super().__init__()
+    # set all the ModelArgs in params variable
+    self.params = params
+    # Initilizate embedding class from the input block
+    self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+
+    # Initialize the decoder block and store it inside the ModuleList.
+    # This is because we've 4 decoder blocks in our Llama 3 model. (Official Llama 3 has 32 blocks)
+    self.layers = nn.ModuleList()
+    for layer_id in range(params.n_layers):
+      self.layers.append(TransformerBlock(args=params))
+
+    # Initilizate RMSNorm for the output block
+    self.norm = RMSNorm(params.dim, eps = params.norm_eps)
+
+    # Initilizate linear layer at the output block.
+    self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+
+  def forward(self, x, start_pos=0, targets=None):
+
+    # start_pos = token position for inference mode, inference = True for inference and False for training mode
+    # x is the batch of token_ids generated from the texts or prompts using tokenizers.
+    # x[bsz, seq_len] -> h[bsz, seq_len, dim]
+    h = self.tok_embeddings(x)
+
+    # If the target is none, Inference mode is activated and set to "True" and "False" if Training mode is activated.
+    if targets is None:
+      inference = True
+    else:
+      inference = False
+
+    # The embeddings (h) will then pass though all the decoder blocks.
+    for layer in self.layers:
+      h = layer(h, start_pos, inference)
+
+    # The output from the final decoder block will feed into the RMSNorm
+    h = self.norm(h)
+
+    # After normalized, the embedding h will then feed into the Linear layer.
+    # The main task of the Linear layer is to generate logits that maps the embeddings with the vocabulary size.
+    # h[bsz, seq_len, dim] -> logits[bsz, seq_len, vocab_size]
+    logits = self.output(h).float()
+    loss = None
+
+    # Inference mode is activated if the targets is not available
+    if targets is None:
+      loss = None
+    # Training mode is activated if the targets are available. And Loss will be calculated for further model training.
+    else:
+      loss = F.cross_entropy(logits.view(-1, self.params.vocab_size), targets.view(-1))
+
+    return logits, loss
\ No newline at end of file
--- a/llama3__scratch/tiny_shakespeare.txt
+++ b/llama3__scratch/tiny_shakespeare.txt
--- a/llama3__scratch/train.py
+++ b/llama3__scratch/train.py
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+import time
+import pandas as pd
+from matplotlib import pyplot as plt
+from dataset import loaddata, tokenlizer, encode, decode
+from model import Transformer
+from config import ModelArgs
+
+
+## Train Llama 3 Model
+# Define function to generate batches from the given dataset
+def get_dataset_batch(data, split, args:ModelArgs):
+    seq_len = args.max_seq_len
+    batch_size = args.max_batch_size
+    device = args.device
+
+    train = data[:int(0.8 * len(data))]
+    val = data[int(0.8 * len(data)): int(0.9 * len(data))]
+    test = data[int(0.9 * len(data)):]
+
+    batch_data = train
+    if split == "val":
+        batch_data = val
+
+    if split == "test":
+        batch_data = test
+
+    # Picking random starting points from the dataset to give random samples for training, validation and testing.
+    stoi, itos, token_bos, token_eos, token_pad = tokenlizer()
+    ix = torch.randint(0, len(batch_data) - seq_len - 3, (batch_size,)).to(device)
+    x = torch.stack([torch.cat([token_bos, batch_data[i:i+seq_len-1]]) for i in ix]).long().to(device)
+    y = torch.stack([torch.cat([batch_data[i+1:i+seq_len], token_eos]) for i in ix]).long().to(device)
+    return x,y
+
+# Define a evaluate loss function to calculate and store training and validation loss for logging and plotting
+@torch.no_grad()
+def evaluate_loss(model, dataset, args:ModelArgs):
+    out = {}
+    model.eval()
+
+    for split in ["train", "val"]:
+        losses = []
+        for _ in range(10):
+            xb, yb = get_dataset_batch(dataset, split, args)
+            _, loss = model(x=xb, targets=yb)
+            losses.append(loss.item())
+        out[split] = np.mean(losses)
+
+    model.train()
+    return out
+
+# Define a training function to perform model training
+def train(model, optimizer, args:ModelArgs):
+    print("model: ", model)
+    data = loaddata()
+    dataset = torch.tensor(encode(data), dtype=torch.int).to(ModelArgs.device)
+    print(f"dataset-shape: {dataset.shape}")
+    epochs = args.epochs
+    log_interval = args.log_interval
+    device = args.device
+    losses = []
+    start_time = time.time()
+
+    for epoch in range(epochs):
+        optimizer.zero_grad()
+
+        xs, ys = get_dataset_batch(dataset, 'train', args)
+        xs = xs.to(device)
+        ys = ys.to(device)
+        logits, loss = model(x=xs, targets=ys)
+        loss.backward()
+        optimizer.step()
+
+        if epoch % log_interval == 0:
+            batch_time = time.time() - start_time
+            x = evaluate_loss(model, dataset, args)
+            losses += [x]
+            print(f"Epoch {epoch} | val loss {x['val']:.3f} | Time {batch_time:.3f}")
+            start_time = time.time()
+
+    # Print the final validation loss
+    print("validation loss: ", losses[-1]['val'])
+
+    # 保存
+    save_file = {"model": model.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "epoch": epoch,
+                    "args": args}
+    torch.save(save_file, "checkpoints/model_{}.pth".format(epoch))
+
+    # Display the interval losses in plot
+    return pd.DataFrame(losses).plot()
+
+## Start training our Llama 3 model
+model = Transformer(ModelArgs).to(ModelArgs.device)
+optimizer = torch.optim.Adam(model.parameters())
+train(model, optimizer, ModelArgs)
+ 
+# 加载
+# checkpoint = torch.load(path, map_location='cpu')
+# model.load_state_dict(checkpoint['model'])
+# optimizer.load_state_dict(checkpoint['optimizer'])
+# lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+# args.start_epoch = checkpoint['epoch'] + 1
\ No newline at end of file
--- a/llama3__scratch/utils.py
+++ b/llama3__scratch/utils.py
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1118
+# 模型名称
+modelName=allamo_pytorch
+# 模型描述
+modelDescription=彻底开源预训练大模型，极简讲清原理，极简提供算法实现代码，让大家秒懂、秒实现，方便大家提出自研算法，让人类文明自由共享是终极目标。
+# 应用场景
+appScenario=推理,训练,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch