"Initial commit"

09d54a38 · luopl · 09d54a38 · 09d54a38 · 09d54a38 · 09d54a38
Commit 09d54a38 authored May 08, 2025 by luopl
5 changed files
--- a/stepvideo/text_encoder/tokenizer.py
+++ b/stepvideo/text_encoder/tokenizer.py
+# Copyright 2025 StepFun Inc. All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# ==============================================================================
+import torch.nn as nn
+import torch
+from typing import List
+
+
+class LLaMaEmbedding(nn.Module):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 cfg,
+                 ):
+        super().__init__()
+        self.hidden_size = cfg.hidden_size
+        self.params_dtype = cfg.params_dtype
+        self.fp32_residual_connection = cfg.fp32_residual_connection 
+        self.embedding_weights_in_fp32 = cfg.embedding_weights_in_fp32
+        self.word_embeddings = torch.nn.Embedding(
+            cfg.padded_vocab_size, self.hidden_size,
+        )
+        self.embedding_dropout = torch.nn.Dropout(cfg.hidden_dropout)
+
+    def forward(self, input_ids):
+        # Embeddings.
+        if self.embedding_weights_in_fp32:
+            self.word_embeddings = self.word_embeddings.to(torch.float32)
+        embeddings = self.word_embeddings(input_ids)
+        if self.embedding_weights_in_fp32:
+            embeddings = embeddings.to(self.params_dtype)
+            self.word_embeddings = self.word_embeddings.to(self.params_dtype)
+
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+
+        return embeddings
+
+
+
+class StepChatTokenizer:
+    """Step Chat Tokenizer"""
+
+    def __init__(
+        self, model_file, name="StepChatTokenizer",
+        bot_token="<|BOT|>",  # Begin of Turn
+        eot_token="<|EOT|>",  # End of Turn
+        call_start_token="<|CALL_START|>",      # Call Start
+        call_end_token="<|CALL_END|>",          # Call End
+        think_start_token="<|THINK_START|>",    # Think Start
+        think_end_token="<|THINK_END|>",        # Think End
+        mask_start_token="<|MASK_1e69f|>",      # Mask start
+        mask_end_token="<|UNMASK_1e69f|>",      # Mask end
+    ):
+        import sentencepiece
+
+        self._tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
+
+        self._vocab = {}
+        self._inv_vocab = {}
+
+        self._special_tokens = {}
+        self._inv_special_tokens = {}
+
+        self._t5_tokens = []
+
+        for idx in range(self._tokenizer.get_piece_size()):
+            text = self._tokenizer.id_to_piece(idx)
+            self._inv_vocab[idx] = text
+            self._vocab[text] = idx
+
+            if self._tokenizer.is_control(idx) or self._tokenizer.is_unknown(idx):
+                self._special_tokens[text] = idx
+                self._inv_special_tokens[idx] = text
+
+        self._unk_id = self._tokenizer.unk_id()
+        self._bos_id = self._tokenizer.bos_id()
+        self._eos_id = self._tokenizer.eos_id()
+
+        for token in [
+            bot_token, eot_token, call_start_token, call_end_token,
+            think_start_token, think_end_token
+        ]:
+            assert token in self._vocab, f"Token '{token}' not found in tokenizer"
+            assert token in self._special_tokens, f"Token '{token}' is not a special token"
+
+        for token in [mask_start_token, mask_end_token]:
+            assert token in self._vocab, f"Token '{token}' not found in tokenizer"
+
+        self._bot_id = self._tokenizer.piece_to_id(bot_token)
+        self._eot_id = self._tokenizer.piece_to_id(eot_token)
+        self._call_start_id = self._tokenizer.piece_to_id(call_start_token)
+        self._call_end_id = self._tokenizer.piece_to_id(call_end_token)
+        self._think_start_id = self._tokenizer.piece_to_id(think_start_token)
+        self._think_end_id = self._tokenizer.piece_to_id(think_end_token)
+        self._mask_start_id = self._tokenizer.piece_to_id(mask_start_token)
+        self._mask_end_id = self._tokenizer.piece_to_id(mask_end_token)
+
+        self._underline_id = self._tokenizer.piece_to_id("\u2581")
+        
+    @property
+    def vocab(self):
+        return self._vocab
+
+    @property
+    def inv_vocab(self):
+        return self._inv_vocab
+
+    @property
+    def vocab_size(self):
+        return self._tokenizer.vocab_size()
+
+    def tokenize(self, text: str) -> List[int]:
+        return self._tokenizer.encode_as_ids(text)
+
+    def detokenize(self, token_ids: List[int]) -> str:
+        return self._tokenizer.decode_ids(token_ids)
+
+    
+class Tokens:
+    def __init__(self, input_ids, cu_input_ids, attention_mask, cu_seqlens, max_seq_len) -> None:
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.cu_input_ids = cu_input_ids
+        self.cu_seqlens = cu_seqlens
+        self.max_seq_len = max_seq_len
+    def to(self, device):
+        self.input_ids = self.input_ids.to(device)
+        self.attention_mask = self.attention_mask.to(device)
+        self.cu_input_ids = self.cu_input_ids.to(device)
+        self.cu_seqlens = self.cu_seqlens.to(device)
+        return self
+    
+class Wrapped_StepChatTokenizer(StepChatTokenizer):
+    def __call__(self, text, max_length=320, padding="max_length", truncation=True, return_tensors="pt"):
+        # [bos, ..., eos, pad, pad, ..., pad]
+        self.BOS = 1
+        self.EOS = 2
+        self.PAD = 2
+        out_tokens = []
+        attn_mask = []
+        if len(text) == 0:
+            part_tokens = [self.BOS] + [self.EOS]
+            valid_size = len(part_tokens)
+            if len(part_tokens) < max_length:
+                part_tokens += [self.PAD] * (max_length - valid_size)
+            out_tokens.append(part_tokens)
+            attn_mask.append([1]*valid_size+[0]*(max_length-valid_size))
+        else:
+            for part in text:
+                part_tokens = self.tokenize(part)
+                part_tokens = part_tokens[:(max_length - 2)] # leave 2 space for bos and eos
+                part_tokens = [self.BOS] + part_tokens + [self.EOS]
+                valid_size = len(part_tokens)
+                if len(part_tokens) < max_length:
+                    part_tokens += [self.PAD] * (max_length - valid_size)
+                out_tokens.append(part_tokens)
+                attn_mask.append([1]*valid_size+[0]*(max_length-valid_size))
+
+        out_tokens = torch.tensor(out_tokens, dtype=torch.long)
+        attn_mask = torch.tensor(attn_mask, dtype=torch.long)
+
+        # padding y based on tp size
+        padded_len = 0
+        padded_flag = True if padded_len > 0 else False
+        if padded_flag:
+            pad_tokens = torch.tensor([[self.PAD] * max_length], device=out_tokens.device)
+            pad_attn_mask = torch.tensor([[1]*padded_len+[0]*(max_length-padded_len)], device=attn_mask.device)
+            out_tokens = torch.cat([out_tokens, pad_tokens], dim=0)
+            attn_mask = torch.cat([attn_mask, pad_attn_mask], dim=0)
+        
+        # cu_seqlens
+        cu_out_tokens = out_tokens.masked_select(attn_mask != 0).unsqueeze(0)
+        seqlen = attn_mask.sum(dim=1).tolist()
+        cu_seqlens = torch.cumsum(torch.tensor([0]+seqlen), 0).to(device=out_tokens.device,dtype=torch.int32)
+        max_seq_len = max(seqlen)
+        return Tokens(out_tokens, cu_out_tokens, attn_mask, cu_seqlens, max_seq_len)
+    
+    
\ No newline at end of file
--- a/stepvideo/utils/__init__.py
+++ b/stepvideo/utils/__init__.py
+from .utils import *
+from .video_process import *
\ No newline at end of file
--- a/stepvideo/utils/utils.py
+++ b/stepvideo/utils/utils.py
+import numpy as np
+import random
+import torch
+from functools import wraps
+import torch.utils._device
+
+
+def setup_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+class EmptyInitOnDevice(torch.overrides.TorchFunctionMode):
+    def __init__(self, device=None):
+        self.device = device
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        if getattr(func, '__module__', None) == 'torch.nn.init':
+            if 'tensor' in kwargs:
+                return kwargs['tensor']
+            else:
+                return args[0]
+        if self.device is not None and func in torch.utils._device._device_constructors() and kwargs.get('device') is None:
+            kwargs['device'] = self.device
+        return func(*args, **kwargs)
+    
+
+def with_empty_init(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with EmptyInitOnDevice('cpu'):
+            return func(*args, **kwargs)
+    return wrapper
+
+
+    
+    
+def culens2mask(
+    cu_seqlens=None,
+    cu_seqlens_kv=None,
+    max_seqlen=None,
+    max_seqlen_kv=None,
+    is_causal=False
+):
+    assert len(cu_seqlens) == len(cu_seqlens_kv); "q k v should have same bsz..."
+    bsz = len(cu_seqlens) - 1
+    seqlens = cu_seqlens[1:]-cu_seqlens[:-1]
+    seqlens_kv = cu_seqlens_kv[1:]-cu_seqlens_kv[:-1]
+    
+    attn_mask = torch.zeros(bsz, max_seqlen, max_seqlen_kv, dtype=torch.bool)
+    for i, (seq_len, seq_len_kv) in enumerate(zip(seqlens, seqlens_kv)):
+        if is_causal:
+            attn_mask[i, :seq_len, :seq_len_kv] = torch.triu(torch.ones(seq_len, seq_len_kv), diagonal=1).bool()
+        else:
+            attn_mask[i, :seq_len, :seq_len_kv] = torch.ones([seq_len, seq_len_kv], dtype=torch.bool)
+
+    return attn_mask
+    
+    
--- a/stepvideo/utils/video_process.py
+++ b/stepvideo/utils/video_process.py
+import numpy as np
+import datetime
+import torch
+import os
+import imageio
+
+
+class VideoProcessor:
+    def __init__(self, save_path: str='./results', name_suffix: str=''):
+        self.save_path = save_path
+        os.makedirs(self.save_path, exist_ok=True)
+        self.name_suffix = name_suffix
+    
+    def crop2standard540p(self, vid_array):
+        _, height, width, _ = vid_array.shape
+        height_center = height//2
+        width_center = width//2
+        if width_center>height_center:  ## horizon mode
+            return vid_array[:, height_center-270:height_center+270, width_center-480:width_center+480]
+        elif width_center<height_center: ## portrait mode
+            return vid_array[:, height_center-480:height_center+480, width_center-270:width_center+270]
+        else:
+            return vid_array
+
+    def save_imageio_video(self, video_array: np.array, output_filename: str, fps=25, codec='libx264'):
+        
+        ffmpeg_params = [
+            "-vf", "atadenoise=0a=0.1:0b=0.1:1a=0.1:1b=0.1",  # denoise
+        ]
+   
+        with imageio.get_writer(output_filename, fps=fps, codec=codec, ffmpeg_params=ffmpeg_params) as vid_writer:
+            for img_array in video_array:
+                vid_writer.append_data(img_array)   
+        
+    
+    def postprocess_video(self, video_tensor, output_file_name='', output_type="mp4", crop2standard540p=True):
+        if len(self.name_suffix) == 0:
+            video_path = os.path.join(self.save_path, f"{output_file_name}-{str(datetime.datetime.now())}.{output_type}")
+        else:
+            video_path = os.path.join(self.save_path, f"{output_file_name}-{self.name_suffix}.{output_type}")
+        
+        video_tensor = (video_tensor.cpu().clamp(-1, 1)+1)*127.5
+        video_tensor = torch.cat([t for t in video_tensor], dim=-2)
+        video_array = video_tensor.clamp(0, 255).to(torch.uint8).numpy().transpose(0,2,3,1)
+        
+        if crop2standard540p:
+            video_array = self.crop2standard540p(video_array)
+
+        self.save_imageio_video(video_array, video_path)
+        print(f"Saved the generated video in {video_path}")
\ No newline at end of file
--- a/stepvideo/vae/vae.py
+++ b/stepvideo/vae/vae.py