me_block.py

import torch
import torch.nn as nn


class MemoryEfficientBlocks(nn.Module):
    def __init__(self, block_class, num_blocks, **block_params):
        super().__init__()
        self.block_class = block_class
        self.num_blocks = num_blocks
        self.block_params = block_params
        
        # 初始化两个block
        self.active_blocks = nn.ModuleList([
            block_class(**block_params) for _ in range(2)
        ])
        
        # 为权重加载创建独立的CUDA流，并设置优先级
        self.compute_stream = torch.cuda.Stream(priority=-1)  # 高优先级
        self.load_stream = torch.cuda.Stream(priority=0)      # 普通优先级

        # 预分配固定内存用于异步传输
        self.pinned_memory = torch.cuda.empty_cache()
        torch.cuda.memory.set_per_process_memory_fraction(0.8)  # 限制GPU内存使用
        
        # 用于存储预加载的权重
        # self.next_weights = None
        self.weight_buffer = []
        # self.current_block_idx = 0
        
    def initialize_weights(self, checkpoint, key):
        """加载所有权重到CPU内存"""
        # checkpoint = torch.load(checkpoint_path, map_location='cpu')
        for i in range(self.num_blocks):
            block_weights = {
                k.replace(f'{key}.{i}.', ''): v 
                for k, v in checkpoint.items() 
                if f'{key}.{i}.' in k
            }
            self.weight_buffer.append(block_weights)
    
    def prefetch_weights(self, block_idx):
        """在独立CUDA流中预加载下一个block的权重"""
        with torch.cuda.stream(self.load_stream):
            next_weights = self.weight_buffer[block_idx]
            next_weights = {
                k: v.cuda(non_blocking=True) 
                for k, v in next_weights.items()
            }
            self.active_blocks[1].load_state_dict(next_weights)
    
    def swap_blocks(self):
        """交换两个block并更新权重"""
        # 等待计算完成
        self.compute_stream.synchronize()
        # 等待加载完成
        self.load_stream.synchronize()
        
        # 交换blocks
        self.active_blocks[0], self.active_blocks[1] = \
            self.active_blocks[1], self.active_blocks[0]
    
    def forward(self, *args, **kwargs):
        """前向传播，同时进行计算和权重加载"""
        # import pdb; pdb.set_trace()
        for i in range(self.num_blocks):
            if i == 0:
                self.active_blocks[0].load_state_dict(self.weight_buffer[0])
            
            # 在主计算流中进行当前block的计算
            with torch.cuda.stream(self.compute_stream):
                current_block = self.active_blocks[0]
                outputs = current_block(*args, **kwargs)  # 解包参数传入
            # import pdb; pdb.set_trace()

            # 在独立流中预加载下一个block的权重
            if i < self.num_blocks - 1:
                self.prefetch_weights(i + 1)
            
            # 交换blocks并更新权重
            self.swap_blocks()
            
            # 更新args中的输入为当前输出
            args = list(args)
            if len(outputs) == 1:
                args[0] = outputs
            else:
                for i in range(len(outputs)):
                    args[i] = outputs[i]
            args = tuple(args)
            
        return outputs