Xinchi/fix offload (#57)

* fix offload extra latency in the first step by pre-allocating pinned memory * pre-commit --------- Co-authored-by: “de1star” <“843414674@qq.com”>

Xinchi/fix offload (#57)
* fix offload extra latency in the first step by pre-allocating pinned memory * pre-commit --------- Co-authored-by: “de1star” <“843414674@qq.com”>
2bb1b0f0 · Xinchi Huang · GitHub · af248eb2 · 2bb1b0f0 · 2bb1b0f0
Commit 2bb1b0f0 authored May 28, 2025 by Xinchi Huang Committed by GitHub May 28, 2025
3 changed files
--- a/lightx2v/common/ops/mm/mm_weight.py
+++ b/lightx2v/common/ops/mm/mm_weight.py
@@ -57,7 +57,9 @@ class MMWeight(MMWeightTemplate):
    def load(self, weight_dict):
        self.weight = weight_dict[self.weight_name].t()
+        self.pinned_weight = torch.empty(self.weight.shape, pin_memory=True, dtype=self.weight.dtype)
        self.bias = weight_dict[self.bias_name] if self.bias_name is not None else None
+        self.pinned_bias = torch.empty(self.bias.shape, pin_memory=True, dtype=self.bias.dtype) if self.bias is not None else None
    def apply(self, input_tensor):
        shape = (input_tensor.shape[0], self.weight.shape[1])
@@ -76,6 +78,15 @@ class MMWeight(MMWeightTemplate):
            destination[self.bias_name] = self.bias.cpu().detach().clone()
        return destination
+    def to_cpu(self, non_blocking=False):
+        # self.weight = self.weight.to("cpu", non_blocking=non_blocking)
+        self.weight = self.pinned_weight.copy_(self.weight, non_blocking=non_blocking).cpu()
+        if hasattr(self, "weight_scale"):
+            self.weight_scale = self.weight_scale.to("cpu", non_blocking=non_blocking)
+        if self.bias is not None:
+            # self.bias = self.bias.to("cpu", non_blocking=non_blocking)
+            self.bias = self.pinned_bias.copy_(self.bias, non_blocking=non_blocking).cpu()
 @MM_WEIGHT_REGISTER("Default-Force-FP32")
 class MMWeightForceFP32(MMWeight):

--- a/lightx2v/common/ops/tensor/tensor.py
+++ b/lightx2v/common/ops/tensor/tensor.py
+import torch
 from lightx2v.utils.registry_factory import TENSOR_REGISTER
@@ -8,9 +9,11 @@ class DefaultTensor:
    def load(self, weight_dict):
        self.tensor = weight_dict[self.tensor_name]
+        self.pinned_tensor = torch.empty(self.tensor.shape, pin_memory=True, dtype=self.tensor.dtype)
    def to_cpu(self, non_blocking=False):
-        self.tensor = self.tensor.to("cpu", non_blocking=non_blocking)
+        # self.tensor = self.tensor.to("cpu", non_blocking=non_blocking)
+        self.tensor = self.pinned_tensor.copy_(self.tensor, non_blocking=non_blocking).cpu()
    def to_cuda(self, non_blocking=False):
        self.tensor = self.tensor.cuda(non_blocking=non_blocking)

--- a/lightx2v/models/networks/wan/model.py
+++ b/lightx2v/models/networks/wan/model.py
@@ -64,9 +64,9 @@ class WanModel:
        use_bfloat16 = self.config.get("use_bfloat16", True)
        with safe_open(file_path, framework="pt") as f:
            if use_bfloat16:
-                tensor_dict = {key: f.get_tensor(key).to(torch.bfloat16).to(self.device) for key in f.keys()}
+                tensor_dict = {key: f.get_tensor(key).to(torch.bfloat16).pin_memory().to(self.device) for key in f.keys()}
            else:
-                tensor_dict = {key: f.get_tensor(key).to(self.device) for key in f.keys()}
+                tensor_dict = {key: f.get_tensor(key).pin_memory().to(self.device) for key in f.keys()}
        return tensor_dict
    def _load_ckpt(self):