Commit 4a457188 authored by raojy's avatar raojy
Browse files

fix

parent a570aeea
import numpy as np
import torch as th
def expand_t_like_x(t, x):
"""Function to reshape time t to broadcastable dimension of x
Args:
t: [batch_dim,], time vector
x: [batch_dim,...], data point
"""
dims = [1] * len(x[0].size())
t = t.view(t.size(0), *dims)
return t
#################### Coupling Plans ####################
class ICPlan:
"""Linear Coupling Plan"""
def __init__(self, sigma=0.0):
self.sigma = sigma
def compute_alpha_t(self, t):
"""Compute the data coefficient along the path"""
return t, 1
def compute_sigma_t(self, t):
"""Compute the noise coefficient along the path"""
return 1 - t, -1
def compute_d_alpha_alpha_ratio_t(self, t):
"""Compute the ratio between d_alpha and alpha"""
return 1 / t
def compute_drift(self, x, t):
"""We always output sde according to score parametrization;"""
t = expand_t_like_x(t, x)
alpha_ratio = self.compute_d_alpha_alpha_ratio_t(t)
sigma_t, d_sigma_t = self.compute_sigma_t(t)
drift = alpha_ratio * x
diffusion = alpha_ratio * (sigma_t**2) - sigma_t * d_sigma_t
return -drift, diffusion
def compute_diffusion(self, x, t, form="constant", norm=1.0):
"""Compute the diffusion term of the SDE
Args:
x: [batch_dim, ...], data point
t: [batch_dim,], time vector
form: str, form of the diffusion term
norm: float, norm of the diffusion term
"""
t = expand_t_like_x(t, x)
choices = {
"constant": norm,
"SBDM": norm * self.compute_drift(x, t)[1],
"sigma": norm * self.compute_sigma_t(t)[0],
"linear": norm * (1 - t),
"decreasing": 0.25 * (norm * th.cos(np.pi * t) + 1) ** 2,
"inccreasing-decreasing": norm * th.sin(np.pi * t) ** 2,
}
try:
diffusion = choices[form]
except KeyError:
raise NotImplementedError(f"Diffusion form {form} not implemented")
return diffusion
def get_score_from_velocity(self, velocity, x, t):
"""Wrapper function: transfrom velocity prediction model to score
Args:
velocity: [batch_dim, ...] shaped tensor; velocity model output
x: [batch_dim, ...] shaped tensor; x_t data point
t: [batch_dim,] time tensor
"""
t = expand_t_like_x(t, x)
alpha_t, d_alpha_t = self.compute_alpha_t(t)
sigma_t, d_sigma_t = self.compute_sigma_t(t)
mean = x
reverse_alpha_ratio = alpha_t / d_alpha_t
var = sigma_t**2 - reverse_alpha_ratio * d_sigma_t * sigma_t
score = (reverse_alpha_ratio * velocity - mean) / var
return score
def get_noise_from_velocity(self, velocity, x, t):
"""Wrapper function: transfrom velocity prediction model to denoiser
Args:
velocity: [batch_dim, ...] shaped tensor; velocity model output
x: [batch_dim, ...] shaped tensor; x_t data point
t: [batch_dim,] time tensor
"""
t = expand_t_like_x(t, x)
alpha_t, d_alpha_t = self.compute_alpha_t(t)
sigma_t, d_sigma_t = self.compute_sigma_t(t)
mean = x
reverse_alpha_ratio = alpha_t / d_alpha_t
var = reverse_alpha_ratio * d_sigma_t - sigma_t
noise = (reverse_alpha_ratio * velocity - mean) / var
return noise
def get_velocity_from_score(self, score, x, t):
"""Wrapper function: transfrom score prediction model to velocity
Args:
score: [batch_dim, ...] shaped tensor; score model output
x: [batch_dim, ...] shaped tensor; x_t data point
t: [batch_dim,] time tensor
"""
t = expand_t_like_x(t, x)
drift, var = self.compute_drift(x, t)
velocity = var * score - drift
return velocity
def compute_mu_t(self, t, x0, x1):
"""Compute the mean of time-dependent density p_t"""
t = expand_t_like_x(t, x1)
alpha_t, _ = self.compute_alpha_t(t)
sigma_t, _ = self.compute_sigma_t(t)
if isinstance(x1, (list, tuple)):
return [alpha_t[i] * x1[i] + sigma_t[i] * x0[i] for i in range(len(x1))]
else:
return alpha_t * x1 + sigma_t * x0
def compute_xt(self, t, x0, x1):
"""Sample xt from time-dependent density p_t; rng is required"""
xt = self.compute_mu_t(t, x0, x1)
return xt
def compute_ut(self, t, x0, x1, xt):
"""Compute the vector field corresponding to p_t"""
t = expand_t_like_x(t, x1)
_, d_alpha_t = self.compute_alpha_t(t)
_, d_sigma_t = self.compute_sigma_t(t)
if isinstance(x1, (list, tuple)):
return [d_alpha_t * x1[i] + d_sigma_t * x0[i] for i in range(len(x1))]
else:
return d_alpha_t * x1 + d_sigma_t * x0
def plan(self, t, x0, x1):
xt = self.compute_xt(t, x0, x1)
ut = self.compute_ut(t, x0, x1, xt)
return t, xt, ut
class VPCPlan(ICPlan):
"""class for VP path flow matching"""
def __init__(self, sigma_min=0.1, sigma_max=20.0):
self.sigma_min = sigma_min
self.sigma_max = sigma_max
self.log_mean_coeff = (
lambda t: -0.25 * ((1 - t) ** 2) * (self.sigma_max - self.sigma_min) - 0.5 * (1 - t) * self.sigma_min
)
self.d_log_mean_coeff = lambda t: 0.5 * (1 - t) * (self.sigma_max - self.sigma_min) + 0.5 * self.sigma_min
def compute_alpha_t(self, t):
"""Compute coefficient of x1"""
alpha_t = self.log_mean_coeff(t)
alpha_t = th.exp(alpha_t)
d_alpha_t = alpha_t * self.d_log_mean_coeff(t)
return alpha_t, d_alpha_t
def compute_sigma_t(self, t):
"""Compute coefficient of x0"""
p_sigma_t = 2 * self.log_mean_coeff(t)
sigma_t = th.sqrt(1 - th.exp(p_sigma_t))
d_sigma_t = th.exp(p_sigma_t) * (2 * self.d_log_mean_coeff(t)) / (-2 * sigma_t)
return sigma_t, d_sigma_t
def compute_d_alpha_alpha_ratio_t(self, t):
"""Special purposed function for computing numerical stabled d_alpha_t / alpha_t"""
return self.d_log_mean_coeff(t)
def compute_drift(self, x, t):
"""Compute the drift term of the SDE"""
t = expand_t_like_x(t, x)
beta_t = self.sigma_min + (1 - t) * (self.sigma_max - self.sigma_min)
return -0.5 * beta_t * x, beta_t / 2
class GVPCPlan(ICPlan):
def __init__(self, sigma=0.0):
super().__init__(sigma)
def compute_alpha_t(self, t):
"""Compute coefficient of x1"""
alpha_t = th.sin(t * np.pi / 2)
d_alpha_t = np.pi / 2 * th.cos(t * np.pi / 2)
return alpha_t, d_alpha_t
def compute_sigma_t(self, t):
"""Compute coefficient of x0"""
sigma_t = th.cos(t * np.pi / 2)
d_sigma_t = -np.pi / 2 * th.sin(t * np.pi / 2)
return sigma_t, d_sigma_t
def compute_d_alpha_alpha_ratio_t(self, t):
"""Special purposed function for computing numerical stabled d_alpha_t / alpha_t"""
return np.pi / (2 * th.tan(t * np.pi / 2))
import enum
import math
from typing import Callable
import numpy as np
import torch as th
from . import path
from .integrators import ode, sde
from .utils import mean_flat, expand_dims
from .dpm_solver import NoiseScheduleFlow, model_wrapper, DPM_Solver
class ModelType(enum.Enum):
"""
Which type of output the model predicts.
"""
NOISE = enum.auto() # the model predicts epsilon
SCORE = enum.auto() # the model predicts \nabla \log p(x)
VELOCITY = enum.auto() # the model predicts v(x)
class PathType(enum.Enum):
"""
Which type of path to use.
"""
LINEAR = enum.auto()
GVP = enum.auto()
VP = enum.auto()
class WeightType(enum.Enum):
"""
Which type of weighting to use.
"""
NONE = enum.auto()
VELOCITY = enum.auto()
LIKELIHOOD = enum.auto()
class Transport:
def __init__(self, *, model_type, path_type, loss_type, train_eps, sample_eps, snr_type, do_shift, seq_len):
path_options = {
PathType.LINEAR: path.ICPlan,
PathType.GVP: path.GVPCPlan,
PathType.VP: path.VPCPlan,
}
self.loss_type = loss_type
self.model_type = model_type
self.path_sampler = path_options[path_type]()
self.train_eps = train_eps
self.sample_eps = sample_eps
self.snr_type = snr_type
self.do_shift = do_shift
self.seq_len = seq_len
def prior_logp(self, z):
"""
Standard multivariate normal prior
Assume z is batched
"""
shape = th.tensor(z.size())
N = th.prod(shape[1:])
_fn = lambda x: -N / 2.0 * np.log(2 * np.pi) - th.sum(x**2) / 2.0
return th.vmap(_fn)(z)
def check_interval(
self,
train_eps,
sample_eps,
*,
diffusion_form="SBDM",
sde=False,
reverse=False,
eval=False,
last_step_size=0.0,
):
t0 = 0
t1 = 1
eps = train_eps if not eval else sample_eps
if type(self.path_sampler) in [path.VPCPlan]:
t1 = 1 - eps if (not sde or last_step_size == 0) else 1 - last_step_size
elif (type(self.path_sampler) in [path.ICPlan, path.GVPCPlan]) and (
self.model_type != ModelType.VELOCITY or sde
): # avoid numerical issue by taking a first semi-implicit step
t0 = eps if (diffusion_form == "SBDM" and sde) or self.model_type != ModelType.VELOCITY else 0
t1 = 1 - eps if (not sde or last_step_size == 0) else 1 - last_step_size
if reverse:
t0, t1 = 1 - t0, 1 - t1
return t0, t1
def sample(self, x1):
"""Sampling x0 & t based on shape of x1 (if needed)
Args:
x1 - data point; [batch, *dim]
"""
if isinstance(x1, (list, tuple)):
x0 = [th.randn_like(img_start) for img_start in x1]
else:
x0 = th.randn_like(x1)
t0, t1 = self.check_interval(self.train_eps, self.sample_eps)
if self.snr_type.startswith("uniform"):
assert t0 == 0.0 and t1 == 1.0, "not implemented."
if "_" in self.snr_type:
_, t0, t1 = self.snr_type.split("_")
t0, t1 = float(t0), float(t1)
t = th.rand((len(x1),)) * (t1 - t0) + t0
elif self.snr_type == "lognorm":
u = th.normal(mean=0.0, std=1.0, size=(len(x1),))
t = 1 / (1 + th.exp(-u)) * (t1 - t0) + t0
else:
raise NotImplementedError("Not implemented snr_type %s" % self.snr_type)
if self.do_shift:
base_shift: float = 0.5
max_shift: float = 1.15
mu = self.get_lin_function(y1=base_shift, y2=max_shift)(self.seq_len)
t = self.time_shift(mu, 1.0, t)
t = t.to(x1[0])
return t, x0, x1
def time_shift(self, mu: float, sigma: float, t: th.Tensor):
# the following implementation was original for t=0: clean / t=1: noise
# Since we adopt the reverse, the 1-t operations are needed
t = 1 - t
t = math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
t = 1 - t
return t
def get_lin_function(
self, x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
) -> Callable[[float], float]:
m = (y2 - y1) / (x2 - x1)
b = y1 - m * x1
return lambda x: m * x + b
def training_losses(self, model, x1, model_kwargs=None):
"""Loss for training the score model
Args:
- model: backbone model; could be score, noise, or velocity
- x1: datapoint
- model_kwargs: additional arguments for the model
"""
if model_kwargs == None:
model_kwargs = {}
t, x0, x1 = self.sample(x1)
t, xt, ut = self.path_sampler.plan(t, x0, x1)
if "cond" in model_kwargs:
conds = model_kwargs.pop("cond")
xt = [th.cat([x, cond], dim=0) if cond is not None else x for x, cond in zip(xt, conds)]
model_output = model(xt, t, **model_kwargs)
# Unwrap model output
if hasattr(model_output, 'sample'):
model_output = model_output.sample
elif isinstance(model_output, tuple):
model_output = model_output[0]
B = len(x0)
terms = {}
# terms['pred'] = model_output
if self.model_type == ModelType.VELOCITY:
if isinstance(x1, (list, tuple)):
assert len(model_output) == len(ut) == len(x1)
for i in range(B):
assert (
model_output[i].shape == ut[i].shape == x1[i].shape
), f"{model_output[i].shape} {ut[i].shape} {x1[i].shape}"
terms["task_loss"] = th.stack(
[((ut[i] - model_output[i]) ** 2).mean() for i in range(B)],
dim=0,
)
else:
terms["task_loss"] = mean_flat(((model_output - ut) ** 2))
else:
raise NotImplementedError
terms["loss"] = terms["task_loss"]
terms["task_loss"] = terms["task_loss"].clone().detach()
terms["t"] = t
return terms
def get_drift(self):
"""member function for obtaining the drift of the probability flow ODE"""
def score_ode(x, t, model, **model_kwargs):
drift_mean, drift_var = self.path_sampler.compute_drift(x, t)
model_output = model(x, t, **model_kwargs)
return -drift_mean + drift_var * model_output # by change of variable
def noise_ode(x, t, model, **model_kwargs):
drift_mean, drift_var = self.path_sampler.compute_drift(x, t)
sigma_t, _ = self.path_sampler.compute_sigma_t(path.expand_t_like_x(t, x))
model_output = model(x, t, **model_kwargs)
score = model_output / -sigma_t
return -drift_mean + drift_var * score
def velocity_ode(x, t, model, **model_kwargs):
model_output = model(x, t, **model_kwargs)
return model_output
if self.model_type == ModelType.NOISE:
drift_fn = noise_ode
elif self.model_type == ModelType.SCORE:
drift_fn = score_ode
else:
drift_fn = velocity_ode
def body_fn(x, t, model, **model_kwargs):
model_output = drift_fn(x, t, model, **model_kwargs)
assert model_output.shape == x.shape, "Output shape from ODE solver must match input shape"
return model_output
return body_fn
def get_score(
self,
):
"""member function for obtaining score of
x_t = alpha_t * x + sigma_t * eps"""
if self.model_type == ModelType.NOISE:
score_fn = (
lambda x, t, model, **kwargs: model(x, t, **kwargs)
/ -self.path_sampler.compute_sigma_t(path.expand_t_like_x(t, x))[0]
)
elif self.model_type == ModelType.SCORE:
score_fn = lambda x, t, model, **kwagrs: model(x, t, **kwagrs)
elif self.model_type == ModelType.VELOCITY:
score_fn = lambda x, t, model, **kwargs: self.path_sampler.get_score_from_velocity(
model(x, t, **kwargs), x, t
)
else:
raise NotImplementedError()
return score_fn
class Sampler:
"""Sampler class for the transport model"""
def __init__(
self,
transport,
):
"""Constructor for a general sampler; supporting different sampling methods
Args:
- transport: an tranport object specify model prediction & interpolant type
"""
self.transport = transport
self.drift = self.transport.get_drift()
self.score = self.transport.get_score()
def __get_sde_diffusion_and_drift(
self,
*,
diffusion_form="SBDM",
diffusion_norm=1.0,
):
def diffusion_fn(x, t):
diffusion = self.transport.path_sampler.compute_diffusion(x, t, form=diffusion_form, norm=diffusion_norm)
return diffusion
sde_drift = lambda x, t, model, **kwargs: self.drift(x, t, model, **kwargs) + diffusion_fn(x, t) * self.score(
x, t, model, **kwargs
)
sde_diffusion = diffusion_fn
return sde_drift, sde_diffusion
def __get_last_step(
self,
sde_drift,
*,
last_step,
last_step_size,
):
"""Get the last step function of the SDE solver"""
if last_step is None:
last_step_fn = lambda x, t, model, **model_kwargs: x
elif last_step == "Mean":
last_step_fn = (
lambda x, t, model, **model_kwargs: x + sde_drift(x, t, model, **model_kwargs) * last_step_size
)
elif last_step == "Tweedie":
alpha = self.transport.path_sampler.compute_alpha_t # simple aliasing; the original name was too long
sigma = self.transport.path_sampler.compute_sigma_t
last_step_fn = lambda x, t, model, **model_kwargs: x / alpha(t)[0][0] + (sigma(t)[0][0] ** 2) / alpha(t)[0][
0
] * self.score(x, t, model, **model_kwargs)
elif last_step == "Euler":
last_step_fn = (
lambda x, t, model, **model_kwargs: x + self.drift(x, t, model, **model_kwargs) * last_step_size
)
else:
raise NotImplementedError()
return last_step_fn
def sample_sde(
self,
*,
sampling_method="Euler",
diffusion_form="SBDM",
diffusion_norm=1.0,
last_step="Mean",
last_step_size=0.04,
num_steps=250,
):
"""returns a sampling function with given SDE settings
Args:
- sampling_method: type of sampler used in solving the SDE; default to be Euler-Maruyama
- diffusion_form: function form of diffusion coefficient; default to be matching SBDM
- diffusion_norm: function magnitude of diffusion coefficient; default to 1
- last_step: type of the last step; default to identity
- last_step_size: size of the last step; default to match the stride of 250 steps over [0,1]
- num_steps: total integration step of SDE
"""
if last_step is None:
last_step_size = 0.0
sde_drift, sde_diffusion = self.__get_sde_diffusion_and_drift(
diffusion_form=diffusion_form,
diffusion_norm=diffusion_norm,
)
t0, t1 = self.transport.check_interval(
self.transport.train_eps,
self.transport.sample_eps,
diffusion_form=diffusion_form,
sde=True,
eval=True,
reverse=False,
last_step_size=last_step_size,
)
_sde = sde(
sde_drift,
sde_diffusion,
t0=t0,
t1=t1,
num_steps=num_steps,
sampler_type=sampling_method,
)
last_step_fn = self.__get_last_step(sde_drift, last_step=last_step, last_step_size=last_step_size)
def _sample(init, model, **model_kwargs):
xs = _sde.sample(init, model, **model_kwargs)
ts = th.ones(init.size(0), device=init.device) * t1
x = last_step_fn(xs[-1], ts, model, **model_kwargs)
xs.append(x)
assert len(xs) == num_steps, "Samples does not match the number of steps"
return xs
return _sample
def sample_dpm(
self,
model,
model_kwargs=None,
):
noise_schedule = NoiseScheduleFlow(schedule="discrete_flow")
def noise_pred_fn(x, t_continuous):
output = model(x, 1 - t_continuous, **model_kwargs)
_, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
try:
noise = x - (1 - expand_dims(sigma_t, x.dim()).to(x)) * output
except:
noise = x - (1 - expand_dims(sigma_t, x.dim()).to(x)) * output[0]
return noise
return DPM_Solver(noise_pred_fn, noise_schedule, algorithm_type="dpmsolver++").sample
def sample_ode(
self,
*,
sampling_method="dopri5",
num_steps=50,
atol=1e-6,
rtol=1e-3,
reverse=False,
do_shift=False,
time_shifting_factor=None,
stochast_ratio=0.0, # 新增参数,0.0=纯ODE,1.0=完全重加噪
):
if stochast_ratio == 0.0:
# 原有逻辑不变
drift = lambda x, t, model, **kwargs: self.drift(x, t, model, **kwargs)
t0, t1 = self.transport.check_interval(
self.transport.train_eps,
self.transport.sample_eps,
sde=False,
eval=True,
reverse=reverse,
last_step_size=0.0,
)
_ode = ode(
drift=drift,
t0=t0,
t1=t1,
sampler_type=sampling_method,
num_steps=num_steps,
atol=atol,
rtol=rtol,
do_shift=do_shift,
time_shifting_factor=time_shifting_factor,
)
return _ode.sample
else:
# 新增:DDPM风格重加噪采样
t0, t1 = self.transport.check_interval(
self.transport.train_eps,
self.transport.sample_eps,
sde=False,
eval=True,
reverse=reverse,
last_step_size=0.0,
)
path_sampler = self.transport.path_sampler
def _sample(init, model, **model_kwargs):
# t0→t1: noise(t=0) → data(t=1)
t_steps = th.linspace(t0, t1, num_steps + 1, dtype=th.float64).to(init)
x_cur = init.to(th.float64)
for t_cur, t_next in zip(t_steps[:-1], t_steps[1:]):
t_batch = th.ones(x_cur.size(0), device=x_cur.device, dtype=x_cur.dtype) * t_cur
# 1. 模型预测 velocity
v = model(x_cur, t_batch, **model_kwargs)
# 2. 直接从流匹配公式还原 x̂₁ 和 x̂₀,避免除以 alpha_t 的奇点
# 联立 x_t = alpha_t*x1 + sigma_t*x0 与 v = d_alpha_t*x1 + d_sigma_t*x0
t_exp = expand_dims(t_batch, x_cur.dim())
alpha_t, d_alpha_t = path_sampler.compute_alpha_t(t_exp)
sigma_t, d_sigma_t = path_sampler.compute_sigma_t(t_exp)
denom = sigma_t * d_alpha_t - d_sigma_t * alpha_t # =1 for ICPlan
x1_hat = (sigma_t * v - d_sigma_t * x_cur) / denom
x0_hat = (d_alpha_t * x_cur - alpha_t * v) / denom
# 3. 按 t_next 重加噪
t_next_batch = th.ones_like(t_batch) * t_next
t_next_exp = expand_dims(t_next_batch, x_cur.dim())
alpha_next, _ = path_sampler.compute_alpha_t(t_next_exp)
sigma_next, _ = path_sampler.compute_sigma_t(t_next_exp)
noi = th.randn_like(x_cur)
x_cur = alpha_next * x1_hat + sigma_next * (
x0_hat * ((1 - stochast_ratio) ** 0.5)
+ noi * (stochast_ratio ** 0.5)
)
return [x_cur]
return _sample
def sample_ode_likelihood(
self,
*,
sampling_method="dopri5",
num_steps=50,
atol=1e-6,
rtol=1e-3,
):
"""returns a sampling function for calculating likelihood with given ODE settings
Args:
- sampling_method: type of sampler used in solving the ODE; default to be Dopri5
- num_steps:
- fixed solver (Euler, Heun): the actual number of integration steps performed
- adaptive solver (Dopri5): the number of datapoints saved during integration; produced by interpolation
- atol: absolute error tolerance for the solver
- rtol: relative error tolerance for the solver
"""
def _likelihood_drift(x, t, model, **model_kwargs):
x, _ = x
eps = th.randint(2, x.size(), dtype=th.float, device=x.device) * 2 - 1
t = th.ones_like(t) * (1 - t)
with th.enable_grad():
x.requires_grad = True
grad = th.autograd.grad(th.sum(self.drift(x, t, model, **model_kwargs) * eps), x)[0]
logp_grad = th.sum(grad * eps, dim=tuple(range(1, len(x.size()))))
drift = self.drift(x, t, model, **model_kwargs)
return (-drift, logp_grad)
t0, t1 = self.transport.check_interval(
self.transport.train_eps,
self.transport.sample_eps,
sde=False,
eval=True,
reverse=False,
last_step_size=0.0,
)
_ode = ode(
drift=_likelihood_drift,
t0=t0,
t1=t1,
sampler_type=sampling_method,
num_steps=num_steps,
atol=atol,
rtol=rtol,
)
def _sample_fn(x, model, **model_kwargs):
init_logp = th.zeros(x.size(0)).to(x)
input = (x, init_logp)
drift, delta_logp = _ode.sample(input, model, **model_kwargs)
drift, delta_logp = drift[-1], delta_logp[-1]
prior_logp = self.transport.prior_logp(drift)
logp = prior_logp - delta_logp
return logp, drift
return _sample_fn
import torch as th
import math
class EasyDict:
def __init__(self, sub_dict):
for k, v in sub_dict.items():
setattr(self, k, v)
def __getitem__(self, key):
return getattr(self, key)
def mean_flat(x):
"""
Take the mean over all non-batch dimensions.
"""
return th.mean(x, dim=list(range(1, len(x.size()))))
def log_state(state):
result = []
sorted_state = dict(sorted(state.items()))
for key, value in sorted_state.items():
# Check if the value is an instance of a class
if "<object" in str(value) or "object at" in str(value):
result.append(f"{key}: [{value.__class__.__name__}]")
else:
result.append(f"{key}: {value}")
return "\n".join(result)
def time_shift(mu: float, sigma: float, t: th.Tensor):
# the following implementation was original for t=0: clean / t=1: noise
# Since we adopt the reverse, the 1-t operations are needed
t = 1 - t
t = math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
t = 1 - t
return t
def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15):
m = (y2 - y1) / (x2 - x1)
b = y1 - m * x1
return lambda x: m * x + b
def expand_dims(v, dims):
"""
Expand the tensor `v` to the dim `dims`.
Args:
`v`: a PyTorch tensor with shape [N].
`dim`: a `int`.
Returns:
a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
"""
return v[(...,) + (None,) * (dims - 1)]
\ No newline at end of file
"""Utility functions for image preprocessing."""
import random
from PIL import Image
def center_crop(pil_image, crop_size):
cw, ch = crop_size
w, h = pil_image.size
left = max(0, (w - cw) // 2)
top = max(0, (h - ch) // 2)
return pil_image.crop((left, top, left + cw, top + ch)).resize((cw, ch), Image.LANCZOS)
def var_center_crop(pil_image, crop_size_list, random_top_k=1):
w, h = pil_image.size
rem_percent = [min(cw / w, ch / h) / max(cw / w, ch / h) for cw, ch in crop_size_list]
crop_size = random.choice(
sorted(((x, y) for x, y in zip(rem_percent, crop_size_list)), reverse=True)[:random_top_k]
)[1]
return center_crop(pil_image, crop_size)
def generate_crop_size_list(num_patches, patch_size, max_ratio=4.0):
assert max_ratio >= 1.0
crop_size_list = []
wp, hp = num_patches, 1
while wp > 0:
if max(wp, hp) / min(wp, hp) <= max_ratio:
crop_size_list.append((wp * patch_size, hp * patch_size))
if (hp + 1) * wp <= num_patches:
hp += 1
else:
wp -= 1
return crop_size_list
from .image_tokenizer import ImageTokenizer
This diff is collapsed.
"""
LLaDA-2.0-Uni — Image Editing
Usage:
python image_edit.py --model_path /path/to/LLaDA-2.0-Uni --image input.jpg --instruction "Change the background to a beach."
python image_edit.py --model_path /path/to/LLaDA-2.0-Uni --image_token input.pt --instruction "Make it a watercolor painting."
"""
import os, sys, gc, argparse, torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from decoder import decode_vq_tokens
def parse_args():
p = argparse.ArgumentParser(description="LLaDA-2.0-Uni Image Editing")
p.add_argument("--model_path", type=str, required=True,
help="Root model dir containing LLM weights, image_tokenizer/, decoder/, vae/")
p.add_argument("--image", type=str, default=None)
p.add_argument("--image_token", type=str, default=None)
p.add_argument("--instruction", type=str, required=True)
p.add_argument("--steps", type=int, default=8)
p.add_argument("--block_length", type=int, default=32)
p.add_argument("--cfg_text_scale", type=float, default=4.0)
p.add_argument("--cfg_image_scale", type=float, default=0.0)
p.add_argument("--decoder_steps", type=int, default=50)
p.add_argument("--resolution_multiplier", type=int, default=2)
p.add_argument("--output", type=str, default="edited.png")
p.add_argument("--seed", type=int, default=42)
return p.parse_args()
def _get_image_token_offset(model_path):
"""Read image_token_offset from model config."""
import json
with open(os.path.join(model_path, "config.json")) as f:
return json.load(f).get("image_token_offset", 157184)
def encode_image_from_pt(pt_path, offset):
data = torch.load(pt_path, map_location="cpu", weights_only=False)
token_ids = (data["semantic_token_ids"] + offset).tolist()
w, h = data["metadata"]["processed_size"]
return token_ids, h // 16, w // 16
def encode_image_from_pil(image_path, model_path, device, offset):
from encoder.image_tokenizer import ImageTokenizer
from decoder.utils import generate_crop_size_list, var_center_crop
image_tokenizer = ImageTokenizer(
model_path=model_path, device=device, dtype=torch.bfloat16,
)
crop_size_list = generate_crop_size_list((512 // 32) ** 2, 32)
pil_image = var_center_crop(Image.open(image_path).convert("RGB"), crop_size_list=crop_size_list)
info = image_tokenizer.encode_with_info(pil_image)
_, h, w = info["grid_thw"]
token_ids = [x + offset for x in info["token_ids"]]
del image_tokenizer; torch.cuda.empty_cache()
return token_ids, h, w
def main():
args = parse_args()
torch.manual_seed(args.seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
# Encode source image
offset = _get_image_token_offset(args.model_path)
if args.image_token:
print(f"Loading pre-tokenized image: {args.image_token}")
image_tokens, image_h, image_w = encode_image_from_pt(args.image_token, offset)
elif args.image:
print(f"Encoding image: {args.image}")
image_tokens, image_h, image_w = encode_image_from_pil(args.image, args.model_path, device, offset)
else:
raise ValueError("Provide --image or --image_token")
print(f"Image grid: {image_h}x{image_w}, instruction: {args.instruction}")
# Phase 1: generate edited VQ tokens
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
args.model_path, device_map={"": device}, trust_remote_code=True
).to(torch.bfloat16).eval()
model.tokenizer = tokenizer
result = model.edit_image(
image_tokens, image_h, image_w, args.instruction,
steps=args.steps, block_length=args.block_length,
cfg_text_scale=args.cfg_text_scale, cfg_image_scale=args.cfg_image_scale,
)
del model; gc.collect(); torch.cuda.empty_cache()
print("Model unloaded.\n")
# Phase 2: decode to image
print("Decoding edited image...")
img = decode_vq_tokens(result["token_ids"], result["h"], result["w"],
args.model_path, device,
resolution_multiplier=args.resolution_multiplier, num_steps=args.decoder_steps)
img.save(args.output)
print(f"\n✅ Saved: {args.output}")
if __name__ == "__main__":
main()
"""
LLaDA-2.0-Uni — Image Understanding (Multimodal Understanding)
Usage:
python mmu_understand.py --model_path /path/to/LLaDA-2.0-Uni --image photo.jpg
python mmu_understand.py --model_path /path/to/LLaDA-2.0-Uni --image_token photo.pt
python mmu_understand.py --model_path /path/to/LLaDA-2.0-Uni --image photo.jpg --question "Describe this image."
"""
import os, sys, argparse, torch
from transformers import AutoModelForCausalLM, AutoTokenizer
def parse_args():
p = argparse.ArgumentParser(description="LLaDA-2.0-Uni Image Understanding")
p.add_argument("--model_path", type=str, required=True,
help="Root model dir containing LLM weights and image_tokenizer/")
p.add_argument("--image", type=str, default=None, help="Path to input image (jpg/png)")
p.add_argument("--image_token", type=str, default=None, help="Path to pre-tokenized .pt file")
p.add_argument("--question", type=str, default="", help="Optional question/prefix")
p.add_argument("--steps", type=int, default=32)
p.add_argument("--block_length", type=int, default=32)
p.add_argument("--gen_length", type=int, default=2048)
return p.parse_args()
def _get_image_token_offset(model_path):
"""Read image_token_offset from model config."""
import json
with open(os.path.join(model_path, "config.json")) as f:
return json.load(f).get("image_token_offset", 157184)
def encode_image_from_pt(pt_path, offset):
data = torch.load(pt_path, map_location="cpu", weights_only=False)
token_ids = (data["semantic_token_ids"] + offset).tolist()
w, h = data["metadata"]["processed_size"]
return token_ids, h // 16, w // 16
def encode_image_from_pil(image_path, model_path, device, offset):
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from encoder.image_tokenizer import ImageTokenizer
from decoder.smart_img_process import smart_resize_images
image_tokenizer = ImageTokenizer(
model_path=model_path, device=device, dtype=torch.bfloat16,
)
pil_image = smart_resize_images([image_path])[0]
info = image_tokenizer.encode_with_info(pil_image)
_, h, w = info["grid_thw"]
token_ids = [x + offset for x in info["token_ids"]]
del image_tokenizer; torch.cuda.empty_cache()
return token_ids, h, w
def main():
args = parse_args()
device = "cuda" if torch.cuda.is_available() else "cpu"
# Encode image
offset = _get_image_token_offset(args.model_path)
if args.image_token:
print(f"Loading pre-tokenized image: {args.image_token}")
image_tokens, image_h, image_w = encode_image_from_pt(args.image_token, offset)
elif args.image:
print(f"Encoding image: {args.image}")
image_tokens, image_h, image_w = encode_image_from_pil(args.image, args.model_path, device, offset)
else:
raise ValueError("Provide --image or --image_token")
print(f"Image grid: {image_h}x{image_w}, tokens: {len(image_tokens)}")
# Load model and use high-level API
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
args.model_path, device_map=device, trust_remote_code=True
).to(torch.bfloat16).eval()
model.tokenizer = tokenizer
print("Generating...")
response = model.understand_image(
image_tokens, image_h, image_w,
question=args.question, steps=args.steps,
block_length=args.block_length, gen_length=args.gen_length,
)
print(f"\n{'='*60}")
print(f"Question: {args.question or '(none)'}")
print(f"{'='*60}")
print(f"Response:\n{response}")
print(f"{'='*60}")
if __name__ == "__main__":
main()
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment