Commit 9c0053b7 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #953 canceled with stages
# 模型编码
modelCode=612
# 模型名称
modelName=itransformer_pytorch
# 模型描述
modelDescription=iTransformer能高效利用长程时序特征,能同时预测多个指标。
# 应用场景
appScenario=推理,训练,金融,运维,电商,制造,能源,医疗
# 框架类型
frameType=pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
from layers.SelfAttention_Family import FlashAttention, AttentionLayer, FullAttention
from layers.Embed import DataEmbedding
import numpy as np
class Model(nn.Module):
"""
Vanilla Transformer
with O(L^2) complexity
Paper link: https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
"""
def __init__(self, configs):
super(Model, self).__init__()
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
# Embedding
self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
FlashAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=configs.output_attention), configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
# Decoder
self.dec_embedding = DataEmbedding(configs.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.decoder = Decoder(
[
DecoderLayer(
AttentionLayer(
FullAttention(True, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
AttentionLayer(
FullAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation,
)
for l in range(configs.d_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model),
projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# Embedding
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.dec_embedding(x_dec, x_mark_dec)
dec_out = self.decoder(dec_out, enc_out, x_mask=None, cross_mask=None)
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
from layers.SelfAttention_Family import FullAttention, AttentionLayer, FlowAttention
from layers.Embed import DataEmbedding
import numpy as np
class Model(nn.Module):
"""
Vanilla Transformer
with O(L^2) complexity
Paper link: https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
"""
def __init__(self, configs):
super(Model, self).__init__()
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
if configs.channel_independence:
self.enc_in = 1
self.dec_in = 1
self.c_out = 1
else:
self.enc_in = configs.enc_in
self.dec_in = configs.dec_in
self.c_out = configs.c_out
# Embedding
self.enc_embedding = DataEmbedding(self.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
FlowAttention(attention_dropout=configs.dropout), configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
# Decoder
self.dec_embedding = DataEmbedding(self.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.decoder = Decoder(
[
DecoderLayer(
AttentionLayer(
FullAttention(True, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
AttentionLayer(
FullAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation,
)
for l in range(configs.d_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model),
projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# Embedding
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.dec_embedding(x_dec, x_mark_dec)
dec_out = self.decoder(dec_out, enc_out, x_mask=None, cross_mask=None)
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
from layers.SelfAttention_Family import ProbAttention, AttentionLayer
from layers.Embed import DataEmbedding
class Model(nn.Module):
"""
Informer with Propspare attention in O(LlogL) complexity
Paper link: https://ojs.aaai.org/index.php/AAAI/article/view/17325/17132
"""
def __init__(self, configs):
super(Model, self).__init__()
self.pred_len = configs.pred_len
self.label_len = configs.label_len
if configs.channel_independence:
self.enc_in = 1
self.dec_in = 1
self.c_out = 1
else:
self.enc_in = configs.enc_in
self.dec_in = configs.dec_in
self.c_out = configs.c_out
# Embedding
self.enc_embedding = DataEmbedding(self.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.dec_embedding = DataEmbedding(self.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
ProbAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=configs.output_attention),
configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
[
ConvLayer(
configs.d_model
) for l in range(configs.e_layers - 1)
] if configs.distil else None,
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
# Decoder
self.decoder = Decoder(
[
DecoderLayer(
AttentionLayer(
ProbAttention(True, configs.factor, attention_dropout=configs.dropout, output_attention=False),
configs.d_model, configs.n_heads),
AttentionLayer(
ProbAttention(False, configs.factor, attention_dropout=configs.dropout, output_attention=False),
configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation,
)
for l in range(configs.d_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model),
projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
)
def long_forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
enc_out = self.enc_embedding(x_enc, x_mark_enc)
dec_out = self.dec_embedding(x_dec, x_mark_dec)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.decoder(dec_out, enc_out, x_mask=None, cross_mask=None)
return dec_out # [B, L, D]
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.long_forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Encoder, EncoderLayer
from layers.SelfAttention_Family import ReformerLayer
from layers.Embed import DataEmbedding
class Model(nn.Module):
"""
Reformer with O(LlogL) complexity
Paper link: https://openreview.net/forum?id=rkgNKkHtvB
"""
def __init__(self, configs, bucket_size=4, n_hashes=4):
"""
bucket_size: int,
n_hashes: int,
"""
super(Model, self).__init__()
self.pred_len = configs.pred_len
self.seq_len = configs.seq_len
if configs.channel_independence:
self.enc_in = 1
self.dec_in = 1
self.c_out = 1
else:
self.enc_in = configs.enc_in
self.dec_in = configs.dec_in
self.c_out = configs.c_out
self.enc_embedding = DataEmbedding(self.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder
self.encoder = Encoder(
[
EncoderLayer(
ReformerLayer(None, configs.d_model, configs.n_heads,
bucket_size=bucket_size, n_hashes=n_hashes),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
self.projection = nn.Linear(
configs.d_model, configs.c_out, bias=True)
def long_forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# add placeholder
x_enc = torch.cat([x_enc, x_dec[:, -self.pred_len:, :]], dim=1)
if x_mark_enc is not None:
x_mark_enc = torch.cat(
[x_mark_enc, x_mark_dec[:, -self.pred_len:, :]], dim=1)
enc_out = self.enc_embedding(x_enc, x_mark_enc) # [B,T,C]
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.projection(enc_out)
return dec_out # [B, L, D]
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.long_forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
from layers.SelfAttention_Family import FullAttention, AttentionLayer
from layers.Embed import DataEmbedding
import numpy as np
class Model(nn.Module):
"""
Vanilla Transformer
with O(L^2) complexity
Paper link: https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
"""
def __init__(self, configs):
super(Model, self).__init__()
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
if configs.channel_independence:
self.enc_in = 1
self.dec_in = 1
self.c_out = 1
else:
self.enc_in = configs.enc_in
self.dec_in = configs.dec_in
self.c_out = configs.c_out
# Embedding
self.enc_embedding = DataEmbedding(self.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
FullAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=configs.output_attention), configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
# Decoder
self.dec_embedding = DataEmbedding(self.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.decoder = Decoder(
[
DecoderLayer(
AttentionLayer(
FullAttention(True, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
AttentionLayer(
FullAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation,
)
for l in range(configs.d_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model),
projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# Embedding
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.dec_embedding(x_dec, x_mark_dec)
dec_out = self.decoder(dec_out, enc_out, x_mask=None, cross_mask=None)
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Encoder, EncoderLayer
from layers.SelfAttention_Family import FlashAttention, AttentionLayer
from layers.Embed import DataEmbedding_inverted
import numpy as np
class Model(nn.Module):
"""
Vanilla Transformer
with O(L^2) complexity
Paper link: https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
"""
def __init__(self, configs):
super(Model, self).__init__()
self.seq_len = configs.seq_len
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
# Embedding
self.enc_embedding = DataEmbedding_inverted(configs.seq_len, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder-only architecture
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
FlashAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=configs.output_attention), configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
self.projector = nn.Linear(configs.d_model, configs.pred_len, bias=True)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# Normalization from Non-stationary Transformer
means = x_enc.mean(1, keepdim=True).detach()
x_enc = x_enc - means
stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
x_enc /= stdev
_, _, N = x_enc.shape
# Embedding
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N]
# De-Normalization from Non-stationary Transformer
dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Encoder, EncoderLayer
from layers.SelfAttention_Family import FlowAttention, AttentionLayer
from layers.Embed import DataEmbedding_inverted
import numpy as np
class Model(nn.Module):
"""
Vanilla Transformer
with O(L^2) complexity
Paper link: https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
"""
def __init__(self, configs):
super(Model, self).__init__()
self.seq_len = configs.seq_len
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
# Embedding
self.enc_embedding = DataEmbedding_inverted(configs.seq_len, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder-only architecture
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
FlowAttention(attention_dropout=configs.dropout), configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
self.projector = nn.Linear(configs.d_model, configs.pred_len, bias=True)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# Normalization from Non-stationary Transformer
means = x_enc.mean(1, keepdim=True).detach()
x_enc = x_enc - means
stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
x_enc /= stdev
_, _, N = x_enc.shape
# Embedding
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N]
# De-Normalization from Non-stationary Transformer
dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Encoder, EncoderLayer
from layers.SelfAttention_Family import ProbAttention, AttentionLayer
from layers.Embed import DataEmbedding_inverted
import numpy as np
class Model(nn.Module):
"""
Vanilla Transformer
with O(L^2) complexity
Paper link: https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
"""
def __init__(self, configs):
super(Model, self).__init__()
self.seq_len = configs.seq_len
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
# Embedding
self.enc_embedding = DataEmbedding_inverted(configs.seq_len, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder-only architecture
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
ProbAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=configs.output_attention), configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
self.projector = nn.Linear(configs.d_model, configs.pred_len, bias=True)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# Normalization from Non-stationary Transformer
means = x_enc.mean(1, keepdim=True).detach()
x_enc = x_enc - means
stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
x_enc /= stdev
_, _, N = x_enc.shape
# Embedding
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N]
# De-Normalization from Non-stationary Transformer
dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Encoder, EncoderLayer
from layers.SelfAttention_Family import ReformerLayer
from layers.Embed import DataEmbedding_inverted
import numpy as np
class Model(nn.Module):
"""
Vanilla Transformer
with O(L^2) complexity
Paper link: https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
"""
def __init__(self, configs):
super(Model, self).__init__()
self.seq_len = configs.seq_len
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
# Embedding
self.enc_embedding = DataEmbedding_inverted(configs.seq_len, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder-only architecture
self.encoder = Encoder(
[
EncoderLayer(
ReformerLayer(None, configs.d_model, configs.n_heads,
bucket_size=4, n_hashes=4),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
self.projector = nn.Linear(configs.d_model, configs.pred_len, bias=True)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
# Normalization from Non-stationary Transformer
means = x_enc.mean(1, keepdim=True).detach()
x_enc = x_enc - means
stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
x_enc /= stdev
_, _, N = x_enc.shape
# Embedding
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=None)
dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N]
# De-Normalization from Non-stationary Transformer
dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
\ No newline at end of file
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Transformer_EncDec import Encoder, EncoderLayer
from layers.SelfAttention_Family import FullAttention, AttentionLayer
from layers.Embed import DataEmbedding_inverted
import numpy as np
class Model(nn.Module):
"""
Paper link: https://arxiv.org/abs/2310.06625
"""
def __init__(self, configs):
super(Model, self).__init__()
self.seq_len = configs.seq_len
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
self.use_norm = configs.use_norm
# Embedding
self.enc_embedding = DataEmbedding_inverted(configs.seq_len, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.class_strategy = configs.class_strategy
# Encoder-only architecture
self.encoder = Encoder(
[
EncoderLayer(
AttentionLayer(
FullAttention(False, configs.factor, attention_dropout=configs.dropout,
output_attention=configs.output_attention), configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=torch.nn.LayerNorm(configs.d_model)
)
self.projector = nn.Linear(configs.d_model, configs.pred_len, bias=True)
def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
if self.use_norm:
# Normalization from Non-stationary Transformer
means = x_enc.mean(1, keepdim=True).detach()
x_enc = x_enc - means
stdev = torch.sqrt(torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
x_enc /= stdev
_, _, N = x_enc.shape # B L N
# B: batch_size; E: d_model;
# L: seq_len; S: pred_len;
# N: number of variate (tokens), can also includes covariates
# Embedding
# B L N -> B N E (B L N -> B L E in the vanilla Transformer)
enc_out = self.enc_embedding(x_enc, x_mark_enc) # covariates (e.g timestamp) can be also embedded as tokens
# B N E -> B N E (B L E -> B L E in the vanilla Transformer)
# the dimensions of embedded time series has been inverted, and then processed by native attn, layernorm and ffn modules
enc_out, attns = self.encoder(enc_out, attn_mask=None)
# B N E -> B N S -> B S N
dec_out = self.projector(enc_out).permute(0, 2, 1)[:, :, :N] # filter the covariates
if self.use_norm:
# De-Normalization from Non-stationary Transformer
dec_out = dec_out * (stdev[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
dec_out = dec_out + (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
return dec_out
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
return dec_out[:, -self.pred_len:, :] # [B, L, D]
\ No newline at end of file
pandas==1.5.3
scikit-learn==1.2.2
numpy==1.23.5
matplotlib==3.7.0
# torch==2.0.0
reformer-pytorch==1.4.4
import argparse
import torch
from experiments.exp_long_term_forecasting import Exp_Long_Term_Forecast
from experiments.exp_long_term_forecasting_partial import Exp_Long_Term_Forecast_Partial
import random
import numpy as np
if __name__ == '__main__':
fix_seed = 2023
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)
parser = argparse.ArgumentParser(description='iTransformer')
# basic config
parser.add_argument('--is_training', type=int, required=True, default=1, help='status')
parser.add_argument('--model_id', type=str, required=True, default='test', help='model id')
parser.add_argument('--model', type=str, required=True, default='iTransformer',
help='model name, options: [iTransformer, iInformer, iReformer, iFlowformer, iFlashformer]')
# data loader
parser.add_argument('--data', type=str, required=True, default='custom', help='dataset type')
parser.add_argument('--root_path', type=str, default='./data/electricity/', help='root path of the data file')
parser.add_argument('--data_path', type=str, default='electricity.csv', help='data csv file')
parser.add_argument('--features', type=str, default='M',
help='forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate')
parser.add_argument('--target', type=str, default='OT', help='target feature in S or MS task')
parser.add_argument('--freq', type=str, default='h',
help='freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h')
parser.add_argument('--checkpoints', type=str, default='./checkpoints/', help='location of model checkpoints')
# forecasting task
parser.add_argument('--seq_len', type=int, default=96, help='input sequence length')
parser.add_argument('--label_len', type=int, default=48, help='start token length') # no longer needed in inverted Transformers
parser.add_argument('--pred_len', type=int, default=96, help='prediction sequence length')
# model define
parser.add_argument('--enc_in', type=int, default=7, help='encoder input size')
parser.add_argument('--dec_in', type=int, default=7, help='decoder input size')
parser.add_argument('--c_out', type=int, default=7, help='output size') # applicable on arbitrary number of variates in inverted Transformers
parser.add_argument('--d_model', type=int, default=512, help='dimension of model')
parser.add_argument('--n_heads', type=int, default=8, help='num of heads')
parser.add_argument('--e_layers', type=int, default=2, help='num of encoder layers')
parser.add_argument('--d_layers', type=int, default=1, help='num of decoder layers')
parser.add_argument('--d_ff', type=int, default=2048, help='dimension of fcn')
parser.add_argument('--moving_avg', type=int, default=25, help='window size of moving average')
parser.add_argument('--factor', type=int, default=1, help='attn factor')
parser.add_argument('--distil', action='store_false',
help='whether to use distilling in encoder, using this argument means not using distilling',
default=True)
parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
parser.add_argument('--embed', type=str, default='timeF',
help='time features encoding, options:[timeF, fixed, learned]')
parser.add_argument('--activation', type=str, default='gelu', help='activation')
parser.add_argument('--output_attention', action='store_true', help='whether to output attention in ecoder')
parser.add_argument('--do_predict', action='store_true', help='whether to predict unseen future data')
# optimization
parser.add_argument('--num_workers', type=int, default=10, help='data loader num workers')
parser.add_argument('--itr', type=int, default=1, help='experiments times')
parser.add_argument('--train_epochs', type=int, default=10, help='train epochs')
parser.add_argument('--batch_size', type=int, default=32, help='batch size of train input data')
parser.add_argument('--patience', type=int, default=3, help='early stopping patience')
parser.add_argument('--learning_rate', type=float, default=0.0001, help='optimizer learning rate')
parser.add_argument('--des', type=str, default='test', help='exp description')
parser.add_argument('--loss', type=str, default='MSE', help='loss function')
parser.add_argument('--lradj', type=str, default='type1', help='adjust learning rate')
parser.add_argument('--use_amp', action='store_true', help='use automatic mixed precision training', default=False)
# GPU
parser.add_argument('--use_gpu', type=bool, default=True, help='use gpu')
parser.add_argument('--gpu', type=int, default=0, help='gpu')
parser.add_argument('--use_multi_gpu', action='store_true', help='use multiple gpus', default=False)
parser.add_argument('--devices', type=str, default='0,1,2,3', help='device ids of multile gpus')
# iTransformer
parser.add_argument('--exp_name', type=str, required=False, default='MTSF',
help='experiemnt name, options:[MTSF, partial_train]')
parser.add_argument('--channel_independence', type=bool, default=False, help='whether to use channel_independence mechanism')
parser.add_argument('--inverse', action='store_true', help='inverse output data', default=False)
parser.add_argument('--class_strategy', type=str, default='projection', help='projection/average/cls_token')
parser.add_argument('--target_root_path', type=str, default='./data/electricity/', help='root path of the data file')
parser.add_argument('--target_data_path', type=str, default='electricity.csv', help='data file')
parser.add_argument('--efficient_training', type=bool, default=False, help='whether to use efficient_training (exp_name should be partial train)') # See Figure 8 of our paper for the detail
parser.add_argument('--use_norm', type=int, default=True, help='use norm and denorm')
parser.add_argument('--partial_start_index', type=int, default=0, help='the start index of variates for partial training, '
'you can select [partial_start_index, min(enc_in + partial_start_index, N)]')
args = parser.parse_args()
args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False
if args.use_gpu and args.use_multi_gpu:
args.devices = args.devices.replace(' ', '')
device_ids = args.devices.split(',')
args.device_ids = [int(id_) for id_ in device_ids]
args.gpu = args.device_ids[0]
print('Args in experiment:')
print(args)
if args.exp_name == 'partial_train': # See Figure 8 of our paper, for the detail
Exp = Exp_Long_Term_Forecast_Partial
else: # MTSF: multivariate time series forecasting
Exp = Exp_Long_Term_Forecast
if args.is_training:
for ii in range(args.itr):
# setting record of experiments
setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_dt{}_{}_{}'.format(
args.model_id,
args.model,
args.data,
args.features,
args.seq_len,
args.label_len,
args.pred_len,
args.d_model,
args.n_heads,
args.e_layers,
args.d_layers,
args.d_ff,
args.factor,
args.embed,
args.distil,
args.des,
args.class_strategy, ii)
exp = Exp(args) # set experiments
print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
exp.train(setting)
print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
exp.test(setting)
if args.do_predict:
print('>>>>>>>predicting : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
exp.predict(setting, True)
torch.cuda.empty_cache()
else:
ii = 0
setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_dt{}_{}_{}'.format(
args.model_id,
args.model,
args.data,
args.features,
args.seq_len,
args.label_len,
args.pred_len,
args.d_model,
args.n_heads,
args.e_layers,
args.d_layers,
args.d_ff,
args.factor,
args.embed,
args.distil,
args.des,
args.class_strategy, ii)
exp = Exp(args) # set experiments
print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
exp.test(setting, test=1)
# exp.predict(setting, load=True)
torch.cuda.empty_cache()
export CUDA_VISIBLE_DEVICES=0
model_name=iFlowformer
# model_name=Flowformer
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_96 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 96 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_192 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 192 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_336 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 336 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_720 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 720 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
export CUDA_VISIBLE_DEVICES=0
model_name=iInformer
# model_name=Informer
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_96 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 96 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_192 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 192 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_336 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 336 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_720 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 720 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
export CUDA_VISIBLE_DEVICES=0
model_name=iReformer
# model_name=Reformer
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_96 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 96 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_192 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 192 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_336 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 336 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_720 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 720 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
export CUDA_VISIBLE_DEVICES=0
model_name=iTransformer
# model_name=Transformer
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_96 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 96 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_192 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 192 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_336 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 336 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
python -u run.py \
--is_training 1 \
--root_path ./dataset/electricity/ \
--data_path electricity.csv \
--model_id ECL_96_720 \
--model $model_name \
--data custom \
--features M \
--seq_len 96 \
--pred_len 720 \
--e_layers 2 \
--enc_in 321 \
--dec_in 321 \
--c_out 321 \
--des 'Exp' \
--itr 1
# Inverted Transformers Work Better for Time Series Forecasting
This folder contains the comparison of the vanilla Transformer-based forecasters and the inverted versions. If you are new to this repo, we recommend you to have a look at this [README](../multivariate_forecasting/README.md) first.
## Scripts
In each folder named after the dataset, we compare the performance of iTransformers and the vanilla Transformers.
```
# iTransformer on the Traffic Dataset with gradually enlarged lookback windows.
bash ./scripts/boost_performance/Traffic/iTransformer.sh
```
You can change the ```model_name``` in the script to select one Transformer variant and its inverted version.
## Results
We compare the performance of Transformer and iTransformer on all six datasets, indicating that the attention and feed-forward network on the
inverted dimensions greatly empower Transformers in multivariate time series forecasting.
<p align="center">
<img src="../../figures/boosting_trm.png" alt="" align=center />
</p>
We apply the proposed inverted framework to Transformer and its variants. It demonstrates that our iTransformers framework can consistently promote these Transformer variants,
and take advantage of the booming efficient attention mechanisms.
<p align="center">
<img src="../../figures/boosting.png" alt="" align=center />
</p>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment