Unverified Commit 03d70c41 authored by Hubert Lu's avatar Hubert Lu Committed by GitHub
Browse files

Cherry-picks some commits to replace torch.Tensor and remove dependency on six (#107)



* replace torch.Tensor with torch.empty (#1578)

* replace torch.Tensor with torch.empty

* nit

* nit

* torch.empty() must have args (#1584)

* use `torch.tensor` to create a tensor with initializer values (#1588)

* use `torch.tensor` with init values
Signed-off-by: default avatarMasaki Kozuki <mkozuki@nvidia.com>

* Update apex/contrib/sparsity/sparse_masklib.py

* remove torch._six
Signed-off-by: default avatarMasaki Kozuki <mkozuki@nvidia.com>

* retire `torch._six`

as per the upstream commit of `b005ec62b9`.
Signed-off-by: default avatarMasaki Kozuki <mkozuki@nvidia.com>

* use std collections.abc
Signed-off-by: default avatarMasaki Kozuki <mkozuki@nvidia.com>

---------
Signed-off-by: default avatarMasaki Kozuki <mkozuki@nvidia.com>

---------
Signed-off-by: default avatarMasaki Kozuki <mkozuki@nvidia.com>
Co-authored-by: default avatarNouamane Tazi <nouamane98@gmail.com>
Co-authored-by: default avatarMasaki Kozuki <mkozuki@nvidia.com>
parent b047a1f1
......@@ -254,17 +254,17 @@ class RNNCell(nn.Module):
self.gate_size = gate_multiplier * self.hidden_size
self.n_hidden_states = n_hidden_states
self.w_ih = nn.Parameter(torch.Tensor(self.gate_size, self.input_size))
self.w_hh = nn.Parameter(torch.Tensor(self.gate_size, self.output_size))
self.w_ih = nn.Parameter(torch.empty(self.gate_size, self.input_size))
self.w_hh = nn.Parameter(torch.empty(self.gate_size, self.output_size))
#Check if there's recurrent projection
if(self.output_size != self.hidden_size):
self.w_ho = nn.Parameter(torch.Tensor(self.output_size, self.hidden_size))
self.w_ho = nn.Parameter(torch.empty(self.output_size, self.hidden_size))
self.b_ih = self.b_hh = None
if self.bias:
self.b_ih = nn.Parameter(torch.Tensor(self.gate_size))
self.b_hh = nn.Parameter(torch.Tensor(self.gate_size))
self.b_ih = nn.Parameter(torch.empty(self.gate_size))
self.b_hh = nn.Parameter(torch.empty(self.gate_size))
#hidden states for forward
self.hidden = [ None for states in range(self.n_hidden_states)]
......
......@@ -18,8 +18,8 @@ class mLSTMRNNCell(RNNCell):
gate_multiplier = 4
super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))
self.w_mih = nn.Parameter(torch.empty(self.output_size, self.input_size))
self.w_mhh = nn.Parameter(torch.empty(self.output_size, self.output_size))
self.reset_parameters()
......
......@@ -2,18 +2,8 @@
# I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like.
# But apparently it's ok:
# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
import os
import torch
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
from torch._six import container_abcs
else:
import collections.abc as container_abcs
class AmpState(object):
def __init__(self):
......
import torch
from torch._six import string_classes
import collections.abc as container_abcs
from types import MethodType
import functools
import numpy as np
import sys
from types import MethodType
import warnings
from ._amp_state import _amp_state, warn_or_err, container_abcs
import numpy as np
import torch
from ._amp_state import _amp_state, warn_or_err
from .handle import disable_casts
from .scaler import LossScaler
from ._process_optimizer import _process_optimizer
......@@ -39,7 +41,7 @@ def to_type(dtype, t):
def applier(value, fn):
if isinstance(value, torch.Tensor):
return fn(value)
elif isinstance(value, string_classes):
elif isinstance(value, str):
return value
elif isinstance(value, np.ndarray):
return value
......
import torch
from torch._six import inf
from typing import Union, Iterable
import torch
_kernel_import_succeeded = False
try:
import amp_C
from apex.multi_tensor_apply import multi_tensor_applier
_kernel_import_succeeded = True
except:
except ImportError:
_kernel_import_succeeded = False
_tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]
def clip_grad_norm_(
parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.0,
error_if_nonfinite: bool = False) -> torch.Tensor:
......
......@@ -41,8 +41,8 @@ class FastLayerNorm(torch.nn.Module):
def __init__(self, hidden_size, eps=1e-5):
super().__init__()
self.epsilon = eps
self.weight = torch.nn.Parameter(torch.Tensor(hidden_size))
self.bias = torch.nn.Parameter(torch.Tensor(hidden_size))
self.weight = torch.nn.Parameter(torch.empty(hidden_size))
self.bias = torch.nn.Parameter(torch.empty(hidden_size))
self.reset_parameters()
def reset_parameters(self):
......
......@@ -37,14 +37,14 @@ class EncdecMultiheadAttn(nn.Module):
self.impl = impl
self.scaling = self.head_dim ** -0.5
self.in_proj_weight_q = Parameter(torch.Tensor(embed_dim, embed_dim))
self.in_proj_weight_kv = Parameter(torch.Tensor(2 * embed_dim, embed_dim))
self.out_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
self.in_proj_weight_q = Parameter(torch.empty(embed_dim, embed_dim))
self.in_proj_weight_kv = Parameter(torch.empty(2 * embed_dim, embed_dim))
self.out_proj_weight = Parameter(torch.empty(embed_dim, embed_dim))
if self.bias:
assert impl != "fast", "ERROR! The Fast implementation does not support biases!"
self.in_proj_bias_q = Parameter(torch.Tensor(embed_dim))
self.in_proj_bias_kv = Parameter(torch.Tensor(2 * embed_dim))
self.out_proj_bias = Parameter(torch.Tensor(embed_dim))
self.in_proj_bias_q = Parameter(torch.empty(embed_dim))
self.in_proj_bias_kv = Parameter(torch.empty(2 * embed_dim))
self.out_proj_bias = Parameter(torch.empty(embed_dim))
else:
self.register_parameter("in_proj_bias_q", None)
self.register_parameter("in_proj_bias_kv", None)
......@@ -53,8 +53,8 @@ class EncdecMultiheadAttn(nn.Module):
self.out_proj_bias = None
if self.include_norm_add:
if impl == "fast":
self.lyr_nrm_gamma_weights = Parameter(torch.Tensor(embed_dim))
self.lyr_nrm_beta_weights = Parameter(torch.Tensor(embed_dim))
self.lyr_nrm_gamma_weights = Parameter(torch.empty(embed_dim))
self.lyr_nrm_beta_weights = Parameter(torch.empty(embed_dim))
self.lyr_nrm = None
else:
self.register_parameter("lyr_norm_gamma_weights", None)
......
......@@ -54,20 +54,20 @@ class SelfMultiheadAttn(nn.Module):
impl == "fast" and bias
), "additive mask not supported for fast mode without bias"
if separate_qkv_params:
self.q_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
self.k_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
self.v_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
self.q_weight = Parameter(torch.empty(embed_dim, embed_dim))
self.k_weight = Parameter(torch.empty(embed_dim, embed_dim))
self.v_weight = Parameter(torch.empty(embed_dim, embed_dim))
else:
self.in_proj_weight = Parameter(torch.Tensor(3 * embed_dim, embed_dim))
self.out_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
self.out_proj_weight = Parameter(torch.empty(embed_dim, embed_dim))
if self.bias:
if separate_qkv_params:
self.q_bias = Parameter(torch.Tensor(embed_dim))
self.k_bias = Parameter(torch.Tensor(embed_dim))
self.v_bias = Parameter(torch.Tensor(embed_dim))
self.q_bias = Parameter(torch.empty(embed_dim))
self.k_bias = Parameter(torch.empty(embed_dim))
self.v_bias = Parameter(torch.empty(embed_dim))
else:
self.in_proj_bias = Parameter(torch.Tensor(3 * embed_dim))
self.out_proj_bias = Parameter(torch.Tensor(embed_dim))
self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
self.out_proj_bias = Parameter(torch.empty(embed_dim))
else:
if separate_qkv_params:
self.register_parameter("q_bias", None)
......@@ -83,8 +83,8 @@ class SelfMultiheadAttn(nn.Module):
self.out_proj_bias = None
if self.include_norm_add:
if impl == "fast":
self.lyr_nrm_gamma_weights = Parameter(torch.Tensor(embed_dim))
self.lyr_nrm_beta_weights = Parameter(torch.Tensor(embed_dim))
self.lyr_nrm_gamma_weights = Parameter(torch.empty(embed_dim))
self.lyr_nrm_beta_weights = Parameter(torch.empty(embed_dim))
self.lyr_nrm = None
else:
self.register_parameter("lyr_norm_gamma_weights", None)
......
......@@ -29,7 +29,7 @@ def compute_valid_1d_patterns(m,n):
if m==4 and n==2 and valid_m4n2_1d_patterns is not None: return valid_m4n2_1d_patterns
patterns = torch.zeros(m)
patterns[:n] = 1
valid_patterns = torch.Tensor(list(set(permutations(patterns.tolist()))))
valid_patterns = torch.tensor(list(set(permutations(patterns.tolist()))))
if m == 4 and n == 2: valid_m4n2_1d_patterns = valid_patterns
return valid_patterns
......@@ -109,10 +109,10 @@ def compute_valid_2d_patterns(m,n):
patterns[:n] = 1
patterns = list(set(permutations(patterns.tolist())))
patterns = patterns + patterns
patterns = torch.Tensor(list(set(permutations(patterns,m))))
patterns = torch.empty(list(set(permutations(patterns,m))))
valid = ((patterns.sum(dim=1) <= n).sum(dim=1) == m).nonzero().view(-1)
valid_patterns = torch.Tensor(valid.shape[0],m,m)
valid_patterns = torch.empty(valid.shape[0],m,m)
valid_patterns[:] = patterns[valid[:]]
if m == 4 and n == 2: valid_m4n2_2d_patterns = valid_patterns
......
......@@ -55,9 +55,9 @@ class FusedDense(nn.Module):
super(FusedDense, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
self.weight = nn.Parameter(torch.empty(out_features, in_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(out_features))
self.bias = nn.Parameter(torch.empty(out_features))
else:
#assert False, "no-bias option not added yet"
self.register_parameter('bias', None)
......@@ -75,10 +75,10 @@ class FusedDenseGeluDense(nn.Module):
self.in_features = in_features
self.intermediate_features = intermediate_features
self.out_features = out_features
self.weight1 = nn.Parameter(torch.Tensor(intermediate_features, in_features))
self.bias1 = nn.Parameter(torch.Tensor(intermediate_features))
self.weight2 = nn.Parameter(torch.Tensor(out_features, intermediate_features))
self.bias2 = nn.Parameter(torch.Tensor(out_features))
self.weight1 = nn.Parameter(torch.empty(intermediate_features, in_features))
self.bias1 = nn.Parameter(torch.empty(intermediate_features))
self.weight2 = nn.Parameter(torch.empty(out_features, intermediate_features))
self.bias2 = nn.Parameter(torch.empty(out_features))
def forward(self, input):
return fused_dense_gelu_dense_function(input, self.weight1, self.bias1, self.weight2, self.bias2)
......
......@@ -273,8 +273,8 @@ class FusedLayerNorm(torch.nn.Module):
self.eps = eps
self.elementwise_affine = elementwise_affine
if self.elementwise_affine:
self.weight = Parameter(torch.Tensor(*normalized_shape))
self.bias = Parameter(torch.Tensor(*normalized_shape))
self.weight = Parameter(torch.empty(*normalized_shape))
self.bias = Parameter(torch.empty(*normalized_shape))
else:
self.register_parameter("weight", None)
self.register_parameter("bias", None)
......@@ -369,7 +369,7 @@ class FusedRMSNorm(torch.nn.Module):
self.eps = eps
self.elementwise_affine = elementwise_affine
if self.elementwise_affine:
self.weight = Parameter(torch.Tensor(*normalized_shape))
self.weight = Parameter(torch.empty(*normalized_shape))
else:
self.register_parameter("weight", None)
self.reset_parameters()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment