Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
d0cb9fa2
Commit
d0cb9fa2
authored
Nov 04, 2018
by
thomwolf
Browse files
clean up model
parent
6cc65177
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
105 deletions
+58
-105
modeling.py
modeling.py
+58
-105
No files found.
modeling.py
View file @
d0cb9fa2
...
@@ -27,26 +27,28 @@ import torch.nn as nn
...
@@ -27,26 +27,28 @@ import torch.nn as nn
from
torch.nn
import
CrossEntropyLoss
from
torch.nn
import
CrossEntropyLoss
def
gelu
(
x
):
def
gelu
(
x
):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
"""
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
return
x
*
0.5
*
(
1.0
+
torch
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
# For information: OpenAI GPT gelu version is a bit different:
# 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
class
BertConfig
(
object
):
class
BertConfig
(
object
):
"""Configuration
for
`BertModel`.
"""
"""Configuration
class to store the configuration of a
`BertModel`.
"""
def
__init__
(
self
,
def
__init__
(
self
,
vocab_size
,
vocab_size
,
hidden_size
=
768
,
hidden_size
=
768
,
num_hidden_layers
=
12
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
num_attention_heads
=
12
,
intermediate_size
=
3072
,
intermediate_size
=
3072
,
hidden_act
=
"gelu"
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_vocab_size
=
16
,
initializer_range
=
0.02
):
initializer_range
=
0.02
):
"""Constructs BertConfig.
"""Constructs BertConfig.
Args:
Args:
...
@@ -110,42 +112,31 @@ class BertConfig(object):
...
@@ -110,42 +112,31 @@ class BertConfig(object):
class
BERTLayerNorm
(
nn
.
Module
):
class
BERTLayerNorm
(
nn
.
Module
):
def
__init__
(
self
,
config
,
variance_epsilon
=
1e-12
):
def
__init__
(
self
,
config
,
variance_epsilon
=
1e-12
):
"Construct a layernorm module in the TF style (epsilon inside the square root)."
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super
(
BERTLayerNorm
,
self
).
__init__
()
super
(
BERTLayerNorm
,
self
).
__init__
()
self
.
gamma
=
nn
.
Parameter
(
torch
.
ones
(
config
.
hidden_size
))
self
.
gamma
=
nn
.
Parameter
(
torch
.
ones
(
config
.
hidden_size
))
self
.
beta
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
hidden_size
))
self
.
beta
=
nn
.
Parameter
(
torch
.
zeros
(
config
.
hidden_size
))
self
.
variance_epsilon
=
variance_epsilon
self
.
variance_epsilon
=
variance_epsilon
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
# TODO check it's identical to TF implementation in details (epsilon and axes)
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
variance_epsilon
)
return
self
.
gamma
*
x
+
self
.
beta
return
self
.
gamma
*
x
+
self
.
beta
# tf.contrib.layers.layer_norm(
# inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
class
BERTEmbeddings
(
nn
.
Module
):
class
BERTEmbeddings
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
BERTEmbeddings
,
self
).
__init__
()
super
(
BERTEmbeddings
,
self
).
__init__
()
"""Construct the embedding module from word, position and token_type embeddings.
"""
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
)
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
)
# Position embeddings are (normally) a contiguous range so we could use a slice
# Since the position embedding table is a learned variable, we create it
# using a (long) sequence length `max_position_embeddings`. The actual
# sequence length might be shorter than this, for faster training of
# tasks that do not have long sequences.
#
# So `full_position_embeddings` is effectively an embedding table
# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
# perform a slice.
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
self
.
position_embeddings
=
nn
.
Embedding
(
config
.
max_position_embeddings
,
config
.
hidden_size
)
# token_type_embeddings vocabulary is very small. TF used one-hot embeddings to speedup.
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
hidden_size
)
self
.
token_type_embeddings
=
nn
.
Embedding
(
config
.
type_vocab_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
BERTLayerNorm
(
config
)
# Not snake-cased to stick with TF model variable name
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self
.
LayerNorm
=
BERTLayerNorm
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
):
...
@@ -182,65 +173,37 @@ class BERTSelfAttention(nn.Module):
...
@@ -182,65 +173,37 @@ class BERTSelfAttention(nn.Module):
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
def
transpose_for_scores
(
self
,
x
,
is_key_tensor
=
False
):
def
transpose_for_scores
(
self
,
x
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
self
.
attention_head_size
)
x
=
x
.
view
(
*
new_x_shape
)
x
=
x
.
view
(
*
new_x_shape
)
if
is_key_tensor
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
return
x
.
permute
(
0
,
2
,
3
,
1
)
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
hidden_states
,
attention_mask
):
def
forward
(
self
,
hidden_states
,
attention_mask
):
# Scalar dimensions referenced here:
# B = batch size (number of sequences)
# F = `from_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
mixed_query_layer
=
self
.
query
(
hidden_states
)
mixed_query_layer
=
self
.
query
(
hidden_states
)
mixed_key_layer
=
self
.
key
(
hidden_states
)
mixed_key_layer
=
self
.
key
(
hidden_states
)
mixed_value_layer
=
self
.
value
(
hidden_states
)
mixed_value_layer
=
self
.
value
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
)
#, is_key_tensor=True)
key_layer
=
self
.
transpose_for_scores
(
mixed_key_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
value_layer
=
self
.
transpose_for_scores
(
mixed_value_layer
)
# Take the dot product between "query" and "key" to get the raw
# Take the dot product between "query" and "key" to get the raw attention scores.
# attention scores.
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
))
# `attention_scores` = [B, N, F, T]
attention_scores
=
attention_scores
/
math
.
sqrt
(
self
.
attention_head_size
)
attention_scores_no_norm
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
))
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores_no_mask
=
attention_scores_no_norm
/
math
.
sqrt
(
self
.
attention_head_size
)
attention_scores
=
attention_scores
+
attention_mask
# TODO clean up this (precompute)
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
# `attention_mask` = [B, 1, F, T]
# attention_mask = tf.expand_dims(attention_mask, axis=[1])
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# adder = (1.0 - attention_mask) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_scores
=
attention_scores_no_mask
+
attention_mask
# Normalize the attention scores to probabilities.
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T]
attention_probs
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
attention_probs_no_drop
=
nn
.
Softmax
(
dim
=-
1
)(
attention_scores
)
# This is actually dropping out entire tokens to attend to, which might
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs
=
self
.
dropout
(
attention_probs
_no_drop
)
attention_probs
=
self
.
dropout
(
attention_probs
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
torch
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
all_head_size
,)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
# aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
# aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
# aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
return
context_layer
return
context_layer
...
@@ -317,12 +280,6 @@ class BERTEncoder(nn.Module):
...
@@ -317,12 +280,6 @@ class BERTEncoder(nn.Module):
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)])
self
.
layer
=
nn
.
ModuleList
([
copy
.
deepcopy
(
layer
)
for
_
in
range
(
config
.
num_hidden_layers
)])
def
forward
(
self
,
hidden_states
,
attention_mask
):
def
forward
(
self
,
hidden_states
,
attention_mask
):
"""
Args:
hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size]
Return:
float Tensor of shape [batch_size, seq_length, hidden_size]
"""
all_encoder_layers
=
[]
all_encoder_layers
=
[]
for
layer_module
in
self
.
layer
:
for
layer_module
in
self
.
layer
:
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
hidden_states
=
layer_module
(
hidden_states
,
attention_mask
)
...
@@ -337,14 +294,8 @@ class BERTPooler(nn.Module):
...
@@ -337,14 +294,8 @@ class BERTPooler(nn.Module):
self
.
activation
=
nn
.
Tanh
()
self
.
activation
=
nn
.
Tanh
()
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
):
"""
Args:
hidden_states: float Tensor of shape [batch_size, seq_length, hidden_size]
Return:
float Tensor of shape [batch_size, hidden_size]
"""
# We "pool" the model by simply taking the hidden state corresponding
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
We assume that this has been pre-trained
# to the first token.
first_token_tensor
=
hidden_states
[:,
0
]
first_token_tensor
=
hidden_states
[:,
0
]
pooled_output
=
self
.
dense
(
first_token_tensor
)
pooled_output
=
self
.
dense
(
first_token_tensor
)
pooled_output
=
self
.
activation
(
pooled_output
)
pooled_output
=
self
.
activation
(
pooled_output
)
...
@@ -373,10 +324,6 @@ class BertModel(nn.Module):
...
@@ -373,10 +324,6 @@ class BertModel(nn.Module):
Args:
Args:
config: `BertConfig` instance.
config: `BertConfig` instance.
Raises:
ValueError: The config is invalid or one of the input tensor shapes
is invalid.
"""
"""
super
(
BertModel
,
self
).
__init__
()
super
(
BertModel
,
self
).
__init__
()
self
.
embeddings
=
BERTEmbeddings
(
config
)
self
.
embeddings
=
BERTEmbeddings
(
config
)
...
@@ -384,26 +331,30 @@ class BertModel(nn.Module):
...
@@ -384,26 +331,30 @@ class BertModel(nn.Module):
self
.
pooler
=
BERTPooler
(
config
)
self
.
pooler
=
BERTPooler
(
config
)
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
):
def
forward
(
self
,
input_ids
,
token_type_ids
=
None
,
attention_mask
=
None
):
# We create 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, from_seq_length]
# So we can broadcast to [batch_size, num_heads, to_seq_length, from_seq_length]
# It's more simple than the triangular masking of causal attention, just need to
# prepare the broadcast here
if
attention_mask
is
None
:
if
attention_mask
is
None
:
attention_mask
=
torch
.
ones_like
(
input_ids
)
attention_mask
=
torch
.
ones_like
(
input_ids
)
if
token_type_ids
is
None
:
if
token_type_ids
is
None
:
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
token_type_ids
=
torch
.
zeros_like
(
input_ids
)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, from_seq_length]
# So we can broadcast to [batch_size, num_heads, to_seq_length, from_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask
=
extended_attention_mask
.
float
()
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
extended_attention_mask
=
(
1.0
-
extended_attention_mask
)
*
-
10000.0
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
embedding_output
=
self
.
embeddings
(
input_ids
,
token_type_ids
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
)
all_encoder_layers
=
self
.
encoder
(
embedding_output
,
extended_attention_mask
)
sequence_output
=
all_encoder_layers
[
-
1
]
sequence_output
=
all_encoder_layers
[
-
1
]
pooled_output
=
self
.
pooler
(
sequence_output
)
pooled_output
=
self
.
pooler
(
sequence_output
)
# TODO DEbugging
# all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
return
all_encoder_layers
,
pooled_output
return
all_encoder_layers
,
pooled_output
class
BertForSequenceClassification
(
nn
.
Module
):
class
BertForSequenceClassification
(
nn
.
Module
):
...
@@ -435,9 +386,14 @@ class BertForSequenceClassification(nn.Module):
...
@@ -435,9 +386,14 @@ class BertForSequenceClassification(nn.Module):
def
init_weights
(
m
):
def
init_weights
(
m
):
if
isinstance
(
m
,
(
nn
.
Linear
,
nn
.
Embedding
)):
if
isinstance
(
m
,
(
nn
.
Linear
,
nn
.
Embedding
)):
# Slight differen
ce here with
the TF version which uses truncated_normal
# Slight
ly
differen
t from
the TF version which uses truncated_normal
for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
# cf https://github.com/pytorch/pytorch/pull/5617
m
.
weight
.
data
.
normal_
(
config
.
initializer_range
)
m
.
weight
.
data
.
normal_
(
config
.
initializer_range
)
elif
isinstance
(
m
,
BERTLayerNorm
):
m
.
beta
.
data
.
normal_
(
config
.
initializer_range
)
m
.
gamma
.
data
.
normal_
(
config
.
initializer_range
)
if
isinstance
(
m
,
nn
.
Linear
):
m
.
bias
.
data
.
zero_
()
self
.
apply
(
init_weights
)
self
.
apply
(
init_weights
)
def
forward
(
self
,
input_ids
,
token_type_ids
,
attention_mask
,
labels
=
None
):
def
forward
(
self
,
input_ids
,
token_type_ids
,
attention_mask
,
labels
=
None
):
...
@@ -474,13 +430,13 @@ class BertForQuestionAnswering(nn.Module):
...
@@ -474,13 +430,13 @@ class BertForQuestionAnswering(nn.Module):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
BertForQuestionAnswering
,
self
).
__init__
()
super
(
BertForQuestionAnswering
,
self
).
__init__
()
self
.
bert
=
BertModel
(
config
)
self
.
bert
=
BertModel
(
config
)
# TODO check if it's normal there is no dropout on SQuAD in the TF version
# TODO check
with Google
if it's normal there is no dropout on
the token classifier of
SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self
.
qa_outputs
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
self
.
qa_outputs
=
nn
.
Linear
(
config
.
hidden_size
,
2
)
def
init_weights
(
m
):
def
init_weights
(
m
):
if
isinstance
(
m
,
(
nn
.
Linear
,
nn
.
Embedding
)):
if
isinstance
(
m
,
(
nn
.
Linear
,
nn
.
Embedding
)):
# Slight differen
ce here with
the TF version which uses truncated_normal for initialization
# Slight
ly
differen
t from
the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
# cf https://github.com/pytorch/pytorch/pull/5617
m
.
weight
.
data
.
normal_
(
config
.
initializer_range
)
m
.
weight
.
data
.
normal_
(
config
.
initializer_range
)
elif
isinstance
(
m
,
BERTLayerNorm
):
elif
isinstance
(
m
,
BERTLayerNorm
):
...
@@ -497,20 +453,17 @@ class BertForQuestionAnswering(nn.Module):
...
@@ -497,20 +453,17 @@ class BertForQuestionAnswering(nn.Module):
start_logits
,
end_logits
=
logits
.
split
(
1
,
dim
=-
1
)
start_logits
,
end_logits
=
logits
.
split
(
1
,
dim
=-
1
)
if
start_positions
is
not
None
and
end_positions
is
not
None
:
if
start_positions
is
not
None
and
end_positions
is
not
None
:
#loss_fct = CrossEntropyLoss()
#start_loss = loss_fct(start_logits, start_positions)
#end_loss = loss_fct(end_logits, end_positions)
batch_size
,
seq_length
=
input_ids
.
size
()
batch_size
,
seq_length
=
input_ids
.
size
()
def
compute_loss
(
logits
,
positions
):
def
compute_loss
(
logits
,
positions
):
max_position
=
positions
.
max
().
item
()
max_position
=
positions
.
max
().
item
()
one_hot
=
torch
.
FloatTensor
(
batch_size
,
max
(
max_position
,
seq_length
)
+
1
).
zero_
()
one_hot
=
torch
.
FloatTensor
(
batch_size
,
max
(
max_position
,
seq_length
)
+
1
).
zero_
()
one_hot
=
one_hot
.
scatter_
(
1
,
positions
.
cpu
(),
1
)
#
Second argument need to be LongTensor and not cuda.LongTensor
one_hot
=
one_hot
.
scatter_
(
1
,
positions
.
cpu
(),
1
)
#
Do this on CPU
one_hot
=
one_hot
[:,
:
seq_length
].
to
(
input_ids
.
device
)
one_hot
=
one_hot
[:,
:
seq_length
].
to
(
input_ids
.
device
)
log_probs
=
nn
.
functional
.
log_softmax
(
logits
,
dim
=
-
1
).
view
(
batch_size
,
seq_length
)
log_probs
=
nn
.
functional
.
log_softmax
(
logits
,
dim
=
-
1
).
view
(
batch_size
,
seq_length
)
loss
=
-
torch
.
mean
(
torch
.
sum
(
one_hot
*
log_probs
),
dim
=
-
1
)
loss
=
-
torch
.
mean
(
torch
.
sum
(
one_hot
*
log_probs
),
dim
=
-
1
)
return
loss
return
loss
start_loss
=
compute_loss
(
start_logits
,
start_positions
)
start_loss
=
compute_loss
(
start_logits
,
start_positions
)
end_loss
=
compute_loss
(
end_logits
,
end_positions
)
end_loss
=
compute_loss
(
end_logits
,
end_positions
)
total_loss
=
(
start_loss
+
end_loss
)
/
2
total_loss
=
(
start_loss
+
end_loss
)
/
2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment