Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
1c933358
Commit
1c933358
authored
Jan 06, 2020
by
thomwolf
Browse files
formating
parent
e25b6fe3
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
358 additions
and
240 deletions
+358
-240
examples/hans/hans_processors.py
examples/hans/hans_processors.py
+62
-51
examples/hans/test_hans.py
examples/hans/test_hans.py
+291
-186
examples/hans/utils_hans.py
examples/hans/utils_hans.py
+5
-3
No files found.
examples/hans/hans_processors.py
View file @
1c933358
...
...
@@ -18,8 +18,9 @@
import
logging
import
os
from
utils_hans
import
DataProcessor
,
InputExample
,
InputFeatures
from
transformers.file_utils
import
is_tf_available
from
utils_hans
import
DataProcessor
,
InputExample
,
InputFeatures
if
is_tf_available
():
import
tensorflow
as
tf
...
...
@@ -27,7 +28,9 @@ if is_tf_available():
logger
=
logging
.
getLogger
(
__name__
)
def
hans_convert_examples_to_features
(
examples
,
tokenizer
,
def
hans_convert_examples_to_features
(
examples
,
tokenizer
,
max_length
=
512
,
task
=
None
,
label_list
=
None
,
...
...
@@ -35,7 +38,8 @@ def hans_convert_examples_to_features(examples, tokenizer,
pad_on_left
=
False
,
pad_token
=
0
,
pad_token_segment_id
=
0
,
mask_padding_with_zero
=
True
):
mask_padding_with_zero
=
True
,
):
"""
Loads a data file into a list of ``InputFeatures``
...
...
@@ -82,12 +86,7 @@ def hans_convert_examples_to_features(examples, tokenizer,
example
=
processor
.
get_example_from_tensor_dict
(
example
)
example
=
processor
.
tfds_map
(
example
)
inputs
=
tokenizer
.
encode_plus
(
example
.
text_a
,
example
.
text_b
,
add_special_tokens
=
True
,
max_length
=
max_length
,
)
inputs
=
tokenizer
.
encode_plus
(
example
.
text_a
,
example
.
text_b
,
add_special_tokens
=
True
,
max_length
=
max_length
,)
input_ids
,
token_type_ids
=
inputs
[
"input_ids"
],
inputs
[
"token_type_ids"
]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
...
...
@@ -106,8 +105,12 @@ def hans_convert_examples_to_features(examples, tokenizer,
token_type_ids
=
token_type_ids
+
([
pad_token_segment_id
]
*
padding_length
)
assert
len
(
input_ids
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
input_ids
),
max_length
)
assert
len
(
attention_mask
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
attention_mask
),
max_length
)
assert
len
(
token_type_ids
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
token_type_ids
),
max_length
)
assert
len
(
attention_mask
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
attention_mask
),
max_length
)
assert
len
(
token_type_ids
)
==
max_length
,
"Error with input length {} vs {}"
.
format
(
len
(
token_type_ids
),
max_length
)
if
output_mode
==
"classification"
:
label
=
label_map
[
example
.
label
]
if
example
.
label
in
label_map
else
0
...
...
@@ -128,28 +131,40 @@ def hans_convert_examples_to_features(examples, tokenizer,
logger
.
info
(
"label: %s (id = %d)"
%
(
example
.
label
,
label
))
features
.
append
(
InputFeatures
(
input_ids
=
input_ids
,
InputFeatures
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
token_type_ids
=
token_type_ids
,
label
=
label
,
pairID
=
pairID
))
label
=
label
,
pairID
=
pairID
,
)
)
if
is_tf_available
()
and
is_tf_dataset
:
def
gen
():
for
ex
in
features
:
yield
({
'input_ids'
:
ex
.
input_ids
,
'attention_mask'
:
ex
.
attention_mask
,
'token_type_ids'
:
ex
.
token_type_ids
},
ex
.
label
)
return
tf
.
data
.
Dataset
.
from_generator
(
gen
,
({
'input_ids'
:
tf
.
int32
,
'attention_mask'
:
tf
.
int32
,
'token_type_ids'
:
tf
.
int32
},
tf
.
int64
),
({
'input_ids'
:
tf
.
TensorShape
([
None
]),
'attention_mask'
:
tf
.
TensorShape
([
None
]),
'token_type_ids'
:
tf
.
TensorShape
([
None
])},
tf
.
TensorShape
([])))
yield
(
{
"input_ids"
:
ex
.
input_ids
,
"attention_mask"
:
ex
.
attention_mask
,
"token_type_ids"
:
ex
.
token_type_ids
,
},
ex
.
label
,
)
return
tf
.
data
.
Dataset
.
from_generator
(
gen
,
({
"input_ids"
:
tf
.
int32
,
"attention_mask"
:
tf
.
int32
,
"token_type_ids"
:
tf
.
int32
},
tf
.
int64
),
(
{
"input_ids"
:
tf
.
TensorShape
([
None
]),
"attention_mask"
:
tf
.
TensorShape
([
None
]),
"token_type_ids"
:
tf
.
TensorShape
([
None
]),
},
tf
.
TensorShape
([]),
),
)
return
features
...
...
@@ -159,21 +174,20 @@ class HansProcessor(DataProcessor):
def
get_example_from_tensor_dict
(
self
,
tensor_dict
):
"""See base class."""
return
InputExample
(
tensor_dict
[
'idx'
].
numpy
(),
tensor_dict
[
'premise'
].
numpy
().
decode
(
'utf-8'
),
tensor_dict
[
'hypothesis'
].
numpy
().
decode
(
'utf-8'
),
str
(
tensor_dict
[
'label'
].
numpy
()))
return
InputExample
(
tensor_dict
[
"idx"
].
numpy
(),
tensor_dict
[
"premise"
].
numpy
().
decode
(
"utf-8"
),
tensor_dict
[
"hypothesis"
].
numpy
().
decode
(
"utf-8"
),
str
(
tensor_dict
[
"label"
].
numpy
()),
)
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"heuristics_train_set.txt"
)),
"train"
)
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"heuristics_train_set.txt"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"heuristics_evaluation_set.txt"
)),
"dev"
)
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"heuristics_evaluation_set.txt"
)),
"dev"
)
def
get_labels
(
self
):
"""See base class."""
...
...
@@ -188,14 +202,12 @@ class HansProcessor(DataProcessor):
guid
=
"%s-%s"
%
(
set_type
,
line
[
0
])
text_a
=
line
[
5
]
text_b
=
line
[
6
]
pairID
=
line
[
7
][
2
:]
if
line
[
7
].
startswith
(
'
ex
'
)
else
line
[
7
]
pairID
=
line
[
7
][
2
:]
if
line
[
7
].
startswith
(
"
ex
"
)
else
line
[
7
]
label
=
line
[
-
1
]
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
,
pairID
=
pairID
))
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
,
pairID
=
pairID
))
return
examples
glue_tasks_num_labels
=
{
"hans"
:
3
,
}
...
...
@@ -207,4 +219,3 @@ glue_processors = {
glue_output_modes
=
{
"hans"
:
"classification"
,
}
examples/hans/test_hans.py
View file @
1c933358
...
...
@@ -19,60 +19,72 @@ from __future__ import absolute_import, division, print_function
import
argparse
import
glob
import
json
import
logging
import
os
import
random
import
json
import
numpy
as
np
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
)
from
torch.utils.data
import
DataLoader
,
RandomSampler
,
SequentialSampler
,
TensorDataset
from
torch.utils.data.distributed
import
DistributedSampler
try
:
from
torch.utils.tensorboard
import
SummaryWriter
except
:
from
tensorboardX
import
SummaryWriter
from
tqdm
import
tqdm
,
trange
from
transformers
import
(
WEIGHTS_NAME
,
BertConfig
,
BertForSequenceClassification
,
BertTokenizer
,
from
hans_processors
import
glue_output_modes
as
output_modes
from
hans_processors
import
glue_processors
as
processors
from
hans_processors
import
hans_convert_examples_to_features
as
convert_examples_to_features
from
transformers
import
(
WEIGHTS_NAME
,
AdamW
,
AlbertConfig
,
AlbertForSequenceClassification
,
AlbertTokenizer
,
BertConfig
,
BertForSequenceClassification
,
BertTokenizer
,
DistilBertConfig
,
DistilBertForSequenceClassification
,
DistilBertTokenizer
,
RobertaConfig
,
RobertaForSequenceClassification
,
RobertaTokenizer
,
XLMConfig
,
XLMForSequenceClassification
,
XLMTokenizer
,
XLNetConfig
,
XLMConfig
,
XLMForSequenceClassification
,
XLMTokenizer
,
XLNetConfig
,
XLNetForSequenceClassification
,
XLNetTokenizer
,
DistilBertConfig
,
DistilBertForSequenceClassification
,
DistilBertTokenizer
,
AlbertConfig
,
AlbertForSequenceClassification
,
AlbertTokenizer
,
)
get_linear_schedule_with_warmup
,
)
from
transformers
import
glue_compute_metrics
as
compute_metrics
try
:
from
torch.utils.tensorboard
import
SummaryWriter
except
:
from
tensorboardX
import
SummaryWriter
from
transformers
import
AdamW
,
get_linear_schedule_with_warmup
from
transformers
import
glue_compute_metrics
as
compute_metrics
from
hans_processors
import
glue_output_modes
as
output_modes
from
hans_processors
import
glue_processors
as
processors
from
hans_processors
import
hans_convert_examples_to_features
as
convert_examples_to_features
logger
=
logging
.
getLogger
(
__name__
)
ALL_MODELS
=
sum
((
tuple
(
conf
.
pretrained_config_archive_map
.
keys
())
for
conf
in
(
BertConfig
,
XLNetConfig
,
XLMConfig
,
RobertaConfig
,
DistilBertConfig
)),
())
ALL_MODELS
=
sum
(
(
tuple
(
conf
.
pretrained_config_archive_map
.
keys
())
for
conf
in
(
BertConfig
,
XLNetConfig
,
XLMConfig
,
RobertaConfig
,
DistilBertConfig
)
),
(),
)
MODEL_CLASSES
=
{
'
bert
'
:
(
BertConfig
,
BertForSequenceClassification
,
BertTokenizer
),
'
xlnet
'
:
(
XLNetConfig
,
XLNetForSequenceClassification
,
XLNetTokenizer
),
'
xlm
'
:
(
XLMConfig
,
XLMForSequenceClassification
,
XLMTokenizer
),
'
roberta
'
:
(
RobertaConfig
,
RobertaForSequenceClassification
,
RobertaTokenizer
),
'
distilbert
'
:
(
DistilBertConfig
,
DistilBertForSequenceClassification
,
DistilBertTokenizer
),
'
albert
'
:
(
AlbertConfig
,
AlbertForSequenceClassification
,
AlbertTokenizer
)
"
bert
"
:
(
BertConfig
,
BertForSequenceClassification
,
BertTokenizer
),
"
xlnet
"
:
(
XLNetConfig
,
XLNetForSequenceClassification
,
XLNetTokenizer
),
"
xlm
"
:
(
XLMConfig
,
XLMForSequenceClassification
,
XLMTokenizer
),
"
roberta
"
:
(
RobertaConfig
,
RobertaForSequenceClassification
,
RobertaTokenizer
),
"
distilbert
"
:
(
DistilBertConfig
,
DistilBertForSequenceClassification
,
DistilBertTokenizer
),
"
albert
"
:
(
AlbertConfig
,
AlbertForSequenceClassification
,
AlbertTokenizer
)
,
}
...
...
@@ -100,14 +112,19 @@ def train(args, train_dataset, model, tokenizer):
t_total
=
len
(
train_dataloader
)
//
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
# Prepare optimizer and schedule (linear warmup and decay)
no_decay
=
[
'
bias
'
,
'
LayerNorm.weight
'
]
no_decay
=
[
"
bias
"
,
"
LayerNorm.weight
"
]
optimizer_grouped_parameters
=
[
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
args
.
weight_decay
},
{
'params'
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
'weight_decay'
:
0.0
}
{
"params"
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
not
any
(
nd
in
n
for
nd
in
no_decay
)],
"weight_decay"
:
args
.
weight_decay
,
},
{
"params"
:
[
p
for
n
,
p
in
model
.
named_parameters
()
if
any
(
nd
in
n
for
nd
in
no_decay
)],
"weight_decay"
:
0.0
},
]
optimizer
=
AdamW
(
optimizer_grouped_parameters
,
lr
=
args
.
learning_rate
,
eps
=
args
.
adam_epsilon
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
=
args
.
warmup_steps
,
num_training_steps
=
t_total
)
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
num_warmup_steps
=
args
.
warmup_steps
,
num_training_steps
=
t_total
)
if
args
.
fp16
:
try
:
from
apex
import
amp
...
...
@@ -121,17 +138,21 @@ def train(args, train_dataset, model, tokenizer):
# Distributed training (should be after apex fp16 initialization)
if
args
.
local_rank
!=
-
1
:
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
model
=
torch
.
nn
.
parallel
.
DistributedDataParallel
(
model
,
device_ids
=
[
args
.
local_rank
],
output_device
=
args
.
local_rank
,
find_unused_parameters
=
True
)
# Train!
logger
.
info
(
"***** Running training *****"
)
logger
.
info
(
" Num examples = %d"
,
len
(
train_dataset
))
logger
.
info
(
" Num Epochs = %d"
,
args
.
num_train_epochs
)
logger
.
info
(
" Instantaneous batch size per GPU = %d"
,
args
.
per_gpu_train_batch_size
)
logger
.
info
(
" Total train batch size (w. parallel, distributed & accumulation) = %d"
,
args
.
train_batch_size
*
args
.
gradient_accumulation_steps
*
(
torch
.
distributed
.
get_world_size
()
if
args
.
local_rank
!=
-
1
else
1
))
logger
.
info
(
" Total train batch size (w. parallel, distributed & accumulation) = %d"
,
args
.
train_batch_size
*
args
.
gradient_accumulation_steps
*
(
torch
.
distributed
.
get_world_size
()
if
args
.
local_rank
!=
-
1
else
1
),
)
logger
.
info
(
" Gradient Accumulation steps = %d"
,
args
.
gradient_accumulation_steps
)
logger
.
info
(
" Total optimization steps = %d"
,
t_total
)
...
...
@@ -145,11 +166,11 @@ def train(args, train_dataset, model, tokenizer):
for
step
,
batch
in
enumerate
(
epoch_iterator
):
model
.
train
()
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
inputs
=
{
'
input_ids
'
:
batch
[
0
],
'attention_mask'
:
batch
[
1
],
'labels'
:
batch
[
3
]}
if
args
.
model_type
!=
'distilbert'
:
inputs
[
'token_type_ids'
]
=
batch
[
2
]
if
args
.
model_type
in
[
'bert'
,
'xlnet'
]
else
None
# XLM, DistilBERT and RoBERTa don't use segment_ids
inputs
=
{
"
input_ids
"
:
batch
[
0
],
"attention_mask"
:
batch
[
1
],
"labels"
:
batch
[
3
]}
if
args
.
model_type
!=
"distilbert"
:
inputs
[
"token_type_ids"
]
=
(
batch
[
2
]
if
args
.
model_type
in
[
"bert"
,
"xlnet"
]
else
None
)
# XLM, DistilBERT and RoBERTa don't use segment_ids
outputs
=
model
(
**
inputs
)
loss
=
outputs
[
0
]
# model outputs are always tuple in transformers (see doc)
...
...
@@ -178,30 +199,34 @@ def train(args, train_dataset, model, tokenizer):
if
args
.
local_rank
in
[
-
1
,
0
]
and
args
.
logging_steps
>
0
and
global_step
%
args
.
logging_steps
==
0
:
logs
=
{}
if
args
.
local_rank
==
-
1
and
args
.
evaluate_during_training
:
# Only evaluate when single GPU otherwise metrics may not average well
if
(
args
.
local_rank
==
-
1
and
args
.
evaluate_during_training
):
# Only evaluate when single GPU otherwise metrics may not average well
results
=
evaluate
(
args
,
model
,
tokenizer
)
for
key
,
value
in
results
.
items
():
eval_key
=
'
eval_{}
'
.
format
(
key
)
eval_key
=
"
eval_{}
"
.
format
(
key
)
logs
[
eval_key
]
=
value
loss_scalar
=
(
tr_loss
-
logging_loss
)
/
args
.
logging_steps
learning_rate_scalar
=
scheduler
.
get_lr
()[
0
]
logs
[
'
learning_rate
'
]
=
learning_rate_scalar
logs
[
'
loss
'
]
=
loss_scalar
logs
[
"
learning_rate
"
]
=
learning_rate_scalar
logs
[
"
loss
"
]
=
loss_scalar
logging_loss
=
tr_loss
for
key
,
value
in
logs
.
items
():
tb_writer
.
add_scalar
(
key
,
value
,
global_step
)
#print(json.dumps({**logs, **{'step': global_step}}))
#
print(json.dumps({**logs, **{'step': global_step}}))
if
args
.
local_rank
in
[
-
1
,
0
]
and
args
.
save_steps
>
0
and
global_step
%
args
.
save_steps
==
0
:
# Save model checkpoint
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
'
checkpoint-{}
'
.
format
(
global_step
))
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"
checkpoint-{}
"
.
format
(
global_step
))
if
not
os
.
path
.
exists
(
output_dir
):
os
.
makedirs
(
output_dir
)
model_to_save
=
model
.
module
if
hasattr
(
model
,
'module'
)
else
model
# Take care of distributed/parallel training
model_to_save
=
(
model
.
module
if
hasattr
(
model
,
"module"
)
else
model
)
# Take care of distributed/parallel training
model_to_save
.
save_pretrained
(
output_dir
)
torch
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
'
training_args.bin
'
))
torch
.
save
(
args
,
os
.
path
.
join
(
output_dir
,
"
training_args.bin
"
))
logger
.
info
(
"Saving model checkpoint to %s"
,
output_dir
)
if
args
.
max_steps
>
0
and
global_step
>
args
.
max_steps
:
...
...
@@ -220,7 +245,7 @@ def train(args, train_dataset, model, tokenizer):
def
evaluate
(
args
,
model
,
tokenizer
,
prefix
=
""
):
# Loop to handle MNLI double evaluation (matched, mis-matched)
eval_task_names
=
(
"mnli"
,
"mnli-mm"
)
if
args
.
task_name
==
"mnli"
else
(
args
.
task_name
,)
eval_outputs_dirs
=
(
args
.
output_dir
,
args
.
output_dir
+
'
-MM
'
)
if
args
.
task_name
==
"mnli"
else
(
args
.
output_dir
,)
eval_outputs_dirs
=
(
args
.
output_dir
,
args
.
output_dir
+
"
-MM
"
)
if
args
.
task_name
==
"mnli"
else
(
args
.
output_dir
,)
results
=
{}
for
eval_task
,
eval_output_dir
in
zip
(
eval_task_names
,
eval_outputs_dirs
):
...
...
@@ -251,11 +276,11 @@ def evaluate(args, model, tokenizer, prefix=""):
batch
=
tuple
(
t
.
to
(
args
.
device
)
for
t
in
batch
)
with
torch
.
no_grad
():
inputs
=
{
'
input_ids
'
:
batch
[
0
],
'attention_mask'
:
batch
[
1
],
'labels'
:
batch
[
3
]}
if
args
.
model_type
!=
'distilbert'
:
inputs
[
'token_type_ids'
]
=
batch
[
2
]
if
args
.
model_type
in
[
'bert'
,
'xlnet'
]
else
None
# XLM, DistilBERT and RoBERTa don't use segment_ids
inputs
=
{
"
input_ids
"
:
batch
[
0
],
"attention_mask"
:
batch
[
1
],
"labels"
:
batch
[
3
]}
if
args
.
model_type
!=
"distilbert"
:
inputs
[
"token_type_ids"
]
=
(
batch
[
2
]
if
args
.
model_type
in
[
"bert"
,
"xlnet"
]
else
None
)
# XLM, DistilBERT and RoBERTa don't use segment_ids
outputs
=
model
(
**
inputs
)
tmp_eval_loss
,
logits
=
outputs
[:
2
]
...
...
@@ -263,11 +288,11 @@ def evaluate(args, model, tokenizer, prefix=""):
nb_eval_steps
+=
1
if
preds
is
None
:
preds
=
logits
.
detach
().
cpu
().
numpy
()
out_label_ids
=
inputs
[
'
labels
'
].
detach
().
cpu
().
numpy
()
out_label_ids
=
inputs
[
"
labels
"
].
detach
().
cpu
().
numpy
()
pair_ids
=
batch
[
4
].
detach
().
cpu
().
numpy
()
else
:
preds
=
np
.
append
(
preds
,
logits
.
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
inputs
[
'
labels
'
].
detach
().
cpu
().
numpy
(),
axis
=
0
)
out_label_ids
=
np
.
append
(
out_label_ids
,
inputs
[
"
labels
"
].
detach
().
cpu
().
numpy
(),
axis
=
0
)
pair_ids
=
np
.
append
(
pair_ids
,
batch
[
4
].
detach
().
cpu
().
numpy
(),
axis
=
0
)
eval_loss
=
eval_loss
/
nb_eval_steps
...
...
@@ -280,7 +305,7 @@ def evaluate(args, model, tokenizer, prefix=""):
with
open
(
output_eval_file
,
"w"
)
as
writer
:
writer
.
write
(
"pairID,gld_label
\n
"
)
for
pid
,
pred
in
zip
(
pair_ids
,
preds
):
writer
.
write
(
'
ex
'
+
str
(
pid
)
+
','
+
label_list
[
int
(
pred
)]
+
'
\n
'
)
writer
.
write
(
"
ex
"
+
str
(
pid
)
+
","
+
label_list
[
int
(
pred
)]
+
"
\n
"
)
return
results
...
...
@@ -292,11 +317,15 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor
=
processors
[
task
]()
output_mode
=
output_modes
[
task
]
# Load data features from cache or dataset file
cached_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
'cached_{}_{}_{}_{}'
.
format
(
'dev'
if
evaluate
else
'train'
,
list
(
filter
(
None
,
args
.
model_name_or_path
.
split
(
'/'
))).
pop
(),
cached_features_file
=
os
.
path
.
join
(
args
.
data_dir
,
"cached_{}_{}_{}_{}"
.
format
(
"dev"
if
evaluate
else
"train"
,
list
(
filter
(
None
,
args
.
model_name_or_path
.
split
(
"/"
))).
pop
(),
str
(
args
.
max_seq_length
),
str
(
task
)))
str
(
task
),
),
)
label_list
=
processor
.
get_labels
()
...
...
@@ -305,18 +334,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
features
=
torch
.
load
(
cached_features_file
)
else
:
logger
.
info
(
"Creating features from dataset file at %s"
,
args
.
data_dir
)
if
task
in
[
'
mnli
'
,
'
mnli-mm
'
]
and
args
.
model_type
in
[
'
roberta
'
]:
if
task
in
[
"
mnli
"
,
"
mnli-mm
"
]
and
args
.
model_type
in
[
"
roberta
"
]:
# HACK(label indices are swapped in RoBERTa pretrained model)
label_list
[
1
],
label_list
[
2
]
=
label_list
[
2
],
label_list
[
1
]
examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
features
=
convert_examples_to_features
(
examples
,
examples
=
(
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
)
features
=
convert_examples_to_features
(
examples
,
tokenizer
,
label_list
=
label_list
,
max_length
=
args
.
max_seq_length
,
output_mode
=
output_mode
,
pad_on_left
=
bool
(
args
.
model_type
in
[
'
xlnet
'
]),
# pad on the left for xlnet
pad_on_left
=
bool
(
args
.
model_type
in
[
"
xlnet
"
]),
# pad on the left for xlnet
pad_token
=
tokenizer
.
convert_tokens_to_ids
([
tokenizer
.
pad_token
])[
0
],
pad_token_segment_id
=
4
if
args
.
model_type
in
[
'
xlnet
'
]
else
0
,
pad_token_segment_id
=
4
if
args
.
model_type
in
[
"
xlnet
"
]
else
0
,
)
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
...
...
@@ -335,7 +367,6 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
all_labels
=
torch
.
tensor
([
f
.
label
for
f
in
features
],
dtype
=
torch
.
float
)
all_pair_ids
=
torch
.
tensor
([
int
(
f
.
pairID
)
for
f
in
features
],
dtype
=
torch
.
long
)
dataset
=
TensorDataset
(
all_input_ids
,
all_attention_mask
,
all_token_type_ids
,
all_labels
,
all_pair_ids
)
return
dataset
,
label_list
...
...
@@ -344,90 +375,149 @@ def main():
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
)
parser
.
add_argument
(
"--model_type"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Model type selected in the list: "
+
", "
.
join
(
MODEL_CLASSES
.
keys
()))
parser
.
add_argument
(
"--model_name_or_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to pre-trained model or shortcut name selected in the list: "
+
", "
.
join
(
ALL_MODELS
))
parser
.
add_argument
(
"--task_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The name of the task to train selected in the list: "
+
", "
.
join
(
processors
.
keys
()))
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
)
parser
.
add_argument
(
"--data_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The input data dir. Should contain the .tsv files (or other data files) for the task."
,
)
parser
.
add_argument
(
"--model_type"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Model type selected in the list: "
+
", "
.
join
(
MODEL_CLASSES
.
keys
()),
)
parser
.
add_argument
(
"--model_name_or_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to pre-trained model or shortcut name selected in the list: "
+
", "
.
join
(
ALL_MODELS
),
)
parser
.
add_argument
(
"--task_name"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The name of the task to train selected in the list: "
+
", "
.
join
(
processors
.
keys
()),
)
parser
.
add_argument
(
"--output_dir"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The output directory where the model predictions and checkpoints will be written."
,
)
## Other parameters
parser
.
add_argument
(
"--config_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
)
parser
.
add_argument
(
"--tokenizer_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained tokenizer name or path if not the same as model_name"
)
parser
.
add_argument
(
"--cache_dir"
,
default
=
""
,
type
=
str
,
help
=
"Where do you want to store the pre-trained models downloaded from s3"
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
parser
.
add_argument
(
"--config_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
)
parser
.
add_argument
(
"--tokenizer_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained tokenizer name or path if not the same as model_name"
,
)
parser
.
add_argument
(
"--cache_dir"
,
default
=
""
,
type
=
str
,
help
=
"Where do you want to store the pre-trained models downloaded from s3"
,
)
parser
.
add_argument
(
"--max_seq_length"
,
default
=
128
,
type
=
int
,
help
=
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
)
parser
.
add_argument
(
"--do_train"
,
action
=
'store_true'
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
'store_true'
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--evaluate_during_training"
,
action
=
'store_true'
,
help
=
"Rul evaluation during training at each logging step."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
'store_true'
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--per_gpu_train_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
)
parser
.
add_argument
(
"--per_gpu_eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for evaluation."
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--weight_decay"
,
default
=
0.0
,
type
=
float
,
help
=
"Weight decay if we apply some."
)
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--max_steps"
,
default
=-
1
,
type
=
int
,
help
=
"If > 0: set total number of training steps to perform. Override num_train_epochs."
)
parser
.
add_argument
(
"--warmup_steps"
,
default
=
0
,
type
=
int
,
help
=
"Linear warmup over warmup_steps."
)
parser
.
add_argument
(
'--logging_steps'
,
type
=
int
,
default
=
50
,
help
=
"Log every X updates steps."
)
parser
.
add_argument
(
'--save_steps'
,
type
=
int
,
default
=
50
,
help
=
"Save checkpoint every X updates steps."
)
parser
.
add_argument
(
"--eval_all_checkpoints"
,
action
=
'store_true'
,
help
=
"Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
'store_true'
,
help
=
"Avoid using CUDA when available"
)
parser
.
add_argument
(
'--overwrite_output_dir'
,
action
=
'store_true'
,
help
=
"Overwrite the content of the output directory"
)
parser
.
add_argument
(
'--overwrite_cache'
,
action
=
'store_true'
,
help
=
"Overwrite the cached training and evaluation sets"
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
"Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
)
parser
.
add_argument
(
'--fp16_opt_level'
,
type
=
str
,
default
=
'O1'
,
"than this will be truncated, sequences shorter will be padded."
,
)
parser
.
add_argument
(
"--do_train"
,
action
=
"store_true"
,
help
=
"Whether to run training."
)
parser
.
add_argument
(
"--do_eval"
,
action
=
"store_true"
,
help
=
"Whether to run eval on the dev set."
)
parser
.
add_argument
(
"--evaluate_during_training"
,
action
=
"store_true"
,
help
=
"Rul evaluation during training at each logging step."
)
parser
.
add_argument
(
"--do_lower_case"
,
action
=
"store_true"
,
help
=
"Set this flag if you are using an uncased model."
)
parser
.
add_argument
(
"--per_gpu_train_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for training."
)
parser
.
add_argument
(
"--per_gpu_eval_batch_size"
,
default
=
8
,
type
=
int
,
help
=
"Batch size per GPU/CPU for evaluation."
)
parser
.
add_argument
(
"--gradient_accumulation_steps"
,
type
=
int
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
,
)
parser
.
add_argument
(
"--learning_rate"
,
default
=
5e-5
,
type
=
float
,
help
=
"The initial learning rate for Adam."
)
parser
.
add_argument
(
"--weight_decay"
,
default
=
0.0
,
type
=
float
,
help
=
"Weight decay if we apply some."
)
parser
.
add_argument
(
"--adam_epsilon"
,
default
=
1e-8
,
type
=
float
,
help
=
"Epsilon for Adam optimizer."
)
parser
.
add_argument
(
"--max_grad_norm"
,
default
=
1.0
,
type
=
float
,
help
=
"Max gradient norm."
)
parser
.
add_argument
(
"--num_train_epochs"
,
default
=
3.0
,
type
=
float
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--max_steps"
,
default
=-
1
,
type
=
int
,
help
=
"If > 0: set total number of training steps to perform. Override num_train_epochs."
,
)
parser
.
add_argument
(
"--warmup_steps"
,
default
=
0
,
type
=
int
,
help
=
"Linear warmup over warmup_steps."
)
parser
.
add_argument
(
"--logging_steps"
,
type
=
int
,
default
=
50
,
help
=
"Log every X updates steps."
)
parser
.
add_argument
(
"--save_steps"
,
type
=
int
,
default
=
50
,
help
=
"Save checkpoint every X updates steps."
)
parser
.
add_argument
(
"--eval_all_checkpoints"
,
action
=
"store_true"
,
help
=
"Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
,
)
parser
.
add_argument
(
"--no_cuda"
,
action
=
"store_true"
,
help
=
"Avoid using CUDA when available"
)
parser
.
add_argument
(
"--overwrite_output_dir"
,
action
=
"store_true"
,
help
=
"Overwrite the content of the output directory"
)
parser
.
add_argument
(
"--overwrite_cache"
,
action
=
"store_true"
,
help
=
"Overwrite the cached training and evaluation sets"
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
42
,
help
=
"random seed for initialization"
)
parser
.
add_argument
(
"--fp16"
,
action
=
"store_true"
,
help
=
"Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
,
)
parser
.
add_argument
(
"--fp16_opt_level"
,
type
=
str
,
default
=
"O1"
,
help
=
"For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html"
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"For distributed training: local_rank"
)
parser
.
add_argument
(
'
--server_ip
'
,
type
=
str
,
default
=
''
,
help
=
"For distant debugging."
)
parser
.
add_argument
(
'
--server_port
'
,
type
=
str
,
default
=
''
,
help
=
"For distant debugging."
)
"See details at https://nvidia.github.io/apex/amp.html"
,
)
parser
.
add_argument
(
"--local_rank"
,
type
=
int
,
default
=-
1
,
help
=
"For distributed training: local_rank"
)
parser
.
add_argument
(
"
--server_ip
"
,
type
=
str
,
default
=
""
,
help
=
"For distant debugging."
)
parser
.
add_argument
(
"
--server_port
"
,
type
=
str
,
default
=
""
,
help
=
"For distant debugging."
)
args
=
parser
.
parse_args
()
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
))
if
(
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
):
raise
ValueError
(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
)
)
# Setup distant debugging if needed
if
args
.
server_ip
and
args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
print
(
"Waiting for debugger attach"
)
ptvsd
.
enable_attach
(
address
=
(
args
.
server_ip
,
args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
...
...
@@ -439,16 +529,24 @@ def main():
else
:
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch
.
cuda
.
set_device
(
args
.
local_rank
)
device
=
torch
.
device
(
"cuda"
,
args
.
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
'
nccl
'
)
torch
.
distributed
.
init_process_group
(
backend
=
"
nccl
"
)
args
.
n_gpu
=
1
args
.
device
=
device
# Setup logging
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
)
logger
.
warning
(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s"
,
args
.
local_rank
,
device
,
args
.
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
)
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
if
args
.
local_rank
in
[
-
1
,
0
]
else
logging
.
WARN
,
)
logger
.
warning
(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s"
,
args
.
local_rank
,
device
,
args
.
n_gpu
,
bool
(
args
.
local_rank
!=
-
1
),
args
.
fp16
,
)
# Set seed
set_seed
(
args
)
...
...
@@ -468,17 +566,23 @@ def main():
args
.
model_type
=
args
.
model_type
.
lower
()
config_class
,
model_class
,
tokenizer_class
=
MODEL_CLASSES
[
args
.
model_type
]
config
=
config_class
.
from_pretrained
(
args
.
config_name
if
args
.
config_name
else
args
.
model_name_or_path
,
config
=
config_class
.
from_pretrained
(
args
.
config_name
if
args
.
config_name
else
args
.
model_name_or_path
,
num_labels
=
num_labels
,
finetuning_task
=
args
.
task_name
,
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
None
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_name_or_path
,
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
None
,
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_name_or_path
,
do_lower_case
=
args
.
do_lower_case
,
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
None
)
model
=
model_class
.
from_pretrained
(
args
.
model_name_or_path
,
from_tf
=
bool
(
'.ckpt'
in
args
.
model_name_or_path
),
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
None
,
)
model
=
model_class
.
from_pretrained
(
args
.
model_name_or_path
,
from_tf
=
bool
(
".ckpt"
in
args
.
model_name_or_path
),
config
=
config
,
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
None
)
cache_dir
=
args
.
cache_dir
if
args
.
cache_dir
else
None
,
)
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
...
...
@@ -487,14 +591,12 @@ def main():
logger
.
info
(
"Training/evaluation parameters %s"
,
args
)
# Training
if
args
.
do_train
:
train_dataset
,
_
=
load_and_cache_examples
(
args
,
args
.
task_name
,
tokenizer
,
evaluate
=
False
)
global_step
,
tr_loss
=
train
(
args
,
train_dataset
,
model
,
tokenizer
)
logger
.
info
(
" global_step = %s, average loss = %s"
,
global_step
,
tr_loss
)
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
if
args
.
do_train
and
(
args
.
local_rank
==
-
1
or
torch
.
distributed
.
get_rank
()
==
0
):
# Create output directory if needed
...
...
@@ -504,36 +606,39 @@ def main():
logger
.
info
(
"Saving model checkpoint to %s"
,
args
.
output_dir
)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save
=
model
.
module
if
hasattr
(
model
,
'module'
)
else
model
# Take care of distributed/parallel training
model_to_save
=
(
model
.
module
if
hasattr
(
model
,
"module"
)
else
model
)
# Take care of distributed/parallel training
model_to_save
.
save_pretrained
(
args
.
output_dir
)
tokenizer
.
save_pretrained
(
args
.
output_dir
)
# Good practice: save your training arguments together with the trained model
torch
.
save
(
args
,
os
.
path
.
join
(
args
.
output_dir
,
'
training_args.bin
'
))
torch
.
save
(
args
,
os
.
path
.
join
(
args
.
output_dir
,
"
training_args.bin
"
))
# Load a trained model and vocabulary that you have fine-tuned
model
=
model_class
.
from_pretrained
(
args
.
output_dir
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
output_dir
)
model
.
to
(
args
.
device
)
# Evaluation
results
=
{}
if
args
.
do_eval
and
args
.
local_rank
in
[
-
1
,
0
]:
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
output_dir
,
do_lower_case
=
args
.
do_lower_case
)
checkpoints
=
[
args
.
output_dir
]
if
args
.
eval_all_checkpoints
:
checkpoints
=
list
(
os
.
path
.
dirname
(
c
)
for
c
in
sorted
(
glob
.
glob
(
args
.
output_dir
+
'/**/'
+
WEIGHTS_NAME
,
recursive
=
True
)))
checkpoints
=
list
(
os
.
path
.
dirname
(
c
)
for
c
in
sorted
(
glob
.
glob
(
args
.
output_dir
+
"/**/"
+
WEIGHTS_NAME
,
recursive
=
True
))
)
logging
.
getLogger
(
"transformers.modeling_utils"
).
setLevel
(
logging
.
WARN
)
# Reduce logging
logger
.
info
(
"Evaluate the following checkpoints: %s"
,
checkpoints
)
for
checkpoint
in
checkpoints
:
global_step
=
checkpoint
.
split
(
'-'
)[
-
1
]
if
len
(
checkpoints
)
>
1
else
""
prefix
=
checkpoint
.
split
(
'/'
)[
-
1
]
if
checkpoint
.
find
(
'
checkpoint
'
)
!=
-
1
else
""
global_step
=
checkpoint
.
split
(
"-"
)[
-
1
]
if
len
(
checkpoints
)
>
1
else
""
prefix
=
checkpoint
.
split
(
"/"
)[
-
1
]
if
checkpoint
.
find
(
"
checkpoint
"
)
!=
-
1
else
""
model
=
model_class
.
from_pretrained
(
checkpoint
)
model
.
to
(
args
.
device
)
result
=
evaluate
(
args
,
model
,
tokenizer
,
prefix
=
prefix
)
result
=
dict
((
k
+
'
_{}
'
.
format
(
global_step
),
v
)
for
k
,
v
in
result
.
items
())
result
=
dict
((
k
+
"
_{}
"
.
format
(
global_step
),
v
)
for
k
,
v
in
result
.
items
())
results
.
update
(
result
)
return
results
...
...
examples/hans/utils_hans.py
View file @
1c933358
...
...
@@ -14,10 +14,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
csv
import
sys
import
copy
import
csv
import
json
import
sys
class
InputExample
(
object
):
"""
...
...
@@ -32,6 +33,7 @@ class InputExample(object):
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
def
__init__
(
self
,
guid
,
text_a
,
text_b
=
None
,
label
=
None
,
pairID
=
None
):
self
.
guid
=
guid
self
.
text_a
=
text_a
...
...
@@ -117,6 +119,6 @@ class DataProcessor(object):
lines
=
[]
for
line
in
reader
:
if
sys
.
version_info
[
0
]
==
2
:
line
=
list
(
unicode
(
cell
,
'
utf-8
'
)
for
cell
in
line
)
line
=
list
(
unicode
(
cell
,
"
utf-8
"
)
for
cell
in
line
)
lines
.
append
(
line
)
return
lines
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment