增加transformer-xl模型代码

cb8dde1c · hepj · a22e7ca7 · cb8dde1c · cb8dde1c · cb8dde1c
Commit cb8dde1c authored Jul 14, 2022 by hepj
20 changed files
--- a/TensorFlow/NLP/transformer-xl-master/log/rocblas/transfoemer-XL_rocblas.log
+++ b/TensorFlow/NLP/transformer-xl-master/log/rocblas/transfoemer-XL_rocblas.log
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1536 -n 11532 -k 512 --alpha 1 --lda 1536 --ldb 512 --beta 0 --ldc 1536 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 1536 -n 12288 -k 512 --alpha 1 --lda 1536 --ldb 512 --beta 0 --ldc 1536 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 2048 -n 5388 -k 512 --alpha 1 --lda 2048 --ldb 512 --beta 0 --ldc 2048 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 2048 -n 6144 -k 512 --alpha 1 --lda 2048 --ldb 512 --beta 0 --ldc 2048 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 1024 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 5388 -k 2048 --alpha 1 --lda 512 --ldb 2048 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 5388 -k 204 --alpha 1 --lda 512 --ldb 204 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 5388 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 6144 -k 2048 --alpha 1 --lda 512 --ldb 2048 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 6144 -k 204 --alpha 1 --lda 512 --ldb 204 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 6144 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 961 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1536 -n 512 -k 11532 --alpha 1 --lda 1536 --ldb 512 --beta 0 --ldc 1536 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 1536 -n 512 -k 12288 --alpha 1 --lda 1536 --ldb 512 --beta 0 --ldc 1536 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 2048 -n 512 -k 5388 --alpha 1 --lda 2048 --ldb 512 --beta 0 --ldc 2048 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 2048 -n 512 -k 6144 --alpha 1 --lda 2048 --ldb 512 --beta 0 --ldc 2048 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 204 -n 512 -k 5388 --alpha 1 --lda 204 --ldb 512 --beta 0 --ldc 204 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 204 -n 512 -k 6144 --alpha 1 --lda 204 --ldb 512 --beta 0 --ldc 204 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 2048 -k 5388 --alpha 1 --lda 512 --ldb 2048 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 2048 -k 6144 --alpha 1 --lda 512 --ldb 2048 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 1024 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 5388 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 6144 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 512 -k 961 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 5388 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 2048 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 2048 -n 6144 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 2048 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 204 -n 5388 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 204 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 204 -n 6144 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 204 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 11532 -k 1536 --alpha 1 --lda 1536 --ldb 1536 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 12288 -k 1536 --alpha 1 --lda 1536 --ldb 1536 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 5388 -k 2048 --alpha 1 --lda 2048 --ldb 2048 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 5388 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 6144 -k 2048 --alpha 1 --lda 2048 --ldb 2048 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 6144 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 0 --ldc 512 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB N -m 1024 -n 512 -k 64 --alpha 1 --lda 1024 --stride_a 65536 --ldb 64 --stride_b 32768 --beta 0 --ldc 1024 --stride_c 524288 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB N -m 1024 -n 6144 -k 64 --alpha 1 --lda 1024 --stride_a 65536 --ldb 64 --stride_b 393216 --beta 0 --ldc 1024 --stride_c 6291456 --batch_count 8 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB N -m 64 -n 449 -k 961 --alpha 1 --lda 64 --stride_a 61504 --ldb 961 --stride_b 431489 --beta 0 --ldc 64 --stride_c 28736 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB N -m 64 -n 512 -k 1024 --alpha 1 --lda 64 --stride_a 65536 --ldb 1024 --stride_b 524288 --beta 0 --ldc 64 --stride_c 32768 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB N -m 961 -n 449 -k 64 --alpha 1 --lda 961 --stride_a 61504 --ldb 64 --stride_b 28736 --beta 0 --ldc 961 --stride_c 431489 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB N -m 961 -n 5388 -k 64 --alpha 1 --lda 961 --stride_a 61504 --ldb 64 --stride_b 344832 --beta 0 --ldc 961 --stride_c 5177868 --batch_count 8 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 1024 -n 64 -k 512 --alpha 1 --lda 1024 --stride_a 524288 --ldb 64 --stride_b 32768 --beta 0 --ldc 1024 --stride_c 65536 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 1024 -n 64 -k 6144 --alpha 1 --lda 1024 --stride_a 6291456 --ldb 64 --stride_b 393216 --beta 0 --ldc 1024 --stride_c 65536 --batch_count 8 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 1024 -k 512 --alpha 1 --lda 64 --stride_a 32768 --ldb 1024 --stride_b 524288 --beta 0 --ldc 64 --stride_c 65536 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 64 -n 961 -k 449 --alpha 1 --lda 64 --stride_a 28736 --ldb 961 --stride_b 431489 --beta 0 --ldc 64 --stride_c 61504 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 961 -n 64 -k 449 --alpha 1 --lda 961 --stride_a 431489 --ldb 64 --stride_b 28736 --beta 0 --ldc 961 --stride_c 61504 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA N --transposeB T -m 961 -n 64 -k 5388 --alpha 1 --lda 961 --stride_a 5177868 --ldb 64 --stride_b 344832 --beta 0 --ldc 961 --stride_c 61504 --batch_count 8 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 1024 -n 512 -k 64 --alpha 1 --lda 64 --stride_a 65536 --ldb 64 --stride_b 32768 --beta 0 --ldc 1024 --stride_c 524288 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 64 -n 449 -k 961 --alpha 1 --lda 961 --stride_a 61504 --ldb 961 --stride_b 431489 --beta 0 --ldc 64 --stride_c 28736 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 64 -n 512 -k 1024 --alpha 1 --lda 1024 --stride_a 65536 --ldb 1024 --stride_b 524288 --beta 0 --ldc 64 --stride_c 32768 --batch_count 96 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 64 -n 5388 -k 961 --alpha 1 --lda 961 --stride_a 61504 --ldb 961 --stride_b 5177868 --beta 0 --ldc 64 --stride_c 344832 --batch_count 8 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 64 -n 6144 -k 1024 --alpha 1 --lda 1024 --stride_a 65536 --ldb 1024 --stride_b 6291456 --beta 0 --ldc 64 --stride_c 393216 --batch_count 8 --atomics_not_allowed
+./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 961 -n 449 -k 64 --alpha 1 --lda 64 --stride_a 61504 --ldb 64 --stride_b 28736 --beta 0 --ldc 961 --stride_c 431489 --batch_count 96 --atomics_not_allowed
--- a/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_1_fp32_new.log
+++ b/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_1_fp32_new.log
--- a/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_4_fp32.log
+++ b/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_4_fp32.log
+nohup: ignoring input
+Run training...
+WARNING:tensorflow:From train_gpu.py:475: The name tf.app.run is deprecated. Please use tf.compat.v1.app.run instead.
+
+WARNING:tensorflow:From train_gpu.py:460: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.
+
+W0411 15:09:43.619874 46979636578560 module_wrapper.py:139] From train_gpu.py:460: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.
+
+WARNING:tensorflow:From train_gpu.py:460: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.
+
+W0411 15:09:43.620179 46979636578560 module_wrapper.py:139] From train_gpu.py:460: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.
+
+WARNING:tensorflow:From train_gpu.py:466: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.
+
+W0411 15:09:43.620661 46979636578560 module_wrapper.py:139] From train_gpu.py:466: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.
+
+INFO:tensorflow:n_token 204
+I0411 15:09:43.620860 46979636578560 train_gpu.py:466] n_token 204
+INFO:tensorflow:[train] File names ['train.bsz-24.tlen-512.tfrecords']
+I0411 15:09:43.621257 46979636578560 data_utils.py:430] [train] File names ['train.bsz-24.tlen-512.tfrecords']
+INFO:tensorflow:num of batches 7242
+I0411 15:09:43.621433 46979636578560 train_gpu.py:234] num of batches 7242
+WARNING:tensorflow:From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.sparse_tensor_to_dense is deprecated. Please use tf.sparse.to_dense instead.
+
+W0411 15:09:44.940819 46979636578560 module_wrapper.py:139] From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.sparse_tensor_to_dense is deprecated. Please use tf.sparse.to_dense instead.
+
+WARNING:tensorflow:From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.FixedLenFeature is deprecated. Please use tf.io.FixedLenFeature instead.
+
+W0411 15:09:44.942387 46979636578560 module_wrapper.py:139] From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.FixedLenFeature is deprecated. Please use tf.io.FixedLenFeature instead.
+
+WARNING:tensorflow:From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.VarLenFeature is deprecated. Please use tf.io.VarLenFeature instead.
+
+W0411 15:09:44.943188 46979636578560 module_wrapper.py:139] From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.VarLenFeature is deprecated. Please use tf.io.VarLenFeature instead.
+
+WARNING:tensorflow:From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.parse_single_example is deprecated. Please use tf.io.parse_single_example instead.
+
+W0411 15:09:44.946408 46979636578560 module_wrapper.py:139] From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.parse_single_example is deprecated. Please use tf.io.parse_single_example instead.
+
+WARNING:tensorflow:From /public/home/hepj/SothisAI/transformer-xl-master/tf/data_utils.py:502: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `tf.cast` instead.
+W0411 15:09:45.984954 46979636578560 deprecation.py:323] From /public/home/hepj/SothisAI/transformer-xl-master/tf/data_utils.py:502: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `tf.cast` instead.
+WARNING:tensorflow:From train_gpu.py:241: DatasetV1.make_one_shot_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
+W0411 15:09:46.000672 46979636578560 deprecation.py:323] From train_gpu.py:241: DatasetV1.make_one_shot_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
+WARNING:tensorflow:From train_gpu.py:253: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.
+
+W0411 15:09:46.023218 46979636578560 module_wrapper.py:139] From train_gpu.py:253: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.
+
+WARNING:tensorflow:From train_gpu.py:253: The name tf.get_variable_scope is deprecated. Please use tf.compat.v1.get_variable_scope instead.
+
+W0411 15:09:46.023566 46979636578560 module_wrapper.py:139] From train_gpu.py:253: The name tf.get_variable_scope is deprecated. Please use tf.compat.v1.get_variable_scope instead.
+
+WARNING:tensorflow:From train_gpu.py:257: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.
+
+W0411 15:09:46.023932 46979636578560 module_wrapper.py:139] From train_gpu.py:257: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.
+
+WARNING:tensorflow:From /public/home/hepj/SothisAI/transformer-xl-master/tf/gpu_utils.py:6: The name tf.NodeDef is deprecated. Please use tf.compat.v1.NodeDef instead.
+
+W0411 15:09:46.024810 46979636578560 module_wrapper.py:139] From /public/home/hepj/SothisAI/transformer-xl-master/tf/gpu_utils.py:6: The name tf.NodeDef is deprecated. Please use tf.compat.v1.NodeDef instead.
+
+WARNING:tensorflow:From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:460: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.
+
+W0411 15:09:46.036548 46979636578560 module_wrapper.py:139] From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:460: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.
+
+WARNING:tensorflow:From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:416: The name tf.matrix_band_part is deprecated. Please use tf.linalg.band_part instead.
+
+W0411 15:09:46.078661 46979636578560 module_wrapper.py:139] From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:416: The name tf.matrix_band_part is deprecated. Please use tf.linalg.band_part instead.
+
+WARNING:tensorflow:From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:493: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.dropout instead.
+W0411 15:09:46.113715 46979636578560 deprecation.py:323] From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:493: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.dropout instead.
+WARNING:tensorflow:From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
+Instructions for updating:
+Please use `layer.__call__` method instead.
+W0411 15:09:46.114592 46979636578560 deprecation.py:323] From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
+Instructions for updating:
+Please use `layer.__call__` method instead.
+WARNING:tensorflow:From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:54: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.Dense instead.
+W0411 15:09:46.141592 46979636578560 deprecation.py:323] From /public/home/hepj/SothisAI/transformer-xl-master/tf/model.py:54: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.Dense instead.
+WARNING:tensorflow:
+The TensorFlow contrib module will not be included in TensorFlow 2.0.
+For more information, please see:
+  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
+  * https://github.com/tensorflow/addons
+  * https://github.com/tensorflow/io (for I/O related ops)
+If you depend on functionality not listed there, please file an issue.
+
+W0411 15:09:46.438683 46979636578560 lazy_loader.py:50] 
+The TensorFlow contrib module will not be included in TensorFlow 2.0.
+For more information, please see:
+  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
+  * https://github.com/tensorflow/addons
+  * https://github.com/tensorflow/io (for I/O related ops)
+If you depend on functionality not listed there, please file an issue.
+
+WARNING:tensorflow:From train_gpu.py:189: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.
+
+W0411 15:09:51.204890 46979636578560 module_wrapper.py:139] From train_gpu.py:189: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.
+
+INFO:tensorflow:#params: 41055436
+I0411 15:09:51.216222 46979636578560 train_gpu.py:190] #params: 41055436
+INFO:tensorflow:#params: 41055436
+I0411 15:10:00.890560 46979636578560 train_gpu.py:190] #params: 41055436
+INFO:tensorflow:#params: 41055436
+I0411 15:10:10.730956 46979636578560 train_gpu.py:190] #params: 41055436
+INFO:tensorflow:#params: 41055436
+I0411 15:10:20.506927 46979636578560 train_gpu.py:190] #params: 41055436
+WARNING:tensorflow:From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/ops/clip_ops.py:301: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use tf.where in 2.0, which has the same broadcast rule as np.where
+W0411 15:10:26.637341 46979636578560 deprecation.py:323] From /public/home/hepj/job_env/venv_1/lib/python3.6/site-packages/tensorflow_core/python/ops/clip_ops.py:301: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use tf.where in 2.0, which has the same broadcast rule as np.where
+WARNING:tensorflow:From train_gpu.py:286: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
+
+W0411 15:10:26.832451 46979636578560 module_wrapper.py:139] From train_gpu.py:286: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
+
+WARNING:tensorflow:From train_gpu.py:296: The name tf.train.cosine_decay is deprecated. Please use tf.compat.v1.train.cosine_decay instead.
+
+W0411 15:10:26.837068 46979636578560 module_wrapper.py:139] From train_gpu.py:296: The name tf.train.cosine_decay is deprecated. Please use tf.compat.v1.train.cosine_decay instead.
+
+WARNING:tensorflow:From train_gpu.py:307: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.
+
+W0411 15:10:26.851632 46979636578560 module_wrapper.py:139] From train_gpu.py:307: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.
+
+WARNING:tensorflow:From train_gpu.py:317: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.
+
+W0411 15:10:28.594886 46979636578560 module_wrapper.py:139] From train_gpu.py:317: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.
+
+WARNING:tensorflow:From train_gpu.py:319: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.
+
+W0411 15:10:29.020682 46979636578560 module_wrapper.py:139] From train_gpu.py:319: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.
+
+WARNING:tensorflow:From train_gpu.py:319: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.
+
+W0411 15:10:29.021113 46979636578560 module_wrapper.py:139] From train_gpu.py:319: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.
+
+2022-04-11 15:10:29.035479: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 1999885000 Hz
+2022-04-11 15:10:29.037594: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x13b302a0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
+2022-04-11 15:10:29.037722: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
+2022-04-11 15:10:29.041016: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libamdhip64.so
+2022-04-11 15:10:33.083881: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1d6c54d0 initialized for platform ROCM (this does not guarantee that XLA will be used). Devices:
+2022-04-11 15:10:33.084019: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Device 66a1, AMDGPU ISA version: gfx906
+2022-04-11 15:10:33.084060: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): Device 66a1, AMDGPU ISA version: gfx906
+2022-04-11 15:10:33.084096: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (2): Device 66a1, AMDGPU ISA version: gfx906
+2022-04-11 15:10:33.084142: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (3): Device 66a1, AMDGPU ISA version: gfx906
+2022-04-11 15:10:33.092406: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 0 with properties: 
+name: Device 66a1
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.7
+pciBusID 0000:04:00.0
+2022-04-11 15:10:33.092565: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 1 with properties: 
+name: Device 66a1
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.7
+pciBusID 0000:26:00.0
+2022-04-11 15:10:33.092653: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 2 with properties: 
+name: Device 66a1
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.7
+pciBusID 0000:43:00.0
+2022-04-11 15:10:33.092738: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 3 with properties: 
+name: Device 66a1
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.7
+pciBusID 0000:63:00.0
+2022-04-11 15:10:35.914914: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocblas.so
+2022-04-11 15:10:35.925268: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libMIOpen.so
+2022-04-11 15:11:04.234699: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocfft.so
+2022-04-11 15:11:04.353950: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocrand.so
+2022-04-11 15:11:04.354631: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0, 1, 2, 3
+2022-04-11 15:11:04.354846: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1180] Device interconnect StreamExecutor with strength 1 edge matrix:
+2022-04-11 15:11:04.354919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1186]      0 1 2 3 
+2022-04-11 15:11:04.355005: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 0:   N Y Y Y 
+2022-04-11 15:11:04.355062: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 1:   Y N Y Y 
+2022-04-11 15:11:04.355110: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 2:   Y Y N Y 
+2022-04-11 15:11:04.355160: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 3:   Y Y Y N 
+2022-04-11 15:11:04.355927: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14923 MB memory) -> physical GPU (device: 0, name: Device 66a1, pci bus id: 0000:04:00.0)
+2022-04-11 15:11:04.358874: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 14922 MB memory) -> physical GPU (device: 1, name: Device 66a1, pci bus id: 0000:26:00.0)
+2022-04-11 15:11:04.364894: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 14923 MB memory) -> physical GPU (device: 2, name: Device 66a1, pci bus id: 0000:43:00.0)
+2022-04-11 15:11:04.374173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 14923 MB memory) -> physical GPU (device: 3, name: Device 66a1, pci bus id: 0000:63:00.0)
+WARNING:tensorflow:From train_gpu.py:320: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.
+
+W0411 15:11:04.391065 46979636578560 module_wrapper.py:139] From train_gpu.py:320: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.
+
+2022-04-11 15:11:07.726957: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+2022-04-11 15:11:32.594987: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+2022-04-11 15:11:32.984313: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocblas.so
+2022-04-11 15:11:34.189588: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+2022-04-11 15:11:34.193113: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+INFO:tensorflow:[200] | gnorm 0.56 lr 0.000250 | loss 2.55 | pplx   12.86, bpc  3.6843
+I0411 15:13:59.103161 46979636578560 train_gpu.py:345] [200] | gnorm 0.56 lr 0.000250 | loss 2.55 | pplx   12.86, bpc  3.6843
+INFO:tensorflow:[400] | gnorm 0.46 lr 0.000249 | loss 1.74 | pplx    5.67, bpc  2.5034
+I0411 15:16:13.616544 46979636578560 train_gpu.py:345] [400] | gnorm 0.46 lr 0.000249 | loss 1.74 | pplx    5.67, bpc  2.5034
+INFO:tensorflow:[600] | gnorm 0.46 lr 0.000248 | loss 1.58 | pplx    4.84, bpc  2.2762
+I0411 15:18:28.268989 46979636578560 train_gpu.py:345] [600] | gnorm 0.46 lr 0.000248 | loss 1.58 | pplx    4.84, bpc  2.2762
+INFO:tensorflow:[800] | gnorm 0.44 lr 0.000246 | loss 1.41 | pplx    4.11, bpc  2.0408
+I0411 15:20:42.953228 46979636578560 train_gpu.py:345] [800] | gnorm 0.44 lr 0.000246 | loss 1.41 | pplx    4.11, bpc  2.0408
+INFO:tensorflow:[1000] | gnorm 0.42 lr 0.000244 | loss 1.31 | pplx    3.71, bpc  1.8905
+I0411 15:22:57.681006 46979636578560 train_gpu.py:345] [1000] | gnorm 0.42 lr 0.000244 | loss 1.31 | pplx    3.71, bpc  1.8905
+INFO:tensorflow:[1200] | gnorm 0.41 lr 0.000241 | loss 1.24 | pplx    3.47, bpc  1.7957
+I0411 15:25:12.388444 46979636578560 train_gpu.py:345] [1200] | gnorm 0.41 lr 0.000241 | loss 1.24 | pplx    3.47, bpc  1.7957
+INFO:tensorflow:[1400] | gnorm 0.41 lr 0.000238 | loss 1.20 | pplx    3.32, bpc  1.7314
+I0411 15:27:27.104644 46979636578560 train_gpu.py:345] [1400] | gnorm 0.41 lr 0.000238 | loss 1.20 | pplx    3.32, bpc  1.7314
+INFO:tensorflow:[1600] | gnorm 0.38 lr 0.000235 | loss 1.18 | pplx    3.27, bpc  1.7074
+I0411 15:29:42.007892 46979636578560 train_gpu.py:345] [1600] | gnorm 0.38 lr 0.000235 | loss 1.18 | pplx    3.27, bpc  1.7074
+INFO:tensorflow:[1800] | gnorm 0.37 lr 0.000231 | loss 1.15 | pplx    3.17, bpc  1.6638
+I0411 15:31:56.785848 46979636578560 train_gpu.py:345] [1800] | gnorm 0.37 lr 0.000231 | loss 1.15 | pplx    3.17, bpc  1.6638
+INFO:tensorflow:[2000] | gnorm 0.35 lr 0.000226 | loss 1.17 | pplx    3.22, bpc  1.6877
+I0411 15:34:11.768712 46979636578560 train_gpu.py:345] [2000] | gnorm 0.35 lr 0.000226 | loss 1.17 | pplx    3.22, bpc  1.6877
+INFO:tensorflow:[2200] | gnorm 0.37 lr 0.000221 | loss 1.13 | pplx    3.11, bpc  1.6366
+I0411 15:36:26.584105 46979636578560 train_gpu.py:345] [2200] | gnorm 0.37 lr 0.000221 | loss 1.13 | pplx    3.11, bpc  1.6366
+INFO:tensorflow:[2400] | gnorm 0.35 lr 0.000216 | loss 1.12 | pplx    3.06, bpc  1.6146
+I0411 15:38:41.714276 46979636578560 train_gpu.py:345] [2400] | gnorm 0.35 lr 0.000216 | loss 1.12 | pplx    3.06, bpc  1.6146
+INFO:tensorflow:[2600] | gnorm 0.39 lr 0.000211 | loss 1.12 | pplx    3.07, bpc  1.6197
+I0411 15:40:56.903222 46979636578560 train_gpu.py:345] [2600] | gnorm 0.39 lr 0.000211 | loss 1.12 | pplx    3.07, bpc  1.6197
+INFO:tensorflow:[2800] | gnorm 0.38 lr 0.000205 | loss 1.10 | pplx    2.99, bpc  1.5815
+I0411 15:43:12.326100 46979636578560 train_gpu.py:345] [2800] | gnorm 0.38 lr 0.000205 | loss 1.10 | pplx    2.99, bpc  1.5815
+INFO:tensorflow:[3000] | gnorm 0.35 lr 0.000199 | loss 1.09 | pplx    2.97, bpc  1.5693
+I0411 15:45:27.849482 46979636578560 train_gpu.py:345] [3000] | gnorm 0.35 lr 0.000199 | loss 1.09 | pplx    2.97, bpc  1.5693
+INFO:tensorflow:[3200] | gnorm 0.34 lr 0.000192 | loss 1.08 | pplx    2.96, bpc  1.5641
+I0411 15:47:43.269138 46979636578560 train_gpu.py:345] [3200] | gnorm 0.34 lr 0.000192 | loss 1.08 | pplx    2.96, bpc  1.5641
+INFO:tensorflow:[3400] | gnorm 0.36 lr 0.000185 | loss 1.06 | pplx    2.90, bpc  1.5352
+I0411 15:49:58.617992 46979636578560 train_gpu.py:345] [3400] | gnorm 0.36 lr 0.000185 | loss 1.06 | pplx    2.90, bpc  1.5352
+INFO:tensorflow:[3600] | gnorm 0.35 lr 0.000179 | loss 1.07 | pplx    2.90, bpc  1.5370
+I0411 15:52:14.135429 46979636578560 train_gpu.py:345] [3600] | gnorm 0.35 lr 0.000179 | loss 1.07 | pplx    2.90, bpc  1.5370
+INFO:tensorflow:[3800] | gnorm 0.33 lr 0.000171 | loss 1.05 | pplx    2.86, bpc  1.5170
+I0411 15:54:29.591644 46979636578560 train_gpu.py:345] [3800] | gnorm 0.33 lr 0.000171 | loss 1.05 | pplx    2.86, bpc  1.5170
+INFO:tensorflow:[4000] | gnorm 0.33 lr 0.000164 | loss 1.06 | pplx    2.89, bpc  1.5298
+I0411 15:56:44.983796 46979636578560 train_gpu.py:345] [4000] | gnorm 0.33 lr 0.000164 | loss 1.06 | pplx    2.89, bpc  1.5298
+2022-04-11 15:56:46.113523: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+INFO:tensorflow:Model saved in path: EXP-enwik8/model.ckpt
+I0411 15:56:51.852108 46979636578560 train_gpu.py:351] Model saved in path: EXP-enwik8/model.ckpt
+INFO:tensorflow:[4200] | gnorm 0.33 lr 0.000156 | loss 1.06 | pplx    2.88, bpc  1.5241
+I0411 15:59:07.272919 46979636578560 train_gpu.py:345] [4200] | gnorm 0.33 lr 0.000156 | loss 1.06 | pplx    2.88, bpc  1.5241
+INFO:tensorflow:[4400] | gnorm 0.33 lr 0.000149 | loss 1.03 | pplx    2.81, bpc  1.4889
+I0411 16:01:22.739627 46979636578560 train_gpu.py:345] [4400] | gnorm 0.33 lr 0.000149 | loss 1.03 | pplx    2.81, bpc  1.4889
+INFO:tensorflow:[4600] | gnorm 0.34 lr 0.000141 | loss 1.05 | pplx    2.87, bpc  1.5207
+I0411 16:03:38.228260 46979636578560 train_gpu.py:345] [4600] | gnorm 0.34 lr 0.000141 | loss 1.05 | pplx    2.87, bpc  1.5207
+INFO:tensorflow:[4800] | gnorm 0.32 lr 0.000133 | loss 1.02 | pplx    2.78, bpc  1.4776
+I0411 16:05:53.647228 46979636578560 train_gpu.py:345] [4800] | gnorm 0.32 lr 0.000133 | loss 1.02 | pplx    2.78, bpc  1.4776
+INFO:tensorflow:[5000] | gnorm 0.32 lr 0.000126 | loss 1.02 | pplx    2.78, bpc  1.4747
+I0411 16:08:09.163792 46979636578560 train_gpu.py:345] [5000] | gnorm 0.32 lr 0.000126 | loss 1.02 | pplx    2.78, bpc  1.4747
+INFO:tensorflow:[5200] | gnorm 0.32 lr 0.000118 | loss 1.04 | pplx    2.83, bpc  1.4988
+I0411 16:10:24.546232 46979636578560 train_gpu.py:345] [5200] | gnorm 0.32 lr 0.000118 | loss 1.04 | pplx    2.83, bpc  1.4988
+INFO:tensorflow:[5400] | gnorm 0.34 lr 0.000110 | loss 1.04 | pplx    2.82, bpc  1.4976
+I0411 16:12:39.994851 46979636578560 train_gpu.py:345] [5400] | gnorm 0.34 lr 0.000110 | loss 1.04 | pplx    2.82, bpc  1.4976
+INFO:tensorflow:[5600] | gnorm 0.33 lr 0.000102 | loss 1.03 | pplx    2.79, bpc  1.4825
+I0411 16:14:55.336359 46979636578560 train_gpu.py:345] [5600] | gnorm 0.33 lr 0.000102 | loss 1.03 | pplx    2.79, bpc  1.4825
+INFO:tensorflow:[5800] | gnorm 0.34 lr 0.000095 | loss 1.00 | pplx    2.71, bpc  1.4406
+I0411 16:17:10.817455 46979636578560 train_gpu.py:345] [5800] | gnorm 0.34 lr 0.000095 | loss 1.00 | pplx    2.71, bpc  1.4406
+INFO:tensorflow:[6000] | gnorm 0.34 lr 0.000087 | loss 1.00 | pplx    2.72, bpc  1.4411
+I0411 16:19:26.204326 46979636578560 train_gpu.py:345] [6000] | gnorm 0.34 lr 0.000087 | loss 1.00 | pplx    2.72, bpc  1.4411
+INFO:tensorflow:[6200] | gnorm 0.33 lr 0.000080 | loss 1.02 | pplx    2.77, bpc  1.4704
+I0411 16:21:41.674479 46979636578560 train_gpu.py:345] [6200] | gnorm 0.33 lr 0.000080 | loss 1.02 | pplx    2.77, bpc  1.4704
+INFO:tensorflow:[6400] | gnorm 0.32 lr 0.000072 | loss 1.01 | pplx    2.74, bpc  1.4556
+I0411 16:23:57.228107 46979636578560 train_gpu.py:345] [6400] | gnorm 0.32 lr 0.000072 | loss 1.01 | pplx    2.74, bpc  1.4556
+INFO:tensorflow:[6600] | gnorm 0.35 lr 0.000066 | loss 1.03 | pplx    2.80, bpc  1.4847
+I0411 16:26:12.733853 46979636578560 train_gpu.py:345] [6600] | gnorm 0.35 lr 0.000066 | loss 1.03 | pplx    2.80, bpc  1.4847
+INFO:tensorflow:[6800] | gnorm 0.32 lr 0.000059 | loss 1.02 | pplx    2.78, bpc  1.4754
+I0411 16:28:28.142014 46979636578560 train_gpu.py:345] [6800] | gnorm 0.32 lr 0.000059 | loss 1.02 | pplx    2.78, bpc  1.4754
+INFO:tensorflow:[7000] | gnorm 0.35 lr 0.000052 | loss 0.99 | pplx    2.70, bpc  1.4311
+I0411 16:30:43.719517 46979636578560 train_gpu.py:345] [7000] | gnorm 0.35 lr 0.000052 | loss 0.99 | pplx    2.70, bpc  1.4311
+INFO:tensorflow:[7200] | gnorm 0.33 lr 0.000046 | loss 0.97 | pplx    2.64, bpc  1.4001
+I0411 16:32:59.260841 46979636578560 train_gpu.py:345] [7200] | gnorm 0.33 lr 0.000046 | loss 0.97 | pplx    2.64, bpc  1.4001
+INFO:tensorflow:[7400] | gnorm 0.34 lr 0.000040 | loss 1.01 | pplx    2.73, bpc  1.4512
+I0411 16:35:14.672811 46979636578560 train_gpu.py:345] [7400] | gnorm 0.34 lr 0.000040 | loss 1.01 | pplx    2.73, bpc  1.4512
+INFO:tensorflow:[7600] | gnorm 0.33 lr 0.000035 | loss 0.96 | pplx    2.61, bpc  1.3820
+I0411 16:37:30.178255 46979636578560 train_gpu.py:345] [7600] | gnorm 0.33 lr 0.000035 | loss 0.96 | pplx    2.61, bpc  1.3820
+INFO:tensorflow:[7800] | gnorm 0.33 lr 0.000030 | loss 0.98 | pplx    2.68, bpc  1.4198
+I0411 16:39:45.574867 46979636578560 train_gpu.py:345] [7800] | gnorm 0.33 lr 0.000030 | loss 0.98 | pplx    2.68, bpc  1.4198
+INFO:tensorflow:[8000] | gnorm 0.31 lr 0.000025 | loss 0.96 | pplx    2.61, bpc  1.3850
+I0411 16:42:00.981717 46979636578560 train_gpu.py:345] [8000] | gnorm 0.31 lr 0.000025 | loss 0.96 | pplx    2.61, bpc  1.3850
+INFO:tensorflow:Model saved in path: EXP-enwik8/model.ckpt
+I0411 16:42:05.911054 46979636578560 train_gpu.py:351] Model saved in path: EXP-enwik8/model.ckpt
+INFO:tensorflow:[8200] | gnorm 0.31 lr 0.000020 | loss 0.98 | pplx    2.65, bpc  1.4076
+I0411 16:44:21.411516 46979636578560 train_gpu.py:345] [8200] | gnorm 0.31 lr 0.000020 | loss 0.98 | pplx    2.65, bpc  1.4076
+INFO:tensorflow:[8400] | gnorm 0.30 lr 0.000016 | loss 0.97 | pplx    2.63, bpc  1.3931
+I0411 16:46:36.940109 46979636578560 train_gpu.py:345] [8400] | gnorm 0.30 lr 0.000016 | loss 0.97 | pplx    2.63, bpc  1.3931
+INFO:tensorflow:[8600] | gnorm 0.32 lr 0.000013 | loss 0.95 | pplx    2.59, bpc  1.3718
+I0411 16:48:52.560289 46979636578560 train_gpu.py:345] [8600] | gnorm 0.32 lr 0.000013 | loss 0.95 | pplx    2.59, bpc  1.3718
+INFO:tensorflow:[8800] | gnorm 0.33 lr 0.000010 | loss 0.95 | pplx    2.58, bpc  1.3666
+I0411 16:51:08.070581 46979636578560 train_gpu.py:345] [8800] | gnorm 0.33 lr 0.000010 | loss 0.95 | pplx    2.58, bpc  1.3666
+INFO:tensorflow:[9000] | gnorm 0.31 lr 0.000007 | loss 0.96 | pplx    2.62, bpc  1.3921
+I0411 16:53:23.609839 46979636578560 train_gpu.py:345] [9000] | gnorm 0.31 lr 0.000007 | loss 0.96 | pplx    2.62, bpc  1.3921
+INFO:tensorflow:[9200] | gnorm 0.31 lr 0.000005 | loss 0.98 | pplx    2.66, bpc  1.4139
+I0411 16:55:38.975215 46979636578560 train_gpu.py:345] [9200] | gnorm 0.31 lr 0.000005 | loss 0.98 | pplx    2.66, bpc  1.4139
+INFO:tensorflow:[9400] | gnorm 0.32 lr 0.000003 | loss 0.97 | pplx    2.65, bpc  1.4033
+I0411 16:57:54.425925 46979636578560 train_gpu.py:345] [9400] | gnorm 0.32 lr 0.000003 | loss 0.97 | pplx    2.65, bpc  1.4033
+INFO:tensorflow:[9600] | gnorm 0.34 lr 0.000002 | loss 0.97 | pplx    2.63, bpc  1.3954
+I0411 17:00:09.927987 46979636578560 train_gpu.py:345] [9600] | gnorm 0.34 lr 0.000002 | loss 0.97 | pplx    2.63, bpc  1.3954
+INFO:tensorflow:[9800] | gnorm 0.31 lr 0.000001 | loss 0.97 | pplx    2.65, bpc  1.4051
+I0411 17:02:25.449383 46979636578560 train_gpu.py:345] [9800] | gnorm 0.31 lr 0.000001 | loss 0.97 | pplx    2.65, bpc  1.4051
+INFO:tensorflow:[10000] | gnorm 0.33 lr 0.000001 | loss 0.96 | pplx    2.62, bpc  1.3900
+I0411 17:04:40.909836 46979636578560 train_gpu.py:345] [10000] | gnorm 0.33 lr 0.000001 | loss 0.96 | pplx    2.62, bpc  1.3900
--- a/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_4_fp32_new.log
+++ b/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_4_fp32_new.log
--- a/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_4_fp32_new_bs12.log
+++ b/TensorFlow/NLP/transformer-xl-master/log/transfoemer-XL_4_fp32_new_bs12.log
--- a/TensorFlow/NLP/transformer-xl-master/model.py
+++ b/TensorFlow/NLP/transformer-xl-master/model.py
+import tensorflow as tf
+
+
+def positional_embedding(pos_seq, inv_freq, bsz=None):
+  sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)
+  pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
+  if bsz is not None:
+    return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
+  else:
+    return pos_emb[:, None, :]
+
+
+def positionwise_FF(inp, d_model, d_inner, dropout, kernel_initializer,
+                    scope='ff', is_training=True):
+  output = inp
+  with tf.variable_scope(scope):
+    output = tf.layers.dense(inp, d_inner, activation=tf.nn.relu,
+                             kernel_initializer=kernel_initializer,
+                             name='layer_1')
+    output = tf.layers.dropout(output, dropout, training=is_training,
+                               name='drop_1')
+    output = tf.layers.dense(output, d_model,
+                             kernel_initializer=kernel_initializer,
+                             name='layer_2')
+    output = tf.layers.dropout(output, dropout, training=is_training,
+                               name='drop_2')
+    output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1)
+  return output
+
+
+def rel_shift(x):
+  x_size = tf.shape(x)
+
+  x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
+  x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
+  x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
+  x = tf.reshape(x, x_size)
+
+  return x
+
+
+def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,
+                       n_head, d_head, dropout, dropatt, is_training,
+                       kernel_initializer, scope='rel_attn'):
+  scale = 1 / (d_head ** 0.5)
+  with tf.variable_scope(scope):
+    qlen = tf.shape(w)[0]
+    rlen = tf.shape(r)[0]
+    bsz = tf.shape(w)[1]
+
+    cat = tf.concat([mems, w],
+                    0) if mems is not None and mems.shape.ndims > 1 else w
+    w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False,
+                              kernel_initializer=kernel_initializer, name='qkv')
+    r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False,
+                               kernel_initializer=kernel_initializer, name='r')
+
+    w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1)
+    w_head_q = w_head_q[-qlen:]
+
+    klen = tf.shape(w_head_k)[0]
+
+    w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])
+    w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])
+    w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])
+
+    r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])
+
+    rw_head_q = w_head_q + r_w_bias
+    rr_head_q = w_head_q + r_r_bias
+
+    AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)
+    BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)
+    BD = rel_shift(BD)
+
+    attn_score = (AC + BD) * scale
+    attn_mask_t = attn_mask[:, :, None, None]
+    attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
+
+    attn_prob = tf.nn.softmax(attn_score, 1)
+    attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training)
+
+    attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
+    size_t = tf.shape(attn_vec)
+    attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])
+
+    attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False,
+                               kernel_initializer=kernel_initializer, name='o')
+    attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
+
+    output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1)
+  return output
+
+
+def embedding_lookup(lookup_table, x, use_tpu=True):
+  if use_tpu:
+    n_token = tf.shape(lookup_table)[0]
+    one_hot_idx = tf.one_hot(x, n_token)
+    if one_hot_idx.shape.ndims == 2:
+      return tf.einsum('nd,in->id', lookup_table, one_hot_idx)
+    else:
+      return tf.einsum('nd,ibn->ibd', lookup_table, one_hot_idx)
+  else:
+    return tf.nn.embedding_lookup(lookup_table, x)
+
+
+def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
+                                   proj_initializer, div_val=1,
+                                   proj_same_dim=True,
+                                   scope='adaptive_embed', **kwargs):
+  emb_scale = d_proj ** 0.5
+  with tf.variable_scope(scope):
+    if div_val == 1:
+      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
+                                     initializer=initializer)
+      y = embedding_lookup(lookup_table, x, use_tpu=False)
+      if d_proj != d_embed:
+        proj_W = tf.get_variable('proj_W', [d_embed, d_proj],
+                                 initializer=proj_initializer)
+        y = tf.einsum('ibe,ed->ibd', y, proj_W)
+      else:
+        proj_W = None
+      ret_params = [lookup_table, proj_W]
+    else:
+      tables, projs = [], []
+      cutoff_ends = [0] + cutoffs + [n_token]
+      x_size = tf.shape(x)
+      y = tf.zeros([x_size[0], x_size[1], d_proj])
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+          mask = (x >= l_idx) & (x < r_idx)
+          cur_x = tf.boolean_mask(x, mask) - l_idx
+          cur_d_embed = d_embed // (div_val ** i)
+          lookup_table = tf.get_variable('lookup_table',
+                                         [r_idx - l_idx, cur_d_embed],
+                                         initializer=initializer)
+          cur_y = embedding_lookup(lookup_table, cur_x, use_tpu=False)
+          if d_proj == cur_d_embed and not proj_same_dim:
+            proj_W = None
+          else:
+            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],
+                                     initializer=proj_initializer)
+            cur_y = tf.einsum('id,de->ie', cur_y, proj_W)
+          mask_idx = tf.to_int64(tf.where(mask))
+          y += tf.scatter_nd(mask_idx, cur_y, tf.to_int64(tf.shape(y)))
+          tables.append(lookup_table)
+          projs.append(proj_W)
+      ret_params = [tables, projs]
+
+  y *= emb_scale
+  return y, ret_params
+
+
+def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
+                                  proj_initializer, div_val=1, perms=None,
+                                  proj_same_dim=True,
+                                  scope='adaptive_embed'):
+  """
+  perms: If None, first compute W = W1 x W2 (projection for each bin),
+      and then compute X x W (embedding lookup). If not None,
+      use bin-based embedding lookup with max_bin_size defined by
+      the shape of perms.
+  """
+  emb_scale = d_proj ** 0.5
+  with tf.variable_scope(scope):
+    if div_val == 1:
+      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
+                                     initializer=initializer)
+      y = embedding_lookup(lookup_table, x)
+      if d_proj != d_embed:
+        proj_W = tf.get_variable('proj_W', [d_embed, d_proj],
+                                 initializer=proj_initializer)
+        y = tf.einsum('ibe,ed->ibd', y, proj_W)
+      else:
+        proj_W = None
+      ret_params = [lookup_table, proj_W]
+    else:
+      tables, projs = [], []
+      cutoff_ends = [0] + cutoffs + [n_token]
+      x_size = tf.shape(x)
+      if perms is None:
+        cat_lookup = []
+      else:
+        cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj])
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+          cur_d_embed = d_embed // (div_val ** i)
+          lookup_table = tf.get_variable('lookup_table',
+                                         [r_idx - l_idx, cur_d_embed],
+                                         initializer=initializer)
+          if cur_d_embed == d_proj and not proj_same_dim:
+            proj_W = None
+          else:
+            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],
+                                   initializer=proj_initializer)
+          if perms is None:
+            cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W))
+          else:
+            # speed up the computation of the first bin
+            # also save some meory
+            if i == 0:
+              cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1))
+              if proj_W is not None:
+                cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W)
+              cur_y *= perms[i][:, :, None]
+              cat_lookup += cur_y
+            else:
+              cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i])
+              cur_x = tf.to_int32(cur_x)
+              cur_y = embedding_lookup(lookup_table, cur_x)
+              if proj_W is not None:
+                cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W)
+              cat_lookup += tf.einsum('kd,ibk->ibd', cur_y, perms[i])
+          tables.append(lookup_table)
+          projs.append(proj_W)
+      if perms is None:
+        cat_lookup = tf.concat(cat_lookup, 0)
+        y = embedding_lookup(cat_lookup, x)
+      else:
+        y = cat_lookup
+      ret_params = [tables, projs]
+
+  y *= emb_scale
+  return y, ret_params
+
+
+def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
+                             params, tie_projs,
+                             initializer=None, proj_initializer=None,
+                             div_val=1, scope='adaptive_softmax',
+                             proj_same_dim=True,
+                             return_mean=True, **kwargs):
+  def _logit(x, W, b, proj):
+    y = x
+    if proj is not None:
+      y = tf.einsum('ibd,ed->ibe', y, proj)
+    return tf.einsum('ibd,nd->ibn', y, W) + b
+
+  params_W, params_projs = params[0], params[1]
+
+  def _gather_logprob(logprob, target):
+    lp_size = tf.shape(logprob)
+    r = tf.range(lp_size[0])
+    idx = tf.stack([r, target], 1)
+    return tf.gather_nd(logprob, idx)
+
+  with tf.variable_scope(scope):
+    if len(cutoffs) == 0:
+      softmax_b = tf.get_variable('bias', [n_token],
+                                  initializer=tf.zeros_initializer())
+      output = _logit(hidden, params_W, softmax_b, params_projs)
+      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
+                                                           logits=output)
+    else:
+      cutoff_ends = [0] + cutoffs + [n_token]
+      nll = tf.zeros_like(target, dtype=tf.float32)
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+          mask = (target >= l_idx) & (target < r_idx)
+          mask_idx = tf.where(mask)
+          cur_target = tf.boolean_mask(target, mask) - l_idx
+          cur_d_embed = d_embed // (div_val ** i)
+
+          if div_val == 1:
+            cur_W = params_W[l_idx: r_idx]
+          else:
+            cur_W = params_W[i]
+          cur_b = tf.get_variable('b', [r_idx - l_idx],
+                                  initializer=tf.zeros_initializer())
+          if tie_projs[i]:
+            if div_val == 1:
+              cur_proj = params_projs
+            else:
+              cur_proj = params_projs[i]
+          else:
+            if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:
+              cur_proj = None
+            else:
+              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],
+                                         initializer=proj_initializer)
+          if i == 0:
+            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],
+                                        initializer=tf.zeros_initializer())
+            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],
+                                        initializer=tf.zeros_initializer())
+            cur_W = tf.concat([cur_W, cluster_W], 0)
+            cur_b = tf.concat([cur_b, cluster_b], 0)
+
+            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)
+            head_logprob = tf.nn.log_softmax(head_logit)
+            cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+            cur_logprob = _gather_logprob(cur_head_logprob, cur_target)
+          else:
+            cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+            cur_hidden = tf.boolean_mask(hidden, mask)
+            tail_logit = tf.squeeze(_logit(
+                cur_hidden[None], cur_W, cur_b, cur_proj), 0)
+            tail_logprob = tf.nn.log_softmax(tail_logit)
+            cur_logprob = (cur_head_logprob[:, cutoff_ends[1] + i - 1] +
+                           _gather_logprob(tail_logprob, cur_target))
+          nll += tf.scatter_nd(mask_idx, -cur_logprob,
+                                 tf.to_int64(tf.shape(nll)))
+  if return_mean:
+    nll = tf.reduce_mean(nll)
+  return nll
+
+
+def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
+                            params, tie_projs,
+                            initializer=None, proj_initializer=None,
+                            div_val=1, perms=None, proj_same_dim=True,
+                            scope='adaptive_softmax',
+                            **kwargs):
+  def _logit(x, W, b, proj):
+    y = x
+    if x.shape.ndims == 3:
+      if proj is not None:
+        y = tf.einsum('ibd,ed->ibe', y, proj)
+      return tf.einsum('ibd,nd->ibn', y, W) + b
+    else:
+      if proj is not None:
+        y = tf.einsum('id,ed->ie', y, proj)
+      return tf.einsum('id,nd->in', y, W) + b
+
+  params_W, params_projs = params[0], params[1]
+
+  with tf.variable_scope(scope):
+    if len(cutoffs) == 0:
+      softmax_b = tf.get_variable('bias', [n_token],
+                                  initializer=tf.zeros_initializer())
+      output = _logit(hidden, params_W, softmax_b, params_projs)
+      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target,
+                                                           logits=output)
+      nll = tf.reduce_mean(nll)
+    else:
+      total_loss, total_cnt = 0, 0
+      cutoff_ends = [0] + cutoffs + [n_token]
+      for i in range(len(cutoff_ends) - 1):
+        with tf.variable_scope('cutoff_{}'.format(i)):
+          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1]
+
+          cur_d_embed = d_embed // (div_val ** i)
+
+          if div_val == 1:
+            cur_W = params_W[l_idx: r_idx]
+          else:
+            cur_W = params_W[i]
+          cur_b = tf.get_variable('b', [r_idx - l_idx],
+                                  initializer=tf.zeros_initializer())
+          if tie_projs[i]:
+            if div_val == 1:
+              cur_proj = params_projs
+            else:
+              cur_proj = params_projs[i]
+          else:
+            if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:
+              cur_proj = None
+            else:
+              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],
+                                         initializer=proj_initializer)
+
+          if i == 0:
+            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],
+                                        initializer=tf.zeros_initializer())
+            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],
+                                        initializer=tf.zeros_initializer())
+            cur_W = tf.concat([cur_W, cluster_W], 0)
+            cur_b = tf.concat([cur_b, cluster_b], 0)
+
+            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)
+
+            head_target = kwargs.get("head_target")
+            head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=head_target,
+                logits=head_logit)
+
+            masked_loss = head_nll * perms[i]
+            total_loss += tf.reduce_sum(masked_loss)
+            total_cnt += tf.reduce_sum(perms[i])
+
+            # head_logprob = tf.nn.log_softmax(head_logit)
+
+            # final_logprob = head_logprob * perms[i][:, :, None]
+            # final_target = tf.one_hot(target, tf.shape(head_logprob)[2])
+            # total_loss -= tf.einsum('ibn,ibn->', final_logprob, final_target)
+            # total_cnt += tf.reduce_sum(perms[i])
+          else:
+            cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i])
+
+            cur_hidden = tf.einsum('ibd,ibk->kd', hidden, perms[i])
+            tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj)
+
+            tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx),
+                                    perms[i])
+            tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                labels=tf.to_int32(tail_target),
+                logits=tail_logit)
+
+            sum_nll = cur_head_nll + tail_nll
+            mask = tf.reduce_sum(perms[i], [0, 1])
+
+            masked_loss = sum_nll * mask
+            total_loss += tf.reduce_sum(masked_loss)
+            total_cnt += tf.reduce_sum(mask)
+
+      nll = total_loss / total_cnt
+
+  return nll
+
+
+def _create_mask(qlen, mlen, same_length=False):
+  attn_mask = tf.ones([qlen, qlen])
+  mask_u = tf.matrix_band_part(attn_mask, 0, -1)
+  mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
+  attn_mask_pad = tf.zeros([qlen, mlen])
+  ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
+  if same_length:
+    mask_l = tf.matrix_band_part(attn_mask, -1, 0)
+    ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
+  return ret
+
+def _cache_mem(curr_out, prev_mem, mem_len=None):
+  if mem_len is None or prev_mem is None:
+    new_mem = curr_out
+  elif mem_len == 0:
+    return prev_mem
+  else:
+    new_mem = tf.concat([prev_mem, curr_out], 0)[- mem_len:]
+
+  return tf.stop_gradient(new_mem)
+
+
+def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,
+                n_head, d_head, d_inner, dropout, dropatt,
+                initializer, is_training, proj_initializer=None,
+                mem_len=None, cutoffs=[], div_val=1, tie_projs=[],
+                same_length=False, clamp_len=-1, use_tpu=True,
+                input_perms=None, target_perms=None, head_target=None,
+                untie_r=False, proj_same_dim=True,
+                scope='transformer'):
+  """
+  cutoffs: a list of python int. Cutoffs for adaptive softmax.
+  tie_projs: a list of python bools. Whether to tie the projections.
+  use_tpu: if True, use one_hot in embedding lookup and bin-based implementation
+        of adaptive softmax.
+  perms: a list of tensors. Each tensor should of size [len, bsz, bin_size].
+        Only used in the adaptive setting.
+  """
+  new_mems = []
+  with tf.variable_scope(scope):
+    if untie_r:
+      r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],
+                               initializer=initializer)
+      r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],
+                                 initializer=initializer)
+    else:
+      r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],
+                                 initializer=initializer)
+      r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],
+                                 initializer=initializer)
+
+    qlen = tf.shape(dec_inp)[0]
+    mlen = tf.shape(mems[0])[0] if mems is not None else 0
+    klen = mlen + qlen
+
+    if proj_initializer is None:
+      proj_initializer = initializer
+    lookup_fn = (mul_adaptive_embedding_lookup if use_tpu else
+                 mask_adaptive_embedding_lookup)
+    embeddings, shared_params = lookup_fn(
+        x=dec_inp,
+        n_token=n_token,
+        d_embed=d_embed,
+        d_proj=d_model,
+        cutoffs=cutoffs,
+        initializer=initializer,
+        proj_initializer=proj_initializer,
+        div_val= div_val,
+        perms=input_perms,
+        proj_same_dim=proj_same_dim)
+
+    attn_mask = _create_mask(qlen, mlen, same_length)
+
+    pos_seq = tf.range(klen - 1, -1, -1.0)
+    if clamp_len > 0:
+      pos_seq = tf.minimum(pos_seq, clamp_len)
+    inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))
+    pos_emb = positional_embedding(pos_seq, inv_freq)
+
+    output = tf.layers.dropout(embeddings, dropout, training=is_training)
+    pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training)
+
+    if mems is None:
+      mems = [None] * n_layer
+
+    for i in range(n_layer):
+      # cache new mems
+      new_mems.append(_cache_mem(output, mems[i], mem_len))
+
+      with tf.variable_scope('layer_{}'.format(i)):
+        output = rel_multihead_attn(
+            w=output,
+            r=pos_emb,
+            r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
+            r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
+            attn_mask=attn_mask,
+            mems=mems[i],
+            d_model=d_model,
+            n_head=n_head,
+            d_head=d_head,
+            dropout=dropout,
+            dropatt=dropatt,
+            is_training=is_training,
+            kernel_initializer=initializer)
+        output = positionwise_FF(
+            inp=output,
+            d_model=d_model,
+            d_inner=d_inner,
+            dropout=dropout,
+            kernel_initializer=initializer,
+            is_training=is_training)
+
+    output = tf.layers.dropout(output, dropout, training=is_training)
+
+    logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else
+                     mask_adaptive_logsoftmax)
+    loss = logsoftmax_fn(
+        hidden=output,
+        target=target,
+        n_token=n_token,
+        d_embed=d_embed,
+        d_proj=d_model,
+        cutoffs=cutoffs,
+        params=shared_params,
+        tie_projs=tie_projs,
+        initializer=initializer,
+        proj_initializer=proj_initializer,
+        div_val=div_val,
+        perms=target_perms,
+        head_target=head_target,
+        proj_same_dim=proj_same_dim)
+    return loss, new_mems
+
--- a/TensorFlow/NLP/transformer-xl-master/nohup.out
+++ b/TensorFlow/NLP/transformer-xl-master/nohup.out
+Run training...
+WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.
+WARNING:tensorflow:From train_gpu_test.py:23: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.
+
+WARNING:tensorflow:From train_gpu_test.py:23: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.
+
+WARNING:tensorflow:From train_gpu_test.py:23: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.
+
+WARNING:tensorflow:From train_gpu_test.py:23: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.
+
+WARNING:tensorflow:From train_gpu_test.py:492: The name tf.app.run is deprecated. Please use tf.compat.v1.app.run instead.
+
+WARNING:tensorflow:From train_gpu_test.py:492: The name tf.app.run is deprecated. Please use tf.compat.v1.app.run instead.
+
+WARNING:tensorflow:From train_gpu_test.py:482: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.
+
+W0623 14:46:20.521119 47187556010368 module_wrapper.py:139] From train_gpu_test.py:482: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.
+
+INFO:tensorflow:n_token 204
+I0623 14:46:20.521361 47187556010368 train_gpu_test.py:482] n_token 204
+INFO:tensorflow:[train] File names ['train.bsz-12.tlen-512.tfrecords']
+I0623 14:46:20.531896 47187556010368 data_utils.py:434] [train] File names ['train.bsz-12.tlen-512.tfrecords']
+INFO:tensorflow:num of batches 14483
+I0623 14:46:20.532083 47187556010368 train_gpu_test.py:240] num of batches 14483
+WARNING:tensorflow:From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.sparse_tensor_to_dense is deprecated. Please use tf.sparse.to_dense instead.
+
+W0623 14:46:34.696085 47187556010368 module_wrapper.py:139] From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.sparse_tensor_to_dense is deprecated. Please use tf.sparse.to_dense instead.
+
+WARNING:tensorflow:From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.FixedLenFeature is deprecated. Please use tf.io.FixedLenFeature instead.
+
+W0623 14:46:34.697554 47187556010368 module_wrapper.py:139] From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.FixedLenFeature is deprecated. Please use tf.io.FixedLenFeature instead.
+
+WARNING:tensorflow:From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.VarLenFeature is deprecated. Please use tf.io.VarLenFeature instead.
+
+W0623 14:46:34.698294 47187556010368 module_wrapper.py:139] From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.VarLenFeature is deprecated. Please use tf.io.VarLenFeature instead.
+
+WARNING:tensorflow:From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.parse_single_example is deprecated. Please use tf.io.parse_single_example instead.
+
+W0623 14:46:34.701079 47187556010368 module_wrapper.py:139] From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.parse_single_example is deprecated. Please use tf.io.parse_single_example instead.
+
+WARNING:tensorflow:From /work/home/hepj/tf1/transformer-xl-master/tf/data_utils.py:506: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `tf.cast` instead.
+W0623 14:46:35.660339 47187556010368 deprecation.py:323] From /work/home/hepj/tf1/transformer-xl-master/tf/data_utils.py:506: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `tf.cast` instead.
+WARNING:tensorflow:From train_gpu_test.py:247: DatasetV1.make_one_shot_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
+W0623 14:46:35.673759 47187556010368 deprecation.py:323] From train_gpu_test.py:247: DatasetV1.make_one_shot_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
+WARNING:tensorflow:From train_gpu_test.py:259: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.
+
+W0623 14:46:35.692192 47187556010368 module_wrapper.py:139] From train_gpu_test.py:259: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.
+
+WARNING:tensorflow:From train_gpu_test.py:259: The name tf.get_variable_scope is deprecated. Please use tf.compat.v1.get_variable_scope instead.
+
+W0623 14:46:35.692516 47187556010368 module_wrapper.py:139] From train_gpu_test.py:259: The name tf.get_variable_scope is deprecated. Please use tf.compat.v1.get_variable_scope instead.
+
+WARNING:tensorflow:From train_gpu_test.py:263: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.
+
+W0623 14:46:35.692863 47187556010368 module_wrapper.py:139] From train_gpu_test.py:263: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.
+
+WARNING:tensorflow:From /work/home/hepj/tf1/transformer-xl-master/tf/gpu_utils.py:6: The name tf.NodeDef is deprecated. Please use tf.compat.v1.NodeDef instead.
+
+W0623 14:46:35.693674 47187556010368 module_wrapper.py:139] From /work/home/hepj/tf1/transformer-xl-master/tf/gpu_utils.py:6: The name tf.NodeDef is deprecated. Please use tf.compat.v1.NodeDef instead.
+
+WARNING:tensorflow:From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:460: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.
+
+W0623 14:46:35.704414 47187556010368 module_wrapper.py:139] From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:460: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.
+
+WARNING:tensorflow:From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:416: The name tf.matrix_band_part is deprecated. Please use tf.linalg.band_part instead.
+
+W0623 14:46:35.742786 47187556010368 module_wrapper.py:139] From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:416: The name tf.matrix_band_part is deprecated. Please use tf.linalg.band_part instead.
+
+WARNING:tensorflow:From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:493: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.dropout instead.
+W0623 14:46:35.775503 47187556010368 deprecation.py:323] From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:493: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.dropout instead.
+WARNING:tensorflow:From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
+Instructions for updating:
+Please use `layer.__call__` method instead.
+W0623 14:46:35.776226 47187556010368 deprecation.py:323] From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
+Instructions for updating:
+Please use `layer.__call__` method instead.
+WARNING:tensorflow:From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:54: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.Dense instead.
+W0623 14:46:35.801217 47187556010368 deprecation.py:323] From /work/home/hepj/tf1/transformer-xl-master/tf/model.py:54: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use keras.layers.Dense instead.
+WARNING:tensorflow:
+The TensorFlow contrib module will not be included in TensorFlow 2.0.
+For more information, please see:
+  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
+  * https://github.com/tensorflow/addons
+  * https://github.com/tensorflow/io (for I/O related ops)
+If you depend on functionality not listed there, please file an issue.
+
+W0623 14:46:36.060614 47187556010368 lazy_loader.py:50] 
+The TensorFlow contrib module will not be included in TensorFlow 2.0.
+For more information, please see:
+  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
+  * https://github.com/tensorflow/addons
+  * https://github.com/tensorflow/io (for I/O related ops)
+If you depend on functionality not listed there, please file an issue.
+
+WARNING:tensorflow:From train_gpu_test.py:194: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.
+
+W0623 14:46:40.507537 47187556010368 module_wrapper.py:139] From train_gpu_test.py:194: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.
+
+INFO:tensorflow:#params: 41055436
+I0623 14:46:40.517497 47187556010368 train_gpu_test.py:195] #params: 41055436
+INFO:tensorflow:#params: 41055436
+I0623 14:46:49.611661 47187556010368 train_gpu_test.py:195] #params: 41055436
+INFO:tensorflow:#params: 41055436
+I0623 14:46:58.740740 47187556010368 train_gpu_test.py:195] #params: 41055436
+INFO:tensorflow:#params: 41055436
+I0623 14:47:08.116391 47187556010368 train_gpu_test.py:195] #params: 41055436
+WARNING:tensorflow:From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/ops/clip_ops.py:301: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use tf.where in 2.0, which has the same broadcast rule as np.where
+W0623 14:47:13.709527 47187556010368 deprecation.py:323] From /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/ops/clip_ops.py:301: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
+Instructions for updating:
+Use tf.where in 2.0, which has the same broadcast rule as np.where
+WARNING:tensorflow:From train_gpu_test.py:292: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
+
+W0623 14:47:13.892564 47187556010368 module_wrapper.py:139] From train_gpu_test.py:292: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.
+
+WARNING:tensorflow:From train_gpu_test.py:302: The name tf.train.cosine_decay is deprecated. Please use tf.compat.v1.train.cosine_decay instead.
+
+W0623 14:47:13.896909 47187556010368 module_wrapper.py:139] From train_gpu_test.py:302: The name tf.train.cosine_decay is deprecated. Please use tf.compat.v1.train.cosine_decay instead.
+
+WARNING:tensorflow:From train_gpu_test.py:313: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.
+
+W0623 14:47:13.910406 47187556010368 module_wrapper.py:139] From train_gpu_test.py:313: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.
+
+WARNING:tensorflow:From train_gpu_test.py:323: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.
+
+W0623 14:47:15.554019 47187556010368 module_wrapper.py:139] From train_gpu_test.py:323: The name tf.train.Saver is deprecated. Please use tf.compat.v1.train.Saver instead.
+
+WARNING:tensorflow:From train_gpu_test.py:325: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.
+
+W0623 14:47:15.950821 47187556010368 module_wrapper.py:139] From train_gpu_test.py:325: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.
+
+WARNING:tensorflow:From train_gpu_test.py:325: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.
+
+W0623 14:47:15.951256 47187556010368 module_wrapper.py:139] From train_gpu_test.py:325: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.
+
+2022-06-23 14:47:15.951746: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
+2022-06-23 14:47:16.345128: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 1999880000 Hz
+2022-06-23 14:47:16.347222: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1399f370 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
+2022-06-23 14:47:16.347347: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
+2022-06-23 14:47:16.376028: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libamdhip64.so
+2022-06-23 14:47:20.574412: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1d549690 initialized for platform ROCM (this does not guarantee that XLA will be used). Devices:
+2022-06-23 14:47:20.574543: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): C878180, AMDGPU ISA version: gfx906
+2022-06-23 14:47:20.574586: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): C878180, AMDGPU ISA version: gfx906
+2022-06-23 14:47:20.574625: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (2): C878180, AMDGPU ISA version: gfx906
+2022-06-23 14:47:20.574663: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (3): C878180, AMDGPU ISA version: gfx906
+2022-06-23 14:47:20.581876: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 0 with properties: 
+name: C878180
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.319
+pciBusID 0000:04:00.0
+2022-06-23 14:47:20.582075: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 1 with properties: 
+name: C878180
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.319
+pciBusID 0000:26:00.0
+2022-06-23 14:47:20.582181: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 2 with properties: 
+name: C878180
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.319
+pciBusID 0000:43:00.0
+2022-06-23 14:47:20.582264: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1650] Found device 3 with properties: 
+name: C878180
+AMDGPU ISA: gfx906
+memoryClockRate (GHz) 1.319
+pciBusID 0000:63:00.0
+2022-06-23 14:47:23.159813: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocblas.so
+2022-06-23 14:47:23.222323: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libMIOpen.so
+2022-06-23 14:48:25.788779: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocfft.so
+2022-06-23 14:48:25.890072: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocrand.so
+2022-06-23 14:48:25.890632: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0, 1, 2, 3
+2022-06-23 14:48:25.890804: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1180] Device interconnect StreamExecutor with strength 1 edge matrix:
+2022-06-23 14:48:25.890868: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1186]      0 1 2 3 
+2022-06-23 14:48:25.890934: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 0:   N Y Y Y 
+2022-06-23 14:48:25.890975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 1:   Y N Y Y 
+2022-06-23 14:48:25.891013: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 2:   Y Y N Y 
+2022-06-23 14:48:25.891050: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 3:   Y Y Y N 
+2022-06-23 14:48:25.891650: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14001 MB memory) -> physical GPU (device: 0, name: C878180, pci bus id: 0000:04:00.0)
+2022-06-23 14:48:25.899617: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 14001 MB memory) -> physical GPU (device: 1, name: C878180, pci bus id: 0000:26:00.0)
+2022-06-23 14:48:25.913932: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 14001 MB memory) -> physical GPU (device: 2, name: C878180, pci bus id: 0000:43:00.0)
+2022-06-23 14:48:25.922425: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 14001 MB memory) -> physical GPU (device: 3, name: C878180, pci bus id: 0000:63:00.0)
+WARNING:tensorflow:From train_gpu_test.py:326: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.
+
+W0623 14:48:25.955481 47187556010368 module_wrapper.py:139] From train_gpu_test.py:326: The name tf.global_variables_initializer is deprecated. Please use tf.compat.v1.global_variables_initializer instead.
+
+2022-06-23 14:48:29.635470: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+2022-06-23 14:48:29.853604: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 13.67G (14682108416 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.853823: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 12.31G (13213896704 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.853928: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 11.08G (11892506624 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854018: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 9.97G (10703255552 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854108: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.97G (9632929792 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854214: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.07G (8669636608 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854311: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 7.27G (7802672640 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854409: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 6.54G (7022405120 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854502: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.89G (6320164352 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854589: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.30G (5688147968 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854702: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.77G (5119332864 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854792: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.29G (4607399424 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.854922: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.86G (4146659328 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855037: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.48G (3731993344 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855140: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.13G (3358793984 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855247: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.81G (3022914560 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855350: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.53G (2720623104 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855448: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.28G (2448560640 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855561: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.05G (2203704576 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855672: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.85G (1983334144 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855761: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.66G (1785000704 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855880: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.50G (1606500608 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.855996: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.35G (1445850624 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.856110: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.21G (1301265664 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.856201: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.09G (1171139072 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:29.856310: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1005.20M (1054025216 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.325078: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+2022-06-23 14:48:54.519871: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 13.67G (14682124032 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520098: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 12.31G (13213911040 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520185: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 11.08G (11892519936 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520269: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 9.97G (10703267840 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520352: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.97G (9632941056 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520434: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.07G (8669646848 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520515: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 7.27G (7802681856 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520596: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 6.54G (7022413312 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520677: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.89G (6320172032 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520757: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.30G (5688154624 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520838: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.77G (5119339008 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.520931: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.29G (4607405056 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521011: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.86G (4146664448 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521100: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.48G (3731997952 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521181: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.13G (3358798080 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521262: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.81G (3022918144 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521355: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.53G (2720626176 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521436: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.28G (2448563456 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521516: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.05G (2203707136 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521597: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.85G (1983336448 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521677: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.66G (1785002752 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521756: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.50G (1606502400 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521836: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.35G (1445852160 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.521925: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.21G (1301266944 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.522006: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.09G (1171140352 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.522086: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1005.20M (1054026496 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.550346: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library librocblas.so
+2022-06-23 14:48:54.751225: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 13.67G (14682116096 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.751450: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 12.31G (13213903872 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.751537: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 11.08G (11892512768 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.751620: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 9.97G (10703261696 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.751702: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.97G (9632934912 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.751797: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.07G (8669640704 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.751887: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 7.27G (7802676224 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.751969: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 6.54G (7022408192 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752049: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.89G (6320167424 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752129: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.30G (5688150528 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752209: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.77G (5119335424 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752299: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.29G (4607401984 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752379: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.86G (4146661632 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752460: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.48G (3731995392 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752540: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.13G (3358795776 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752621: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.81G (3022916096 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752701: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.53G (2720624384 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752780: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.28G (2448561920 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752868: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.05G (2203705600 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.752950: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.85G (1983335168 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.753033: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.66G (1785001728 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.753114: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.50G (1606501632 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.753194: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.35G (1445851392 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.753274: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.21G (1301266176 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.753353: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.09G (1171139584 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.753433: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1005.20M (1054025728 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.979658: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 13.67G (14682108416 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.979903: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 12.31G (13213896704 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.979991: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 11.08G (11892506624 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980075: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 9.97G (10703255552 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980159: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.97G (9632929792 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980239: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 8.07G (8669636608 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980320: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 7.27G (7802672640 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980401: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 6.54G (7022405120 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980482: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.89G (6320164352 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980562: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 5.30G (5688147968 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980656: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.77G (5119332864 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980750: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 4.29G (4607399424 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980831: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.86G (4146659328 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.980920: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.48G (3731993344 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981001: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 3.13G (3358793984 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981081: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.81G (3022914560 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981161: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.53G (2720623104 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981241: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.28G (2448560640 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981320: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 2.05G (2203704576 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981400: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.85G (1983334144 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981479: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.66G (1785000704 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981559: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.50G (1606500608 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981639: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.35G (1445850624 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981719: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.21G (1301265664 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981800: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1.09G (1171139072 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:54.981887: E tensorflow/stream_executor/rocm/rocm_driver.cc:645] failed to allocate 1005.20M (1054025216 bytes) from device: HIP_ERROR_OutOfMemory
+2022-06-23 14:48:55.422345: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+2022-06-23 14:48:55.425431: I tensorflow/core/graph/gpu_fusion_pass.cc:505] ROCm Fusion is enabled.
+Error 218(hipErrorInvalidKernelFile) /data/jenkins_workspace/workspace/rocBLAS_release/rocblas/build/release/virtualenv/lib64/python3.6/site-packages/Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp:84: 
+error
+hipErrorInvalidKernelFile
+/work/home/hepj/app/dtk-22.04.1/rocblas/lib/library_dcu2/TensileLibrary_gfx906.co
+
+Error 218(hipErrorInvalidKernelFile) /data/jenkins_workspace/workspace/rocBLAS_release/rocblas/build/release/virtualenv/lib64/python3.6/site-packages/Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp:84: 
+error
+hipErrorInvalidKernelFile
+/work/home/hepj/app/dtk-22.04.1/rocblas/lib/library_dcu2/TensileLibrary_gfx906.co
+
+Error 218(hipErrorInvalidKernelFile) /data/jenkins_workspace/workspace/rocBLAS_release/rocblas/build/release/virtualenv/lib64/python3.6/site-packages/Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp:84: 
+error
+hipErrorInvalidKernelFile
+/work/home/hepj/app/dtk-22.04.1/rocblas/lib/library_dcu2/TensileLibrary_gfx906.co
+
+Error 218(hipErrorInvalidKernelFile) /data/jenkins_workspace/workspace/rocBLAS_release/rocblas/build/release/virtualenv/lib64/python3.6/site-packages/Tensile/Source/lib/source/hip/HipSolutionAdapter.cpp:84: 
+error
+hipErrorInvalidKernelFile
+/work/home/hepj/app/dtk-22.04.1/rocblas/lib/library_dcu2/TensileLibrary_gfx906.co
+
+
+rocBLAS error: Tensile solution found, but exception thrown for { a_type: "f32_r", b_type: "f32_r", c_type: "f32_r", d_type: "f32_r", compute_type: "f32_r", transA: 'N', transB: 'N', M: 1536, N: 3072, K: 512, alpha: 1, row_stride_a: 1, col_stride_a: 1536, row_stride_b: 1, col_stride_b: 512, row_stride_c: 1, col_stride_c: 1536, row_stride_d: 1, col_stride_d: 1536, beta: 0, batch_count: 1, strided_batch: true, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, atomics_mode: atomics_not_allowed }
+Kernel Cijk_Ailk_Bljk_SB_MT128x64x16_SN_APM1_AF0EM1_AF1EM1_AMAS3_ASAE01_ASCE01_ASEM1_BL1_DTL0_ETSP_EPS0_FL0_GRVW4_GSU1_ISA906_IU1_K1_KLA_LPA0_LPB0_LDL1_LRVW4_MAC_MDA2_NLCA1_NLCB1_ONLL1_PK0_PGR0_PLR1_RK0_SU32_SUM0_SUS256_SVW4_SNLL0_TT8_4_USFGRO0_VAW1_VS1_VW4_WG16_16_1_WGM1 not found in any loaded module.
+This message will be only be displayed once, unless the ROCBLAS_VERBOSE_TENSILE_ERROR environment variable is set.
+2022-06-23 14:49:05.193083: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.193084: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.193098: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.193602: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.193683: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.193756: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.193945: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.194235: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.194547: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.194827: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.195128: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.194451: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.195339: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.195407: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.195598: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.195656: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.195961: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.196166: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.196230: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.196526: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.196709: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.196821: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.196884: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.197241: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.197292: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.197748: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.198108: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.198415: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.198601: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.198852: E tensorflow/stream_executor/rocm/rocm_blas.cc:416] failed to run ROCBLAS routine rocblas_sgemm: rocblas_status_internal_error
+2022-06-23 14:49:05.906378: W tensorflow/core/kernels/data/cache_dataset_ops.cc:824] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
+==================================================
+/work/home/hepj/tf1/transformer-xl-master/data/enwik8//tfrecords/record_info-train.bsz-12.tlen-512.json
+==================================================
+Traceback (most recent call last):
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
+    return fn(*args)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
+    target_list, run_metadata)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
+    run_metadata)
+tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
+  (0) Internal: Blas GEMM launch failed : a.shape=(1024, 512), b.shape=(512, 512), m=1024, n=512, k=512
+	 [[{{node transformer_1/layer_2/rel_attn/r/Tensordot/MatMul}}]]
+  (1) Internal: Blas GEMM launch failed : a.shape=(1024, 512), b.shape=(512, 512), m=1024, n=512, k=512
+	 [[{{node transformer/layer_2/rel_attn/r/Tensordot/MatMul}}]]
+0 successful operations.
+3 derived errors ignored.
+
+During handling of the above exception, another exception occurred:
+
+Traceback (most recent call last):
+  File "train_gpu_test.py", line 492, in <module>
+    tf.app.run()
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/platform/app.py", line 40, in run
+    _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/absl/app.py", line 312, in run
+    _run_main(main, args)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/absl/app.py", line 258, in _run_main
+    sys.exit(main(argv))
+  File "train_gpu_test.py", line 486, in main
+    train(n_token, cutoffs, "/gpu:0")
+  File "train_gpu_test.py", line 341, in train
+    fetched = sess.run(fetches, feed_dict=feed_dict)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 956, in run
+    run_metadata_ptr)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1180, in _run
+    feed_dict_tensor, options, run_metadata)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
+    run_metadata)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
+    raise type(e)(node_def, op, message)
+tensorflow.python.framework.errors_impl.InternalError: 2 root error(s) found.
+  (0) Internal: Blas GEMM launch failed : a.shape=(1024, 512), b.shape=(512, 512), m=1024, n=512, k=512
+	 [[node transformer_1/layer_2/rel_attn/r/Tensordot/MatMul (defined at /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
+  (1) Internal: Blas GEMM launch failed : a.shape=(1024, 512), b.shape=(512, 512), m=1024, n=512, k=512
+	 [[node transformer/layer_2/rel_attn/r/Tensordot/MatMul (defined at /work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
+0 successful operations.
+3 derived errors ignored.
+
+Original stack trace for 'transformer_1/layer_2/rel_attn/r/Tensordot/MatMul':
+  File "train_gpu_test.py", line 492, in <module>
+    tf.app.run()
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/platform/app.py", line 40, in run
+    _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/absl/app.py", line 312, in run
+    _run_main(main, args)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/absl/app.py", line 258, in _run_main
+    sys.exit(main(argv))
+  File "train_gpu_test.py", line 486, in main
+    train(n_token, cutoffs, "/gpu:0")
+  File "train_gpu_test.py", line 271, in train
+    mems=mems_i)
+  File "train_gpu_test.py", line 223, in single_core_graph
+    is_training=is_training)
+  File "train_gpu_test.py", line 191, in model_fn
+    proj_same_dim=FLAGS.proj_same_dim)
+  File "/work/home/hepj/tf1/transformer-xl-master/tf/model.py", line 517, in transformer
+    kernel_initializer=initializer)
+  File "/work/home/hepj/tf1/transformer-xl-master/tf/model.py", line 56, in rel_multihead_attn
+    kernel_initializer=kernel_initializer, name='r')
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py", line 324, in new_func
+    return func(*args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py", line 187, in dense
+    return layer.apply(inputs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py", line 324, in new_func
+    return func(*args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py", line 1700, in apply
+    return self.__call__(inputs, *args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/layers/base.py", line 548, in __call__
+    outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py", line 854, in __call__
+    outputs = call_fn(cast_inputs, *args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py", line 234, in wrapper
+    return converted_call(f, options, args, kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py", line 439, in converted_call
+    return _call_unconverted(f, args, kwargs, options)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py", line 330, in _call_unconverted
+    return f(*args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/keras/layers/core.py", line 1039, in call
+    outputs = standard_ops.tensordot(inputs, self.kernel, [[rank - 1], [0]])
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/ops/math_ops.py", line 4096, in tensordot
+    ab_matmul = matmul(a_reshape, b_reshape)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/util/dispatch.py", line 180, in wrapper
+    return target(*args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/ops/math_ops.py", line 2754, in matmul
+    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_math_ops.py", line 6236, in mat_mul
+    name=name)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
+    op_def=op_def)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
+    return func(*args, **kwargs)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
+    attrs, op_def, compute_device)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
+    op_def=op_def)
+  File "/work/home/hepj/.pyenv/versions/tf1/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
+    self._traceback = tf_stack.extract_stack()
+
--- a/TensorFlow/NLP/transformer-xl-master/prep_enwik8.py
+++ b/TensorFlow/NLP/transformer-xl-master/prep_enwik8.py
+#!/usr/bin/env python
+# coding=utf-8
+
+import os
+import sys
+import zipfile
+
+if os.path.exists('train.txt'):
+    print('Tokenized enwik8 already exists - skipping processing')
+    sys.exit()
+
+data = zipfile.ZipFile('enwik8.zip').read('enwik8')
+
+print('Length of enwik8: {}'.format(len(data)))
+
+num_test_chars = 5000000
+
+train_data = data[: -2 * num_test_chars]
+valid_data = data[-2 * num_test_chars: -num_test_chars]
+test_data = data[-num_test_chars:]
+
+for fn, part in [('train.txt', train_data), ('valid.txt', valid_data), ('test.txt', test_data)]:
+    print('{} will have {} bytes'.format(fn, len(part)))
+    print('- Tokenizing...')
+    part_str = ' '.join([str(c) if c != ord('\n') else '\n' for c in part])
+    print('- Writing...')
+    f = open(fn, 'w').write(part_str)
+    f = open(fn + '.raw', 'wb').write(part)
--- a/TensorFlow/NLP/transformer-xl-master/prof_gpu.py
+++ b/TensorFlow/NLP/transformer-xl-master/prof_gpu.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import time
+
+from absl import flags
+import absl.logging as _logging  # pylint: disable=unused-import
+
+import tensorflow as tf
+import model
+import data_utils
+
+from gpu_utils import assign_to_gpu, average_grads_and_vars
+
+import numpy as np
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.python.profiler import model_analyzer
+from tensorflow.python.profiler import option_builder
+
+import os
+os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+# GPU config
+flags.DEFINE_integer("num_hosts", default=1,
+      help="Number of TPU hosts")
+flags.DEFINE_integer("num_core_per_host", default=8,
+      help="Number of cores per host")
+
+# Experiment (data/checkpoint/directory) config
+flags.DEFINE_string("data_dir", default="",
+      help="Path to tf-records directory.")
+flags.DEFINE_string("record_info_dir", default="",
+      help="Path to local directory containing filenames.txt.")
+flags.DEFINE_string("corpus_info_path", default="",
+      help="Path to corpus-info.json file.")
+flags.DEFINE_string("model_dir", default=None,
+      help="Estimator model_dir.")
+flags.DEFINE_bool("do_train", default=True,
+      help="Whether to run training.")
+flags.DEFINE_bool("do_eval", default=False,
+      help="Whether to run eval on the dev set.")
+flags.DEFINE_string("eval_ckpt_path", None,
+      help="Checkpoint path for do_test evaluation."
+           "If set, model_dir will be ignored."
+           "If unset, will use the latest ckpt in model_dir.")
+flags.DEFINE_string("warm_start_path", None,
+      help="Checkpoint path for warm start."
+           "If set, will clear Adam states."
+           "Note that the new model_dir should be different"
+           " from warm_start_path.")
+
+# Optimization config
+flags.DEFINE_float("learning_rate", default=2.5e-4,
+      help="Maximum learning rate.")
+flags.DEFINE_float("clip", default=0.25,
+      help="Gradient clipping value.")
+# for cosine decay
+flags.DEFINE_float("min_lr_ratio", default=0.004,
+      help="Minimum ratio learning rate.")
+flags.DEFINE_integer("warmup_steps", default=0,
+      help="Number of steps for linear lr warmup.")
+
+# Training config
+flags.DEFINE_integer("train_batch_size", default=60,
+      help="Size of train batch.")
+flags.DEFINE_integer("eval_batch_size", default=60,
+      help="Size of valid batch.")
+flags.DEFINE_integer("train_steps", default=100000,
+      help="Total number of training steps.")
+flags.DEFINE_integer("iterations", default=500,
+      help="Number of iterations per repeat loop.")
+flags.DEFINE_integer("save_steps", default=10000,
+      help="number of steps for model checkpointing.")
+
+# Evaluation config
+flags.DEFINE_bool("do_test", default=False,
+      help="Run on the test set.")
+flags.DEFINE_integer("max_eval_batch", default=-1,
+      help="Set -1 to turn off. Only used in test mode.")
+flags.DEFINE_bool("do_eval_only", default=False,
+      help="Run evaluation only.")
+flags.DEFINE_integer("start_eval_steps", default=10000,
+      help="Which checkpoint to start with in `do_eval_only` mode.")
+flags.DEFINE_string("eval_split", "valid",
+      help="Which data split to evaluate.")
+
+# Model config
+flags.DEFINE_integer("tgt_len", default=70,
+      help="Number of steps to predict")
+flags.DEFINE_integer("mem_len", default=70,
+      help="Number of steps to cache")
+flags.DEFINE_bool("same_length", default=False,
+      help="Same length attention")
+flags.DEFINE_integer("clamp_len", default=-1,
+      help="Clamp length")
+
+flags.DEFINE_integer("n_layer", default=6,
+      help="Number of layers.")
+flags.DEFINE_integer("d_model", default=500,
+      help="Dimension of the model.")
+flags.DEFINE_integer("d_embed", default=500,
+      help="Dimension of the embeddings.")
+flags.DEFINE_integer("n_head", default=10,
+      help="Number of attention heads.")
+flags.DEFINE_integer("d_head", default=50,
+      help="Dimension of each attention head.")
+flags.DEFINE_integer("d_inner", default=1000,
+      help="Dimension of inner hidden size in positionwise feed-forward.")
+flags.DEFINE_float("dropout", default=0.1,
+      help="Dropout rate.")
+flags.DEFINE_float("dropatt", default=0.1,
+      help="Attention dropout rate.")
+flags.DEFINE_bool("untie_r", default=False,
+      help="untie r_w_bias and r_r_bias")
+
+# Adaptive Softmax / Embedding
+flags.DEFINE_bool("tie_weight", default=True,
+      help="Tie embedding and softmax weight.")
+flags.DEFINE_integer("div_val", default=1,
+      help="Divide the embedding size by this val for each bin")
+flags.DEFINE_bool("proj_share_all_but_first", default=False,
+      help="True to share all but first projs, False not to share.")
+flags.DEFINE_bool("proj_same_dim", default=True,
+      help="Project the bin with the same dimension.")
+
+# Parameter initialization
+flags.DEFINE_enum("init", default="normal",
+      enum_values=["normal", "uniform"],
+      help="Initialization method.")
+flags.DEFINE_float("init_std", default=0.02,
+      help="Initialization std when init is normal.")
+flags.DEFINE_float("proj_init_std", default=0.01,
+      help="Initialization std for embedding projection.")
+flags.DEFINE_float("init_range", default=0.1,
+      help="Initialization std when init is uniform.")
+
+FLAGS = flags.FLAGS
+
+def get_model_fn(n_token, cutoffs):
+  def model_fn(inp, tgt, mems, is_training):
+    inp = tf.transpose(inp, [1, 0])
+    tgt = tf.transpose(tgt, [1, 0])
+
+    if FLAGS.init == "uniform":
+      initializer = tf.initializers.random_uniform(
+          minval=-FLAGS.init_range,
+          maxval=FLAGS.init_range,
+          seed=None)
+    elif FLAGS.init == "normal":
+      initializer = tf.initializers.random_normal(
+          stddev=FLAGS.init_std,
+          seed=None)
+      proj_initializer = tf.initializers.random_normal(
+          stddev=FLAGS.proj_init_std,
+          seed=None)
+
+    tie_projs = [False for _ in range(len(cutoffs) + 1)]
+    if FLAGS.proj_share_all_but_first:
+      for i in range(1, len(tie_projs)):
+        tie_projs[i] = True
+
+    loss, new_mems = model.transformer(
+        dec_inp=inp,
+        target=tgt,
+        mems=mems,
+        n_token=n_token,
+        n_layer=FLAGS.n_layer,
+        d_model=FLAGS.d_model,
+        d_embed=FLAGS.d_embed,
+        n_head=FLAGS.n_head,
+        d_head=FLAGS.d_head,
+        d_inner=FLAGS.d_inner,
+        dropout=FLAGS.dropout,
+        dropatt=FLAGS.dropatt,
+        initializer=initializer,
+        proj_initializer=proj_initializer,
+        is_training=is_training,
+        mem_len=FLAGS.mem_len,
+        cutoffs=cutoffs,
+        div_val=FLAGS.div_val,
+        tie_projs=tie_projs,
+        input_perms=None,
+        target_perms=None,
+        head_target=None,
+        same_length=FLAGS.same_length,
+        clamp_len=FLAGS.clamp_len,
+        use_tpu=False,
+        untie_r=FLAGS.untie_r,
+        proj_same_dim=FLAGS.proj_same_dim)
+
+    # number of parameters
+    num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
+    tf.logging.info('#params: {}'.format(num_params))
+
+    # format_str = '{{:<{0}s}}\t{{}}'.format(
+    #     max([len(v.name) for v in tf.trainable_variables()]))
+    # for v in tf.trainable_variables():
+    #   tf.logging.info(format_str.format(v.name, v.get_shape()))
+
+    if is_training:
+      all_vars = tf.trainable_variables()
+      grads = tf.gradients(loss, all_vars)
+      grads_and_vars = list(zip(grads, all_vars))
+
+      return loss, new_mems, grads_and_vars
+    else:
+      return loss, new_mems
+
+  return model_fn
+
+
+def single_core_graph(n_token, cutoffs, is_training, inp, tgt, mems):
+  model_fn = get_model_fn(
+      n_token=n_token,
+      cutoffs=cutoffs)
+
+  model_ret = model_fn(
+      inp=inp,
+      tgt=tgt,
+      mems=mems,
+      is_training=is_training)
+
+  return model_ret
+
+
+def train(n_token, cutoffs, ps_device):
+  ##### Get input function and model function
+  tf.logging.set_verbosity(tf.logging.INFO)
+  train_input_fn, train_record_info = data_utils.get_input_fn(
+      record_info_dir=FLAGS.record_info_dir,
+      split="train",
+      per_host_bsz=FLAGS.train_batch_size,
+      tgt_len=FLAGS.tgt_len,
+      num_core_per_host=FLAGS.num_core_per_host,
+      num_hosts=1,
+      use_tpu=False)
+
+  tf.logging.info("num of batches {}".format(train_record_info["num_batch"]))
+
+  ##### Create computational graph
+  train_set = train_input_fn({
+      "batch_size": FLAGS.train_batch_size,
+      "data_dir": FLAGS.data_dir})
+
+  input_feed, label_feed = train_set.make_one_shot_iterator().get_next()
+
+  inputs = tf.split(input_feed, FLAGS.num_core_per_host, 0)
+  labels = tf.split(label_feed, FLAGS.num_core_per_host, 0)
+
+  per_core_bsz = FLAGS.train_batch_size // FLAGS.num_core_per_host
+
+  tower_mems, tower_losses, tower_new_mems, tower_grads_and_vars = [], [], [], []
+
+  for i in range(FLAGS.num_core_per_host):
+    reuse = True if i > 0 else None
+    with tf.device(assign_to_gpu(i, ps_device)), \
+        tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
+
+      mems_i = [tf.placeholder(tf.float32,
+                               [FLAGS.mem_len, per_core_bsz, FLAGS.d_model])
+                for _ in range(FLAGS.n_layer)]
+
+      loss_i, new_mems_i, grads_and_vars_i = single_core_graph(
+          n_token=n_token,
+          cutoffs=cutoffs,
+          is_training=True,
+          inp=inputs[i],
+          tgt=labels[i],
+          mems=mems_i)
+
+      tower_mems.append(mems_i)
+      tower_losses.append(loss_i)
+      tower_new_mems.append(new_mems_i)
+      tower_grads_and_vars.append(grads_and_vars_i)
+
+  ## average losses and gradients across towers
+  if len(tower_losses) > 1:
+    loss = tf.add_n(tower_losses) / len(tower_losses)
+    grads_and_vars = average_grads_and_vars(tower_grads_and_vars)
+  else:
+    loss = tower_losses[0]
+    grads_and_vars = tower_grads_and_vars[0]
+  grads, all_vars = zip(*grads_and_vars)
+
+  ## clip gradient
+  clipped, gnorm = tf.clip_by_global_norm(grads, FLAGS.clip)
+  grads_and_vars = list(zip(clipped, all_vars))
+
+  ## configure the optimizer
+  global_step = tf.train.get_or_create_global_step()
+
+  # warmup stage: increase the learning rate linearly
+  if FLAGS.warmup_steps > 0:
+    warmup_lr = tf.to_float(global_step) / tf.to_float(FLAGS.warmup_steps) \
+                * FLAGS.learning_rate
+  else:
+    warmup_lr = 0.0
+
+  # decay stage: decay the learning rate using the cosine schedule
+  decay_lr = tf.train.cosine_decay(
+      FLAGS.learning_rate,
+      global_step=global_step-FLAGS.warmup_steps,
+      decay_steps=FLAGS.train_steps-FLAGS.warmup_steps,
+      alpha=FLAGS.min_lr_ratio)
+
+  # choose warmup or decay
+  learning_rate = tf.where(global_step < FLAGS.warmup_steps,
+                           warmup_lr, decay_lr)
+
+  # get the train op
+  optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
+  train_op = optimizer.apply_gradients(grads_and_vars, global_step)
+
+  ##### Training loop
+  tower_mems_np = [
+      [np.zeros([FLAGS.mem_len, per_core_bsz, FLAGS.d_model], dtype=np.float32)
+          for layer in range(FLAGS.n_layer)]
+      for core in range(FLAGS.num_core_per_host)
+  ]
+
+  saver = tf.train.Saver()
+
+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
+    #改《
+    profiler = model_analyzer.Profiler(graph=sess.graph)
+    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+    run_metadata = tf.RunMetadata()
+    #》
+    sess.run(tf.global_variables_initializer())
+
+    if FLAGS.warm_start_path is not None:
+      tf.logging.info("warm start from {}".format(FLAGS.warm_start_path))
+      saver.restore(sess, FLAGS.warm_start_path)
+
+    fetches = [loss, tower_new_mems, global_step, gnorm, learning_rate, train_op]
+
+    total_loss, prev_step = 0., -1
+    while True:
+      feed_dict = {}
+      for i in range(FLAGS.num_core_per_host):
+        for m, m_np in zip(tower_mems[i], tower_mems_np[i]):
+          feed_dict[m] = m_np
+      #改
+      fetched = sess.run(fetches, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata)
+      loss_np, tower_mems_np, curr_step = fetched[:3]
+      total_loss += loss_np
+      #改<
+      profiler.add_step(step=curr_step, run_meta=run_metadata)
+      #>
+
+      #改<
+      if curr_step==0:
+          start_time=time.time()
+      if curr_step > 0:
+          end_time=time.time()
+          global_step_s=1/(end_time-start_time)
+          start_time=end_time
+          tf.logging.info("global_step/sec: {:.6f} , step= {}".format(global_step_s,curr_step))
+          #tf.logging.info("examples/sec : {}".format(global_step_s * FLAGS.train_batch_size))
+      #>
+      if curr_step > 0 and curr_step % FLAGS.iterations == 0:
+        curr_loss = total_loss / (curr_step - prev_step)
+        tf.logging.info("[{}] | gnorm {:.2f} lr {:8.6f} "
+            "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format(
+            curr_step, fetched[-3], fetched[-2],
+            curr_loss, math.exp(curr_loss), curr_loss / math.log(2)))
+        total_loss, prev_step = 0., curr_step
+
+      if curr_step > 0 and curr_step % FLAGS.save_steps == 0:
+        save_path = os.path.join(FLAGS.model_dir, "model.ckpt")
+        saver.save(sess, save_path)
+        tf.logging.info("Model saved in path: {}".format(save_path))
+
+      if curr_step == FLAGS.train_steps:
+        break
+      #改
+      profile_op_opt_builder = option_builder.ProfileOptionBuilder()
+      profile_op_opt_builder.select(['micros', 'occurrence'])
+      profile_op_opt_builder.order_by('occurrence')
+      profile_op_opt_builder.with_max_depth(10)
+      #
+      profile_op_opt_builder.with_step(5)
+      #将结果打印到文件
+      profile_op_opt_builder.with_file_output("./prof.txt")
+      #
+      profile_op_opt_builder.with_timeline_output("./prof.json")
+      # 显示视图为op view
+      profiler.profile_operations(profile_op_opt_builder.build())
+
+
+def evaluate(n_token, cutoffs, ps_device):
+  ##### Get input function and model function
+  eval_input_fn, eval_record_info = data_utils.get_input_fn(
+      record_info_dir=FLAGS.record_info_dir,
+      split=FLAGS.eval_split,
+      per_host_bsz=FLAGS.eval_batch_size,
+      tgt_len=FLAGS.tgt_len,
+      num_core_per_host=FLAGS.num_core_per_host,
+      num_hosts=1,
+      use_tpu=False)
+
+  num_batch = eval_record_info["num_batch"]
+  if FLAGS.max_eval_batch > 0:
+      num_batch = FLAGS.max_eval_batch
+  tf.logging.info("num of batches {}".format(num_batch))
+
+  ##### Create computational graph
+  eval_set = eval_input_fn({
+      "batch_size": FLAGS.eval_batch_size,
+      "data_dir": FLAGS.data_dir})
+
+  input_feed, label_feed = eval_set.make_one_shot_iterator().get_next()
+
+  inputs = tf.split(input_feed, FLAGS.num_core_per_host, 0)
+  labels = tf.split(label_feed, FLAGS.num_core_per_host, 0)
+
+  per_core_bsz = FLAGS.eval_batch_size // FLAGS.num_core_per_host
+  tower_mems, tower_losses, tower_new_mems = [], [], []
+
+  for i in range(FLAGS.num_core_per_host):
+    with tf.device(assign_to_gpu(i, ps_device)), \
+        tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
+
+      mems_i = [tf.placeholder(tf.float32,
+                    [FLAGS.mem_len, per_core_bsz, FLAGS.d_model])
+                for _ in range(FLAGS.n_layer)]
+
+      loss_i, new_mems_i = single_core_graph(
+          n_token=n_token,
+          cutoffs=cutoffs,
+          is_training=False,
+          inp=inputs[i],
+          tgt=labels[i],
+          mems=mems_i)
+
+      tower_mems.append(mems_i)
+      tower_losses.append(loss_i)
+      tower_new_mems.append(new_mems_i)
+
+  ## sum losses across towers
+  if len(tower_losses) > 1:
+    loss = tf.add_n(tower_losses) / len(tower_losses)
+  else:
+    loss = tower_losses[0]
+
+  ##### Evaluation loop
+  tower_mems_np = [
+      [np.zeros([FLAGS.mem_len, per_core_bsz, FLAGS.d_model], dtype=np.float32)
+          for layer in range(FLAGS.n_layer)]
+      for core in range(FLAGS.num_core_per_host)
+  ]
+
+  saver = tf.train.Saver()
+
+  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
+
+    sess.run(tf.global_variables_initializer())
+
+    if FLAGS.eval_ckpt_path is None:
+      eval_ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir)
+    else:
+      eval_ckpt_path = FLAGS.eval_ckpt_path
+    tf.logging.info("Evaluate {}".format(eval_ckpt_path))
+    saver.restore(sess, eval_ckpt_path)
+
+    fetches = [loss, tower_new_mems, tf.size(label_feed)]
+
+    format_str = "  >> processing batch {{:{0}d}}/{{:{0}d}} ..".format(
+        len(str(num_batch)))
+
+    total_loss, total_cnt = 0, 0
+    for step in range(num_batch):
+      if step % (num_batch // 10) == 0:
+        tf.logging.info(format_str.format(step, num_batch))
+
+      feed_dict = {}
+      for i in range(FLAGS.num_core_per_host):
+        for m, m_np in zip(tower_mems[i], tower_mems_np[i]):
+          feed_dict[m] = m_np
+
+      fetched = sess.run(fetches, feed_dict=feed_dict)
+
+      loss_np, tower_mems_np, cnt_np = fetched[:3]
+      total_loss += loss_np * cnt_np
+      total_cnt += cnt_np
+
+    avg_loss = total_loss / total_cnt
+    tf.logging.info("| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format(
+        avg_loss, math.exp(avg_loss), avg_loss / math.log(2)))
+
+
+def main(unused_argv):
+  del unused_argv  # Unused
+
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  # Get corpus info
+  corpus_info = data_utils.get_corpus_info(FLAGS.corpus_info_path)
+  n_token = corpus_info["vocab_size"]
+  cutoffs = corpus_info["cutoffs"][1:-1]
+  tf.logging.info("n_token {}".format(n_token))
+
+  if FLAGS.do_train:
+    tf.logging.set_verbosity(tf.logging.INFO)
+    train(n_token, cutoffs, "/gpu:0")
+  if FLAGS.do_eval:
+    evaluate(n_token, cutoffs, "/gpu:0")
+
+
+if __name__ == "__main__":
+  tf.app.run()
--- a/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_base1_gpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_base1_gpu.sh
+#!/bin/bash
+'''export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+
+export MIOPEN_ENABLE_LOGGING_CMD=1
+export ROCBLAS_LAYER=3
+
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#NCCL_DEBUG=INFO
+'''
+# Data
+#DATA_ROOT=../data/enwik8/
+DATA_ROOT=/work/home/hepj/tf1/transformer-xl-master/data/enwik8/
+MODEL_DIR=./EXP-enwik8_1_test
+# Model
+N_LAYER=12
+D_MODEL=512
+D_EMBED=512
+N_HEAD=8
+D_HEAD=64
+D_INNER=2048
+
+# Training
+TGT_LEN=512
+MEM_LEN=512
+
+TRAIN_STEPS=14483
+
+BSZ=12 #12
+NUM_CORE=1
+
+# Testing
+TEST_TGT_LEN=80
+TEST_MEM_LEN=2100
+TEST_CLAMP_LEN=820
+
+TEST_BSZ=10
+TEST_NUM_CORE=1
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train_gpu_test.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=${MODEL_DIR} \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.1 \
+        --dropatt=0.0 \
+        --learning_rate=0.00025 \
+        --warmup_steps=0 \
+        --train_steps=${TRAIN_STEPS} \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=200 \
+        --save_steps=4000 \
+        --do_train=True \
+        --do_eval=False \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-enwik8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.0 \
+        --dropatt=0.0 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --do_train=False \
+        --do_eval=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_base_gpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_base_gpu.sh
+#!/bin/bash
+
+# Data
+#DATA_ROOT=../data/enwik8/
+DATA_ROOT=/work/home/hepj/tf1/transformer-xl-master/data/enwik8/
+
+
+# Model
+N_LAYER=12
+D_MODEL=512
+D_EMBED=512
+N_HEAD=8
+D_HEAD=64
+D_INNER=2048
+
+# Training
+TGT_LEN=512
+MEM_LEN=512
+
+TRAIN_STEPS=14483 # 7242  #从数据读取信息可以知道数据有这么多个batch，因此跑这么多step
+
+#测试使用12
+BSZ=12
+NUM_CORE=4
+
+# Testing
+TEST_TGT_LEN=80
+TEST_MEM_LEN=2100
+TEST_CLAMP_LEN=820
+
+TEST_BSZ=10
+TEST_NUM_CORE=1
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    #改
+    #python train_gpu.py \
+    python train_gpu_test.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-enwik8_4_new_bs12 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.1 \
+        --dropatt=0.0 \
+        --learning_rate=0.00025 \
+        --warmup_steps=0 \
+        --train_steps=${TRAIN_STEPS} \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=200 \
+        --save_steps=4000 \
+        --do_train=True \
+        --do_eval=False \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-enwik8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.0 \
+        --dropatt=0.0 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --do_train=False \
+        --do_eval=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_base_test.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_base_test.sh
+#!/bin/bash
+'''export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+
+export MIOPEN_ENABLE_LOGGING_CMD=1
+export ROCBLAS_LAYER=3
+
+module unload compiler/rocm/2.9
+echo "MIOPEN_FIND_MODE=$MIOPEN_FIND_MODE"
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+#NCCL_DEBUG=INFO
+'''
+export HIP_VISIBLE_DEVICES=0
+
+# Data
+#DATA_ROOT=../data/enwik8/
+DATA_ROOT=/public/home/hepj/SothisAI/transformer-xl-master/data/text8
+
+# Model
+N_LAYER=12
+D_MODEL=512
+D_EMBED=512
+N_HEAD=8
+D_HEAD=64
+D_INNER=2048
+
+# Training
+TGT_LEN=512
+MEM_LEN=512
+
+TRAIN_STEPS=14483
+
+BSZ=12
+NUM_CORE=1
+
+# Testing
+TEST_TGT_LEN=80
+TEST_MEM_LEN=2100
+TEST_CLAMP_LEN=820
+
+TEST_BSZ=10
+TEST_NUM_CORE=1
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train_gpu_test.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-enwik8_test \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.1 \
+        --dropatt=0.0 \
+        --learning_rate=0.00025 \
+        --warmup_steps=0 \
+        --train_steps=${TRAIN_STEPS} \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=200 \
+        --save_steps=4000 \
+        --do_train=True \
+        --do_eval=False \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-enwik8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.0 \
+        --dropatt=0.0 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --do_train=False \
+        --do_eval=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_large_tpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/enwik8_large_tpu.sh
+#!/bin/bash
+
+# Path
+LOCAL_DIR=../data/enwik8/
+GSDATA=
+GSEXP=
+
+# TPU setting
+NUM_HOST=2
+NUM_CORE=16 # TPUv2 -> 8 | TPUv3 -> 16
+
+TEST_NUM_HOST=1
+TEST_NUM_CORE=8 # TPUv2 -> 8 | TPUv3 -> 16
+
+# Model
+N_LAYER=24
+D_MODEL=1024
+D_EMBED=1024
+N_HEAD=8
+D_HEAD=128
+D_INNER=3072
+
+# Training
+TGT_LEN=768
+MEM_LEN=768
+TRAIN_BSZ=64
+VALID_BSZ=64
+
+# Testing
+TEST_TGT_LEN=128
+TEST_MEM_LEN=3800
+TEST_CLAMP_LEN=1000
+TEST_BSZ=16
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${TRAIN_BSZ} \
+        --per_host_valid_bsz=${VALID_BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --num_passes=10 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=train.bsz-${TRAIN_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/enwik8-tfrecords/
+
+    SRC_PATTERN=valid.bsz-${VALID_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/enwik8-tfrecords/
+
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --num_passes=1 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=test.bsz-${TEST_BSZ}.tlen-${TEST_TGT_LEN}.core-${TEST_NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/enwik8-tfrecords/
+
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --data_dir=${GSDATA}/enwik8-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/enwik8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.15 \
+        --dropatt=0.15 \
+        --learning_rate=0.00025 \
+        --warmup_steps=4000 \
+        --train_steps=400000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${TRAIN_BSZ} \
+        --use_tpu=True \
+        --num_host=${NUM_HOST} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=1000 \
+        --save_steps=10000 \
+        --do_train=True \
+        --do_eval=False \
+        ${@:2}
+
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train.py \
+        --data_dir=${GSDATA}/enwik8-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/enwik8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_host=${TEST_NUM_HOST} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --use_tpu=True \
+        --do_train=False \
+        --do_eval_only=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/scripts/lm1b_base_gpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/lm1b_base_gpu.sh
+#!/bin/bash
+
+# Data
+DATA_ROOT=../data/one-billion-words/
+
+# Model
+DIV_VAL=4
+N_LAYER=18
+D_MODEL=1024
+D_EMBED=1024
+N_HEAD=8
+D_HEAD=128
+D_INNER=4096
+
+# Training
+TGT_LEN=256
+MEM_LEN=256
+
+BSZ=256
+NUM_CORE=4
+
+# Testing
+TEST_TGT_LEN=32
+TEST_MEM_LEN=128
+TEST_CLAMP_LEN=-1
+
+TEST_BSZ=16
+TEST_NUM_CORE=1
+
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+      --data_dir=${DATA_ROOT}/ \
+      --dataset=lm1b \
+      --tgt_len=${TGT_LEN} \
+      --per_host_train_bsz=${BSZ} \
+      --per_host_valid_bsz=${BSZ} \
+      --num_passes=1 \
+      --use_tpu=False \
+      ${@:2}
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+      --data_dir=${DATA_ROOT}/ \
+      --dataset=lm1b \
+      --tgt_len=${TEST_TGT_LEN} \
+      --per_host_test_bsz=${TEST_BSZ} \
+      --num_passes=1 \
+      --use_tpu=False \
+      ${@:2}
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-lm1b \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=False \
+        --proj_same_dim=False \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.1 \
+        --dropatt=0.0 \
+        --learning_rate=0.00025 \
+        --warmup_steps=0 \
+        --train_steps=400000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=200 \
+        --save_steps=4000 \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-lm1b \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=False \
+        --proj_same_dim=False \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.0 \
+        --dropatt=0.0 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --do_train=False \
+        --do_eval=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/scripts/lm1b_large_tpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/lm1b_large_tpu.sh
+#!/bin/bash
+
+# Path
+LOCAL_DIR=../data/one-billion-words/
+GSDATA=
+GSEXP=
+
+# TPU setting
+NUM_HOST=32
+NUM_CORE=16 # TPUv2 -> 8 | TPUv3 -> 16
+
+TEST_NUM_HOST=1
+TEST_NUM_CORE=8 # TPUv2 -> 8 | TPUv3 -> 16
+
+# Model
+DIV_VAL=4
+N_LAYER=24
+D_MODEL=1280
+D_EMBED=1280
+N_HEAD=16
+D_HEAD=80
+D_INNER=8192
+
+# Training
+TGT_LEN=32
+MEM_LEN=32
+TRAIN_BSZ=512
+VALID_BSZ=512
+TRAIN_BSZ_PER_HOST=$((TRAIN_BSZ / NUM_HOST))
+VALID_BSZ_PER_HOST=$((VALID_BSZ / NUM_HOST))
+
+# Testing
+TEST_TGT_LEN=32
+TEST_MEM_LEN=128
+TEST_CLAMP_LEN=-1
+TEST_BSZ=8
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=lm1b \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${TRAIN_BSZ_PER_HOST} \
+        --per_host_valid_bsz=${VALID_BSZ_PER_HOST} \
+        --num_core_per_host=${NUM_CORE} \
+        --num_passes=10 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=train.bsz-${TRAIN_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/lm1b-tfrecords/
+
+    SRC_PATTERN=valid.bsz-${VALID_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/lm1b-tfrecords/
+
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=lm1b \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --num_passes=1 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=test.bsz-${TEST_BSZ}.tlen-${TEST_TGT_LEN}.core-${TEST_NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/lm1b-tfrecords/
+
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --data_dir=${GSDATA}/lm1b-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/lm1b \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=False \
+        --proj_same_dim=False \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.05 \
+        --dropatt=0.05 \
+        --init_std=0.005 \
+        --learning_rate=0.0001 \
+        --warmup_steps=30000 \
+        --train_steps=1200000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${TRAIN_BSZ} \
+        --num_hosts=${NUM_HOST} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=1000 \
+        --save_steps=10000 \
+        --use_tpu=True \
+        --do_eval=False \
+        ${@:2}
+
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train.py \
+        --data_dir=${GSDATA}/lm1b-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/lm1b \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=False \
+        --proj_same_dim=False \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_host=${TEST_NUM_HOST} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --use_tpu=True \
+        --do_train=False \
+        --do_eval_only=True \
+        --eval_split=test \
+        ${@:2}
+
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/scripts/text8_base_gpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/text8_base_gpu.sh
+#!/bin/bash
+
+# Data
+DATA_ROOT=../data/text8/
+
+# Model
+N_LAYER=12
+D_MODEL=512
+D_EMBED=512
+N_HEAD=8
+D_HEAD=64
+D_INNER=2048
+
+# Training
+TGT_LEN=512
+MEM_LEN=512
+
+BSZ=24
+NUM_CORE=4
+
+# Testing
+TEST_TGT_LEN=80
+TEST_MEM_LEN=2100
+TEST_CLAMP_LEN=820
+
+TEST_BSZ=10
+TEST_NUM_CORE=1
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=text8 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=text8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-text8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.1 \
+        --dropatt=0.0 \
+        --learning_rate=0.00025 \
+        --warmup_steps=0 \
+        --train_steps=400000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=200 \
+        --save_steps=4000 \
+        --do_train=True \
+        --do_eval=False \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-text8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.0 \
+        --dropatt=0.0 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --do_train=False \
+        --do_eval=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
\ No newline at end of file
--- a/TensorFlow/NLP/transformer-xl-master/scripts/text8_large_tpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/text8_large_tpu.sh
+#!/bin/bash
+
+# Path
+LOCAL_DIR=../data/text8/
+GSDATA=
+GSEXP=
+
+# TPU setting
+NUM_HOST=2
+NUM_CORE=16 # TPUv2 -> 8 | TPUv3 -> 16
+
+TEST_NUM_HOST=1
+TEST_NUM_CORE=8 # TPUv2 -> 8 | TPUv3 -> 16
+
+# Model
+N_LAYER=24
+D_MODEL=1024
+D_EMBED=1024
+N_HEAD=8
+D_HEAD=128
+D_INNER=3072
+
+# Training
+TGT_LEN=768
+MEM_LEN=768
+TRAIN_BSZ=64
+VALID_BSZ=64
+
+# Testing
+TEST_TGT_LEN=128
+TEST_MEM_LEN=3800
+TEST_CLAMP_LEN=1000
+TEST_BSZ=16
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=text8 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${TRAIN_BSZ} \
+        --per_host_valid_bsz=${VALID_BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --num_passes=10 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=train.bsz-${TRAIN_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/text8-tfrecords/
+
+    SRC_PATTERN=valid.bsz-${VALID_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/text8-tfrecords/
+
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=text8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --num_passes=1 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=test.bsz-${TEST_BSZ}.tlen-${TEST_TGT_LEN}.core-${TEST_NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/text8-tfrecords/
+
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --data_dir=${GSDATA}/text8-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/text8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.15 \
+        --dropatt=0.15 \
+        --learning_rate=0.00025 \
+        --warmup_steps=4000 \
+        --train_steps=400000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${TRAIN_BSZ} \
+        --use_tpu=True \
+        --num_host=${NUM_HOST} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=1000 \
+        --save_steps=10000 \
+        --do_train=True \
+        --do_eval=False \
+        ${@:2}
+
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train.py \
+        --data_dir=${GSDATA}/text8-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/text8 \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_host=${TEST_NUM_HOST} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --use_tpu=True \
+        --do_train=False \
+        --do_eval_only=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/scripts/wt103_base_gpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/wt103_base_gpu.sh
+#!/bin/bash
+
+# Data
+DATA_ROOT=../data/wikitext-103/
+
+# Model
+DIV_VAL=1
+N_LAYER=16
+D_MODEL=410
+D_EMBED=410
+N_HEAD=10
+D_HEAD=41
+D_INNER=2100
+
+# Training
+TGT_LEN=150
+MEM_LEN=150
+
+BSZ=60
+NUM_CORE=4
+
+# Testing
+TEST_TGT_LEN=64
+TEST_MEM_LEN=640
+TEST_CLAMP_LEN=400
+
+TEST_BSZ=10
+TEST_NUM_CORE=1
+
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=wt103 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${BSZ} \
+        --per_host_valid_bsz=${BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${DATA_ROOT}/ \
+        --dataset=enwik8 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_passes=1 \
+        --use_tpu=False \
+        ${@:2}
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-wt103 \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=True \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.1 \
+        --dropatt=0.0 \
+        --learning_rate=0.00025 \
+        --warmup_steps=0 \
+        --train_steps=400000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=200 \
+        --save_steps=4000 \
+        ${@:2}
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train_gpu.py \
+        --data_dir=${DATA_ROOT}/tfrecords \
+        --record_info_dir=${DATA_ROOT}/tfrecords/ \
+        --corpus_info_path=${DATA_ROOT}/corpus-info.json \
+        --model_dir=EXP-wt103 \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=True \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.0 \
+        --dropatt=0.0 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --do_train=False \
+        --do_eval=True \
+        --eval_split=test \
+        ${@:2}
+else
+    echo 'unknown argment 1'
+fi
\ No newline at end of file
--- a/TensorFlow/NLP/transformer-xl-master/scripts/wt103_large_tpu.sh
+++ b/TensorFlow/NLP/transformer-xl-master/scripts/wt103_large_tpu.sh
+#!/bin/bash
+
+# Path
+LOCAL_DIR=../data/wikitext-103/
+GSDATA=
+GSEXP=
+
+# TPU setting
+NUM_HOST=4
+NUM_CORE=16 # TPUv2 -> 8 | TPUv3 -> 16
+
+TEST_NUM_HOST=1
+TEST_NUM_CORE=8 # TPUv2 -> 8 | TPUv3 -> 16
+
+# Model
+DIV_VAL=4
+N_LAYER=18
+D_MODEL=1024
+D_EMBED=1024
+N_HEAD=16
+D_HEAD=64
+D_INNER=4096
+
+# Training
+TGT_LEN=384
+MEM_LEN=384
+TRAIN_BSZ=128
+VALID_BSZ=128
+
+# Testing
+TEST_TGT_LEN=128
+TEST_MEM_LEN=1600
+TEST_CLAMP_LEN=1000
+TEST_BSZ=8
+
+if [[ $1 == 'train_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=wt103 \
+        --tgt_len=${TGT_LEN} \
+        --per_host_train_bsz=${TRAIN_BSZ} \
+        --per_host_valid_bsz=${VALID_BSZ} \
+        --num_core_per_host=${NUM_CORE} \
+        --num_passes=10 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=train.bsz-${TRAIN_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/wt103-tfrecords/
+
+    SRC_PATTERN=valid.bsz-${VALID_BSZ}.tlen-${TGT_LEN}.core-${NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/wt103-tfrecords/
+
+elif [[ $1 == 'test_data' ]]; then
+    python data_utils.py \
+        --data_dir=${LOCAL_DIR}/ \
+        --dataset=wt103 \
+        --tgt_len=${TEST_TGT_LEN} \
+        --per_host_test_bsz=${TEST_BSZ} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --num_passes=1 \
+        --use_tpu=True \
+        ${@:2}
+
+    SRC_PATTERN=test.bsz-${TEST_BSZ}.tlen-${TEST_TGT_LEN}.core-${TEST_NUM_CORE}*
+    gsutil cp ${LOCAL_DIR}/tfrecords/${SRC_PATTERN} ${GSDATA}/wt103-tfrecords/
+
+elif [[ $1 == 'train' ]]; then
+    echo 'Run training...'
+    python train.py \
+        --data_dir=${GSDATA}/wt103-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/wt103 \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=True \
+        --proj_same_dim=True \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --dropout=0.2 \
+        --dropatt=0.2 \
+        --init_std=0.005 \
+        --learning_rate=0.00025 \
+        --warmup_steps=16000 \
+        --train_steps=4000000 \
+        --tgt_len=${TGT_LEN} \
+        --mem_len=${MEM_LEN} \
+        --train_batch_size=${TRAIN_BSZ} \
+        --num_hosts=${NUM_HOST} \
+        --num_core_per_host=${NUM_CORE} \
+        --iterations=1000 \
+        --save_steps=10000 \
+        --use_tpu=True \
+        --do_eval=False \
+        ${@:2}
+
+elif [[ $1 == 'eval' ]]; then
+    echo 'Run evaluation...'
+    python train.py \
+        --data_dir=${GSDATA}/wt103-tfrecords \
+        --record_info_dir=${LOCAL_DIR}/tfrecords/ \
+        --corpus_info_path=${LOCAL_DIR}/corpus-info.json \
+        --model_dir=${GSEXP}/wt103 \
+        --div_val=${DIV_VAL} \
+        --untie_r=True \
+        --proj_share_all_but_first=True \
+        --proj_same_dim=True \
+        --n_layer=${N_LAYER} \
+        --d_model=${D_MODEL} \
+        --d_embed=${D_EMBED} \
+        --n_head=${N_HEAD} \
+        --d_head=${D_HEAD} \
+        --d_inner=${D_INNER} \
+        --tgt_len=${TEST_TGT_LEN} \
+        --mem_len=${TEST_MEM_LEN} \
+        --clamp_len=${TEST_CLAMP_LEN} \
+        --same_length=True \
+        --eval_batch_size=${TEST_BSZ} \
+        --num_host=${TEST_NUM_HOST} \
+        --num_core_per_host=${TEST_NUM_CORE} \
+        --use_tpu=True \
+        --do_train=False \
+        --do_eval_only=True \
+        --eval_split=test \
+        ${@:2}
+
+else
+    echo 'unknown argment 1'
+fi
--- a/TensorFlow/NLP/transformer-xl-master/sota/download.sh
+++ b/TensorFlow/NLP/transformer-xl-master/sota/download.sh
+#!/bin/bash
+
+URL=http://curtis.ml.cmu.edu/datasets/pretrained_xl
+
+DATA_ROOT=./
+
+function download () {
+  fileurl=${1}
+  filename=${fileurl##*/}
+  if [ ! -f ${filename} ]; then
+    echo ">>> Download '${filename}' from '${fileurl}'."
+    wget --quiet ${fileurl}
+  else
+    echo "*** File '${filename}' exists. Skip."
+  fi
+}
+
+cd $DATA_ROOT
+mkdir -p pretrained_xl && cd pretrained_xl
+
+# enwik8
+mkdir -p tf_enwik8 && cd tf_enwik8
+
+mkdir -p data && cd data
+download ${URL}/tf_enwiki8/data/cache.pkl
+download ${URL}/tf_enwiki8/data/corpus-info.json
+cd ..
+
+mkdir -p model && cd model
+download ${URL}/tf_enwiki8/model/checkpoint
+download ${URL}/tf_enwiki8/model/model.ckpt-0.data-00000-of-00001
+download ${URL}/tf_enwiki8/model/model.ckpt-0.index
+download ${URL}/tf_enwiki8/model/model.ckpt-0.meta
+cd ..
+
+cd ..
+
+# text8
+mkdir -p tf_text8 && cd tf_text8
+
+mkdir -p data && cd data
+download ${URL}/tf_text8/data/cache.pkl
+download ${URL}/tf_text8/data/corpus-info.json
+cd ..
+
+mkdir -p model && cd model
+download ${URL}/tf_text8/model/checkpoint
+download ${URL}/tf_text8/model/model.ckpt-0.data-00000-of-00001
+download ${URL}/tf_text8/model/model.ckpt-0.index
+download ${URL}/tf_text8/model/model.ckpt-0.meta
+cd ..
+
+cd ..
+
+# wt103
+mkdir -p tf_wt103 && cd tf_wt103
+
+mkdir -p data && cd data
+download ${URL}/tf_wt103/data/cache.pkl
+download ${URL}/tf_wt103/data/corpus-info.json
+cd ..
+
+mkdir -p model && cd model
+download ${URL}/tf_wt103/model/checkpoint
+download ${URL}/tf_wt103/model/model.ckpt-0.data-00000-of-00001
+download ${URL}/tf_wt103/model/model.ckpt-0.index
+download ${URL}/tf_wt103/model/model.ckpt-0.meta
+cd ..
+
+cd ..
+
+# lm1b
+mkdir -p tf_lm1b && cd tf_lm1b
+
+mkdir -p data && cd data
+download ${URL}/tf_lm1b/data/cache.pkl
+download ${URL}/tf_lm1b/data/corpus-info.json
+cd ..
+
+mkdir -p model && cd model
+download ${URL}/tf_lm1b/model/checkpoint
+download ${URL}/tf_lm1b/model/model.ckpt-1191000.data-00000-of-00001
+download ${URL}/tf_lm1b/model/model.ckpt-1191000.index
+download ${URL}/tf_lm1b/model/model.ckpt-1191000.meta
+cd ..
+
+cd ..