tokenizer moved to its own directory

11220df8 · Mohammad · 5050203f · 11220df8 · 11220df8 · 11220df8
Commit 11220df8 authored Mar 28, 2020 by Mohammad
6 changed files
--- a/megatron/data/__init__.py
+++ b/megatron/data/__init__.py
 from . import indexed_dataset
-from .bert_tokenization import FullTokenizer as FullBertTokenizer
+

--- a/megatron/data/bert_dataset.py
+++ b/megatron/data/bert_dataset.py
@@ -24,7 +24,7 @@ from torch.utils.data import Dataset

 from megatron import mpu
 from megatron.data import helpers
-from megatron.data import FullBertTokenizer
+from megatron.tokenizer.bert_tokenization import FullTokenizer as FullBertTokenizer
 from megatron.data.dataset_utils import build_training_sample
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 from megatron.utils import print_rank_0

--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -21,7 +21,7 @@ import time

 import torch

-from megatron.data.tokenizer import build_tokenizer
+from megatron.tokenizer import build_tokenizer
 from .arguments import parse_args

 _GLOBAL_ARGS = None

--- a/megatron/tokenizer/__init__.py
+++ b/megatron/tokenizer/__init__.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .tokenizer import build_tokenizer
--- a/megatron/data/bert_tokenization.py
+++ b/megatron/data/bert_tokenization.py
--- a/megatron/data/tokenizer.py
+++ b/megatron/data/tokenizer.py