"tests/vscode:/vscode.git/clone" did not exist on "84cd9e8d01adb47f046b1ee449fc76a0c32dc4e2"
Unverified Commit 51448673 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fix CI with change of name of nlp (#7054)

* nlp -> datasets

* More nlp -> datasets

* Woopsie

* More nlp -> datasets

* One last
parent e9a2f772
...@@ -77,7 +77,7 @@ jobs: ...@@ -77,7 +77,7 @@ jobs:
- v0.3-torch_and_tf-{{ checksum "setup.py" }} - v0.3-torch_and_tf-{{ checksum "setup.py" }}
- v0.3-{{ checksum "setup.py" }} - v0.3-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install git+https://github.com/huggingface/nlp - run: pip install git+https://github.com/huggingface/datasets
- run: pip install .[sklearn,tf-cpu,torch,testing] - run: pip install .[sklearn,tf-cpu,torch,testing]
- run: pip install codecov pytest-cov - run: pip install codecov pytest-cov
- save_cache: - save_cache:
...@@ -104,7 +104,7 @@ jobs: ...@@ -104,7 +104,7 @@ jobs:
- v0.3-torch-{{ checksum "setup.py" }} - v0.3-torch-{{ checksum "setup.py" }}
- v0.3-{{ checksum "setup.py" }} - v0.3-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install git+https://github.com/huggingface/nlp - run: pip install git+https://github.com/huggingface/datasets
- run: pip install .[sklearn,torch,testing] - run: pip install .[sklearn,torch,testing]
- save_cache: - save_cache:
key: v0.3-torch-{{ checksum "setup.py" }} key: v0.3-torch-{{ checksum "setup.py" }}
...@@ -129,7 +129,7 @@ jobs: ...@@ -129,7 +129,7 @@ jobs:
- v0.3-tf-{{ checksum "setup.py" }} - v0.3-tf-{{ checksum "setup.py" }}
- v0.3-{{ checksum "setup.py" }} - v0.3-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install git+https://github.com/huggingface/nlp - run: pip install git+https://github.com/huggingface/datasets
- run: pip install .[sklearn,tf-cpu,testing] - run: pip install .[sklearn,tf-cpu,testing]
- save_cache: - save_cache:
key: v0.3-tf-{{ checksum "setup.py" }} key: v0.3-tf-{{ checksum "setup.py" }}
......
...@@ -46,7 +46,7 @@ jobs: ...@@ -46,7 +46,7 @@ jobs:
pip install --upgrade pip pip install --upgrade pip
pip install torch!=1.6.0 pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime] pip install .[sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/nlp pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
......
...@@ -43,7 +43,7 @@ jobs: ...@@ -43,7 +43,7 @@ jobs:
pip install --upgrade pip pip install --upgrade pip
pip install torch!=1.6.0 pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime] pip install .[sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/nlp pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks - name: Are GPUs recognized by our DL frameworks
run: | run: |
......
# Long Form Question Answering # Long Form Question Answering
This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗nlp](https://github.com/huggingface/nlp) libraries. This folder contains the code for the Long Form Question answering [demo](http://35.226.96.115:8080/) as well as methods to train and use a fully end-to-end Long Form Question Answering system using the [🤗transformers](https://github.com/huggingface/transformers) and [🤗datasets](https://github.com/huggingface/datasets) libraries.
You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html). You can use these methods to train your own system by following along the associate [notebook](https://github.com/huggingface/notebooks/blob/master/longform-qa/Long_Form_Question_Answering_with_ELI5_and_Wikipedia.ipynb) or [blog post](https://yjernite.github.io/lfqa.html).
import datasets
import faiss import faiss
import nlp
import numpy as np import numpy as np
import streamlit as st import streamlit as st
import torch import torch
...@@ -45,7 +45,7 @@ def load_models(): ...@@ -45,7 +45,7 @@ def load_models():
def load_indexes(): def load_indexes():
if LOAD_DENSE_INDEX: if LOAD_DENSE_INDEX:
faiss_res = faiss.StandardGpuResources() faiss_res = faiss.StandardGpuResources()
wiki40b_passages = nlp.load_dataset(path="wiki_snippets", name="wiki40b_en_100_0")["train"] wiki40b_passages = datasets.load_dataset(path="wiki_snippets", name="wiki40b_en_100_0")["train"]
wiki40b_passage_reps = np.memmap( wiki40b_passage_reps = np.memmap(
"wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat", "wiki40b_passages_reps_32_l-8_h-768_b-512-512.dat",
dtype="float32", dtype="float32",
...@@ -63,7 +63,7 @@ def load_indexes(): ...@@ -63,7 +63,7 @@ def load_indexes():
@st.cache(allow_output_mutation=True) @st.cache(allow_output_mutation=True)
def load_train_data(): def load_train_data():
eli5 = nlp.load_dataset("eli5", name="LFQA_reddit") eli5 = datasets.load_dataset("eli5", name="LFQA_reddit")
eli5_train = eli5["train_eli5"] eli5_train = eli5["train_eli5"]
eli5_train_q_reps = np.memmap( eli5_train_q_reps = np.memmap(
"eli5_questions_reps.dat", dtype="float32", mode="r", shape=(eli5_train.num_rows, 128) "eli5_questions_reps.dat", dtype="float32", mode="r", shape=(eli5_train.num_rows, 128)
......
...@@ -4,8 +4,8 @@ import os # noqa: F401 ...@@ -4,8 +4,8 @@ import os # noqa: F401
from random import choice, randint from random import choice, randint
from time import time from time import time
import datasets # noqa: F401
import faiss # noqa: F401 import faiss # noqa: F401
import nlp # noqa: F401
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import torch import torch
......
...@@ -12,7 +12,7 @@ faiss ...@@ -12,7 +12,7 @@ faiss
streamlit streamlit
elasticsearch elasticsearch
pandas pandas
nlp datasets
fire fire
pytest pytest
conllu conllu
\ No newline at end of file
...@@ -5,25 +5,25 @@ from tqdm import tqdm ...@@ -5,25 +5,25 @@ from tqdm import tqdm
def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir=None) -> None: def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir=None) -> None:
"""Download a dataset using the nlp package and save it to the format expected by finetune.py """Download a dataset using the datasets package and save it to the format expected by finetune.py
Format of save_dir: train.source, train.target, val.source, val.target, test.source, test.target. Format of save_dir: train.source, train.target, val.source, val.target, test.source, test.target.
Args: Args:
src_lang: <str> source language src_lang: <str> source language
tgt_lang: <str> target language tgt_lang: <str> target language
dataset: <str> wmt16, wmt17, etc. wmt16 is a good start as it's small. To get the full list run `import nlp; print([d.id for d in nlp.list_datasets() if "wmt" in d.id])` dataset: <str> wmt16, wmt17, etc. wmt16 is a good start as it's small. To get the full list run `import datasets; print([d.id for d in datasets.list_datasets() if "wmt" in d.id])`
save_dir: <str>, where to save the datasets, defaults to f'{dataset}-{src_lang}-{tgt_lang}' save_dir: <str>, where to save the datasets, defaults to f'{dataset}-{src_lang}-{tgt_lang}'
Usage: Usage:
>>> download_wmt_dataset('ro', 'en', dataset='wmt16') # saves to wmt16-ro-en >>> download_wmt_dataset('ro', 'en', dataset='wmt16') # saves to wmt16-ro-en
""" """
try: try:
import nlp import datasets
except (ModuleNotFoundError, ImportError): except (ModuleNotFoundError, ImportError):
raise ImportError("run pip install nlp") raise ImportError("run pip install datasets")
pair = f"{src_lang}-{tgt_lang}" pair = f"{src_lang}-{tgt_lang}"
print(f"Converting {dataset}-{pair}") print(f"Converting {dataset}-{pair}")
ds = nlp.load_dataset(dataset, pair) ds = datasets.load_dataset(dataset, pair)
if save_dir is None: if save_dir is None:
save_dir = f"{dataset}-{pair}" save_dir = f"{dataset}-{pair}"
save_dir = Path(save_dir) save_dir = Path(save_dir)
......
...@@ -7,6 +7,7 @@ known_first_party = transformers ...@@ -7,6 +7,7 @@ known_first_party = transformers
known_third_party = known_third_party =
absl absl
conllu conllu
datasets
elasticsearch elasticsearch
fairseq fairseq
faiss faiss
...@@ -16,7 +17,6 @@ known_third_party = ...@@ -16,7 +17,6 @@ known_third_party =
git git
h5py h5py
matplotlib matplotlib
nlp
nltk nltk
numpy numpy
packaging packaging
......
...@@ -83,7 +83,7 @@ from .file_utils import ( ...@@ -83,7 +83,7 @@ from .file_utils import (
add_start_docstrings, add_start_docstrings,
cached_path, cached_path,
is_apex_available, is_apex_available,
is_nlp_available, is_datasets_available,
is_psutil_available, is_psutil_available,
is_py3nvml_available, is_py3nvml_available,
is_tf_available, is_tf_available,
......
...@@ -66,12 +66,12 @@ except (ImportError, AssertionError): ...@@ -66,12 +66,12 @@ except (ImportError, AssertionError):
try: try:
import nlp # noqa: F401 import datasets # noqa: F401
_nlp_available = True _datasets_available = True
except ImportError: except ImportError:
_nlp_available = False _datasets_available = False
try: try:
from torch.hub import _get_torch_home from torch.hub import _get_torch_home
...@@ -155,8 +155,8 @@ def is_torch_tpu_available(): ...@@ -155,8 +155,8 @@ def is_torch_tpu_available():
return _torch_tpu_available return _torch_tpu_available
def is_nlp_available(): def is_datasets_available():
return _nlp_available return _datasets_available
def is_psutil_available(): def is_psutil_available():
......
...@@ -20,7 +20,7 @@ from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler ...@@ -20,7 +20,7 @@ from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
from tqdm.auto import tqdm, trange from tqdm.auto import tqdm, trange
from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
from .file_utils import is_nlp_available, is_torch_tpu_available from .file_utils import is_datasets_available, is_torch_tpu_available
from .integrations import ( from .integrations import (
default_hp_search_backend, default_hp_search_backend,
is_comet_available, is_comet_available,
...@@ -65,8 +65,8 @@ else: ...@@ -65,8 +65,8 @@ else:
_use_native_amp = True _use_native_amp = True
from torch.cuda.amp import autocast from torch.cuda.amp import autocast
if is_nlp_available(): if is_datasets_available():
import nlp import datasets
if is_torch_tpu_available(): if is_torch_tpu_available():
import torch_xla.core.xla_model as xm import torch_xla.core.xla_model as xm
...@@ -179,10 +179,10 @@ class Trainer: ...@@ -179,10 +179,10 @@ class Trainer:
:obj:`eval_dataset`. Will default to :func:`~transformers.default_data_collator` if no ``tokenizer`` is :obj:`eval_dataset`. Will default to :func:`~transformers.default_data_collator` if no ``tokenizer`` is
provided, an instance of :func:`~transformers.DataCollatorWithPadding` otherwise. provided, an instance of :func:`~transformers.DataCollatorWithPadding` otherwise.
train_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): train_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The dataset to use for training. If it is an :obj:`nlp.Dataset`, columns not accepted by the The dataset to use for training. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed. ``model.forward()`` method are automatically removed.
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The dataset to use for evaluation. If it is an :obj:`nlp.Dataset`, columns not accepted by the The dataset to use for evaluation. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed. ``model.forward()`` method are automatically removed.
tokenizer (:class:`PreTrainedTokenizerBase`, `optional`): tokenizer (:class:`PreTrainedTokenizerBase`, `optional`):
The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
...@@ -280,10 +280,10 @@ class Trainer: ...@@ -280,10 +280,10 @@ class Trainer:
FutureWarning, FutureWarning,
) )
if is_nlp_available(): if is_datasets_available():
if isinstance(train_dataset, nlp.Dataset): if isinstance(train_dataset, datasets.Dataset):
self._remove_unused_columns(self.train_dataset, description="training") self._remove_unused_columns(self.train_dataset, description="training")
if isinstance(eval_dataset, nlp.Dataset): if isinstance(eval_dataset, datasets.Dataset):
self._remove_unused_columns(self.eval_dataset, description="evaluation") self._remove_unused_columns(self.eval_dataset, description="evaluation")
self.global_step = None self.global_step = None
...@@ -294,7 +294,7 @@ class Trainer: ...@@ -294,7 +294,7 @@ class Trainer:
self.hp_search_backend = None self.hp_search_backend = None
self.use_tune_checkpoints = False self.use_tune_checkpoints = False
def _remove_unused_columns(self, dataset: "nlp.Dataset", description: Optional[str] = None): def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None):
if not self.args.remove_unused_columns: if not self.args.remove_unused_columns:
return return
# Inspect model forward signature to keep only the arguments it accepts. # Inspect model forward signature to keep only the arguments it accepts.
...@@ -364,12 +364,12 @@ class Trainer: ...@@ -364,12 +364,12 @@ class Trainer:
Args: Args:
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
If provided, will override :obj:`self.eval_dataset`. If it is an :obj:`nlp.Dataset`, columns not If provided, will override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, columns not
accepted by the ``model.forward()`` method are automatically removed. accepted by the ``model.forward()`` method are automatically removed.
""" """
if eval_dataset is None and self.eval_dataset is None: if eval_dataset is None and self.eval_dataset is None:
raise ValueError("Trainer: evaluation requires an eval_dataset.") raise ValueError("Trainer: evaluation requires an eval_dataset.")
elif eval_dataset is not None and is_nlp_available() and isinstance(eval_dataset, nlp.Dataset): elif eval_dataset is not None and is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
self._remove_unused_columns(eval_dataset, description="evaluation") self._remove_unused_columns(eval_dataset, description="evaluation")
eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
eval_sampler = self._get_eval_sampler(eval_dataset) eval_sampler = self._get_eval_sampler(eval_dataset)
...@@ -393,10 +393,10 @@ class Trainer: ...@@ -393,10 +393,10 @@ class Trainer:
Args: Args:
eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`): eval_dataset (:obj:`torch.utils.data.dataset.Dataset`, `optional`):
The test dataset to use. If it is an :obj:`nlp.Dataset`, columns not accepted by the The test dataset to use. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed. ``model.forward()`` method are automatically removed.
""" """
if is_nlp_available() and isinstance(test_dataset, nlp.Dataset): if is_datasets_available() and isinstance(test_dataset, datasets.Dataset):
self._remove_unused_columns(test_dataset, description="test") self._remove_unused_columns(test_dataset, description="test")
test_sampler = self._get_eval_sampler(test_dataset) test_sampler = self._get_eval_sampler(test_dataset)
...@@ -1200,7 +1200,7 @@ class Trainer: ...@@ -1200,7 +1200,7 @@ class Trainer:
Args: Args:
eval_dataset (:obj:`Dataset`, `optional`): eval_dataset (:obj:`Dataset`, `optional`):
Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`nlp.Dataset`, Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`,
columns not accepted by the ``model.forward()`` method are automatically removed. columns not accepted by the ``model.forward()`` method are automatically removed.
Returns: Returns:
...@@ -1227,7 +1227,7 @@ class Trainer: ...@@ -1227,7 +1227,7 @@ class Trainer:
Args: Args:
test_dataset (:obj:`Dataset`): test_dataset (:obj:`Dataset`):
Dataset to run the predictions on. If it is an :obj:`nlp.Dataset`, columns not accepted by the Dataset to run the predictions on. If it is an :obj:`datasets.Dataset`, columns not accepted by the
``model.forward()`` method are automatically removed. ``model.forward()`` method are automatically removed.
Returns: Returns:
......
import unittest import unittest
import nlp import datasets
import numpy as np import numpy as np
from transformers import AutoTokenizer, TrainingArguments, is_torch_available from transformers import AutoTokenizer, TrainingArguments, is_torch_available
...@@ -200,11 +200,11 @@ class TrainerIntegrationTest(unittest.TestCase): ...@@ -200,11 +200,11 @@ class TrainerIntegrationTest(unittest.TestCase):
x = trainer.eval_dataset.x x = trainer.eval_dataset.x
self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
def test_trainer_with_nlp(self): def test_trainer_with_datasets(self):
np.random.seed(42) np.random.seed(42)
x = np.random.normal(size=(64,)).astype(np.float32) x = np.random.normal(size=(64,)).astype(np.float32)
y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,)) y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,))
train_dataset = nlp.Dataset.from_dict({"input_x": x, "label": y}) train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y})
# Base training. Should have the same results as test_reproducible_training # Base training. Should have the same results as test_reproducible_training
model = RegressionModel() model = RegressionModel()
...@@ -222,7 +222,7 @@ class TrainerIntegrationTest(unittest.TestCase): ...@@ -222,7 +222,7 @@ class TrainerIntegrationTest(unittest.TestCase):
# Adding one column not used by the model should have no impact # Adding one column not used by the model should have no impact
z = np.random.normal(size=(64,)).astype(np.float32) z = np.random.normal(size=(64,)).astype(np.float32)
train_dataset = nlp.Dataset.from_dict({"input_x": x, "label": y, "extra": z}) train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z})
model = RegressionModel() model = RegressionModel()
trainer = Trainer(model, args, train_dataset=train_dataset) trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.train() trainer.train()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment