Unverified Commit e4c06ed6 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

New run_seq2seq script (#9605)



* New run_seq2seq script

* Add tests

* Mark as slow

* Update examples/seq2seq/run_seq2seq.py
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>

* Update src/transformers/data/data_collator.py
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>

* Update src/transformers/data/data_collator.py
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>

* Address review comments
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>
parent fa876aee
This diff is collapsed.
...@@ -23,7 +23,7 @@ from unittest.mock import patch ...@@ -23,7 +23,7 @@ from unittest.mock import patch
import torch import torch
from transformers.file_utils import is_apex_available from transformers.file_utils import is_apex_available
from transformers.testing_utils import TestCasePlus, require_torch_non_multi_gpu_but_fix_me, torch_device from transformers.testing_utils import TestCasePlus, require_torch_non_multi_gpu_but_fix_me, slow, torch_device
SRC_DIRS = [ SRC_DIRS = [
...@@ -35,6 +35,7 @@ SRC_DIRS = [ ...@@ -35,6 +35,7 @@ SRC_DIRS = [
"language-modeling", "language-modeling",
"multiple-choice", "multiple-choice",
"question-answering", "question-answering",
"seq2seq",
] ]
] ]
sys.path.extend(SRC_DIRS) sys.path.extend(SRC_DIRS)
...@@ -47,6 +48,7 @@ if SRC_DIRS is not None: ...@@ -47,6 +48,7 @@ if SRC_DIRS is not None:
import run_mlm import run_mlm
import run_ner import run_ner
import run_qa as run_squad import run_qa as run_squad
import run_seq2seq
import run_swag import run_swag
...@@ -259,3 +261,67 @@ class ExamplesTests(TestCasePlus): ...@@ -259,3 +261,67 @@ class ExamplesTests(TestCasePlus):
with patch.object(sys, "argv", testargs + [model_type, model_name]): with patch.object(sys, "argv", testargs + [model_type, model_name]):
result = run_generation.main() result = run_generation.main()
self.assertGreaterEqual(len(result[0]), 10) self.assertGreaterEqual(len(result[0]), 10)
@slow
@require_torch_non_multi_gpu_but_fix_me
def test_run_seq2seq_summarization(self):
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_seq2seq.py
--model_name_or_path t5-small
--task summarization
--train_file tests/fixtures/tests_samples/xsum/sample.json
--validation_file tests/fixtures/tests_samples/xsum/sample.json
--output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=50
--warmup_steps=8
--do_train
--do_eval
--learning_rate=2e-4
--per_device_train_batch_size=2
--per_device_eval_batch_size=1
--predict_with_generate
""".split()
with patch.object(sys, "argv", testargs):
result = run_seq2seq.main()
self.assertGreaterEqual(result["eval_rouge1"], 10)
self.assertGreaterEqual(result["eval_rouge2"], 2)
self.assertGreaterEqual(result["eval_rougeL"], 7)
self.assertGreaterEqual(result["eval_rougeLsum"], 7)
@slow
@require_torch_non_multi_gpu_but_fix_me
def test_run_seq2seq_translation(self):
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_seq2seq.py
--model_name_or_path sshleifer/student_marian_en_ro_6_1
--task translation_en_to_ro
--train_file tests/fixtures/tests_samples/wmt16/sample.json
--validation_file tests/fixtures/tests_samples/wmt16/sample.json
--output_dir {tmp_dir}
--overwrite_output_dir
--max_steps=50
--warmup_steps=8
--do_train
--do_eval
--learning_rate=3e-3
--per_device_train_batch_size=2
--per_device_eval_batch_size=1
--predict_with_generate
--source_lang en_XX
--target_lang ro_RO
""".split()
with patch.object(sys, "argv", testargs):
result = run_seq2seq.main()
self.assertGreaterEqual(result["eval_bleu"], 30)
...@@ -324,6 +324,7 @@ if is_torch_available(): ...@@ -324,6 +324,7 @@ if is_torch_available():
"DataCollator", "DataCollator",
"DataCollatorForLanguageModeling", "DataCollatorForLanguageModeling",
"DataCollatorForPermutationLanguageModeling", "DataCollatorForPermutationLanguageModeling",
"DataCollatorForSeq2Seq",
"DataCollatorForSOP", "DataCollatorForSOP",
"DataCollatorForTokenClassification", "DataCollatorForTokenClassification",
"DataCollatorForWholeWordMask", "DataCollatorForWholeWordMask",
...@@ -1395,6 +1396,7 @@ if TYPE_CHECKING: ...@@ -1395,6 +1396,7 @@ if TYPE_CHECKING:
DataCollator, DataCollator,
DataCollatorForLanguageModeling, DataCollatorForLanguageModeling,
DataCollatorForPermutationLanguageModeling, DataCollatorForPermutationLanguageModeling,
DataCollatorForSeq2Seq,
DataCollatorForSOP, DataCollatorForSOP,
DataCollatorForTokenClassification, DataCollatorForTokenClassification,
DataCollatorForWholeWordMask, DataCollatorForWholeWordMask,
......
...@@ -224,6 +224,63 @@ def tolist(x: Union[List[Any], torch.Tensor]): ...@@ -224,6 +224,63 @@ def tolist(x: Union[List[Any], torch.Tensor]):
return x.tolist() if isinstance(x, torch.Tensor) else x return x.tolist() if isinstance(x, torch.Tensor) else x
@dataclass
class DataCollatorForSeq2Seq:
"""
Data collator that will dynamically pad the inputs received, as well as the labels.
Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence is provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
"""
tokenizer: PreTrainedTokenizerBase
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
label_pad_token_id: int = -100
def __call__(self, features):
labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
# We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
# same length to return tensors.
if labels is not None:
max_label_length = max(len(l) for l in labels)
padding_side = self.tokenizer.padding_side
for feature in features:
remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
feature["labels"] = (
feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
)
return self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
@dataclass @dataclass
class DataCollatorForLanguageModeling: class DataCollatorForLanguageModeling:
""" """
......
...@@ -35,6 +35,11 @@ class DataCollatorForPermutationLanguageModeling: ...@@ -35,6 +35,11 @@ class DataCollatorForPermutationLanguageModeling:
requires_pytorch(self) requires_pytorch(self)
class DataCollatorForSeq2Seq:
def __init__(self, *args, **kwargs):
requires_pytorch(self)
class DataCollatorForSOP: class DataCollatorForSOP:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
requires_pytorch(self) requires_pytorch(self)
......
{"translation": {"en": "Membership of Parliament: see Minutes", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal"}}
{"translation": {"en": "Approval of Minutes of previous sitting: see Minutes", "ro": "Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal"}}
{"translation": {"en": "Membership of Parliament: see Minutes", "ro": "Componenţa Parlamentului: a se vedea procesul-verbal"}}
{"translation": {"en": "Verification of credentials: see Minutes", "ro": "Verificarea prerogativelor: a se vedea procesul-verbal"}}
{"translation": {"en": "Documents received: see Minutes", "ro": "Depunere de documente: a se vedea procesul-verbal"}}
{"translation": {"en": "Written statements and oral questions (tabling): see Minutes", "ro": "Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal"}}
{"translation": {"en": "Petitions: see Minutes", "ro": "Petiţii: a se vedea procesul-verbal"}}
{"translation": {"en": "Texts of agreements forwarded by the Council: see Minutes", "ro": "Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal"}}
{"translation": {"en": "Action taken on Parliament's resolutions: see Minutes", "ro": "Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal"}}
{"translation": {"en": "Agenda for next sitting: see Minutes", "ro": "Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal"}}
{"document": "The warning begins at 22:00 GMT on Saturday and ends at 10:00 on Sunday.\nThe ice could lead to difficult driving conditions on untreated roads and slippery conditions on pavements, the weather service warned.\nOnly the southernmost counties and parts of the most westerly counties are expected to escape.\nCounties expected to be affected are Carmarthenshire, Powys, Ceredigion, Pembrokeshire, Denbighshire, Gwynedd, Wrexham, Conwy, Flintshire, Anglesey, Monmouthshire, Blaenau Gwent, Caerphilly, Merthyr Tydfil, Neath Port Talbot, Rhondda Cynon Taff and Torfaen.", "summary": "The Met Office has issued a yellow weather warning for ice across most of Wales."}
{"document": "You can see highlights of Sunderland v Arsenal on Match of the Day at 22:20 BST on Saturday on BBC One and the BBC Sport website.\nStoke and West Ham, for example, have started to climb away from the relegation zone but the biggest worry for Sunderland fans is that their side do not look remotely capable of doing the same.\nI know the Black Cats have got out of trouble before having found themselves in a similar situation but this time, after picking up only two points from their first nine games, things look really desperate for the only top-flight team without a win.\nAt least one element of their struggles seems to be self-inflicted, with everyone at the club feeling sorry for themselves - and not just because they have lost some players to injury and conceded some costly late goals.\nThere is a negative feeling about the place with the manager David Moyes and his players talking about how they have gone backwards since last season, when they should be searching for any kind of spark that could change things around.\nFrom the outside, looking at the way they play and their lack of creativity, it is hard to see what that spark might be or what could fundamentally change under Moyes until the January transfer window opens.\nIf they can get one win under their belt then they will get a bit of belief back but, the longer this winless run goes on, the more negativity there will be.\nMedia playback is not supported on this device\nSunderland finished last season on a high under Sam Allardyce, with a run of just one defeat in their last 11 games securing their safety.\nIn the space of five months, all of that confidence and momentum seems to have been sucked out of the club, despite them effectively having the same group of players who, not so long ago, looked inspired.\nThat is not all down to Moyes, but he has to take some responsibility for it.\nI am yet to see a defined style of play from Sunderland since he took charge at the end of July.\nThat is in contrast to Allardyce's time as manager, when they were resolute and difficult to beat and, at the end of his stint at the Stadium of Light, also played with a purpose when they went forward.\nOff the pitch, Moyes has not helped himself much either.\nThere was no need for him to be so pessimistic when he came out after the second game of the season and announced they would be in a relegation fight, which did not send out the right message to his players or the fans.\nWhen he took charge, he had actually started out by being unrealistically positive - talking about Sunderland becoming a club that regularly finished in the top half of the Premier League - but his expectations went downhill very quickly.\nI know you can argue that he has been proved right, because Sunderland are now battling the drop, but it meant there was a cloud over from them almost as soon as the season had started.\nIt seems to be a case that if you stop Jermain Defoe, you stop Sunderland. His statistics stand up well in comparison to last season, but the rest of their team are not doing enough in attack.\nThey were reliant on Defoe last season too, but others did chip in - in their first nine league games of 2015-16, five players found the net. This time around, only Defoe and Patrick van Aanholt have scored in the same period.\nIt is going to be a massive struggle for them to stay up from the position they are now in anyway, but they badly need a win and quickly. I don't see it coming at home to Arsenal on Saturday, though.\nDo they even look capable of holding out for a draw against the Gunners, the way another struggling team Middlesbrough did at Emirates Stadium last weekend? No.\nIf you struggle to make chances and score goals, as Sunderland do, that puts more pressure on your defence because you know if you concede then you are in big trouble.\nAnd the Black Cats have problems at the back as well - their only clean sheet in 12 matches under Moyes was against League One side Shrewsbury Town in the EFL Cup.\nIt does not bode well against an Arsenal side that are averaging more than two goals a game this season.\nIt is hard to find any positives from Sunderland's situation but at least they have not been cut adrift at the bottom - yet.\nUnless they win soon, that could happen. I think Hull are also in for a very tough season but when I look at the other two teams immediately above them, Boro and Swansea, they definitely have more about them than the Black Cats do.\nMedia playback is not supported on this device\nChanging manager has clearly not helped Sunderland and comparisons with his predecessor do not help Moyes much either.\nYou cannot tell me that, if Allardyce was still in charge, Sunderland would have only picked up two points so far. It just would not have happened.\nMoyes replaced him relatively late in the summer, which is difficult in itself, but he can only complain about the things that have gone against him up to a point. He should be doing much better than he is.\nHe is still the manager and he is capable of turning things around, so it is right there is no suggestion of him getting the sack.\nBut that will not last forever. This industry is results-driven and Moyes' results are not good enough.\nThat clearly has to change soon and, looking at Sunderland's next few fixtures, the one that stands out as a must-win is their home game against Hull on 19 November.\nIf they fail to beat Arsenal and Bournemouth, then the visit of the Tigers will be the game to define Moyes' tenure. If Sunderland are still without a win after that, things will become extremely difficult for him.\nChris Sutton was speaking to BBC Sport's Chris Bevan.", "summary": "We are exactly a quarter of the way through the Premier League season and some teams at the bottom of the table seem to be turning things around after making a bad start."}
{"document": "The win keeps the Candystripes two points behind leaders Dundalk who won 2-0 away to Shamrock Rovers.\nFormer Plymouth striker Patterson scored his sixth goal of the season in the 14th minute at the Brandywell.\nHe shot into an empty net after the ball broke to him when keeper Dean Delany thwarted Barry McNamee.\nKurtis Byrne should have netted a speedy equaliser but the son of former Celtic player Paul Byrne completely missed his kick in front of goal.\nThat was the one big scare for Kenny Shiels' men on a night when both keepers had a quiet night.\nDerry City have won six and drawn two in the eight games they have played since losing to Finn Harps on the first day of the season.", "summary": "Rory Patterson's early goal proved enough to give second-placed Derry City a home victory over Bohemians in Friday night's Premier Division clash."}
{"document": "The centre-right coalition led by Mr Passos Coelho won the most seats in the election on 4 October.\nBut Socialist leader Antonio Costa has been working to build a coalition with far-left parties.\nMany believe that Mr Passos Coelho will fail to pass the test of a vote of no confidence in Portugal's parliament.\nPresident Anibal Cavaco Silva would then be expected to ask the left to form a government.\nThere are fears that weeks of uncertainty could harm Portugal's economic recovery, more than a year after it exited the strict terms of its €78bn (£57bn) international bailout.\nEU officials have threatened to take action against Portugal for missing a 15 October deadline to present its draft 2016 budget.\nPortugal is still running one of the highest budget deficits in the eurozone.\n12%\nof the workforce is unemployed\n20%\nof people live below the poverty line\n485,000 emigrated from Portugal between 2011 and 2014\n125% debt to GDP - the second highest rate in the European Union\nMr Passos Coelho's Social Democrats have promised to present a budget, but the two left-wing parties campaigned strongly against his outgoing government's record of harsh austerity.\nThe Left Bloc is seen as allied to the anti-austerity Syriza party in Greece, which for months tried to renegotiate the terms of Greece's eurozone bailout.\nPortugal's Communist Party is regarded as anti-euro and anti-Nato, although it is thought to have moderated its eurozone policies in recent weeks.\nIf Mr Costa's Socialists are eventually chosen to lead a left-wing coalition, it would be the first time since the fall of Portugal's dictatorship in 1974 that a right-wing president appointed a government backed by communists.\nAfter his re-appointment as prime minister leading a right-of-centre coalition, Pedro Passos Coelho has 10 days to appoint ministers and secure parliamentary approval.\nThat may prove impossible, since his coalition lost its majority in the 4 October election and the Socialists have pledged to reject his programme if their talks with other parties succeed.\nTogether, the Socialists, Left Bloc and Communist Party have a majority. All wanted the president to appoint Mr Costa - arguing that anything else was a waste of time.\nIf Mr Passos Coelho does fail, the president could then appoint Mr Costa or keep the incumbent on as caretaker.\nFresh legislative elections may only take place from June, after voters have elected a new president early next year.", "summary": "The Portuguese president has invited incumbent Prime Minister Pedro Passos Coelho to form the next government, despite him having lost his majority."}
{"document": "Nev Edwards scored an early try for Sale, before Castres' Florian Vialelle went over, but Julien Dumora's penalty put the hosts 10-7 ahead at the break.\nJoe Ford sent over a penalty before Castres' Marc-Antoine Rallier and Sales' Will Addison were sin-binned.\nJulien Caminati's late attempt to stop Charlie Ingall saw Sale awarded the decisive penalty try.\nThe win moves the English Premiership side to within one point of Pool Two leaders Newport Gwent Dragons after three games.\nSale got off to the ideal start, Edwards sprinting away for the game's opening points from an Andrei Ostrikov kick, but Castres heaped the pressure on in search of a reply, which came through Vialelle on eight minutes.\nSharks flanker Magnus Lund was forced off with a head injury before the television match official denied Castres a second try, with replays showing that the Sharks defence did enough to force full-back Caminati into touch.\nFord had a chance to put Sale ahead again, but his penalty on 27 minutes drifted wide. Dumora, however, made no mistake soon after, slotting over to give the French side the lead on 33 minutes.\nA combination of probing grubber kicks and scrappy play eventually led to Ford teeing up his second penalty attempt, with the fly-half this time booting the three points to make it 10-10.\nRallier's yellow card following a scuffle saw Ford opt for the posts soon after, but he was off target again before Sales' one-man advantage was lost as Addison was sin-binned.\nSharks pushed for the breakthrough as Ingall went close to touching down, and the video referee eventually gave the penalty try after deciding that Caminati's attempt to stop the winger was illegal.\nCastres: Caminati; Martial, Vialelle, Combezou, Decrop; Dumora, Dupont; Taumoepeau, Rallier, Montes; Samson, Moreaux, Caballero, Diarra, Beattie.\nReplacements: Beziat, Tichit, Martinez, Desroche, Babillot, Fontaine, Lamerat, Seron.\nSale: Arscott; Edwards, Addison, Jennings, Ingall; Ford, Mitchell, Lewis-Roberts, Briggs, Mujati, Mills, Ostrikov, Lund, Seymour (capt), Easter.\nReplacements: Taylor, Flynn, Parker, Beaumont, Neild, Jeffers, James, Haley.\nReferee: David Wilkinson (Ireland)", "summary": "A late penalty try gave Sale victory over Castres at Stade Pierre-Antoine in their European Challenge Cup clash."}
{"document": "The 33-year-old was released by Norwich this summer after five years at the club, during which time he made 75 Canaries first-team appearances.\nTurner also had spells on loan at Fulham and Sheffield Wednesday during his time at Carrow Road.\nIn total, the centre-back has made 436 senior career appearances for eight different clubs.\nFind all the latest football transfers on our dedicated page.", "summary": "League One side Southend United have signed former Hull and Norwich defender Michael Turner on a one-year deal."}
{"document": "United contacted St Johnstone this week with a view to speaking to 52-year-old Wright about the job but this approach was rejected by the Saints board.\nThe Tannadice club - bottom of the Premiership - are seeking to replace Jackie McNamara, who left last month.\nDave Bowman took the first team for Saturday's loss to Partick Thistle.\nThe Tangerines have won only once this season and prop up the table with five points from 10 games.\nFormer Northern Ireland goalkeeper Wright, who replaced Steve Lomas at McDiarmid Park in 2013, led St Johnstone to Scottish Cup success in his first season in charge.\nHe has also secured two successive top-six finishes for the Perth side and previously managed in his homeland.", "summary": "St Johnstone boss Tommy Wright is no longer under consideration for the Dundee United manager's job, BBC Scotland has learned."}
{"document": "Media playback is unsupported on your device\n2 November 2014 Last updated at 17:20 GMT\nHomes and businesses were damaged in the storm, but weather experts were not able to confirm it was a tornado.\nNavtej Johal reports.", "summary": "Residents in Coalville in Leicestershire are cleaning up after high winds hit the town."}
{"document": "5 August 2015 Last updated at 06:36 BST\nShe's now 84 and has been telling Newsround the inspiring story of her life before and after that devastating and world-changing event.\nThis animation contains some sad moments that you might find upsetting.\nYou can find out more about what happened in Hiroshima here.\nWatch 'Hiroshima: A Newsround Special' - Thursday 6 August at 5.30pm on the CBBC channel and on the Newsround website.", "summary": "Bun Hashizume was 14 years old and lived in Hiroshima, in Japan, when a nuclear bomb was dropped on the city 70 years ago, at the end of World War Two."}
{"document": "But what has been your moment of the year?\nFrom Ben Stokes' 258 off 198 balls against South Africa to Stuart Broad's 6-17 against the same opponents, and Alastair Cook being the first Englishman to reach 10,000 Test runs, there are lots of highlights.\nOr perhaps you revelled in Australia being skittled for just 85? Or the dog that invaded the pitch at Vizag?\nThe cricket brains of BBC Sport and BBC Radio 5 live asked you to rank your top 10, and your shortlist will be revealed on Tuesday's Tuffers and Vaughan Cricket Show (20:30 GMT, BBC Radio 5 live and online).\nVotes will no longer count but you can still pick your top 10 and share with friends.\nWhat are your top 10 cricketing moments from this year?", "summary": "It's been topsy-turvy for the England side but eventful and entertaining nonetheless."}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment