"scripts/git@developer.sourcefind.cn:Wenxuan/LightX2V.git" did not exist on "ad73b2716f29e9e8672b9b90bc6cfa0f82ff6019"
Unverified Commit 2dd1b8f0 authored by Rohit Gupta's avatar Rohit Gupta Committed by GitHub
Browse files

adding key pair dataset (#19765)

parent 17d7aec8
......@@ -91,7 +91,7 @@ pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-96
dataset = datasets.load_dataset("superb", name="asr", split="test")
# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
# as we're not interested in the *target* part of the dataset.
# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
for out in tqdm(pipe(KeyDataset(dataset, "file"))):
print(out)
# {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
......
......@@ -293,3 +293,16 @@ class KeyDataset(Dataset):
def __getitem__(self, i):
return self.dataset[i][self.key]
class KeyPairDataset(Dataset):
def __init__(self, dataset: Dataset, key1: str, key2: str):
self.dataset = dataset
self.key1 = key1
self.key2 = key2
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment