Creates a LabelSampler for a SentenceLabelDataset.
:param examples:
a list with InputExamples
:param samples_per_label:
the number of consecutive, random and unique samples drawn per label. Batch size should be a multiple of samples_per_label
:param with_replacement:
if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
drawing.
"""
super().__init__()
self.samples_per_label=samples_per_label
# Group examples by label
label2ex={}
forexampleinexamples:
ifexample.labelnotinlabel2ex:
label2ex[example.label]=[]
label2ex[example.label].append(example)
# Include only labels with at least 2 examples
self.grouped_inputs=[]
self.groups_right_border=[]
num_labels=0
forlabel,label_examplesinlabel2ex.items():
iflen(label_examples)>=self.samples_per_label:
self.grouped_inputs.extend(label_examples)
self.groups_right_border.append(
len(self.grouped_inputs)
)# At which position does this label group / bucket end?
num_labels+=1
self.label_range=np.arange(num_labels)
self.with_replacement=with_replacement
np.random.shuffle(self.label_range)
logger.info(
"SentenceLabelDataset: {} examples, from which {} examples could be used (those labels appeared at least {} times). {} different labels found.".format(
Given a large set of sentences, this evaluator performs paraphrase (duplicate) mining and
identifies the pairs with the highest similarity. It compare the extracted paraphrase pairs
with a set of gold labels and computes the F1 score.
"""
def__init__(
self,
sentences_map:Dict[str,str],
duplicates_list:List[Tuple[str,str]]=None,
duplicates_dict:Dict[str,Dict[str,bool]]=None,
add_transitive_closure:bool=False,
query_chunk_size:int=5000,
corpus_chunk_size:int=100000,
max_pairs:int=500000,
top_k:int=100,
show_progress_bar:bool=False,
batch_size:int=16,
name:str="",
write_csv:bool=True,
):
"""
:param sentences_map: A dictionary that maps sentence-ids to sentences, i.e. sentences_map[id] => sentence.
:param duplicates_list: Duplicates_list is a list with id pairs [(id1, id2), (id1, id5)] that identifies the duplicates / paraphrases in the sentences_map
:param duplicates_dict: A default dictionary mapping [id1][id2] to true if id1 and id2 are duplicates. Must be symmetric, i.e., if [id1][id2] => True, then [id2][id1] => True.
:param add_transitive_closure: If true, it adds a transitive closure, i.e. if dup[a][b] and dup[b][c], then dup[a][c]
:param query_chunk_size: To identify the paraphrases, the cosine-similarity between all sentence-pairs will be computed. As this might require a lot of memory, we perform a batched computation. #query_batch_size sentences will be compared against up to #corpus_batch_size sentences. In the default setting, 5000 sentences will be grouped together and compared up-to against 100k other sentences.
:param corpus_chunk_size: The corpus will be batched, to reduce the memory requirement
:param max_pairs: We will only extract up to #max_pairs potential paraphrase candidates.
:param top_k: For each query, we extract the top_k most similar pairs and add it to a sorted list. I.e., for one sentence we cannot find more than top_k paraphrases
:param show_progress_bar: Output a progress bar
:param batch_size: Batch size for computing sentence embeddings