Commit 4d4d8f59 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2741 canceled with stages
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING, Dict, List, Optional, Union
import pandas as pd
from datasets import Dataset
from distilabel.errors import DistilabelUserError
from distilabel.steps.base import StepResources
if TYPE_CHECKING:
from distilabel.pipeline.base import BasePipeline
from distilabel.steps import GeneratorStep
def make_generator_step(
dataset: Union[Dataset, pd.DataFrame, List[Dict[str, str]]],
pipeline: Union["BasePipeline", None] = None,
batch_size: int = 50,
input_mappings: Optional[Dict[str, str]] = None,
output_mappings: Optional[Dict[str, str]] = None,
resources: StepResources = StepResources(),
repo_id: Optional[str] = "default_name",
) -> "GeneratorStep":
"""Helper method to create a `GeneratorStep` from a dataset, to simplify
Args:
dataset: The dataset to use in the `Pipeline`.
batch_size: The batch_size, will default to the same used by the `GeneratorStep`s.
Defaults to `50`.
input_mappings: Applies the same as any other step. Defaults to `None`.
output_mappings: Applies the same as any other step. Defaults to `None`.
resources: Applies the same as any other step. Defaults to `StepResources()`.
repo_id: The repository ID to use in the `LoadDataFromHub` step.
This shouldn't be necessary, but in case of error, the dataset will try to be loaded
using `load_dataset` internally. If that case happens, the `repo_id` will be used.
Raises:
ValueError: If the format is different from the ones supported.
Returns:
A `LoadDataFromDicts` if the input is a list of dicts, or `LoadDataFromHub` instance
if the input is a `pd.DataFrame` or a `Dataset`.
"""
from distilabel.steps import LoadDataFromDicts, LoadDataFromHub
if isinstance(dataset, list):
return LoadDataFromDicts(
pipeline=pipeline,
data=dataset,
batch_size=batch_size,
input_mappings=input_mappings or {},
output_mappings=output_mappings or {},
resources=resources,
)
if isinstance(dataset, pd.DataFrame):
dataset = Dataset.from_pandas(dataset, preserve_index=False)
if not isinstance(dataset, Dataset):
raise DistilabelUserError(
f"Dataset type not allowed: {type(dataset)}, must be one of: "
"`datasets.Dataset`, `pd.DataFrame`, `List[Dict[str, str]]`",
page="sections/how_to_guides/basic/pipeline/?h=make_#__tabbed_1_2",
)
loader = LoadDataFromHub(
pipeline=pipeline,
repo_id=repo_id,
batch_size=batch_size,
input_mappings=input_mappings or {},
output_mappings=output_mappings or {},
resources=resources,
)
super(loader.__class__, loader).load() # Ensure the logger is loaded
loader._dataset = dataset
loader.num_examples = len(dataset)
loader._dataset_info = {"default": dataset.info}
return loader
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from collections import defaultdict
from typing import TYPE_CHECKING, Optional
from datasets import Dataset
from pydantic import Field
from distilabel.mixins.runtime_parameters import RuntimeParameter
from distilabel.steps.base import GlobalStep, StepInput
if TYPE_CHECKING:
from distilabel.typing import StepOutput
class PushToHub(GlobalStep):
"""Push data to a Hugging Face Hub dataset.
A `GlobalStep` which creates a `datasets.Dataset` with the input data and pushes
it to the Hugging Face Hub.
Attributes:
repo_id: The Hugging Face Hub repository ID where the dataset will be uploaded.
split: The split of the dataset that will be pushed. Defaults to `"train"`.
private: Whether the dataset to be pushed should be private or not. Defaults to
`False`.
token: The token that will be used to authenticate in the Hub. If not provided, the
token will be tried to be obtained from the environment variable `HF_TOKEN`.
If not provided using one of the previous methods, then `huggingface_hub` library
will try to use the token from the local Hugging Face CLI configuration. Defaults
to `None`.
Runtime parameters:
- `repo_id`: The Hugging Face Hub repository ID where the dataset will be uploaded.
- `split`: The split of the dataset that will be pushed.
- `private`: Whether the dataset to be pushed should be private or not.
- `token`: The token that will be used to authenticate in the Hub.
Input columns:
- dynamic (`all`): all columns from the input will be used to create the dataset.
Categories:
- save
- dataset
- huggingface
Examples:
Push batches of your dataset to the Hugging Face Hub repository:
```python
from distilabel.steps import PushToHub
push = PushToHub(repo_id="path_to/repo")
push.load()
result = next(
push.process(
[
{
"instruction": "instruction ",
"generation": "generation"
}
],
)
)
# >>> result
# [{'instruction': 'instruction ', 'generation': 'generation'}]
```
"""
repo_id: RuntimeParameter[str] = Field(
default=None,
description="The Hugging Face Hub repository ID where the dataset will be uploaded.",
)
split: RuntimeParameter[str] = Field(
default="train",
description="The split of the dataset that will be pushed. Defaults to 'train'.",
)
private: RuntimeParameter[bool] = Field(
default=False,
description="Whether the dataset to be pushed should be private or not. Defaults"
" to `False`.",
)
token: Optional[RuntimeParameter[str]] = Field(
default=None,
description="The token that will be used to authenticate in the Hub. If not provided,"
" the token will be tried to be obtained from the environment variable `HF_TOKEN`."
" If not provided using one of the previous methods, then `huggingface_hub` library"
" will try to use the token from the local Hugging Face CLI configuration. Defaults"
" to `None`",
)
def process(self, inputs: StepInput) -> "StepOutput": # type: ignore
"""Method that processes the input data, respecting the `datasets.Dataset` formatting,
and pushes it to the Hugging Face Hub based on the `RuntimeParameter`s attributes.
Args:
inputs: that input data within a single object (as it's a GlobalStep) that
will be transformed into a `datasets.Dataset`.
Yields:
Propagates the received inputs so that the `Distiset` can be generated if this is
the last step of the `Pipeline`, or if this is not a leaf step and has follow up
steps.
"""
dataset_dict = defaultdict(list)
for input in inputs:
for key, value in input.items():
dataset_dict[key].append(value)
dataset_dict = dict(dataset_dict)
dataset = Dataset.from_dict(dataset_dict)
dataset.push_to_hub(
self.repo_id, # type: ignore
split=self.split,
private=self.private,
token=self.token or os.getenv("HF_TOKEN"),
)
yield inputs
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import TYPE_CHECKING, Any, Dict, List, Union
from pydantic import Field, PrivateAttr, SecretStr
from distilabel.models.mixins.cuda_device_placement import CudaDevicePlacementMixin
from distilabel.steps.base import Step, StepInput
from distilabel.utils.huggingface import HF_TOKEN_ENV_VAR
if TYPE_CHECKING:
import torch
from transformers import PreTrainedModel, PreTrainedTokenizer
from distilabel.typing import ChatType, StepColumns, StepOutput
class RewardModelScore(Step, CudaDevicePlacementMixin):
"""Assign a score to a response using a Reward Model.
`RewardModelScore` is a `Step` that using a Reward Model (RM) loaded using `transformers`,
assigns an score to a response generated for an instruction, or a score to a multi-turn
conversation.
Attributes:
model: the model Hugging Face Hub repo id or a path to a directory containing the
model weights and configuration files.
revision: if `model` refers to a Hugging Face Hub repository, then the revision
(e.g. a branch name or a commit id) to use. Defaults to `"main"`.
torch_dtype: the torch dtype to use for the model e.g. "float16", "float32", etc.
Defaults to `"auto"`.
trust_remote_code: whether to allow fetching and executing remote code fetched
from the repository in the Hub. Defaults to `False`.
device_map: a dictionary mapping each layer of the model to a device, or a mode like `"sequential"` or `"auto"`. Defaults to `None`.
token: the Hugging Face Hub token that will be used to authenticate to the Hugging
Face Hub. If not provided, the `HF_TOKEN` environment or `huggingface_hub` package
local configuration will be used. Defaults to `None`.
truncation: whether to truncate sequences at the maximum length. Defaults to `False`.
max_length: maximun length to use for padding or truncation. Defaults to `None`.
Input columns:
- instruction (`str`, optional): the instruction used to generate a `response`.
If provided, then `response` must be provided too.
- response (`str`, optional): the response generated for `instruction`. If provided,
then `instruction` must be provide too.
- conversation (`ChatType`, optional): a multi-turn conversation. If not provided,
then `instruction` and `response` columns must be provided.
Output columns:
- score (`float`): the score given by the reward model for the instruction-response
pair or the conversation.
Categories:
- scorer
Examples:
Assigning an score for an instruction-response pair:
```python
from distilabel.steps import RewardModelScore
step = RewardModelScore(
model="RLHFlow/ArmoRM-Llama3-8B-v0.1", device_map="auto", trust_remote_code=True
)
step.load()
result = next(
step.process(
inputs=[
{
"instruction": "How much is 2+2?",
"response": "The output of 2+2 is 4",
},
{"instruction": "How much is 2+2?", "response": "4"},
]
)
)
# [
# {'instruction': 'How much is 2+2?', 'response': 'The output of 2+2 is 4', 'score': 0.11690367758274078},
# {'instruction': 'How much is 2+2?', 'response': '4', 'score': 0.10300665348768234}
# ]
```
Assigning an score for a multi-turn conversation:
```python
from distilabel.steps import RewardModelScore
step = RewardModelScore(
model="RLHFlow/ArmoRM-Llama3-8B-v0.1", device_map="auto", trust_remote_code=True
)
step.load()
result = next(
step.process(
inputs=[
{
"conversation": [
{"role": "user", "content": "How much is 2+2?"},
{"role": "assistant", "content": "The output of 2+2 is 4"},
],
},
{
"conversation": [
{"role": "user", "content": "How much is 2+2?"},
{"role": "assistant", "content": "4"},
],
},
]
)
)
# [
# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': 'The output of 2+2 is 4'}], 'score': 0.11690367758274078},
# {'conversation': [{'role': 'user', 'content': 'How much is 2+2?'}, {'role': 'assistant', 'content': '4'}], 'score': 0.10300665348768234}
# ]
```
"""
model: str
revision: str = "main"
torch_dtype: str = "auto"
trust_remote_code: bool = False
device_map: Union[str, Dict[str, Any], None] = None
token: Union[SecretStr, None] = Field(
default_factory=lambda: os.getenv(HF_TOKEN_ENV_VAR), description=""
)
truncation: bool = False
max_length: Union[int, None] = None
_model: Union["PreTrainedModel", None] = PrivateAttr(None)
_tokenizer: Union["PreTrainedTokenizer", None] = PrivateAttr(None)
def load(self) -> None:
super().load()
if self.device_map in ["cuda", "auto"]:
CudaDevicePlacementMixin.load(self)
try:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
except ImportError as e:
raise ImportError(
"`transformers` is not installed. Please install it using `pip install 'distilabel[hf-transformers]'`."
) from e
token = self.token.get_secret_value() if self.token is not None else self.token
self._model = AutoModelForSequenceClassification.from_pretrained(
self.model,
revision=self.revision,
torch_dtype=self.torch_dtype,
trust_remote_code=self.trust_remote_code,
device_map=self.device_map,
token=token,
)
self._tokenizer = AutoTokenizer.from_pretrained(
self.model,
revision=self.revision,
torch_dtype=self.torch_dtype,
trust_remote_code=self.trust_remote_code,
token=token,
)
@property
def inputs(self) -> "StepColumns":
"""Either `response` and `instruction`, or a `conversation` columns."""
return {
"response": False,
"instruction": False,
"conversation": False,
}
@property
def outputs(self) -> "StepColumns":
"""The `score` given by the reward model."""
return ["score"]
def _prepare_conversation(self, input: Dict[str, Any]) -> "ChatType":
if "instruction" in input and "response" in input:
return [
{"role": "user", "content": input["instruction"]},
{"role": "assistant", "content": input["response"]},
]
return input["conversation"]
def _prepare_inputs(self, inputs: List[Dict[str, Any]]) -> "torch.Tensor":
return self._tokenizer.apply_chat_template( # type: ignore
[self._prepare_conversation(input) for input in inputs], # type: ignore
return_tensors="pt",
padding=True,
truncation=self.truncation,
max_length=self.max_length,
).to(self._model.device) # type: ignore
def _inference(self, inputs: List[Dict[str, Any]]) -> List[float]:
import torch
input_ids = self._prepare_inputs(inputs)
with torch.no_grad():
output = self._model(input_ids) # type: ignore
logits = output.logits
if logits.shape == (2, 1):
logits = logits.squeeze(-1)
return logits.tolist()
def process(self, inputs: StepInput) -> "StepOutput": # type: ignore
scores = self._inference(inputs)
for input, score in zip(inputs, scores):
input["score"] = score
yield inputs
def unload(self) -> None:
if self.device_map in ["cuda", "auto"]:
CudaDevicePlacementMixin.unload(self)
super().unload()
# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from distilabel.steps.tasks.apigen.execution_checker import APIGenExecutionChecker
from distilabel.steps.tasks.apigen.generator import APIGenGenerator
from distilabel.steps.tasks.apigen.semantic_checker import APIGenSemanticChecker
from distilabel.steps.tasks.argilla_labeller import ArgillaLabeller
from distilabel.steps.tasks.base import GeneratorTask, ImageTask, Task
from distilabel.steps.tasks.clair import CLAIR
from distilabel.steps.tasks.complexity_scorer import ComplexityScorer
from distilabel.steps.tasks.decorator import task
from distilabel.steps.tasks.evol_instruct.base import EvolInstruct
from distilabel.steps.tasks.evol_instruct.evol_complexity.base import EvolComplexity
from distilabel.steps.tasks.evol_instruct.evol_complexity.generator import (
EvolComplexityGenerator,
)
from distilabel.steps.tasks.evol_instruct.generator import EvolInstructGenerator
from distilabel.steps.tasks.evol_quality.base import EvolQuality
from distilabel.steps.tasks.generate_embeddings import GenerateEmbeddings
from distilabel.steps.tasks.genstruct import Genstruct
from distilabel.steps.tasks.image_generation import ImageGeneration
from distilabel.steps.tasks.improving_text_embeddings import (
BitextRetrievalGenerator,
EmbeddingTaskGenerator,
GenerateLongTextMatchingData,
GenerateShortTextMatchingData,
GenerateTextClassificationData,
GenerateTextRetrievalData,
MonolingualTripletGenerator,
)
from distilabel.steps.tasks.instruction_backtranslation import (
InstructionBacktranslation,
)
from distilabel.steps.tasks.magpie.base import Magpie
from distilabel.steps.tasks.magpie.generator import MagpieGenerator
from distilabel.steps.tasks.math_shepherd.completer import MathShepherdCompleter
from distilabel.steps.tasks.math_shepherd.generator import MathShepherdGenerator
from distilabel.steps.tasks.math_shepherd.utils import FormatPRM
from distilabel.steps.tasks.pair_rm import PairRM
from distilabel.steps.tasks.prometheus_eval import PrometheusEval
from distilabel.steps.tasks.quality_scorer import QualityScorer
from distilabel.steps.tasks.self_instruct import SelfInstruct
from distilabel.steps.tasks.sentence_transformers import GenerateSentencePair
from distilabel.steps.tasks.structured_generation import StructuredGeneration
from distilabel.steps.tasks.text_classification import TextClassification
from distilabel.steps.tasks.text_generation import ChatGeneration, TextGeneration
from distilabel.steps.tasks.text_generation_with_image import TextGenerationWithImage
from distilabel.steps.tasks.ultrafeedback import UltraFeedback
from distilabel.steps.tasks.urial import URIAL
from distilabel.typing import ChatItem, ChatType
__all__ = [
"CLAIR",
"URIAL",
"APIGenExecutionChecker",
"APIGenGenerator",
"APIGenSemanticChecker",
"ArgillaLabeller",
"ArgillaLabeller",
"BitextRetrievalGenerator",
"ChatGeneration",
"ChatItem",
"ChatType",
"ComplexityScorer",
"EmbeddingTaskGenerator",
"EvolComplexity",
"EvolComplexityGenerator",
"EvolInstruct",
"EvolInstructGenerator",
"EvolQuality",
"FormatPRM",
"GenerateEmbeddings",
"GenerateLongTextMatchingData",
"GenerateSentencePair",
"GenerateShortTextMatchingData",
"GenerateTextClassificationData",
"GenerateTextRetrievalData",
"GeneratorTask",
"Genstruct",
"ImageGeneration",
"ImageTask",
"InstructionBacktranslation",
"Magpie",
"MagpieGenerator",
"MathShepherdCompleter",
"MathShepherdGenerator",
"MonolingualTripletGenerator",
"MonolingualTripletGenerator",
"PairRM",
"PrometheusEval",
"QualityScorer",
"SelfInstruct",
"StructuredGeneration",
"Task",
"Task",
"TextClassification",
"TextGeneration",
"TextGenerationWithImage",
"UltraFeedback",
"task",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment