v1.0

4d4d8f59 · chenzk · 4d4d8f59 · 4d4d8f59 · 4d4d8f59 · 4d4d8f59
Commit 4d4d8f59 authored Jun 04, 2025 by chenzk
20 changed files
--- a/distilabel/src/distilabel/steps/columns/__pycache__/merge.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/columns/__pycache__/merge.cpython-310.pyc
--- a/distilabel/src/distilabel/steps/columns/__pycache__/utils.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/columns/__pycache__/utils.cpython-310.pyc
--- a/distilabel/src/distilabel/steps/columns/combine.py
+++ b/distilabel/src/distilabel/steps/columns/combine.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from distilabel.constants import DISTILABEL_METADATA_KEY
+from distilabel.steps.base import Step, StepInput
+from distilabel.steps.columns.utils import merge_distilabel_metadata
+if TYPE_CHECKING:
+    from distilabel.typing import StepOutput
+class CombineOutputs(Step):
+    """Combine the outputs of several upstream steps.
+    `CombineOutputs` is a `Step` that takes the outputs of several upstream steps and combines
+    them to generate a new dictionary with all keys/columns of the upstream steps outputs.
+    Input columns:
+        - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.
+    Output columns:
+        - dynamic (based on the upstream `Step`s): All the columns of the upstream steps outputs.
+    Categories:
+        - columns
+    Examples:
+        Combine dictionaries of a dataset:
+        ```python
+        from distilabel.steps import CombineOutputs
+        combine_outputs = CombineOutputs()
+        combine_outputs.load()
+        result = next(
+            combine_outputs.process(
+                [{"a": 1, "b": 2}, {"a": 3, "b": 4}],
+                [{"c": 5, "d": 6}, {"c": 7, "d": 8}],
+            )
+        )
+        # [
+        #   {"a": 1, "b": 2, "c": 5, "d": 6},
+        #   {"a": 3, "b": 4, "c": 7, "d": 8},
+        # ]
+        ```
+        Combine upstream steps outputs in a pipeline:
+        ```python
+        from distilabel.pipeline import Pipeline
+        from distilabel.steps import CombineOutputs
+        with Pipeline() as pipeline:
+            step_1 = ...
+            step_2 = ...
+            step_3 = ...
+            combine = CombineOutputs()
+            [step_1, step_2, step_3] >> combine
+        ```
+    """
+    def process(self, *inputs: StepInput) -> "StepOutput":
+        combined_outputs = []
+        for output_dicts in zip(*inputs):
+            combined_dict = {}
+            for output_dict in output_dicts:
+                combined_dict.update(
+                    {
+                        k: v
+                        for k, v in output_dict.items()
+                        if k != DISTILABEL_METADATA_KEY
+                    }
+                )
+            if any(
+                DISTILABEL_METADATA_KEY in output_dict for output_dict in output_dicts
+            ):
+                combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(
+                    *output_dicts
+                )
+            combined_outputs.append(combined_dict)
+        yield combined_outputs
--- a/distilabel/src/distilabel/steps/columns/expand.py
+++ b/distilabel/src/distilabel/steps/columns/expand.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+from itertools import zip_longest
+from typing import TYPE_CHECKING, Any, Dict, List, Union
+from pydantic import field_validator, model_validator
+from typing_extensions import Self
+from distilabel.steps.base import Step, StepInput
+if TYPE_CHECKING:
+    from distilabel.typing import StepColumns, StepOutput
+class ExpandColumns(Step):
+    """Expand columns that contain lists into multiple rows.
+    `ExpandColumns` is a `Step` that takes a list of columns and expands them into multiple
+    rows. The new rows will have the same data as the original row, except for the expanded
+    column, which will contain a single item from the original list.
+    Attributes:
+        columns: A dictionary that maps the column to be expanded to the new column name
+            or a list of columns to be expanded. If a list is provided, the new column name
+            will be the same as the column name.
+        encoded: A bool to inform Whether the columns are JSON encoded lists. If this value is
+            set to True, the columns will be decoded before expanding. Alternatively, to specify
+            columns that can be encoded, a list can be provided. In this case, the column names
+            informed must be a subset of the columns selected for expansion.
+        split_statistics: A bool to inform whether the statistics in the `distilabel_metadata`
+            column should be split into multiple rows.
+            If we want to expand some columns containing a list of strings that come from
+            having parsed the output of an LLM, the tokens in the `statistics_{step_name}`
+            of the `distilabel_metadata` column should be splitted to avoid multiplying
+            them if we aggregate the data afterwards. For example, with a task that is supposed
+            to generate a list of N instructions, and we want each of those N instructions in
+            different rows, we should split the statistics by N.
+            In such a case, set this value to True.
+    Input columns:
+        - dynamic (determined by `columns` attribute): The columns to be expanded into
+            multiple rows.
+    Output columns:
+        - dynamic (determined by `columns` attribute):  The expanded columns.
+    Categories:
+        - columns
+    Examples:
+        Expand the selected columns into multiple rows:
+        ```python
+        from distilabel.steps import ExpandColumns
+        expand_columns = ExpandColumns(
+            columns=["generation"],
+        )
+        expand_columns.load()
+        result = next(
+            expand_columns.process(
+                [
+                    {
+                        "instruction": "instruction 1",
+                        "generation": ["generation 1", "generation 2"]}
+                ],
+            )
+        )
+        # >>> result
+        # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]
+        ```
+        Expand the selected columns which are JSON encoded into multiple rows:
+        ```python
+        from distilabel.steps import ExpandColumns
+        expand_columns = ExpandColumns(
+            columns=["generation"],
+            encoded=True,  # It can also be a list of columns that are encoded, i.e. ["generation"]
+        )
+        expand_columns.load()
+        result = next(
+            expand_columns.process(
+                [
+                    {
+                        "instruction": "instruction 1",
+                        "generation": '["generation 1", "generation 2"]'}
+                ],
+            )
+        )
+        # >>> result
+        # [{'instruction': 'instruction 1', 'generation': 'generation 1'}, {'instruction': 'instruction 1', 'generation': 'generation 2'}]
+        ```
+        Expand the selected columns and split the statistics in the `distilabel_metadata` column:
+        ```python
+        from distilabel.steps import ExpandColumns
+        expand_columns = ExpandColumns(
+            columns=["generation"],
+            split_statistics=True,
+        )
+        expand_columns.load()
+        result = next(
+            expand_columns.process(
+                [
+                    {
+                        "instruction": "instruction 1",
+                        "generation": ["generation 1", "generation 2"],
+                        "distilabel_metadata": {
+                            "statistics_generation": {
+                                "input_tokens": [12],
+                                "output_tokens": [12],
+                            },
+                        },
+                    }
+                ],
+            )
+        )
+        # >>> result
+        # [{'instruction': 'instruction 1', 'generation': 'generation 1', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}, {'instruction': 'instruction 1', 'generation': 'generation 2', 'distilabel_metadata': {'statistics_generation': {'input_tokens': [6], 'output_tokens': [6]}}}]
+        ```
+    """
+    columns: Union[Dict[str, str], List[str]]
+    encoded: Union[bool, List[str]] = False
+    split_statistics: bool = False
+    @field_validator("columns")
+    @classmethod
+    def always_dict(cls, value: Union[Dict[str, str], List[str]]) -> Dict[str, str]:
+        """Ensure that the columns are always a dictionary.
+        Args:
+            value: The columns to be expanded.
+        Returns:
+            The columns to be expanded as a dictionary.
+        """
+        if isinstance(value, list):
+            return {col: col for col in value}
+        return value
+    @model_validator(mode="after")
+    def is_subset(self) -> Self:
+        """Ensure the "encoded" column names are a subset of the "columns" selected.
+        Returns:
+            The "encoded" attribute updated to work internally.
+        """
+        if isinstance(self.encoded, list):
+            if not set(self.encoded).issubset(set(self.columns.keys())):
+                raise ValueError(
+                    "The 'encoded' columns must be a subset of the 'columns' selected for expansion."
+                )
+        if isinstance(self.encoded, bool):
+            self.encoded = list(self.columns.keys()) if self.encoded else []
+        return self
+    @property
+    def inputs(self) -> "StepColumns":
+        """The columns to be expanded."""
+        return list(self.columns.keys())
+    @property
+    def outputs(self) -> "StepColumns":
+        """The expanded columns."""
+        return [
+            new_column if new_column else expand_column
+            for expand_column, new_column in self.columns.items()
+        ]
+    def process(self, inputs: StepInput) -> "StepOutput":  # type: ignore
+        """Expand the columns in the input data.
+        Args:
+            inputs: The input data.
+        Yields:
+            The expanded rows.
+        """
+        if self.encoded:
+            for input in inputs:
+                for column in self.encoded:
+                    input[column] = json.loads(input[column])
+        yield [row for input in inputs for row in self._expand_columns(input)]
+    def _expand_columns(self, input: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Expand the columns in the input data.
+        Args:
+            input: The input data.
+        Returns:
+            The expanded rows.
+        """
+        metadata_visited = False
+        expanded_rows = []
+        # Update the columns here to avoid doing the validation on the `inputs`, as the
+        # `distilabel_metadata` is not defined on Pipeline creation on the DAG.
+        columns = self.columns
+        if self.split_statistics:
+            columns["distilabel_metadata"] = "distilabel_metadata"
+        for expand_column, new_column in columns.items():  # type: ignore
+            data = input.get(expand_column)
+            input, metadata_visited = self._split_metadata(
+                input, len(data), metadata_visited
+            )
+            rows = []
+            for item, expanded in zip_longest(*[data, expanded_rows], fillvalue=input):
+                rows.append({**expanded, new_column: item})
+            expanded_rows = rows
+        return expanded_rows
+    def _split_metadata(
+        self, input: Dict[str, Any], n: int, metadata_visited: bool = False
+    ) -> None:
+        """Help method to split the statistics in `distilabel_metadata` column.
+        Args:
+            input: The input data.
+            n: Number of splits to apply to the tokens (if we have 12 tokens and want to split
+                them 3 times, n==3).
+            metadata_visited: Bool to prevent from updating the data more than once.
+        Returns:
+            Updated input with the `distilabel_metadata` updated.
+        """
+        # - If we want to split the statistics, we need to ensure that the metadata is present.
+        # - Metadata can only be visited once per row to avoid successive splitting.
+        # TODO: For an odd number of tokens, this will miss 1, we have to fix it.
+        if (
+            self.split_statistics
+            and (metadata := input.get("distilabel_metadata", {}))
+            and not metadata_visited
+        ):
+            for k, v in metadata.items():
+                if (
+                    not v
+                ):  # In case it wasn't found in the metadata for some error, skip it
+                    continue
+                if k.startswith("statistics_") and (
+                    "input_tokens" in v and "output_tokens" in v
+                ):
+                    # For num_generations>1 we assume all the tokens should be divided by n
+                    # TODO: The tokens should always come as a list, but there can
+                    # be differences
+                    if isinstance(v["input_tokens"], list):
+                        input_tokens = [value // n for value in v["input_tokens"]]
+                    else:
+                        input_tokens = [v["input_tokens"] // n]
+                    if isinstance(v["input_tokens"], list):
+                        output_tokens = [value // n for value in v["output_tokens"]]
+                    else:
+                        output_tokens = [v["output_tokens"] // n]
+                    input["distilabel_metadata"][k] = {
+                        "input_tokens": input_tokens,
+                        "output_tokens": output_tokens,
+                    }
+                metadata_visited = True
+            # Once we have updated the metadata, Create a list out of it to let the
+            # following section to expand it as any other column.
+            if isinstance(input["distilabel_metadata"], dict):
+                input["distilabel_metadata"] = [input["distilabel_metadata"]] * n
+        return input, metadata_visited
--- a/distilabel/src/distilabel/steps/columns/group.py
+++ b/distilabel/src/distilabel/steps/columns/group.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, List, Optional
+from typing_extensions import override
+from distilabel.steps.base import Step, StepInput
+from distilabel.steps.columns.utils import group_columns
+if TYPE_CHECKING:
+    from distilabel.typing import StepOutput
+class GroupColumns(Step):
+    """Combines columns from a list of `StepInput`.
+    `GroupColumns` is a `Step` that implements the `process` method that calls the `group_dicts`
+    function to handle and combine a list of `StepInput`. Also `GroupColumns` provides two attributes
+    `columns` and `output_columns` to specify the columns to group and the output columns
+    which will override the default value for the properties `inputs` and `outputs`, respectively.
+    Attributes:
+        columns: List of strings with the names of the columns to group.
+        output_columns: Optional list of strings with the names of the output columns.
+    Input columns:
+        - dynamic (determined by `columns` attribute): The columns to group.
+    Output columns:
+        - dynamic (determined by `columns` and `output_columns` attributes): The columns
+            that were grouped.
+    Categories:
+        - columns
+    Examples:
+        Group columns of a dataset:
+        ```python
+        from distilabel.steps import GroupColumns
+        group_columns = GroupColumns(
+            name="group_columns",
+            columns=["generation", "model_name"],
+        )
+        group_columns.load()
+        result = next(
+            group_columns.process(
+                [{"generation": "AI generated text"}, {"model_name": "my_model"}],
+                [{"generation": "Other generated text", "model_name": "my_model"}]
+            )
+        )
+        # >>> result
+        # [{'merged_generation': ['AI generated text', 'Other generated text'], 'merged_model_name': ['my_model']}]
+        ```
+        Specify the name of the output columns:
+        ```python
+        from distilabel.steps import GroupColumns
+        group_columns = GroupColumns(
+            name="group_columns",
+            columns=["generation", "model_name"],
+            output_columns=["generations", "generation_models"]
+        )
+        group_columns.load()
+        result = next(
+            group_columns.process(
+                [{"generation": "AI generated text"}, {"model_name": "my_model"}],
+                [{"generation": "Other generated text", "model_name": "my_model"}]
+            )
+        )
+        # >>> result
+        #[{'generations': ['AI generated text', 'Other generated text'], 'generation_models': ['my_model']}]
+        ```
+    """
+    columns: List[str]
+    output_columns: Optional[List[str]] = None
+    @property
+    def inputs(self) -> List[str]:
+        """The inputs for the task are the column names in `columns`."""
+        return self.columns
+    @property
+    def outputs(self) -> List[str]:
+        """The outputs for the task are the column names in `output_columns` or
+        `grouped_{column}` for each column in `columns`."""
+        return (
+            self.output_columns
+            if self.output_columns is not None
+            else [f"grouped_{column}" for column in self.columns]
+        )
+    @override
+    def process(self, *inputs: StepInput) -> "StepOutput":
+        """The `process` method calls the `group_dicts` function to handle and combine a list of `StepInput`.
+        Args:
+            *inputs: A list of `StepInput` to be combined.
+        Yields:
+            A `StepOutput` with the combined `StepInput` using the `group_dicts` function.
+        """
+        yield group_columns(
+            *inputs,
+            group_columns=self.inputs,
+            output_group_columns=self.outputs,
+        )
--- a/distilabel/src/distilabel/steps/columns/keep.py
+++ b/distilabel/src/distilabel/steps/columns/keep.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, List
+from typing_extensions import override
+from distilabel.steps.base import Step, StepInput
+if TYPE_CHECKING:
+    from distilabel.typing import StepColumns, StepOutput
+class KeepColumns(Step):
+    """Keeps selected columns in the dataset.
+    `KeepColumns` is a `Step` that implements the `process` method that keeps only the columns
+    specified in the `columns` attribute. Also `KeepColumns` provides an attribute `columns` to
+    specify the columns to keep which will override the default value for the properties `inputs`
+    and `outputs`.
+    Note:
+        The order in which the columns are provided is important, as the output will be sorted
+        using the provided order, which is useful before pushing either a `dataset.Dataset` via
+        the `PushToHub` step or a `distilabel.Distiset` via the `Pipeline.run` output variable.
+    Attributes:
+        columns: List of strings with the names of the columns to keep.
+    Input columns:
+        - dynamic (determined by `columns` attribute): The columns to keep.
+    Output columns:
+        - dynamic (determined by `columns` attribute): The columns that were kept.
+    Categories:
+        - columns
+    Examples:
+        Select the columns to keep:
+        ```python
+        from distilabel.steps import KeepColumns
+        keep_columns = KeepColumns(
+            columns=["instruction", "generation"],
+        )
+        keep_columns.load()
+        result = next(
+            keep_columns.process(
+                [{"instruction": "What's the brightest color?", "generation": "white", "model_name": "my_model"}],
+            )
+        )
+        # >>> result
+        # [{'instruction': "What's the brightest color?", 'generation': 'white'}]
+        ```
+    """
+    columns: List[str]
+    @property
+    def inputs(self) -> "StepColumns":
+        """The inputs for the task are the column names in `columns`."""
+        return self.columns
+    @property
+    def outputs(self) -> "StepColumns":
+        """The outputs for the task are the column names in `columns`."""
+        return self.columns
+    @override
+    def process(self, *inputs: StepInput) -> "StepOutput":
+        """The `process` method keeps only the columns specified in the `columns` attribute.
+        Args:
+            *inputs: A list of dictionaries with the input data.
+        Yields:
+            A list of dictionaries with the output data.
+        """
+        for input in inputs:
+            outputs = []
+            for item in input:
+                outputs.append({col: item[col] for col in self.columns})
+            yield outputs
--- a/distilabel/src/distilabel/steps/columns/merge.py
+++ b/distilabel/src/distilabel/steps/columns/merge.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, List, Optional
+from typing_extensions import override
+from distilabel.steps.base import Step, StepInput
+from distilabel.steps.columns.utils import merge_columns
+if TYPE_CHECKING:
+    from distilabel.typing import StepColumns, StepOutput
+class MergeColumns(Step):
+    """Merge columns from a row.
+    `MergeColumns` is a `Step` that implements the `process` method that calls the `merge_columns`
+    function to handle and combine columns in a `StepInput`. `MergeColumns` provides two attributes
+    `columns` and `output_column` to specify the columns to merge and the resulting output column.
+    This step can be useful if you have a `Task` that generates instructions for example, and you
+    want to have more examples of those. In such a case, you could for example use another `Task`
+    to multiply your instructions synthetically, what would yield two different columns splitted.
+    Using `MergeColumns` you can merge them and use them as a single column in your dataset for
+    further processing.
+    Attributes:
+        columns: List of strings with the names of the columns to merge.
+        output_column: str name of the output column
+    Input columns:
+        - dynamic (determined by `columns` attribute): The columns to merge.
+    Output columns:
+        - dynamic (determined by `columns` and `output_column` attributes): The columns
+            that were merged.
+    Categories:
+        - columns
+    Examples:
+        Combine columns in rows of a dataset:
+        ```python
+        from distilabel.steps import MergeColumns
+        combiner = MergeColumns(
+            columns=["queries", "multiple_queries"],
+            output_column="queries",
+        )
+        combiner.load()
+        result = next(
+            combiner.process(
+                [
+                    {
+                        "queries": "How are you?",
+                        "multiple_queries": ["What's up?", "Everything ok?"]
+                    }
+                ],
+            )
+        )
+        # >>> result
+        # [{'queries': ['How are you?', "What's up?", 'Everything ok?']}]
+        ```
+    """
+    columns: List[str]
+    output_column: Optional[str] = None
+    @property
+    def inputs(self) -> "StepColumns":
+        return self.columns
+    @property
+    def outputs(self) -> "StepColumns":
+        return [self.output_column] if self.output_column else ["merged_column"]
+    @override
+    def process(self, inputs: StepInput) -> "StepOutput":
+        combined = []
+        for input in inputs:
+            combined.append(
+                merge_columns(
+                    input,
+                    columns=self.columns,
+                    new_column=self.outputs[0],
+                )
+            )
+        yield combined
--- a/distilabel/src/distilabel/steps/columns/utils.py
+++ b/distilabel/src/distilabel/steps/columns/utils.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from distilabel.constants import DISTILABEL_METADATA_KEY
+if TYPE_CHECKING:
+    from distilabel.steps.base import StepInput
+def merge_distilabel_metadata(
+    *output_dicts: Dict[str, Any],
+) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+    """
+    Merge the `DISTILABEL_METADATA_KEY` from multiple output dictionaries. `DISTILABEL_METADATA_KEY`
+    can be either a dictionary containing metadata keys or a list containing dictionaries
+    of metadata keys.
+    Args:
+        *output_dicts: Variable number of dictionaries or lists containing distilabel metadata.
+    Returns:
+        A merged dictionary or list containing all the distilabel metadata.
+    """
+    merged_metadata = defaultdict(list)
+    for output_dict in output_dicts:
+        metadata = output_dict.get(DISTILABEL_METADATA_KEY, {})
+        # If `distilabel_metadata_key` is a `list` then it contains dictionaries with
+        # the metadata per `num_generations` created when `group_generations==True`
+        if isinstance(metadata, list):
+            if not isinstance(merged_metadata, list):
+                merged_metadata = []
+            merged_metadata.extend(metadata)
+        else:
+            for key, value in metadata.items():
+                merged_metadata[key].append(value)
+    if isinstance(merged_metadata, list):
+        return merged_metadata
+    final_metadata = {}
+    for key, value_list in merged_metadata.items():
+        if len(value_list) == 1:
+            final_metadata[key] = value_list[0]
+        else:
+            final_metadata[key] = value_list
+    return final_metadata
+def group_columns(
+    *inputs: "StepInput",
+    group_columns: List[str],
+    output_group_columns: Optional[List[str]] = None,
+) -> "StepInput":
+    """Groups multiple list of dictionaries into a single list of dictionaries on the
+    specified `group_columns`. If `group_columns` are provided, then it will also rename
+    `group_columns`.
+    Args:
+        inputs: list of dictionaries to combine.
+        group_columns: list of keys to merge on.
+        output_group_columns: list of keys to rename the merge keys to. Defaults to `None`.
+    Returns:
+        A list of dictionaries where the values of the `group_columns` are combined into a
+        list and renamed to `output_group_columns`.
+    """
+    if output_group_columns is not None and len(output_group_columns) != len(
+        group_columns
+    ):
+        raise ValueError(
+            "The length of `output_group_columns` must be the same as the length of `group_columns`."
+        )
+    if output_group_columns is None:
+        output_group_columns = [f"grouped_{key}" for key in group_columns]
+    group_columns_dict = dict(zip(group_columns, output_group_columns))
+    result = []
+    # Use zip to iterate over lists based on their index
+    for dicts_at_index in zip(*inputs):
+        combined_dict = {}
+        metadata_dicts = []
+        # Iterate over dicts at the same index
+        for d in dicts_at_index:
+            # Extract metadata for merging
+            if DISTILABEL_METADATA_KEY in d:
+                metadata_dicts.append(
+                    {DISTILABEL_METADATA_KEY: d[DISTILABEL_METADATA_KEY]}
+                )
+            # Iterate over key-value pairs in each dict
+            for key, value in d.items():
+                if key == DISTILABEL_METADATA_KEY:
+                    continue
+                # If the key is in the merge_keys, append the value to the existing list
+                if key in group_columns_dict.keys():
+                    combined_dict.setdefault(group_columns_dict[key], []).append(value)
+                # If the key is not in the merge_keys, create a new key-value pair
+                else:
+                    combined_dict[key] = value
+        if metadata_dicts:
+            combined_dict[DISTILABEL_METADATA_KEY] = merge_distilabel_metadata(
+                *metadata_dicts
+            )
+        result.append(combined_dict)
+    return result
+def merge_columns(
+    row: Dict[str, Any], columns: List[str], new_column: str = "combined_key"
+) -> Dict[str, Any]:
+    """Merge columns in a dictionary into a single column on the specified `new_column`.
+    Args:
+        row: Dictionary corresponding to a row in a dataset.
+        columns: List of keys to merge.
+        new_column: Name of the new key created.
+    Returns:
+        Dictionary with the new merged key.
+    """
+    result = row.copy()  # preserve the original dictionary
+    combined = []
+    for key in columns:
+        to_combine = result.pop(key)
+        if not isinstance(to_combine, list):
+            to_combine = [to_combine]
+        combined += to_combine
+    result[new_column] = combined
+    return result
--- a/distilabel/src/distilabel/steps/decorator.py
+++ b/distilabel/src/distilabel/steps/decorator.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Literal,
+    Type,
+    Union,
+    overload,
+)
+from pydantic import create_model
+from distilabel.mixins.runtime_parameters import _RUNTIME_PARAMETER_ANNOTATION
+from distilabel.steps.base import (
+    _STEP_INPUT_ANNOTATION,
+    GeneratorStep,
+    GlobalStep,
+    Step,
+)
+from distilabel.utils.typing_ import is_parameter_annotated_with
+if TYPE_CHECKING:
+    from distilabel.steps.base import _Step
+    from distilabel.typing import GeneratorStepOutput, StepColumns, StepOutput
+_STEP_MAPPING = {
+    "normal": Step,
+    "global": GlobalStep,
+    "generator": GeneratorStep,
+}
+ProcessingFunc = Callable[..., Union["StepOutput", "GeneratorStepOutput"]]
+@overload
+def step(
+    inputs: Union["StepColumns", None] = None,
+    outputs: Union["StepColumns", None] = None,
+    step_type: Literal["normal"] = "normal",
+) -> Callable[..., Type["Step"]]: ...
+@overload
+def step(
+    inputs: Union["StepColumns", None] = None,
+    outputs: Union["StepColumns", None] = None,
+    step_type: Literal["global"] = "global",
+) -> Callable[..., Type["GlobalStep"]]: ...
+@overload
+def step(
+    inputs: None = None,
+    outputs: Union["StepColumns", None] = None,
+    step_type: Literal["generator"] = "generator",
+) -> Callable[..., Type["GeneratorStep"]]: ...
+def step(
+    inputs: Union["StepColumns", None] = None,
+    outputs: Union["StepColumns", None] = None,
+    step_type: Literal["normal", "global", "generator"] = "normal",
+) -> Callable[..., Type["_Step"]]:
+    """Creates an `Step` from a processing function.
+    Args:
+        inputs: a list containing the name of the inputs columns/keys or a dictionary
+            where the keys are the columns and the values are booleans indicating whether
+            the column is required or not, that are required by the step. If not provided
+            the default will be an empty list `[]` and it will be assumed that the step
+            doesn't need any specific columns. Defaults to `None`.
+        outputs: a list containing the name of the outputs columns/keys or a dictionary
+            where the keys are the columns and the values are booleans indicating whether
+            the column will be generated or not. If not provided the default will be an
+            empty list `[]` and it will be assumed that the step doesn't need any specific
+            columns. Defaults to `None`.
+        step_type: the kind of step to create. Valid choices are: "normal" (`Step`),
+            "global" (`GlobalStep`) or "generator" (`GeneratorStep`). Defaults to
+            `"normal"`.
+    Returns:
+        A callable that will generate the type given the processing function.
+    Example:
+    ```python
+    # Normal step
+    @step(inputs=["instruction"], outputs=["generation"])
+    def GenerationStep(inputs: StepInput, dummy_generation: RuntimeParameter[str]) -> StepOutput:
+        for input in inputs:
+            input["generation"] = dummy_generation
+        yield inputs
+    # Global step
+    @step(inputs=["instruction"], step_type="global")
+    def FilteringStep(inputs: StepInput, max_length: RuntimeParameter[int] = 256) -> StepOutput:
+        yield [
+            input
+            for input in inputs
+            if len(input["instruction"]) <= max_length
+        ]
+    # Generator step
+    @step(outputs=["num"], step_type="generator")
+    def RowGenerator(num_rows: RuntimeParameter[int] = 500) -> GeneratorStepOutput:
+        data = list(range(num_rows))
+        for i in range(0, len(data), 100):
+            last_batch = i + 100 >= len(data)
+            yield [{"num": num} for num in data[i : i + 100]], last_batch
+    ```
+    """
+    inputs = inputs or []
+    outputs = outputs or []
+    def decorator(func: ProcessingFunc) -> Type["_Step"]:
+        if step_type not in _STEP_MAPPING:
+            raise ValueError(
+                f"Invalid step type '{step_type}'. Please, review the '{func.__name__}'"
+                " function decorated with the `@step` decorator and provide a valid"
+                " `step_type`. Valid choices are: 'normal', 'global' or 'generator'."
+            )
+        BaseClass = _STEP_MAPPING[step_type]
+        signature = inspect.signature(func)
+        runtime_parameters = {
+            name: (
+                param.annotation,
+                param.default if param.default != param.empty else None,
+            )
+            for name, param in signature.parameters.items()
+        }
+        runtime_parameters = {}
+        step_input_parameter = None
+        for name, param in signature.parameters.items():
+            if is_parameter_annotated_with(param, _RUNTIME_PARAMETER_ANNOTATION):
+                runtime_parameters[name] = (
+                    param.annotation,
+                    param.default if param.default != param.empty else None,
+                )
+            if not step_type == "generator" and is_parameter_annotated_with(
+                param, _STEP_INPUT_ANNOTATION
+            ):
+                if step_input_parameter is not None:
+                    raise ValueError(
+                        f"Function '{func.__name__}' has more than one parameter annotated"
+                        f" with `StepInput`. Please, review the '{func.__name__}' function"
+                        " decorated with the `@step` decorator and provide only one"
+                        " argument annotated with `StepInput`."
+                    )
+                step_input_parameter = param
+        RuntimeParametersModel = create_model(  # type: ignore
+            "RuntimeParametersModel",
+            **runtime_parameters,  # type: ignore
+        )
+        def inputs_property(self) -> "StepColumns":
+            return inputs
+        def outputs_property(self) -> "StepColumns":
+            return outputs
+        def process(
+            self, *args: Any, **kwargs: Any
+        ) -> Union["StepOutput", "GeneratorStepOutput"]:
+            return func(*args, **kwargs)
+        return type(  # type: ignore
+            func.__name__,
+            (
+                BaseClass,
+                RuntimeParametersModel,
+            ),
+            {
+                "process": process,
+                "inputs": property(inputs_property),
+                "outputs": property(outputs_property),
+                "__module__": func.__module__,
+                "__doc__": func.__doc__,
+                "_built_from_decorator": True,
+                # Override the `get_process_step_input` method to return the parameter
+                # of the original function annotated with `StepInput`.
+                "get_process_step_input": lambda self: step_input_parameter,
+            },
+        )
+    return decorator
--- a/distilabel/src/distilabel/steps/deita.py
+++ b/distilabel/src/distilabel/steps/deita.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, List, Literal
+import numpy as np
+from pydantic import Field
+from distilabel.mixins.runtime_parameters import RuntimeParameter
+from distilabel.steps.base import GlobalStep, StepInput
+if TYPE_CHECKING:
+    from distilabel.steps.typing import StepOutput
+class DeitaFiltering(GlobalStep):
+    """Filter dataset rows using DEITA filtering strategy.
+    Filter the dataset based on the DEITA score and the cosine distance between the embeddings.
+    It's an implementation of the filtering step from the paper 'What Makes Good Data
+    for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning'.
+    Attributes:
+        data_budget: The desired size of the dataset after filtering.
+        diversity_threshold: If a row has a cosine distance with respect to it's nearest
+            neighbor greater than this value, it will be included in the filtered dataset.
+            Defaults to `0.9`.
+        normalize_embeddings: Whether to normalize the embeddings before computing the cosine
+            distance. Defaults to `True`.
+    Runtime parameters:
+        - `data_budget`: The desired size of the dataset after filtering.
+        - `diversity_threshold`: If a row has a cosine distance with respect to it's nearest
+            neighbor greater than this value, it will be included in the filtered dataset.
+    Input columns:
+        - evol_instruction_score (`float`): The score of the instruction generated by
+            `ComplexityScorer` step.
+        - evol_response_score (`float`): The score of the response generated by
+            `QualityScorer` step.
+        - embedding (`List[float]`): The embedding generated for the conversation of the
+            instruction-response pair using `GenerateEmbeddings` step.
+    Output columns:
+        - deita_score (`float`): The DEITA score for the instruction-response pair.
+        - deita_score_computed_with (`List[str]`): The scores used to compute the DEITA
+            score.
+        - nearest_neighbor_distance (`float`): The cosine distance between the embeddings
+            of the instruction-response pair.
+    Categories:
+        - filtering
+    References:
+        - [`What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning`](https://arxiv.org/abs/2312.15685)
+    Examples:
+        Filter the dataset based on the DEITA score and the cosine distance between the embeddings:
+        ```python
+        from distilabel.steps import DeitaFiltering
+        deita_filtering = DeitaFiltering(data_budget=1)
+        deita_filtering.load()
+        result = next(
+            deita_filtering.process(
+                [
+                    {
+                        "evol_instruction_score": 0.5,
+                        "evol_response_score": 0.5,
+                        "embedding": [-8.12729941, -5.24642847, -6.34003029],
+                    },
+                    {
+                        "evol_instruction_score": 0.6,
+                        "evol_response_score": 0.6,
+                        "embedding": [2.99329242, 0.7800932, 0.7799726],
+                    },
+                    {
+                        "evol_instruction_score": 0.7,
+                        "evol_response_score": 0.7,
+                        "embedding": [10.29041806, 14.33088073, 13.00557506],
+                    },
+                ],
+            )
+        )
+        # >>> result
+        # [{'evol_instruction_score': 0.5, 'evol_response_score': 0.5, 'embedding': [-8.12729941, -5.24642847, -6.34003029], 'deita_score': 0.25, 'deita_score_computed_with': ['evol_instruction_score', 'evol_response_score'], 'nearest_neighbor_distance': 1.9042812683723933}]
+        ```
+    Citations:
+        ```
+        @misc{liu2024makesgooddataalignment,
+            title={What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning},
+            author={Wei Liu and Weihao Zeng and Keqing He and Yong Jiang and Junxian He},
+            year={2024},
+            eprint={2312.15685},
+            archivePrefix={arXiv},
+            primaryClass={cs.CL},
+            url={https://arxiv.org/abs/2312.15685},
+        }
+        ```
+    """
+    data_budget: RuntimeParameter[int] = Field(
+        default=None, description="The desired size of the dataset after filtering."
+    )
+    diversity_threshold: RuntimeParameter[float] = Field(
+        default=0.9,
+        description="If a row has a cosine distance with respect to it's nearest neighbor"
+        " greater than this value, it will be included in the filtered dataset.",
+    )
+    normalize_embeddings: RuntimeParameter[bool] = Field(
+        default=True,
+        description="Whether to normalize the embeddings before computing the cosine distance.",
+    )
+    distance_metric: RuntimeParameter[Literal["cosine", "manhattan"]] = Field(
+        default="cosine",
+        description="The distance metric to use. Currently only 'cosine' is supported.",
+    )
+    @property
+    def inputs(self) -> List[str]:
+        return ["evol_instruction_score", "evol_response_score", "embedding"]
+    @property
+    def outputs(self) -> List[str]:
+        return ["deita_score", "nearest_neighbor_distance", "deita_score_computed_with"]
+    def process(self, inputs: StepInput) -> "StepOutput":  # type: ignore
+        """Filter the dataset based on the DEITA score and the cosine distance between the
+        embeddings.
+        Args:
+            inputs: The input data.
+        Returns:
+            The filtered dataset.
+        """
+        inputs = self._compute_deita_score(inputs)
+        inputs = self._compute_nearest_neighbor(inputs)
+        inputs.sort(key=lambda x: x["deita_score"], reverse=True)
+        selected_rows = []
+        for input in inputs:
+            if len(selected_rows) >= self.data_budget:  # type: ignore
+                break
+            if input["nearest_neighbor_distance"] >= self.diversity_threshold:
+                selected_rows.append(input)
+        yield selected_rows
+    def _compute_deita_score(self, inputs: StepInput) -> StepInput:
+        """Computes the DEITA score for each instruction-response pair. The DEITA score is
+        the product of the instruction score and the response score.
+        Args:
+            inputs: The input data.
+        Returns:
+            The input data with the DEITA score computed.
+        """
+        for input_ in inputs:
+            evol_instruction_score = input_.get("evol_instruction_score")
+            evol_response_score = input_.get("evol_response_score")
+            if evol_instruction_score and evol_response_score:
+                deita_score = evol_instruction_score * evol_response_score
+                score_computed_with = ["evol_instruction_score", "evol_response_score"]
+            elif evol_instruction_score:
+                self._logger.warning(
+                    "Response score is missing for the instruction-response pair. Using"
+                    " instruction score as DEITA score."
+                )
+                deita_score = evol_instruction_score
+                score_computed_with = ["evol_instruction_score"]
+            elif evol_response_score:
+                self._logger.warning(
+                    "Instruction score is missing for the instruction-response pair. Using"
+                    " response score as DEITA score."
+                )
+                deita_score = evol_response_score
+                score_computed_with = ["evol_response_score"]
+            else:
+                self._logger.warning(
+                    "Instruction and response scores are missing for the instruction-response"
+                    " pair. Setting DEITA score to 0."
+                )
+                deita_score = 0
+                score_computed_with = []
+            input_.update(
+                {
+                    "deita_score": deita_score,
+                    "deita_score_computed_with": score_computed_with,
+                }
+            )
+        return inputs
+    def _compute_nearest_neighbor(self, inputs: StepInput) -> StepInput:
+        """Computes the cosine distance between the embeddings of the instruction-response
+        pairs and the nearest neighbor.
+        Args:
+            inputs: The input data.
+        Returns:
+            The input data with the cosine distance computed.
+        """
+        embeddings = np.array([input["embedding"] for input in inputs])
+        if self.normalize_embeddings:
+            embeddings = self._normalize_embeddings(embeddings)
+        self._logger.info("📏 Computing nearest neighbor distance...")
+        if self.distance_metric == "cosine":
+            self._logger.info("📏 Using cosine distance.")
+            distances = self._cosine_distance(embeddings)
+        else:
+            self._logger.info("📏 Using manhattan distance.")
+            distances = self._manhattan_distance(embeddings)
+        for distance, input in zip(distances, inputs):
+            input["nearest_neighbor_distance"] = distance
+        return inputs
+    def _normalize_embeddings(self, embeddings: np.ndarray) -> np.ndarray:
+        """Normalize the embeddings.
+        Args:
+            embeddings: The embeddings to normalize.
+        Returns:
+            The normalized embeddings.
+        """
+        self._logger.info("⚖️ Normalizing embeddings...")
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+        return embeddings / norms
+    def _cosine_distance(self, embeddings: np.array) -> np.array:  # type: ignore
+        """Computes the cosine distance between the embeddings.
+        Args:
+            embeddings: The embeddings.
+        Returns:
+            The cosine distance between the embeddings.
+        """
+        cosine_similarity = np.dot(embeddings, embeddings.T)
+        cosine_distance = 1 - cosine_similarity
+        # Ignore self-distance
+        np.fill_diagonal(cosine_distance, np.inf)
+        return np.min(cosine_distance, axis=1)
+    def _manhattan_distance(self, embeddings: np.array) -> np.array:  # type: ignore
+        """Computes the manhattan distance between the embeddings.
+        Args:
+            embeddings: The embeddings.
+        Returns:
+            The manhattan distance between the embeddings.
+        """
+        manhattan_distance = np.abs(embeddings[:, None] - embeddings).sum(-1)
+        # Ignore self-distance
+        np.fill_diagonal(manhattan_distance, np.inf)
+        return np.min(manhattan_distance, axis=1)
--- a/distilabel/src/distilabel/steps/embeddings/__init__.py
+++ b/distilabel/src/distilabel/steps/embeddings/__init__.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/distilabel/src/distilabel/steps/embeddings/__pycache__/__init__.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/embeddings/__pycache__/__init__.cpython-310.pyc
--- a/distilabel/src/distilabel/steps/embeddings/__pycache__/embedding_generation.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/embeddings/__pycache__/embedding_generation.cpython-310.pyc
--- a/distilabel/src/distilabel/steps/embeddings/__pycache__/nearest_neighbour.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/embeddings/__pycache__/nearest_neighbour.cpython-310.pyc
--- a/distilabel/src/distilabel/steps/embeddings/embedding_generation.py
+++ b/distilabel/src/distilabel/steps/embeddings/embedding_generation.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from distilabel.models.embeddings.base import Embeddings
+from distilabel.steps.base import Step, StepInput
+if TYPE_CHECKING:
+    from distilabel.typing import StepColumns, StepOutput
+class EmbeddingGeneration(Step):
+    """Generate embeddings using an `Embeddings` model.
+    `EmbeddingGeneration` is a `Step` that using an `Embeddings` model generates sentence
+    embeddings for the provided input texts.
+    Attributes:
+        embeddings: the `Embeddings` model used to generate the sentence embeddings.
+    Input columns:
+        - text (`str`): The text for which the sentence embedding has to be generated.
+    Output columns:
+        - embedding (`List[Union[float, int]]`): the generated sentence embedding.
+    Categories:
+        - embedding
+    Examples:
+        Generate sentence embeddings with Sentence Transformers:
+        ```python
+        from distilabel.models import SentenceTransformerEmbeddings
+        from distilabel.steps import EmbeddingGeneration
+        embedding_generation = EmbeddingGeneration(
+            embeddings=SentenceTransformerEmbeddings(
+                model="mixedbread-ai/mxbai-embed-large-v1",
+            )
+        )
+        embedding_generation.load()
+        result = next(embedding_generation.process([{"text": "Hello, how are you?"}]))
+        # [{'text': 'Hello, how are you?', 'embedding': [0.06209656596183777, -0.015797119587659836, ...]}]
+        ```
+    """
+    embeddings: Embeddings
+    @property
+    def inputs(self) -> "StepColumns":
+        return ["text"]
+    @property
+    def outputs(self) -> "StepColumns":
+        return ["embedding", "model_name"]
+    def load(self) -> None:
+        """Loads the `Embeddings` model."""
+        super().load()
+        self.embeddings.load()
+    def process(self, inputs: StepInput) -> "StepOutput":  # type: ignore
+        embeddings = self.embeddings.encode(inputs=[input["text"] for input in inputs])
+        for input, embedding in zip(inputs, embeddings):
+            input["embedding"] = embedding
+            input["model_name"] = self.embeddings.model_name
+        yield inputs
+    def unload(self) -> None:
+        super().unload()
+        self.embeddings.unload()
--- a/distilabel/src/distilabel/steps/embeddings/nearest_neighbour.py
+++ b/distilabel/src/distilabel/steps/embeddings/nearest_neighbour.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.util
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+import numpy as np
+from datasets import Dataset
+from pydantic import Field
+from distilabel.mixins.runtime_parameters import RuntimeParameter
+from distilabel.steps import GlobalStep, StepInput
+if TYPE_CHECKING:
+    from distilabel.typing import StepOutput
+class FaissNearestNeighbour(GlobalStep):
+    """Create a `faiss` index to get the nearest neighbours.
+    `FaissNearestNeighbour` is a `GlobalStep` that creates a `faiss` index using the Hugging
+    Face `datasets` library integration, and then gets the nearest neighbours and the scores
+    or distance of the nearest neighbours for each input row.
+    Attributes:
+        device: the CUDA device ID or a list of IDs to be used. If negative integer, it
+            will use all the available GPUs. Defaults to `None`.
+        string_factory: the name of the factory to be used to build the `faiss` index.
+            Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.
+            Defaults to `None`.
+        metric_type: the metric to be used to measure the distance between the points. It's
+            an integer and the recommend way to pass it is importing `faiss` and then passing
+            one of `faiss.METRIC_x` variables. Defaults to `None`.
+        k: the number of nearest neighbours to search for each input row. Defaults to `1`.
+        search_batch_size: the number of rows to include in a search batch. The value can
+            be adjusted to maximize the resources usage or to avoid OOM issues. Defaults
+            to `50`.
+        train_size: If the index needs a training step, specifies how many vectors will be
+            used to train the index.
+    Runtime parameters:
+        - `device`: the CUDA device ID or a list of IDs to be used. If negative integer,
+            it will use all the available GPUs. Defaults to `None`.
+        - `string_factory`: the name of the factory to be used to build the `faiss` index.
+            Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.
+            Defaults to `None`.
+        - `metric_type`: the metric to be used to measure the distance between the points.
+            It's an integer and the recommend way to pass it is importing `faiss` and then
+            passing one of `faiss.METRIC_x` variables. Defaults to `None`.
+        - `k`: the number of nearest neighbours to search for each input row. Defaults to `1`.
+        - `search_batch_size`: the number of rows to include in a search batch. The value
+            can be adjusted to maximize the resources usage or to avoid OOM issues. Defaults
+            to `50`.
+        - `train_size`: If the index needs a training step, specifies how many vectors will
+            be used to train the index.
+    Input columns:
+        - embedding (`List[Union[float, int]]`): a sentence embedding.
+    Output columns:
+        - nn_indices (`List[int]`): a list containing the indices of the `k` nearest neighbours
+            in the inputs for the row.
+        - nn_scores (`List[float]`): a list containing the score or distance to each `k`
+            nearest neighbour in the inputs.
+    Categories:
+        - embedding
+    References:
+        - [`The Faiss library`](https://arxiv.org/abs/2401.08281)
+    Examples:
+        Generating embeddings and getting the nearest neighbours:
+        ```python
+        from distilabel.models import SentenceTransformerEmbeddings
+        from distilabel.pipeline import Pipeline
+        from distilabel.steps import EmbeddingGeneration, FaissNearestNeighbour, LoadDataFromHub
+        with Pipeline(name="hello") as pipeline:
+            load_data = LoadDataFromHub(output_mappings={"prompt": "text"})
+            embeddings = EmbeddingGeneration(
+                embeddings=SentenceTransformerEmbeddings(
+                    model="mixedbread-ai/mxbai-embed-large-v1"
+                )
+            )
+            nearest_neighbours = FaissNearestNeighbour()
+            load_data >> embeddings >> nearest_neighbours
+        if __name__ == "__main__":
+            distiset = pipeline.run(
+                parameters={
+                    load_data.name: {
+                        "repo_id": "distilabel-internal-testing/instruction-dataset-mini",
+                        "split": "test",
+                    },
+                },
+                use_cache=False,
+            )
+        ```
+    Citations:
+        ```
+        @misc{douze2024faisslibrary,
+            title={The Faiss library},
+            author={Matthijs Douze and Alexandr Guzhva and Chengqi Deng and Jeff Johnson and Gergely Szilvasy and Pierre-Emmanuel Mazaré and Maria Lomeli and Lucas Hosseini and Hervé Jégou},
+            year={2024},
+            eprint={2401.08281},
+            archivePrefix={arXiv},
+            primaryClass={cs.LG},
+            url={https://arxiv.org/abs/2401.08281},
+        }
+        ```
+    """
+    device: Optional[RuntimeParameter[Union[int, List[int]]]] = Field(
+        default=None,
+        description="The CUDA device ID or a list of IDs to be used. If negative integer,"
+        " it will use all the available GPUs.",
+    )
+    string_factory: Optional[RuntimeParameter[str]] = Field(
+        default=None,
+        description="The name of the factory to be used to build the `faiss` index."
+        "Available string factories can be checked here: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes.",
+    )
+    metric_type: Optional[RuntimeParameter[int]] = Field(
+        default=None,
+        description="The metric to be used to measure the distance between the points. It's"
+        " an integer and the recommend way to pass it is importing `faiss` and thenpassing"
+        " one of `faiss.METRIC_x` variables.",
+    )
+    k: Optional[RuntimeParameter[int]] = Field(
+        default=1,
+        description="The number of nearest neighbours to search for each input row.",
+    )
+    search_batch_size: Optional[RuntimeParameter[int]] = Field(
+        default=50,
+        description="The number of rows to include in a search batch. The value can be adjusted"
+        " to maximize the resources usage or to avoid OOM issues.",
+    )
+    train_size: Optional[RuntimeParameter[int]] = Field(
+        default=None,
+        description="If the index needs a training step, specifies how many vectors will be used to train the index.",
+    )
+    def load(self) -> None:
+        super().load()
+        if importlib.util.find_spec("faiss") is None:
+            raise ImportError(
+                "`faiss` package is not installed. Please install it using `pip install"
+                " 'distilabel[faiss-cpu]' or 'distilabel[faiss-gpu]'`."
+            )
+    @property
+    def inputs(self) -> List[str]:
+        return ["embedding"]
+    @property
+    def outputs(self) -> List[str]:
+        return ["nn_indices", "nn_scores"]
+    def _build_index(self, inputs: List[Dict[str, Any]]) -> Dataset:
+        """Builds a `faiss` index using `datasets` integration.
+        Args:
+            inputs: a list of dictionaries.
+        Returns:
+            The build `datasets.Dataset` with its `faiss` index.
+        """
+        dataset = Dataset.from_list(inputs)
+        if self.train_size is not None and self.string_factory:
+            self._logger.info("🏋️‍♀️ Starting Faiss index training...")
+        dataset.add_faiss_index(
+            column="embedding",
+            device=self.device,  # type: ignore
+            string_factory=self.string_factory,
+            metric_type=self.metric_type,
+            train_size=self.train_size,
+        )
+        return dataset
+    def _save_index(self, dataset: Dataset) -> None:
+        """Save the generated Faiss index as an artifact of the step.
+        Args:
+            dataset: the dataset with the `faiss` index built.
+        """
+        self.save_artifact(
+            name="faiss_index",
+            write_function=lambda path: dataset.save_faiss_index(
+                index_name="embedding", file=path / "index.faiss"
+            ),
+            metadata={
+                "num_rows": len(dataset),
+                "embedding_dim": len(dataset[0]["embedding"]),
+            },
+        )
+    def _search(self, dataset: Dataset) -> Dataset:
+        """Search the top `k` nearest neighbours for each row in the dataset.
+        Args:
+            dataset: the dataset with the `faiss` index built.
+        Returns:
+            The updated dataset containing the top `k` nearest neighbours for each row,
+            as well as the score or distance.
+        """
+        def add_search_results(examples: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+            queries = np.array(examples["embedding"])
+            results = dataset.search_batch(
+                index_name="embedding",
+                queries=queries,
+                k=self.k + 1,  # type: ignore
+            )
+            examples["nn_indices"] = [indices[1:] for indices in results.total_indices]
+            examples["nn_scores"] = [scores[1:] for scores in results.total_scores]
+            return examples
+        return dataset.map(
+            add_search_results, batched=True, batch_size=self.search_batch_size
+        )
+    def process(self, inputs: StepInput) -> "StepOutput":  # type: ignore
+        dataset = self._build_index(inputs)
+        dataset_with_search_results = self._search(dataset)
+        self._save_index(dataset)
+        yield dataset_with_search_results.to_list()
--- a/distilabel/src/distilabel/steps/filtering/__init__.py
+++ b/distilabel/src/distilabel/steps/filtering/__init__.py
+# Copyright 2023-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/distilabel/src/distilabel/steps/filtering/__pycache__/__init__.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/filtering/__pycache__/__init__.cpython-310.pyc
--- a/distilabel/src/distilabel/steps/filtering/__pycache__/embedding.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/filtering/__pycache__/embedding.cpython-310.pyc
--- a/distilabel/src/distilabel/steps/filtering/__pycache__/minhash.cpython-310.pyc
+++ b/distilabel/src/distilabel/steps/filtering/__pycache__/minhash.cpython-310.pyc