pipeline_apigen.py

# Copyright 2023-present, Argilla, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

from datasets import load_dataset

from distilabel.models import InferenceEndpointsLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import CombineOutputs, DataSampler, LoadDataFromDicts
from distilabel.steps.tasks import (
    APIGenExecutionChecker,
    APIGenGenerator,
    APIGenSemanticChecker,
)
from distilabel.steps.tasks.apigen.utils import PrepareExamples, load_module_from_path

libpath = Path(__file__).parent / "lib_apigen.py"

data = [
    {
        "func_name": "final_velocity",
        "func_desc": "Calculates the final velocity of an object given its initial velocity, acceleration, and time.",
    },
    {
        "func_name": "permutation_count",
        "func_desc": "Calculates the number of permutations of k elements from a set of n elements.",
    },
    {
        "func_name": "getdivision",
        "func_desc": "Divides two numbers by making an API call to a division service.",
    },
    {
        "func_name": "binary_addition",
        "func_desc": "Adds two binary numbers and returns the result as a binary string.",
    },
    {
        "func_name": "swapi_planet_resource",
        "func_desc": "get a specific planets resource",
    },
    {
        "func_name": "disney_character",
        "func_desc": "Find a specific character using this endpoint",
    },
]

libpath_module = load_module_from_path(libpath)
tools = libpath_module.get_tools()  # call get_tools()

# TODO: Add in the tools between 0 and 2 extra tools to make the task more challenging.
for row in data:
    # The tools should have a mix where both the correct and irrelevant tools are present.
    row.update({"tools": [tools[row["func_name"]]]})


ds_og = (
    load_dataset("Salesforce/xlam-function-calling-60k", split="train")
    .shuffle(seed=42)
    .select(range(500))
    .to_list()
)


with Pipeline(name="APIGenPipeline") as pipeline:
    loader_seeds = LoadDataFromDicts(data=data)
    sampler = DataSampler(
        data=ds_og,
        size=2,
        samples=len(data),
        batch_size=8,
    )

    prep_examples = PrepareExamples()

    model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
    llm = InferenceEndpointsLLM(
        model_id=model_id,
        tokenizer_id=model_id,
        generation_kwargs={
            "temperature": 0.7,
            "max_new_tokens": 2048,
        },
    )
    apigen = APIGenGenerator(
        llm=llm,
        use_default_structured_output=True,
    )
    combine_steps = CombineOutputs()

    execution_checker = APIGenExecutionChecker(libpath=str(libpath))
    semantic_checker = APIGenSemanticChecker(llm=llm)

    sampler >> prep_examples
    (
        [loader_seeds, prep_examples]
        >> combine_steps
        >> apigen
        >> execution_checker
        >> semantic_checker
    )


if __name__ == "__main__":
    distiset = pipeline.run()
    print(distiset["default"]["train"][0])