test_pipeline.yaml 8.26 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
distilabel:
  version: 1.0.0.b0
pipeline:
  name: test-pipeline
  description: As the name suggests, a test pipeline.
  steps:
    - step:
        name: load_hub_dataset
        input_mappings: {}
        output_mappings:
          prompt: instruction
        batch_size: 100
        repo_id: null
        split: train
        config: null
        runtime_parameters_info:
          - name: repo_id
            optional: false
            description: The Hugging Face Hub repository ID of the dataset to load.
          - name: split
            optional: true
            description: The split of the dataset to load. Defaults to 'train'.
          - name: config
            optional: true
            description:
              The configuration of the dataset to load. This is optional and
              only needed if the dataset has multiple configurations.
        type_info:
          module: distilabel.steps.generators.huggingface
          name: LoadDataFromHub
      name: load_hub_dataset
    - step:
        name: text_generation_gpt
        input_mappings: {}
        output_mappings:
          model_name: model
        input_batch_size: 10
        llm:
          generation_kwargs: {}
          model: gpt-3.5-turbo
          base_url: https://api.openai.com/v1
          type_info:
            module: distilabel.models.llms.openai
            name: OpenAILLM
        group_generations: false
        num_generations: 3
        runtime_parameters_info:
          - name: llm
            runtime_parameters_info:
              - name: generation_kwargs
                description:
                  The kwargs to be propagated to either `generate` or `agenerate`
                  methods within each `LLM`.
                keys:
                  - name: max_new_tokens
                    optional: true
                    description:
                      the maximun number of new tokens that the model will generate.  Defaults
                      to `128`.
                  - name: frequency_penalty
                    optional: true
                  - name: presence_penalty
                    optional: true
                    description:
                      the presence penalty to use for the generation. Defaults
                      to  `0.0`.
                  - name: temperature
                    optional: true
                    description: the temperature to use for the generation. Defaults to `0.1`.
                  - name: top_p
                    optional: true
                    description: the top-p value to use for the generation. Defaults to `1.0`.
              - name: base_url
                optional: true
                description: The base URL to use for the OpenAI API requests.
              - name: api_key
                optional: true
                description: The API key to authenticate the requests to the OpenAI API.
          - name: num_generations
            optional: true
            description: The number of generations to be produced per input.
        type_info:
          module: distilabel.steps.tasks.text_generation
          name: TextGeneration
      name: text_generation_gpt
    - step:
        name: text_generation_gpt_2
        input_mappings: {}
        output_mappings:
          model_name: model
        input_batch_size: 10
        llm:
          generation_kwargs: {}
          model: gpt-3.5-turbo
          base_url: https://api.openai.com/v1
          type_info:
            module: distilabel.models.llms.openai
            name: OpenAILLM
        group_generations: true
        num_generations: 3
        runtime_parameters_info:
          - name: llm
            runtime_parameters_info:
              - name: generation_kwargs
                description:
                  The kwargs to be propagated to either `generate` or `agenerate`
                  methods within each `LLM`.
                keys:
                  - name: max_new_tokens
                    optional: true
                    description:
                      the maximun number of new tokens that the model will generate.  Defaults
                      to `128`.
                  - name: frequency_penalty
                    optional: true
                  - name: presence_penalty
                    optional: true
                    description:
                      the presence penalty to use for the generation. Defaults
                      to  `0.0`.
                  - name: temperature
                    optional: true
                    description: the temperature to use for the generation. Defaults to `0.1`.
                  - name: top_p
                    optional: true
                    description: the top-p value to use for the generation. Defaults to `1.0`.
              - name: base_url
                optional: true
                description: The base URL to use for the OpenAI API requests.
              - name: api_key
                optional: true
                description: The API key to authenticate the requests to the OpenAI API.
          - name: num_generations
            optional: true
            description: The number of generations to be produced per input.
        type_info:
          module: distilabel.steps.tasks.text_generation
          name: TextGeneration
      name: text_generation_gpt_2
    - step:
        name: push_to_hub
        input_mappings: {}
        output_mappings: {}
        input_batch_size: 50
        repo_id: null
        split: train
        private: false
        token: null
        runtime_parameters_info:
          - name: repo_id
            optional: false
            description:
              The Hugging Face Hub repository ID where the dataset will be
              uploaded.
          - name: split
            optional: true
            description: The split of the dataset that will be pushed. Defaults to 'train'.
          - name: private
            optional: true
            description:
              Whether the dataset to be pushed should be private or not. Defaults
              to `False`.
          - name: token
            optional: true
            description:
              The token that will be used to authenticate in the Hub. If not
              provided, the token will be tried to be obtained from the environment variable
              `HF_TOKEN`. If not provided using one of the previous methods, then `huggingface_hub`
              library will try to use the token from the local Hugging Face CLI configuration.
              Defaults to `None`
        type_info:
          module: distilabel.steps.globals.huggingface
          name: PushToHub
      name: push_to_hub
    - step:
        name: push_to_hub_2
        input_mappings: {}
        output_mappings: {}
        input_batch_size: 50
        repo_id: null
        split: train
        private: false
        token: null
        runtime_parameters_info:
          - name: repo_id
            optional: false
            description:
              The Hugging Face Hub repository ID where the dataset will be
              uploaded.
          - name: split
            optional: true
            description: The split of the dataset that will be pushed. Defaults to 'train'.
          - name: private
            optional: true
            description:
              Whether the dataset to be pushed should be private or not. Defaults
              to `False`.
          - name: token
            optional: true
            description:
              The token that will be used to authenticate in the Hub. If not
              provided, the token will be tried to be obtained from the environment variable
              `HF_TOKEN`. If not provided using one of the previous methods, then `huggingface_hub`
              library will try to use the token from the local Hugging Face CLI configuration.
              Defaults to `None`
        type_info:
          module: distilabel.steps.globals.huggingface
          name: PushToHub
      name: push_to_hub_2
  connections:
    - from: load_hub_dataset
      to:
        - text_generation_gpt
        - text_generation_gpt_2
    - from: text_generation_gpt
      to:
        - push_to_hub
    - from: text_generation_gpt_2
      to:
        - push_to_hub_2
    - from: push_to_hub
      to: []
    - from: push_to_hub_2
      to: []
  type_info:
    module: distilabel.pipeline.local
    name: Pipeline