convert_sharegpt_to_openai.py 11 KB
Newer Older
raojy's avatar
raojy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Download dataset from:
https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json

Convert to OpenAI API:
export INPUT_FILE=sharegpt_20230401_clean_lang_split.json
python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128
"""

import argparse
import json
import random
from statistics import mean
from typing import Any

import pandas as pd  # type: ignore
import tqdm  # type: ignore
from transformers import AutoTokenizer  # type: ignore


def has_non_english_chars(text: str) -> bool:
    return not text.isascii()


def content_is_valid(
    content: str, min_content_len: int | None, max_content_len: int | None
) -> bool:
    if min_content_len and len(content) < min_content_len:
        return False

    if max_content_len and len(content) > max_content_len:
        return False

    return has_non_english_chars(content)


def print_stats(
    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
) -> None:
    # Collect statistics
    stats = []

    print("\nCollecting statistics...")
    for item in tqdm.tqdm(conversations):
        # item has "id" and "messages"
        messages = item["messages"]

        user_turns = 0
        assistant_turns = 0
        user_words = 0
        assistant_words = 0
        conv_chars = 0

        user_tokens: list[int] = []
        assistant_tokens: list[int] = []

        for m in messages:
            content = m["content"]
            conv_chars += len(content)
            content_num_words = content.count(" ") + 1

            num_tokens = 0
            if tokenizer:
                num_tokens = len(tokenizer(m["content"]).input_ids)

            if m["role"] == "user":
                user_turns += 1
                user_words += content_num_words
                if tokenizer:
                    user_tokens.append(num_tokens)

            elif m["role"] == "assistant":
                assistant_turns += 1
                assistant_words += content_num_words
                if tokenizer:
                    assistant_tokens.append(num_tokens)

        # assert user_turns == assistant_turns, \
        # f"Invalid conversation ID {item['id']}"

        conv_words = user_words + assistant_words
        item_stats = {
            "user_turns": user_turns,
            "assistant_turns": assistant_turns,
            "user_words": user_words,
            "assistant_words": assistant_words,
            "conv_turns": len(messages),
            "conv_words": conv_words,
            "conv_characters": conv_chars,
        }

        if len(user_tokens) > 0:
            item_stats["user_tokens"] = int(mean(user_tokens))

        if len(assistant_tokens) > 0:
            item_stats["assistant_tokens"] = int(mean(assistant_tokens))

        stats.append(item_stats)

    print("\nStatistics:")
    percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
    df = pd.DataFrame(stats)
    print(df.describe(percentiles=percentiles).transpose())


def convert_sharegpt_to_openai(
    seed: int,
    input_file: str,
    output_file: str,
    max_items: int | None,
    min_content_len: int | None = None,
    max_content_len: int | None = None,
    min_turns: int | None = None,
    max_turns: int | None = None,
    model: str | None = None,
) -> None:
    if min_turns and max_turns:
        assert min_turns <= max_turns

    if min_content_len and max_content_len:
        # Verify that min is not larger than max if both were given
        assert min_content_len <= max_content_len

    print(
        f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=},"
        f" {max_content_len=}, {min_turns=}, {max_turns=}\n"
    )

    random.seed(seed)

    tokenizer = None
    if model is not None:
        print(f"Loading tokenizer from: {model}")
        tokenizer = AutoTokenizer.from_pretrained(model)

    # Read the ShareGPT JSON file
    print(f"Reading file: {input_file}")
    with open(input_file, encoding="utf-8") as f:
        # Should be a list of dicts
        # Each dict should have "id" (string) and "conversations" (list of dicts)
        sharegpt_data = json.load(f)

    assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts"

    print(f"Total items in input file: {len(sharegpt_data):,}")

    print(f"Shuffling dataset with seed {seed}")
    random.shuffle(sharegpt_data)

    # Map conversation ID to the all the messages
    conversation_parts: dict[str, list[Any]] = {}

    for item in tqdm.tqdm(sharegpt_data):
        assert "id" in item, "Missing key 'id'"
        assert "conversations" in item, "Missing key 'conversations'"

        # Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.)
        conv_id, _ = item["id"].split("_")
        new_turns = item["conversations"]

        if conv_id not in conversation_parts:
            # Start new conversation
            conversation_parts[conv_id] = []
        elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0:
            prev_turns = conversation_parts[conv_id][-1]
            if prev_turns[-1]["from"] == new_turns[0]["from"]:
                new_turns = new_turns[1:]

        if len(new_turns) > 0:
            # We assume that parts are in order in the ShareGPT dataset
            conversation_parts[conv_id].append(new_turns)

    dataset: list[dict[str, Any]] = []
    for conv_id, conv_parts in conversation_parts.items():
        new_item = {"id": conv_id}

        conversations: list[dict[str, str]] = []

        # Merge all parts
        for conv_part in conv_parts:
            conversations.extend(conv_part)

        if len(conversations) > 0:
            new_item["conversations"] = conversations
            dataset.append(new_item)

    print(f"Total unique conversations (IDs) in input file: {len(dataset):,}")

    # Final output data
    final_openai_dataset: list[dict] = []

    # Filter conversations from the ShareGPT dataset and convert to OpenAI format
    for item in tqdm.tqdm(dataset):
        messages: list[dict] = []

        assert "id" in item, "Missing key 'id'"
        assert "conversations" in item, "Missing key 'conversations'"

        conv_id = item["id"]
        conversations = item["conversations"]

        if min_turns is not None and len(conversations) < min_turns:
            # Skip short conversations
            continue

        # Convert each message in the conversation, up to max_turns if specified
        for i, turn in enumerate(conversations):
            assert "from" in turn and "value" in turn, (
                f"Invalid conversation ID {conv_id} - missing 'from' or 'value'"
            )

            role = None
            turn_from = turn["from"]

            if turn_from in {"human", "user"}:
                role = "user"
            elif turn_from in {"gpt", "bing", "chatgpt", "bard"}:
                role = "assistant"
            elif turn_from == "system":
                role = "system"

            assert role is not None, (
                f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid"
            )

            if i == 0 and role != "user":
                # If the first message is from assistant (gpt), skip it.
                # this happens when the conversation is a follow-up
                # to a previous conversation (from the same user).
                continue

            if max_turns is not None and i >= max_turns:
                break

            # Convert message to OpenAI format (with "role" and "content")
            content = turn["value"]
            messages.append({"role": role, "content": content})

        # Add the converted conversation to the OpenAI format
        if len(messages) > 0:
            valid_messages = True

            # First turn should always be from the user
            user_turn = True

            for m in messages:
                # Make sure that turns alternate between user and assistant
                if (user_turn and m["role"] != "user") or (
                    not user_turn and m["role"] != "assistant"
                ):
                    valid_messages = False
                    break

                user_turn = not user_turn

                content = m["content"]
                valid_messages = content_is_valid(
                    content, min_content_len, max_content_len
                )
                if not valid_messages:
                    break

            if valid_messages is True:
                final_openai_dataset.append({"id": conv_id, "messages": messages})

    assert len(final_openai_dataset) > 0, "Final number of conversations is zero"

    print_stats(final_openai_dataset)

    print_stats_again = False
    if max_items is not None and len(final_openai_dataset) > max_items:
        print(f"\n\nSampling {max_items} items from the dataset...")
        print_stats_again = True
        final_openai_dataset = random.sample(final_openai_dataset, max_items)

    if print_stats_again:
        # Print stats after the dataset changed
        print_stats(final_openai_dataset, tokenizer)

    # Write the converted data to a new JSON file
    final_size = len(final_openai_dataset)
    print(f"\nTotal conversations converted (after filtering): {final_size:,}")
    print(f"\nWriting file: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Convert ShareGPT dataset to OpenAI API format"
    )
    parser.add_argument("input_file", help="Path to the input ShareGPT JSON file")
    parser.add_argument(
        "output_file", help="Path to the output OpenAI format JSON file"
    )
    parser.add_argument(
        "--seed", type=int, default=0, help="Seed for random number generators"
    )
    parser.add_argument(
        "--max-items",
        type=int,
        default=None,
        help="Maximum number of items in the output file",
    )
    parser.add_argument(
        "--min-turns",
        type=int,
        default=None,
        help="Minimum number of turns per conversation",
    )
    parser.add_argument(
        "--max-turns",
        type=int,
        default=None,
        help="Maximum number of turns per conversation",
    )
    parser.add_argument(
        "--min-content-len",
        type=int,
        default=None,
        help="Min number of characters in the messages' content",
    )
    parser.add_argument(
        "--max-content-len",
        type=int,
        default=None,
        help="Max number of characters in the messages' content",
    )
    parser.add_argument(
        "--model",
        type=str,
        default=None,
        help="LLM model, only the tokenizer will be used",
    )

    args = parser.parse_args()

    convert_sharegpt_to_openai(
        args.seed,
        args.input_file,
        args.output_file,
        args.max_items,
        args.min_content_len,
        args.max_content_len,
        args.min_turns,
        args.max_turns,
        args.model,
    )


if __name__ == "__main__":
    main()