Unverified Commit d381eb96 authored by Ido Segev's avatar Ido Segev Committed by GitHub
Browse files

Multi turn benchmark progress bar for synthetic conversation generation (#28394)


Signed-off-by: default avatarIdo Segev <idos@pliops.com>
parent 9973e6e0
......@@ -11,6 +11,7 @@ from bench_utils import (
Color,
logger,
)
from tqdm import tqdm
from transformers import AutoTokenizer # type: ignore
# Conversation ID is a string (e.g: "UzTK34D")
......@@ -417,6 +418,10 @@ def generate_conversations(
data = file.read()
tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
list_of_tokens.extend(tokens_in_file)
logger.info(
f"Loaded {len(tokens_in_file)} tokens from file {filename}, "
f"total tokens so far: {len(list_of_tokens)}"
)
conversations: ConversationsMap = {}
conv_id = 0
......@@ -449,18 +454,25 @@ def generate_conversations(
)
base_offset += common_prefix_tokens
for conv_id in range(args.num_conversations):
for conv_id in tqdm(
range(args.num_conversations),
total=args.num_conversations,
desc="Generating conversations",
unit="conv",
):
# Generate a single conversation
messages: MessagesList = []
nturns = turn_count[conv_id]
# User prompt token count per turn (with lower limit)
input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
input_token_count: np.ndarray = args.input_num_tokens.sample(nturns).astype(int)
input_token_count = np.maximum(input_token_count, base_prompt_token_count)
# Assistant answer token count per turn (with lower limit)
output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
output_token_count: np.ndarray = args.output_num_tokens.sample(nturns).astype(
int
)
output_token_count = np.maximum(output_token_count, 1)
user_turn = True
......
......@@ -2,4 +2,5 @@ numpy>=1.24
pandas>=2.0.0
aiohttp>=3.10
transformers>=4.46
xlsxwriter>=3.2.1
\ No newline at end of file
xlsxwriter>=3.2.1
tqdm>=4.66
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment