Commit 1516fed0 authored by one's avatar one
Browse files

Update evo2 entrypoint scripts

parent b52f967e
{
"recommendations": [
"ms-python.python",
"astral-sh.ruff"
]
}
\ No newline at end of file
#!/bin/bash
set -e
export MODEL_PATH=/models/arcinstitute/evo2_7b
# export MIOPEN_ENABLE_LOGGING_CMD=1
# export MIOPEN_ENABLE_LOGGING=1
# export MIOPEN_LOG_LEVEL=6
# export ROCBLAS_LAYER=4
export HIP_VISIBLE_DEVICES=1
export MODEL_NAME=evo2_7b
export MODEL_PATH=/models/arcinstitute/evo2_7b/evo2_7b.pt
#export MIOPEN_ENABLE_LOGGING_CMD=1
#export MIOPEN_ENABLE_LOGGING=1
#export MIOPEN_LOG_LEVEL=6
#export ROCBLAS_LAYER=3
EVO_CMD="numactl -m 1 -N 1 \
python -m evo2.test.test_evo2_generation_batched \
--model_name ${MODEL_NAME} \
--local_path ${MODEL_PATH}"
BATCH_SIZE=2
EVO2_CMD="numactl -m 0 -N 0 python -m evo2.test.test_evo2_generation_batched --model_name evo2_7b --local_path ${MODEL_PATH}/evo2_7b.pt --batch_size ${BATCH_SIZE}"
# EVO2_CMD="numactl -m 0 -N 0 python -m evo2.test.test_evo2_generation --model_name evo2_7b --local_path ${MODEL_PATH}/evo2_7b.pt"
run_all_tests() {
local batch_size=$1
#${EVO2_CMD}
echo "================================================"
echo "Running all tests for batch size ${batch_size}"
echo "================================================"
mkdir -p log &> /dev/null
hipprof --hip-trace -o log/trace-padding-bs${BATCH_SIZE} ${EVO2_CMD}
echo "==== Normal run ===="
${EVO_CMD} --batch_size ${batch_size}
# echo "==== Torch profiler trace for step 0 ===="
# ${EVO_CMD} --batch_size ${batch_size} --trace --trace_step 0
# echo "==== Torch profiler trace for step 1 ===="
# ${EVO_CMD} --batch_size ${batch_size} --trace --trace_step 1
# echo "==== Hipprof trace ===="
# hipprof --hip-trace -o log/trace-bs${batch_size} \
# ${EVO_CMD} --batch_size ${batch_size}
# echo "==== Nsight-systems trace ===="
# nsys profile --force-overwrite=true \
# --stats=true --trace=cuda \
# -o log/trace-bs${batch_size} \
# ${EVO_CMD} --batch_size ${batch_size}
}
run_all_tests 1
run_all_tests 2
......@@ -2,14 +2,14 @@ import argparse
import csv
from importlib import resources
from pathlib import Path
from typing import List, Optional, Union
from typing import Optional
import numpy as np
import time
import torch
from evo2 import Evo2
def read_prompts(input_file):
"""Read prompts from input file or built-in test data.
......@@ -20,25 +20,27 @@ def read_prompts(input_file):
# If it's a string that doesn't exist as a file path, assume it's a test data file
if isinstance(input_file, str) and not Path(input_file).is_file():
# This is the reliable way to get package data
with resources.path('evo2.test.data', input_file) as data_path:
with resources.path("evo2.test.data", input_file) as data_path:
input_file = data_path
# Your existing code to read the file
promptseqs = []
with open(input_file, encoding='utf-8-sig', newline='') as csvfile:
with open(input_file, encoding="utf-8-sig", newline="") as csvfile:
reader = csv.reader(csvfile)
next(reader) # Skip header
for row in reader:
promptseqs.append(row[0])
return promptseqs
def mid_point_split(*, seq, num_tokens):
"""Split sequence at midpoint for prompt and target."""
mid_point = 2*(len(seq)//4)
mid_point = 2 * (len(seq) // 4)
prompt = seq[:mid_point]
target = seq[mid_point:mid_point+num_tokens]
target = seq[mid_point : mid_point + num_tokens]
return prompt, target
def calculate_sequence_identity(seq1: str, seq2: str) -> Optional[float]:
"""Calculate sequence identity between two sequences through direct comparison."""
if not seq1 or not seq2:
......@@ -48,8 +50,18 @@ def calculate_sequence_identity(seq1: str, seq2: str) -> Optional[float]:
matches = sum(a == b for a, b in zip(seq1[:min_length], seq2[:min_length]))
return (matches / min_length) * 100
def generate_and_score(*, sequences, model, generations_per_prompt=5, n_tokens=500,
temperature=1.0, top_k=1, top_p=1.0, batch_size=2):
def generate_and_score(
*,
sequences,
model,
generations_per_prompt=5,
n_tokens=500,
temperature=1.0,
top_k=1,
top_p=1.0,
batch_size=2,
):
"""Prompt with first half, generate and score on 2nd half."""
scores = []
prompts = []
......@@ -62,12 +74,125 @@ def generate_and_score(*, sequences, model, generations_per_prompt=5, n_tokens=5
targets.extend([target] * generations_per_prompt)
for i in range(0, len(prompts), batch_size):
batch_prompts = prompts[i:i + batch_size]
batch_targets = targets[i:i + batch_size]
batch_prompts = prompts[i : i + batch_size]
batch_targets = targets[i : i + batch_size]
with torch.inference_mode():
torch.cuda.synchronize()
step_time = -time.perf_counter()
generated = model.generate(
prompt_seqs=batch_prompts,
n_tokens=n_tokens,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
torch.cuda.synchronize()
step_time += time.perf_counter()
print(
f"[{i}:{min(i + batch_size, len(prompts))}) E2E Time for model.generate: {step_time:.3f} s"
)
for j, decoded_seq in enumerate(generated.sequences):
score = calculate_sequence_identity(decoded_seq, batch_targets[j])
scores.append(score)
# Reshape scores to group by original sequence
reshaped_scores = [
scores[i : i + generations_per_prompt]
for i in range(0, len(scores), generations_per_prompt)
]
return reshaped_scores
def custom_trace_handler(
dir_name="./log/pt-trace/", sort_by="self_device_time_total", top_n=20
):
tb_handler = torch.profiler.tensorboard_trace_handler(dir_name=dir_name)
field_fallbacks = {
"self_device_time_total": "self_cuda_time_total",
"device_time_total": "cuda_time_total",
"self_cuda_time_total": "self_cpu_time_total",
}
def handler(prof):
tb_handler(prof)
avgs = prof.key_averages()
final_sort_key = sort_by
if len(avgs) > 0:
sample_event = avgs[0]
# fallback
if not hasattr(sample_event, final_sort_key):
fallback_key = field_fallbacks.get(final_sort_key)
if fallback_key and hasattr(sample_event, fallback_key):
print(
f"[PROFILER] '{final_sort_key}' not found. Falling back to '{fallback_key}'."
)
final_sort_key = fallback_key
else:
print(
f"[PROFILER] Sort key '{final_sort_key}' invalid. Using default order."
)
final_sort_key = None
print(avgs.table(sort_by=final_sort_key, row_limit=top_n))
return handler
def generate_and_score_prof(
*,
sequences,
model,
generations_per_prompt=5,
n_tokens=500,
temperature=1.0,
top_k=1,
top_p=1.0,
batch_size=2,
trace_step=1,
):
"""Prompt with first half, generate and score on 2nd half with torch profiler.
Profiler is enabled only for iteration i==1 to capture detailed performance data.
"""
scores = []
prompts = []
targets = []
# Prepare all prompts and targets
for seq in sequences:
prompt, target = mid_point_split(seq=seq, num_tokens=n_tokens)
prompts.extend([prompt] * generations_per_prompt)
targets.extend([target] * generations_per_prompt)
print("\n[TRACE] Start profiling...")
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(wait=0, warmup=trace_step, active=1, repeat=1),
on_trace_ready=custom_trace_handler(dir_name="./log/pt-trace/"),
record_shapes=True,
profile_memory=True,
with_stack=True,
with_flops=True,
) as prof:
for i in range(0, len(prompts), batch_size):
batch_prompts = prompts[i : i + batch_size]
batch_targets = targets[i : i + batch_size]
with torch.inference_mode():
if torch.cuda.is_available(): torch.cuda.synchronize()
elapsed_time = -time.perf_counter()
torch.cuda.synchronize()
step_time = -time.perf_counter()
generated = model.generate(
prompt_seqs=batch_prompts,
n_tokens=n_tokens,
......@@ -75,20 +200,26 @@ def generate_and_score(*, sequences, model, generations_per_prompt=5, n_tokens=5
top_k=top_k,
top_p=top_p,
)
if torch.cuda.is_available(): torch.cuda.synchronize()
elapsed_time += time.perf_counter()
print(f"[{i}:{min(i+batch_size, len(prompts))}) Time for model.generate: {elapsed_time:.3f} s")
torch.cuda.synchronize()
step_time += time.perf_counter()
print(
f"[{i}:{min(i + batch_size, len(prompts))}) E2E Time for model.generate: {step_time:.3f} s"
)
for j, decoded_seq in enumerate(generated.sequences):
score = calculate_sequence_identity(decoded_seq, batch_targets[j])
scores.append(score)
prof.step()
# Reshape scores to group by original sequence
reshaped_scores = [scores[i:i + generations_per_prompt]
for i in range(0, len(scores), generations_per_prompt)]
reshaped_scores = [
scores[i : i + generations_per_prompt]
for i in range(0, len(scores), generations_per_prompt)
]
return reshaped_scores
def main():
"""
Test sequence generation and scoring using the evo2 models
......@@ -98,10 +229,30 @@ def main():
- Evo 2 1B base: 68.0%
"""
parser = argparse.ArgumentParser(description="Test Evo2 Model Generation")
parser.add_argument("--model_name", choices=['evo2_7b', 'evo2_40b', 'evo2_1b_base'], default='evo2_7b',
help="Model to test (supports evo2_7b, evo2_40b, evo2_1b_base)")
parser.add_argument(
"--model_name",
choices=["evo2_7b", "evo2_40b", "evo2_1b_base"],
default="evo2_7b",
help="Model to test (supports evo2_7b, evo2_40b, evo2_1b_base)",
)
parser.add_argument("--local_path", type=str, default=None)
parser.add_argument("--batch_size", type=int, default=1, help="Batch size for generation")
parser.add_argument(
"--n_tokens", type=int, default=500, help="Number of tokens to generate"
)
parser.add_argument(
"--batch_size", type=int, default=1, help="Batch size for generation"
)
parser.add_argument(
"--trace",
action="store_true",
help="Enable torch profiler",
)
parser.add_argument(
"--trace_step",
type=int,
default=1,
help="Attach torch profiler to specific step (default: 1)",
)
args = parser.parse_args()
......@@ -113,27 +264,34 @@ def main():
# Test parameters: greedy sampling of 500 tokens
test_params = {
'n_tokens': 500,
'temperature': 1.0,
'top_k': 1,
'top_p': 1.0,
'generations_per_prompt': 1,
'batch_size': args.batch_size,
"n_tokens": args.n_tokens,
"temperature": 1.0,
"top_k": 1,
"top_p": 1.0,
"generations_per_prompt": 1,
"batch_size": args.batch_size,
}
# Read and process sequences
sequences = read_prompts('prompts.csv')
sequences = read_prompts("prompts.csv")
# Debugging: replace all prompts with the longest prompt
if args.batch_size > 1:
longest_prompt = max(sequences, key=len)
sequences = [longest_prompt] * len(sequences)
print(f"[debug] Using longest prompt len={len(longest_prompt)} for all sequences")
print(
f"[DEBUG] Using longest prompt len={len(longest_prompt)} for all sequences"
)
scores = generate_and_score(
if args.trace:
print("[TRACE] Using generate_and_score_prof with torch profiler")
scores = generate_and_score_prof(
sequences=sequences,
model=model,
**test_params
trace_step=args.trace_step,
**test_params,
)
else:
scores = generate_and_score(sequences=sequences, model=model, **test_params)
# Calculate and validate results
mean_score = np.mean(scores)
......@@ -142,11 +300,7 @@ def main():
# Validate against expected scores
eps = 3 # large epsilon for direct comparison, since there are numeric differences by versions
expected_scores = {
'evo2_40b': 91.15,
'evo2_7b': 89.25,
'evo2_1b_base': 68.0
}
expected_scores = {"evo2_40b": 91.15, "evo2_7b": 89.25, "evo2_1b_base": 68.0}
expected_score = expected_scores[args.model_name]
if abs(mean_score - expected_score) < eps:
......@@ -154,5 +308,6 @@ def main():
else:
print(f"\nTest Failed: Expected {expected_score}%, got {mean_score}%")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment