description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
)
parser.add_argument(
"--data_path",
required=True,
help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
)
parser.add_argument(
"--project_name",
required=True,
help="The name of the generated Zeno project.",
)
returnparser.parse_args()
defmain():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
This scripts expects your results to live in a data folder where subfolders contain results of individual models.
"""
args=parse_args()
client=ZenoClient(os.environ["ZENO_API_KEY"])
# Get all model subfolders from the parent data folder.
models=[
os.path.basename(os.path.normpath(f))
forfinos.scandir(Path(args.data_path))
iff.is_dir()
]
assertlen(models)>0,"No model directories found in the data_path."
"arc_challenge":"Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"lambada":"Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
}
task_dict=lm_eval.tasks.get_task_dict(task_names)
fortask_name,taskintask_dict.items():
# patch description field in task (# TODO: make this much more cleaned up)