eval_magma_libero.py

import os
import numpy as np
import draccus
from dataclasses import dataclass
from typing import Optional, Tuple
import tqdm
from libero.libero import benchmark
from libero_env_utils import (
    get_libero_env, 
    get_libero_dummy_action,
    get_libero_obs,
    get_max_steps,
    set_seed_everywhere
)
from libero_magma_utils import (
    get_magma_model,
    get_magma_prompt,
    get_magma_action
)

@dataclass
class LiberoConfig:
    # Model parameters
    model_name: str = "microsoft/magma-8b-libero-goal"      # model_name
    task_suite_name: str = "libero_goal"                    # Task suite name
    
    # Evaluation parameters
    num_trials_per_task: int = 50                          # Number of rollouts per task
    resolution: int = 256                                  # Image resolution
    num_steps_wait: int = 10                              # Steps to wait for stabilization
    seed: int = 0                                         # Random seed
    save_dir: str = "./libero_eval_log"                   # Directory for saving logs

@draccus.wrap()
def eval_libero(cfg: LiberoConfig) -> Tuple[int, int]:
    """
    Evaluate Libero environment with given configuration.
    
    Args:
        cfg: LiberoConfig object containing evaluation parameters
        
    Returns:
        Tuple[int, int]: Total episodes and total successful episodes
    """
    # Setup logging
    os.makedirs(cfg.save_dir, exist_ok=True)
    log_filepath = f"{cfg.save_dir}/magma_eval-{cfg.task_suite_name}.log"
    log_file = open(log_filepath, "w")
    print(f"Logging to local log file: {log_filepath}")
    
    # Write initial log
    log_file.write(f"Task suite: {cfg.task_suite_name}\n")
    print(f"Task suite: {cfg.task_suite_name}")

    # Get benchmark and task suite
    benchmark_dict = benchmark.get_benchmark_dict()
    task_suite = benchmark_dict[cfg.task_suite_name]()
    num_tasks_in_suite = task_suite.n_tasks
    
    # Initialize counters
    total_episodes, total_successes = 0, 0
    set_seed_everywhere(cfg.seed)
    
    # Load model
    processor, magma = get_magma_model(cfg.model_name)
    
    # Iterate through all tasks
    for task_id in tqdm.tqdm(range(num_tasks_in_suite)):
        # Get task
        task = task_suite.get_task(task_id)
        task_name = task.name
        max_steps = get_max_steps(cfg.task_suite_name)
        
        # Get default LIBERO initial states
        initial_states = task_suite.get_task_init_states(task_id)
        
        # Initialize LIBERO environment and task description
        env, task_description = get_libero_env(task, resolution=cfg.resolution)
        print(f"[info] Evaluating task {task_id} from suite {cfg.task_suite_name}, "
              f"the language instruction is {task_description}.")
        log_file.write(f"Task {task_id}: {task_description}\n")
        log_file.flush()

        # Get prompt for current task
        prompt = get_magma_prompt(task_description, processor, magma.config)
        
        # Initialize task-specific counters
        task_episodes, task_successes = 0, 0
        
        # Run trials for current task
        for trial in range(cfg.num_trials_per_task):
            env.reset()
            obs = env.set_init_state(initial_states[trial])

            step = 0
            
            while step < max_steps + cfg.num_steps_wait:
                if step < cfg.num_steps_wait:
                    obs, reward, done, info = env.step(get_libero_dummy_action())
                    step += 1
                    continue
                    
                img = get_libero_obs(obs, resize_size=cfg.resolution)
                
                action = get_magma_action(magma, processor, img, prompt, cfg.task_suite_name)
                obs, reward, done, info = env.step(action.tolist())
                step += 1
                
                if done:
                    task_successes += 1
                    break
            
            task_episodes += 1
        
        # Update total counters
        total_episodes += task_episodes
        total_successes += task_successes
        
        # Log task success rate
        task_success_rate = float(task_successes) / float(task_episodes)
        print(f"Current task ({task_name}) success rate: {task_success_rate}")
        log_file.write(f"Current task ({task_name}) success rate: {task_success_rate}\n")
        log_file.flush()
        
    # Log final suite success rate
    suite_success_rate = float(total_successes) / float(total_episodes)
    print(f"Task suite success rate: {suite_success_rate}")
    log_file.write(f"\nTask suite {cfg.task_suite_name} success rate: {suite_success_rate}\n")
    log_file.flush()
    
    env.close()
    log_file.close()
    
    return total_episodes, total_successes

if __name__ == "__main__":
    eval_libero()