"""Evaluate trained models and export structured CSV/plots.""" import argparse import concurrent.futures import copy import glob import inspect import os import sys from typing import Dict, List, Optional, Tuple PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) import matplotlib import numpy as np import pandas as pd import yaml matplotlib.use("Agg") import matplotlib.pyplot as plt from agents.appo_agent import APPOAgent from agents.dcmappo_agent import DCMAPPOAgent from agents.ddqn_agent import DDQNAgent from agents.ddpg_agent import DDPGAgent from agents.dqn_agent import DQNAgent from agents.gpro_agent import GPROAgent from agents.madqn_agent import MADQNAgent from agents.mappo_agent import MAPPOAgent from agents.ppo_agent import PPOAgent from agents.qmix_agent import QMIXAgent from agents.dcqmix_agent import DCQMIXAgent from agents.sac_agent import SACAgent from agents.sctd3_agent import SCTD3Agent from agents.tcamappo_agent import TCAMAPPOAgent from agents.td3_agent import TD3Agent from envs.edge_vsl_env import SUMOEdgeVSLEnvironment from envs.reward_system import REWARD_COMPONENT_COLUMNS, REWARD_COMPONENT_LABELS from utils.config import get_agent_config from utils.heatmap_plotting import ( build_action_panel, build_occupancy_panel, build_speed_panel, save_heatmap_panels, ) from utils.run_dirs import find_shared_config_path, resolve_checkpoint_root MODEL_ORDER = ["ppo", "gpro", "appo", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix", "ddpg", "sac", "td3", "sctd3"] BASELINE_NAME = "no_control" EVAL_ORDER = [BASELINE_NAME] + MODEL_ORDER MODEL_LABELS = { BASELINE_NAME: "NO_CONTROL", "ppo": "PPO", "gpro": "GPRO-PPO", "appo": "APPO", "mappo": "MAPPO", "tcamappo": "TCA-MAPPO", "dcmappo": "DC-MAPPO", "dqn": "DQN", "madqn": "MA-DQN", "ddqn": "DDQN", "qmix": "QMIX", "dcqmix": "DC-QMIX", "ddpg": "DDPG", "sac": "SAC", "td3": "TD3", "sctd3": "SC-TD3", } def parse_args(): parser = argparse.ArgumentParser(description="Evaluate trained SUMO VSL models.") parser.add_argument( "--checkpoint-root", type=str, default=None, help="Checkpoint root or run root. Default: latest under runs/.", ) parser.add_argument( "--output-dir", type=str, default=None, help="Evaluation output directory. Default: results/evaluations/.", ) parser.add_argument( "--config", type=str, default="config_sumo_vsl.yaml", help="Fallback config path when the shared run config is unavailable.", ) parser.add_argument( "--models", nargs="*", default=None, help="Subset of models to evaluate, e.g. --models ppo gpro tcamappo dcmappo dqn madqn ddqn qmix dcqmix sac td3 sctd3", ) parser.add_argument("--seed", type=int, default=42, help="Evaluation seed.") parser.add_argument( "--workers", type=int, default=None, help="Number of parallel evaluation workers. Default: min(model_count, max(1, cpu_count//2)).", ) parser.add_argument( "--with-gui", action="store_true", help="Run evaluation with SUMO GUI enabled.", ) parser.add_argument( "--begin-time", type=int, default=None, help="Override SUMO evaluation begin time in seconds. Default: use training config.", ) parser.add_argument( "--end-time", type=int, default=None, help="Override SUMO evaluation end time in seconds. Default: use training config.", ) parser.add_argument( "--step-length", type=float, default=None, help="Override SUMO simulation step length for evaluation only. Default: use training config.", ) parser.add_argument( "--route-file", type=str, default=None, help="Override SUMO route/flow file for evaluation only. Supports absolute paths or project-relative paths.", ) return parser.parse_args() def normalize_model_name(name: str) -> str: lowered = name.strip().lower() if lowered not in MODEL_ORDER: raise ValueError(f"Unsupported model name: {name}") return lowered def discover_model_dirs(checkpoint_root: str, requested_models: List[str] = None) -> Dict[str, str]: checkpoint_root = os.path.abspath(checkpoint_root) requested = [normalize_model_name(m) for m in requested_models] if requested_models else None discovered = {} for model_name in MODEL_ORDER: model_dir = os.path.join(checkpoint_root, model_name) if os.path.isdir(model_dir): discovered[model_name] = model_dir if discovered: if requested is None: return discovered return {k: v for k, v in discovered.items() if k in requested} base_name = os.path.basename(checkpoint_root).lower() parent_name = os.path.basename(os.path.dirname(checkpoint_root)).lower() grandparent_name = os.path.basename(os.path.dirname(os.path.dirname(checkpoint_root))).lower() if base_name in MODEL_LABELS and ( parent_name == "checkpoints" or grandparent_name in {"checkpoints", "multi-model"} ): model_name = base_name if requested is not None and model_name not in requested: return {} return {model_name: checkpoint_root} if os.path.isfile(os.path.join(checkpoint_root, "config.yaml")): model_name = None if parent_name in MODEL_LABELS: model_name = parent_name elif base_name in MODEL_LABELS: model_name = base_name elif requested and len(requested) == 1: model_name = requested[0] if model_name is None: raise ValueError( "Single checkpoint dir detected, but model type cannot be inferred. " "Please pass --models ." ) return {model_name: checkpoint_root} raise FileNotFoundError(f"No model checkpoint directories found in: {checkpoint_root}") def infer_eval_run_name(checkpoint_root: str) -> str: normalized_root = os.path.abspath(checkpoint_root) base_name = os.path.basename(normalized_root) parent_dir = os.path.dirname(normalized_root) parent_name = os.path.basename(parent_dir) grandparent_dir = os.path.dirname(parent_dir) grandparent_name = os.path.basename(grandparent_dir) if base_name == "checkpoints": return parent_name if parent_name == "checkpoints": return f"{base_name}_{grandparent_name}" if grandparent_name == "multi-model": return f"{base_name}_{parent_name}" if parent_name == "multi-model": return base_name if parent_name in MODEL_ORDER: return f"{parent_name}_{base_name}" return base_name def resolve_eval_output_dir(output_dir: str, checkpoint_root: str) -> str: if output_dir: return output_dir run_name = infer_eval_run_name(checkpoint_root) return os.path.join("results", "evaluations", run_name) def load_config_for_checkpoint(checkpoint_dir: Optional[str], fallback_config_path: str) -> dict: with open(fallback_config_path, "r", encoding="utf-8") as f: base_config = yaml.safe_load(f) checkpoint_config = find_shared_config_path(checkpoint_dir, fallback_config_path) if checkpoint_config and os.path.isfile(checkpoint_config): with open(checkpoint_config, "r", encoding="utf-8") as f: checkpoint_loaded = yaml.safe_load(f) return deep_merge_dicts(base_config, checkpoint_loaded) return base_config def deep_merge_dicts(base: dict, override: dict) -> dict: merged = copy.deepcopy(base) for key, value in override.items(): if isinstance(value, dict) and isinstance(merged.get(key), dict): merged[key] = deep_merge_dicts(merged[key], value) else: merged[key] = copy.deepcopy(value) return merged def resolve_project_path(path_str: Optional[str]) -> Optional[str]: if not path_str: return None if os.path.isabs(path_str): return path_str return os.path.abspath(os.path.join(PROJECT_ROOT, path_str)) def resolve_model_load_path(model_name: str, checkpoint_dir: str) -> str: if model_name in {"ppo", "gpro", "appo", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix"}: best_path = os.path.join(checkpoint_dir, "model_best.pt") if os.path.isfile(best_path): return best_path episode_paths = sorted(glob.glob(os.path.join(checkpoint_dir, "model_ep*.pt"))) if episode_paths: return episode_paths[-1] else: best_zip = os.path.join(checkpoint_dir, "model_best.zip") best_base = os.path.join(checkpoint_dir, "model_best") if os.path.isfile(best_zip): return best_base episode_paths = sorted(glob.glob(os.path.join(checkpoint_dir, "model_ep*.zip"))) if episode_paths: return episode_paths[-1][:-4] raise FileNotFoundError(f"No checkpoint file found for {model_name} under: {checkpoint_dir}") def build_value_based_agent(agent_cls, agent_cfg: dict, env: SUMOEdgeVSLEnvironment): candidate_kwargs = { "state_dim": env.state_dim, "num_edges": env.num_controlled_edges, "num_actions_per_edge": env.action_dim, "hidden_dim": agent_cfg.get("hidden_dim", 256), "mixing_hidden_dim": agent_cfg.get( "mixing_hidden_dim", agent_cfg.get("hidden_dim", 256), ), "learning_rate": agent_cfg.get("learning_rate", 3e-4), "gamma": agent_cfg.get("gamma", 0.99), "epsilon_start": agent_cfg.get("epsilon_start", 1.0), "epsilon_end": agent_cfg.get("epsilon_end", 0.01), "epsilon_decay": agent_cfg.get("epsilon_decay", 600), "buffer_size": agent_cfg.get("buffer_size", 20000), "batch_size": agent_cfg.get("batch_size", 128), "target_update": agent_cfg.get("target_update", 5), "device": agent_cfg.get("device", "cuda"), "edge_feature_dim": env.features_per_edge, "time_feature_dim": 3, "total_edge_count": env.num_edges, "controlled_start_index": env.controlled_edge_start_index, "num_corridor_blocks": agent_cfg.get("num_corridor_blocks", 2), "corridor_kernel_size": agent_cfg.get("corridor_kernel_size", 5), "corridor_dropout": agent_cfg.get("corridor_dropout", 0.05), } accepted = inspect.signature(agent_cls).parameters filtered_kwargs = { key: value for key, value in candidate_kwargs.items() if key in accepted } return agent_cls(**filtered_kwargs) def build_agent(model_name: str, config: dict, env: SUMOEdgeVSLEnvironment): agent_cfg = get_agent_config(config, model_name) if model_name == "ppo": return PPOAgent( state_dim=env.state_dim, action_dims=[env.action_dim] * env.num_controlled_edges, hidden_layers=agent_cfg.get("hidden_layers", [256, 256]), learning_rate=agent_cfg.get("learning_rate", 3e-4), gamma=agent_cfg.get("gamma", 0.99), gae_lambda=agent_cfg.get("gae_lambda", 0.95), clip_epsilon=agent_cfg.get("clip_epsilon", 0.2), value_coef=agent_cfg.get("value_coef", 0.5), entropy_coef=agent_cfg.get("entropy_coef", 0.01), max_grad_norm=agent_cfg.get("max_grad_norm", 0.5), ppo_epochs=agent_cfg.get("ppo_epochs", 4), minibatch_size=agent_cfg.get("batch_size", 15), device=agent_cfg.get("device", "cuda"), lr_schedule=agent_cfg.get("lr_schedule", "cosine"), total_episodes=config.get("training", {}).get("num_episodes", 4000), ) if model_name == "gpro": return GPROAgent( state_dim=env.state_dim, action_dims=[env.action_dim] * env.num_controlled_edges, hidden_layers=agent_cfg.get("hidden_layers", [256, 256]), learning_rate=agent_cfg.get("learning_rate", 3e-4), clip_epsilon=agent_cfg.get("clip_epsilon", 0.2), entropy_coef=agent_cfg.get("entropy_coef", 0.01), max_grad_norm=agent_cfg.get("max_grad_norm", 0.5), ppo_epochs=agent_cfg.get("ppo_epochs", 4), minibatch_size=agent_cfg.get("batch_size", 15), group_size=agent_cfg.get("group_size", 4), advantage_epsilon=agent_cfg.get("advantage_epsilon", 1e-8), device=agent_cfg.get("device", "cuda"), lr_schedule=agent_cfg.get("lr_schedule", "cosine"), total_episodes=config.get("training", {}).get("num_episodes", 4000), ) if model_name == "appo": return APPOAgent( state_dim=env.state_dim, action_dims=[env.action_dim] * env.num_controlled_edges, edge_feature_dim=env.features_per_edge, total_edge_count=env.num_edges, controlled_start_index=env.controlled_edge_start_index, hidden_dim=agent_cfg.get("hidden_dim", 128), num_heads=agent_cfg.get("num_heads", 4), num_layers=agent_cfg.get("num_layers", 2), learning_rate=agent_cfg.get("learning_rate", 3e-4), gamma=agent_cfg.get("gamma", 0.99), gae_lambda=agent_cfg.get("gae_lambda", 0.95), clip_epsilon=agent_cfg.get("clip_epsilon", 0.2), value_coef=agent_cfg.get("value_coef", 0.5), entropy_coef=agent_cfg.get("entropy_coef", 0.01), max_grad_norm=agent_cfg.get("max_grad_norm", 0.5), ppo_epochs=agent_cfg.get("ppo_epochs", 4), minibatch_size=agent_cfg.get("batch_size", 15), device=agent_cfg.get("device", "cuda"), lr_schedule=agent_cfg.get("lr_schedule", "cosine"), total_episodes=config.get("training", {}).get("num_episodes", 4000), ) if model_name == "mappo": return MAPPOAgent( state_dim=env.state_dim, num_agents=env.num_controlled_edges, num_actions=env.action_dim, edge_feature_dim=env.features_per_edge, total_edge_count=env.num_edges, controlled_start_index=env.controlled_edge_start_index, hidden_dim=agent_cfg.get("hidden_dim", 256), critic_hidden_dim=agent_cfg.get("critic_hidden_dim", 256), learning_rate=agent_cfg.get("learning_rate", 3e-4), gamma=agent_cfg.get("gamma", 0.99), gae_lambda=agent_cfg.get("gae_lambda", 0.95), clip_epsilon=agent_cfg.get("clip_epsilon", 0.2), value_coef=agent_cfg.get("value_coef", 0.5), entropy_coef=agent_cfg.get("entropy_coef", 0.01), max_grad_norm=agent_cfg.get("max_grad_norm", 0.5), ppo_epochs=agent_cfg.get("ppo_epochs", 4), minibatch_size=agent_cfg.get("batch_size", 15), device=agent_cfg.get("device", "cuda"), lr_schedule=agent_cfg.get("lr_schedule", "cosine"), total_episodes=config.get("training", {}).get("num_episodes", 4000), ) if model_name == "dcmappo": return DCMAPPOAgent( state_dim=env.state_dim, num_agents=env.num_controlled_edges, num_actions=env.action_dim, edge_feature_dim=env.features_per_edge, total_edge_count=env.num_edges, controlled_start_index=env.controlled_edge_start_index, hidden_dim=agent_cfg.get("hidden_dim", 256), critic_hidden_dim=agent_cfg.get("critic_hidden_dim", 256), num_corridor_blocks=agent_cfg.get("num_corridor_blocks", 2), corridor_kernel_size=agent_cfg.get("corridor_kernel_size", 3), corridor_dropout=agent_cfg.get("corridor_dropout", 0.05), learning_rate=agent_cfg.get("learning_rate", 3e-4), gamma=agent_cfg.get("gamma", 0.99), gae_lambda=agent_cfg.get("gae_lambda", 0.95), clip_epsilon=agent_cfg.get("clip_epsilon", 0.2), value_coef=agent_cfg.get("value_coef", 0.5), entropy_coef=agent_cfg.get("entropy_coef", 0.01), max_grad_norm=agent_cfg.get("max_grad_norm", 0.5), ppo_epochs=agent_cfg.get("ppo_epochs", 4), minibatch_size=agent_cfg.get("batch_size", 15), device=agent_cfg.get("device", "cuda"), lr_schedule=agent_cfg.get("lr_schedule", "cosine"), total_episodes=config.get("training", {}).get("num_episodes", 4000), ) if model_name == "tcamappo": return TCAMAPPOAgent( state_dim=env.state_dim, num_agents=env.num_controlled_edges, num_actions=env.action_dim, edge_feature_dim=env.features_per_edge, total_edge_count=env.num_edges, controlled_start_index=env.controlled_edge_start_index, hidden_dim=agent_cfg.get("hidden_dim", 256), critic_hidden_dim=agent_cfg.get("critic_hidden_dim", 256), history_window=agent_cfg.get("history_window", 6), critic_num_heads=agent_cfg.get("critic_num_heads", 4), critic_num_layers=agent_cfg.get("critic_num_layers", 2), critic_dropout=agent_cfg.get("critic_dropout", 0.05), learning_rate=agent_cfg.get("learning_rate", 3e-4), gamma=agent_cfg.get("gamma", 0.99), gae_lambda=agent_cfg.get("gae_lambda", 0.95), clip_epsilon=agent_cfg.get("clip_epsilon", 0.2), value_coef=agent_cfg.get("value_coef", 0.5), entropy_coef=agent_cfg.get("entropy_coef", 0.01), max_grad_norm=agent_cfg.get("max_grad_norm", 0.5), ppo_epochs=agent_cfg.get("ppo_epochs", 4), minibatch_size=agent_cfg.get("batch_size", 15), device=agent_cfg.get("device", "cuda"), lr_schedule=agent_cfg.get("lr_schedule", "cosine"), total_episodes=config.get("training", {}).get("num_episodes", 4000), ) if model_name == "dqn": return build_value_based_agent(DQNAgent, agent_cfg, env) if model_name == "madqn": return build_value_based_agent(MADQNAgent, agent_cfg, env) if model_name == "ddqn": return build_value_based_agent(DDQNAgent, agent_cfg, env) if model_name == "qmix": return build_value_based_agent(QMIXAgent, agent_cfg, env) if model_name == "dcqmix": return build_value_based_agent(DCQMIXAgent, agent_cfg, env) if model_name == "ddpg": return DDPGAgent( state_dim=env.state_dim, action_dims=[env.action_dim] * env.num_controlled_edges, learning_rate=agent_cfg.get("learning_rate", 3e-4), buffer_size=agent_cfg.get("buffer_size", 20000), learning_starts=agent_cfg.get("learning_starts", 200), batch_size=agent_cfg.get("batch_size", 128), tau=agent_cfg.get("tau", 0.005), gamma=agent_cfg.get("gamma", 0.99), exploration_sigma=agent_cfg.get("exploration_sigma", 0.15), device=agent_cfg.get("device", "cuda"), actor_hidden_dims=agent_cfg.get("actor_hidden_dims"), critic_hidden_dims=agent_cfg.get("critic_hidden_dims"), activation_fn=agent_cfg.get("activation_fn", "relu"), ) if model_name == "sac": return SACAgent( state_dim=env.state_dim, action_dims=[env.action_dim] * env.num_controlled_edges, learning_rate=agent_cfg.get("learning_rate", 3e-4), buffer_size=agent_cfg.get("buffer_size", 20000), learning_starts=agent_cfg.get("learning_starts", 200), batch_size=agent_cfg.get("batch_size", 128), tau=agent_cfg.get("tau", 0.005), gamma=agent_cfg.get("gamma", 0.99), ent_coef=agent_cfg.get("ent_coef", "auto"), target_entropy=agent_cfg.get("target_entropy", "auto"), target_update_interval=agent_cfg.get("target_update_interval", 1), log_std_init=agent_cfg.get("log_std_init", -3.0), device=agent_cfg.get("device", "cuda"), actor_hidden_dims=agent_cfg.get("actor_hidden_dims"), critic_hidden_dims=agent_cfg.get("critic_hidden_dims"), activation_fn=agent_cfg.get("activation_fn", "relu"), ) if model_name in {"td3", "sctd3"}: agent_cls = SCTD3Agent if model_name == "sctd3" else TD3Agent common_kwargs = dict( state_dim=env.state_dim, action_dims=[env.action_dim] * env.num_controlled_edges, learning_rate=agent_cfg.get("learning_rate", 3e-4), buffer_size=agent_cfg.get("buffer_size", 20000), learning_starts=agent_cfg.get("learning_starts", 200), batch_size=agent_cfg.get("batch_size", 128), tau=agent_cfg.get("tau", 0.005), gamma=agent_cfg.get("gamma", 0.99), policy_delay=agent_cfg.get("policy_delay", 2), exploration_sigma=agent_cfg.get("exploration_sigma", 0.15), device=agent_cfg.get("device", "cuda"), actor_hidden_dims=agent_cfg.get("actor_hidden_dims"), critic_hidden_dims=agent_cfg.get("critic_hidden_dims"), ) if model_name == "sctd3": common_kwargs.update( edge_feature_dim=env.features_per_edge, total_edge_count=env.num_edges, controlled_start_index=env.controlled_edge_start_index, extractor_feature_dim=agent_cfg.get("extractor_feature_dim", 128), extractor_edge_hidden_dim=agent_cfg.get("extractor_edge_hidden_dim", 16), extractor_global_hidden_dim=agent_cfg.get("extractor_global_hidden_dim", 32), extractor_spatial_blocks=agent_cfg.get("extractor_spatial_blocks", 1), extractor_kernel_size=agent_cfg.get("extractor_kernel_size", 3), activation_fn=agent_cfg.get("activation_fn", "relu"), ) else: common_kwargs.update( activation_fn=agent_cfg.get("activation_fn", "relu"), ) return agent_cls(**common_kwargs) raise ValueError(f"Unsupported model name: {model_name}") def select_deterministic_action(agent, state: np.ndarray) -> np.ndarray: action, _, _ = agent.select_action(state, deterministic=True) return action def select_no_control_action(env: SUMOEdgeVSLEnvironment) -> np.ndarray: if env.num_controlled_edges <= 0: return np.zeros(0, dtype=np.int64) return np.full(env.num_controlled_edges, env.action_dim - 1, dtype=np.int64) def resolve_logged_action_info( model_name: str, env: SUMOEdgeVSLEnvironment, action: Optional[np.ndarray], action_applied_mask: List[bool], edge_idx: int, action_speed_kmh: float, ) -> Tuple[int, float]: if not action_applied_mask[edge_idx]: return -1, action_speed_kmh controlled_idx = edge_idx - env.controlled_edge_start_index if action is None or controlled_idx < 0 or controlled_idx >= len(action): return -1, action_speed_kmh return int(action[controlled_idx]), action_speed_kmh def update_mainline_travel_time_tracking( info: dict, mainline_depart_times: Dict[str, float], active_mainline_vehicle_ids: set, completed_mainline_travel_times: List[float], ) -> Tuple[int, float, float]: for event in info.get("departed_vehicle_events", []): if not event.get("is_mainline", False): continue veh_id = str(event["vehicle_id"]) mainline_depart_times[veh_id] = float(event["sim_time"]) active_mainline_vehicle_ids.add(veh_id) interval_travel_times = [] for event in info.get("arrived_vehicle_events", []): veh_id = str(event["vehicle_id"]) if veh_id not in active_mainline_vehicle_ids: continue depart_time = mainline_depart_times.pop(veh_id, None) active_mainline_vehicle_ids.discard(veh_id) if depart_time is None: continue travel_time = float(event["sim_time"]) - depart_time if travel_time < 0: continue interval_travel_times.append(travel_time) completed_mainline_travel_times.append(travel_time) interval_mean = float(np.mean(interval_travel_times)) if interval_travel_times else np.nan cumulative_mean = ( float(np.mean(completed_mainline_travel_times)) if completed_mainline_travel_times else np.nan ) return len(interval_travel_times), interval_mean, cumulative_mean def _extract_reward_components(info: dict) -> Dict[str, float]: return {column: info.get(column, np.nan) for column in REWARD_COMPONENT_COLUMNS} def evaluate_single_model( model_name: str, checkpoint_dir: Optional[str], fallback_config_path: str, output_dir: str, seed: int, begin_time: int, end_time: int, with_gui: bool, step_length: Optional[float], route_file: Optional[str], ) -> Tuple[pd.DataFrame, pd.DataFrame, dict]: config = load_config_for_checkpoint(checkpoint_dir, fallback_config_path) if checkpoint_dir else load_config_for_checkpoint("", fallback_config_path) runtime_config = copy.deepcopy(config) runtime_config.setdefault("sumo", {}) effective_begin_time = runtime_config["sumo"].get("begin_time") effective_end_time = runtime_config["sumo"].get("end_time") if begin_time is not None: runtime_config["sumo"]["begin_time"] = begin_time effective_begin_time = begin_time if end_time is not None: runtime_config["sumo"]["end_time"] = end_time effective_end_time = end_time runtime_config["sumo"]["gui"] = with_gui if step_length is not None: runtime_config["sumo"]["step_length"] = step_length if route_file is not None: runtime_config["sumo"]["route_file"] = route_file runtime_config.setdefault("runtime", {})["output_dir"] = os.path.join(output_dir, model_name) runtime_config["runtime"]["metrics_subdir"] = "eval_sumo_metrics" runtime_config["runtime"]["collect_detector_cells"] = True runtime_config["runtime"]["use_vehicle_subscriptions"] = True runtime_config["runtime"]["collect_trip_events"] = True runtime_config["runtime"]["evaluation_mode"] = True env = SUMOEdgeVSLEnvironment(runtime_config) agent = None load_path = None if model_name != BASELINE_NAME: agent = build_agent(model_name, runtime_config, env) load_path = resolve_model_load_path(model_name, checkpoint_dir) agent.load(load_path) if hasattr(agent, "reset_episode"): agent.reset_episode() state = env.reset(seed=seed) step_rows = [] edge_rows = [] detector_rows = [] step_idx = 0 mainline_depart_times: Dict[str, float] = {} active_mainline_vehicle_ids = set() completed_mainline_travel_times: List[float] = [] while True: if model_name == BASELINE_NAME: action = select_no_control_action(env) next_state, reward, done, info = env.step(action, apply_control=True) else: action = select_deterministic_action(agent, state) next_state, reward, done, info = env.step(action, apply_control=True) if model_name != BASELINE_NAME and hasattr(agent, "update_temporal_context"): agent.update_temporal_context(state, action, reward, info) step_idx += 1 ( mainline_completed_count, mainline_interval_travel_time_mean_s, mainline_travel_time_cumulative_mean_s, ) = update_mainline_travel_time_tracking( info, mainline_depart_times, active_mainline_vehicle_ids, completed_mainline_travel_times, ) step_row = { "model": model_name, "model_label": MODEL_LABELS[model_name], "step": step_idx, "sim_time": info.get("sim_time", np.nan), "reward": reward, "throughput": info.get("throughput", np.nan), "arrived_count": info.get("arrived_count", np.nan), "departed_count": info.get("departed_count", np.nan), "mean_speed_kmh": info.get("mean_speed_kmh", np.nan), "speed_variance_norm": info.get("speed_variance_norm", np.nan), "mean_occupancy": info.get("mean_occupancy", np.nan), "density": info.get("density", np.nan), "num_vehicles": info.get("num_vehicles", np.nan), "num_stops": info.get("num_stops", np.nan), "mainline_completed_count": mainline_completed_count, "mainline_interval_travel_time_mean_s": mainline_interval_travel_time_mean_s, "mainline_travel_time_cumulative_mean_s": mainline_travel_time_cumulative_mean_s, } step_row.update(_extract_reward_components(info)) step_rows.append(step_row) measured_speeds_ms = info.get("edge_speeds_ms", []) occupancies = info.get("edge_occupancies", []) action_speeds_kmh = info.get("edge_speeds_kmh", []) action_applied_mask = info.get("action_applied_mask", [True] * env.num_edges) for edge_idx, edge_id in enumerate(env.control_edges): action_index, logged_speed_kmh = resolve_logged_action_info( model_name=model_name, env=env, action=action, action_applied_mask=action_applied_mask, edge_idx=edge_idx, action_speed_kmh=float(action_speeds_kmh[edge_idx]), ) edge_rows.append( { "model": model_name, "model_label": MODEL_LABELS[model_name], "step": step_idx, "edge_index": edge_idx, "edge_id": edge_id, "action_index": action_index, "action_speed_kmh": logged_speed_kmh, "action_applied": bool(action_applied_mask[edge_idx]), "measured_speed_kmh": float(measured_speeds_ms[edge_idx] * 3.6), "occupancy": float(occupancies[edge_idx]), } ) for detector_cell in info.get("detector_cells", []): edge_idx = detector_cell["edge_index"] action_index, logged_speed_kmh = resolve_logged_action_info( model_name=model_name, env=env, action=action, action_applied_mask=action_applied_mask, edge_idx=edge_idx, action_speed_kmh=float(action_speeds_kmh[edge_idx]), ) detector_rows.append( { "model": model_name, "model_label": MODEL_LABELS[model_name], "step": step_idx, "edge_index": edge_idx, "edge_id": detector_cell["edge_id"], "pos_index": detector_cell["pos_index"], "position_m": detector_cell["position_m"], "distance_m": detector_cell["distance_m"], "cell_id": f"{detector_cell['edge_id']}@{detector_cell['pos_index']}", "action_index": action_index, "action_speed_kmh": logged_speed_kmh, "action_applied": bool(action_applied_mask[edge_idx]), "measured_speed_kmh": float(detector_cell["speed_ms"] * 3.6), "occupancy": float(detector_cell["occupancy"]), "vehicle_count": int(detector_cell["vehicle_count"]), } ) state = next_state if done: break env.close() step_df = pd.DataFrame(step_rows) edge_df = pd.DataFrame(edge_rows) detector_df = pd.DataFrame(detector_rows) meta = { "model": model_name, "checkpoint_dir": os.path.abspath(checkpoint_dir) if checkpoint_dir else "", "checkpoint_path": ( os.path.abspath(load_path if load_path.endswith(".pt") else f"{load_path}.zip") if load_path else "" ), "num_steps": len(step_df), "num_edges": env.num_edges, "begin_time": effective_begin_time, "end_time": effective_end_time, "with_gui": with_gui, "step_length": runtime_config["sumo"].get("step_length"), "route_file": runtime_config["sumo"].get("route_file", ""), "mainline_completed_total": len(completed_mainline_travel_times), "mainline_travel_time_mean_s": ( float(np.mean(completed_mainline_travel_times)) if completed_mainline_travel_times else np.nan ), } return step_df, edge_df, detector_df, meta def evaluate_worker(task: Tuple[str, Optional[str], str, str, int, Optional[int], Optional[int], bool, Optional[float], Optional[str]]): return evaluate_single_model(*task) def build_summary(step_df: pd.DataFrame) -> pd.DataFrame: grouped = step_df.groupby(["model", "model_label"], sort=False) aggregations = dict( steps=("step", "count"), reward_sum=("reward", "sum"), reward_mean=("reward", "mean"), throughput_mean=("throughput", "mean"), throughput_std=("throughput", "std"), mean_speed_kmh_mean=("mean_speed_kmh", "mean"), mean_speed_kmh_std=("mean_speed_kmh", "std"), speed_variance_norm_mean=("speed_variance_norm", "mean"), density_mean=("density", "mean"), stops_total=("num_stops", "sum"), stops_mean=("num_stops", "mean"), mainline_completed_total=("mainline_completed_count", "sum"), mainline_travel_time_mean_s=("mainline_travel_time_cumulative_mean_s", "last"), ) for column in REWARD_COMPONENT_COLUMNS: aggregations[f"{column}_mean"] = (column, "mean") summary_df = grouped.agg(**aggregations).reset_index() summary_df["throughput_std"] = summary_df["throughput_std"].fillna(0.0) summary_df["mean_speed_kmh_std"] = summary_df["mean_speed_kmh_std"].fillna(0.0) return summary_df.sort_values("model", key=lambda s: s.map({m: i for i, m in enumerate(EVAL_ORDER)})) def save_csv_outputs( step_df: pd.DataFrame, edge_df: pd.DataFrame, detector_df: pd.DataFrame, summary_df: pd.DataFrame, meta_rows: List[dict], output_dir: str, ): os.makedirs(output_dir, exist_ok=True) step_df.to_csv(os.path.join(output_dir, "step_metrics.csv"), index=False) edge_df.to_csv(os.path.join(output_dir, "edge_metrics.csv"), index=False) detector_df.to_csv(os.path.join(output_dir, "detector_metrics.csv"), index=False) summary_df.to_csv(os.path.join(output_dir, "summary.csv"), index=False) pd.DataFrame(meta_rows).to_csv(os.path.join(output_dir, "evaluation_manifest.csv"), index=False) per_model_dir = os.path.join(output_dir, "per_model") os.makedirs(per_model_dir, exist_ok=True) for model_name in step_df["model"].unique(): model_dir = os.path.join(per_model_dir, model_name) os.makedirs(model_dir, exist_ok=True) step_df[step_df["model"] == model_name].to_csv(os.path.join(model_dir, "step_metrics.csv"), index=False) edge_df[edge_df["model"] == model_name].to_csv(os.path.join(model_dir, "edge_metrics.csv"), index=False) detector_df[detector_df["model"] == model_name].to_csv(os.path.join(model_dir, "detector_metrics.csv"), index=False) def plot_step_comparison(step_df: pd.DataFrame, output_dir: str): metrics = [ ("reward", "Reward"), ("throughput", "Throughput (veh/h)"), ("mean_speed_kmh", "Mean Speed (km/h)"), ("speed_variance_norm", "Normalized Speed Variance"), ("num_stops", "Stops"), ("density", "Density (veh/km)"), ("mainline_travel_time_cumulative_mean_s", "Avg Mainline Travel Time (s)"), ] fig, axes = plt.subplots(4, 2, figsize=(16, 15), sharex=True) axes = axes.flatten() for ax, (column, title) in zip(axes, metrics): for model_name in EVAL_ORDER: model_df = step_df[step_df["model"] == model_name] if model_df.empty: continue ax.plot(model_df["step"], model_df[column], label=MODEL_LABELS[model_name], linewidth=1.8) ax.set_title(title) ax.set_xlabel("Step") ax.grid(True, alpha=0.3) for ax in axes[len(metrics):]: ax.axis("off") axes[0].legend() plt.tight_layout() plt.savefig(os.path.join(output_dir, "comparison_timeseries.png"), dpi=160) plt.close() def plot_reward_components(step_df: pd.DataFrame, output_dir: str): components = [ (column, REWARD_COMPONENT_LABELS[column]) for column in REWARD_COMPONENT_COLUMNS ] fig, axes = plt.subplots(4, 2, figsize=(15, 14), sharex=True) axes = axes.flatten() for ax, (column, title) in zip(axes, components): for model_name in EVAL_ORDER: model_df = step_df[step_df["model"] == model_name] if model_df.empty: continue ax.plot(model_df["step"], model_df[column], label=MODEL_LABELS[model_name], linewidth=1.8) ax.set_title(title) ax.set_xlabel("Step") ax.grid(True, alpha=0.3) for ax in axes[len(components):]: ax.axis("off") axes[0].legend() plt.tight_layout() plt.savefig(os.path.join(output_dir, "reward_components.png"), dpi=160) plt.close() def plot_summary_bars(summary_df: pd.DataFrame, output_dir: str): metrics = [ ("throughput_mean", "Avg Throughput (veh/h)"), ("mean_speed_kmh_mean", "Avg Mean Speed (km/h)"), ("speed_variance_norm_mean", "Avg Normalized Speed Variance"), ("stops_total", "Total Stops"), ("mainline_travel_time_mean_s", "Avg Mainline Travel Time (s)"), ] fig, axes = plt.subplots(3, 2, figsize=(14, 12)) axes = axes.flatten() labels = summary_df["model_label"].tolist() x = np.arange(len(labels)) for ax, (column, title) in zip(axes, metrics): ax.bar(x, summary_df[column].values) ax.set_title(title) ax.set_xticks(x) ax.set_xticklabels(labels) ax.grid(True, axis="y", alpha=0.3) for ax in axes[len(metrics):]: ax.axis("off") plt.tight_layout() plt.savefig(os.path.join(output_dir, "summary_bars.png"), dpi=160) plt.close() def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output_dir: str): heatmap_dir = os.path.join(output_dir, "heatmaps") os.makedirs(heatmap_dir, exist_ok=True) for model_name in EVAL_ORDER: detector_model_df = detector_df[detector_df["model"] == model_name] edge_model_df = edge_df[edge_df["model"] == model_name] if detector_model_df.empty or edge_model_df.empty: continue cell_order = ( detector_model_df[["edge_index", "edge_id", "pos_index", "cell_id", "distance_m"]] .drop_duplicates() .sort_values(["distance_m", "edge_index", "pos_index"]) ) ordered_cell_ids = cell_order["cell_id"].tolist() speed_grid = detector_model_df.pivot(index="cell_id", columns="step", values="measured_speed_kmh").reindex(ordered_cell_ids).values occ_grid = detector_model_df.pivot(index="cell_id", columns="step", values="occupancy").reindex(ordered_cell_ids).values edge_order = ( edge_model_df[["edge_index", "edge_id"]] .drop_duplicates() .sort_values("edge_index") ) ordered_edge_ids = edge_order["edge_id"].tolist() action_plot_df = edge_model_df.copy() if "action_applied" in action_plot_df.columns: action_plot_df.loc[~action_plot_df["action_applied"].astype(bool), "action_speed_kmh"] = np.nan action_grid = ( action_plot_df.pivot(index="edge_id", columns="step", values="action_speed_kmh") .reindex(ordered_edge_ids) .values ) panels = [ build_speed_panel( speed_grid, ordered_cell_ids, f"{MODEL_LABELS[model_name]} Measured Speed (km/h)", "Detector Cell (bottom=upstream, top=downstream)", ), build_action_panel( action_grid, ordered_edge_ids, f"{MODEL_LABELS[model_name]} Applied VSL (km/h)", ), build_occupancy_panel( occ_grid, ordered_cell_ids, f"{MODEL_LABELS[model_name]} Occupancy (%)", "Detector Cell (bottom=upstream, top=downstream)", ), ] save_heatmap_panels( os.path.join(heatmap_dir, f"{model_name}_heatmaps.png"), panels, xlabel="Decision Step", ) def _format_metric(value: float, fmt: str) -> str: return "N/A" if pd.isna(value) else format(value, fmt) def print_summary(summary_df: pd.DataFrame, output_dir: str): print("\n" + "=" * 72) print("Evaluation Summary") print("=" * 72) for _, row in summary_df.iterrows(): print(f"\n{row['model_label']}:") print(f" Avg Throughput: {row['throughput_mean']:.1f} veh/h") print(f" Avg Mean Speed: {row['mean_speed_kmh_mean']:.1f} km/h") print(f" Avg Normalized Speed Variance: {row['speed_variance_norm_mean']:.4f}") print(f" Total Stops: {row['stops_total']:.0f}") print(f" Mainline Completed Vehicles: {row['mainline_completed_total']:.0f}") print( " Avg Mainline Travel Time: " f"{_format_metric(row['mainline_travel_time_mean_s'], '.2f')} s" ) print(f" Avg Density: {row['density_mean']:.2f} veh/km") print(f" Episode Reward Sum: {row['reward_sum']:.2f}") print("=" * 72) print(f"Saved to: {os.path.abspath(output_dir)}") def main(): args = parse_args() route_file = resolve_project_path(args.route_file) if route_file is not None and not os.path.isfile(route_file): raise FileNotFoundError(f"Custom route file not found: {route_file}") checkpoint_root = resolve_checkpoint_root(args.checkpoint_root) model_dirs = discover_model_dirs(checkpoint_root, args.models) if not model_dirs: raise FileNotFoundError("No models matched the requested selection.") output_dir = resolve_eval_output_dir(args.output_dir, checkpoint_root) os.makedirs(output_dir, exist_ok=True) all_step_dfs = [] all_edge_dfs = [] all_detector_dfs = [] meta_rows = [] tasks = [] first_checkpoint_dir = None for model_name in MODEL_ORDER: checkpoint_dir = model_dirs.get(model_name) if checkpoint_dir is None: continue if first_checkpoint_dir is None: first_checkpoint_dir = checkpoint_dir tasks.append( ( model_name, checkpoint_dir, args.config, output_dir, args.seed, args.begin_time, args.end_time, args.with_gui, args.step_length, route_file, ) ) if not tasks: raise FileNotFoundError("No evaluation tasks were created.") tasks.insert( 0, ( BASELINE_NAME, first_checkpoint_dir, args.config, output_dir, args.seed, args.begin_time, args.end_time, args.with_gui, args.step_length, route_file, ), ) default_workers = min(len(tasks), max(1, (os.cpu_count() or 2) // 2)) max_workers = args.workers or default_workers if args.with_gui and max_workers > 1: print("GUI evaluation requested, forcing workers=1 to avoid multiple SUMO GUI windows.") max_workers = 1 if max_workers <= 1 or len(tasks) == 1: for task in tasks: model_name, checkpoint_dir, *_ = task print(f"Evaluating {MODEL_LABELS[model_name]} from {checkpoint_dir}") step_df, edge_df, detector_df, meta = evaluate_worker(task) all_step_dfs.append(step_df) all_edge_dfs.append(edge_df) all_detector_dfs.append(detector_df) meta_rows.append(meta) else: print(f"Running evaluation in parallel with {max_workers} workers") future_to_model = {} with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: for task in tasks: model_name, checkpoint_dir, *_ = task print(f"Queueing {MODEL_LABELS[model_name]} from {checkpoint_dir}") future = executor.submit(evaluate_worker, task) future_to_model[future] = model_name for future in concurrent.futures.as_completed(future_to_model): model_name = future_to_model[future] step_df, edge_df, detector_df, meta = future.result() print(f"Finished {MODEL_LABELS[model_name]}") all_step_dfs.append(step_df) all_edge_dfs.append(edge_df) all_detector_dfs.append(detector_df) meta_rows.append(meta) step_df = pd.concat(all_step_dfs, ignore_index=True) edge_df = pd.concat(all_edge_dfs, ignore_index=True) detector_df = pd.concat(all_detector_dfs, ignore_index=True) summary_df = build_summary(step_df) save_csv_outputs(step_df, edge_df, detector_df, summary_df, meta_rows, output_dir) plot_step_comparison(step_df, output_dir) plot_reward_components(step_df, output_dir) plot_summary_bars(summary_df, output_dir) plot_model_heatmaps(edge_df, detector_df, output_dir) print_summary(summary_df, output_dir) if __name__ == "__main__": main()