ctm-dqn/scripts/evaluate_models.py

1111 lines
45 KiB
Python

"""Evaluate trained models and export structured CSV/plots."""
import argparse
import concurrent.futures
import copy
import glob
import inspect
import os
import sys
from typing import Dict, List, Optional, Tuple
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
import matplotlib
import numpy as np
import pandas as pd
import yaml
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from agents.appo_agent import APPOAgent
from agents.dcmappo_agent import DCMAPPOAgent
from agents.ddqn_agent import DDQNAgent
from agents.ddpg_agent import DDPGAgent
from agents.dqn_agent import DQNAgent
from agents.gpro_agent import GPROAgent
from agents.madqn_agent import MADQNAgent
from agents.mappo_agent import MAPPOAgent
from agents.ppo_agent import PPOAgent
from agents.qmix_agent import QMIXAgent
from agents.dcqmix_agent import DCQMIXAgent
from agents.sac_agent import SACAgent
from agents.sctd3_agent import SCTD3Agent
from agents.tcamappo_agent import TCAMAPPOAgent
from agents.td3_agent import TD3Agent
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, REWARD_COMPONENT_LABELS
from utils.config import get_agent_config
from utils.heatmap_plotting import (
build_action_panel,
build_occupancy_panel,
build_speed_panel,
save_heatmap_panels,
)
from utils.run_dirs import find_shared_config_path, resolve_checkpoint_root
MODEL_ORDER = ["ppo", "gpro", "appo", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix", "ddpg", "sac", "td3", "sctd3"]
BASELINE_NAME = "no_control"
EVAL_ORDER = [BASELINE_NAME] + MODEL_ORDER
MODEL_LABELS = {
BASELINE_NAME: "NO_CONTROL",
"ppo": "PPO",
"gpro": "GPRO-PPO",
"appo": "APPO",
"mappo": "MAPPO",
"tcamappo": "TCA-MAPPO",
"dcmappo": "DC-MAPPO",
"dqn": "DQN",
"madqn": "MA-DQN",
"ddqn": "DDQN",
"qmix": "QMIX",
"dcqmix": "DC-QMIX",
"ddpg": "DDPG",
"sac": "SAC",
"td3": "TD3",
"sctd3": "SC-TD3",
}
def parse_args():
parser = argparse.ArgumentParser(description="Evaluate trained SUMO VSL models.")
parser.add_argument(
"--checkpoint-root",
type=str,
default=None,
help="Checkpoint root or run root. Default: latest under runs/<timestamp>.",
)
parser.add_argument(
"--output-dir",
type=str,
default=None,
help="Evaluation output directory. Default: results/evaluations/<run_name>.",
)
parser.add_argument(
"--config",
type=str,
default="config_sumo_vsl.yaml",
help="Fallback config path when the shared run config is unavailable.",
)
parser.add_argument(
"--models",
nargs="*",
default=None,
help="Subset of models to evaluate, e.g. --models ppo gpro tcamappo dcmappo dqn madqn ddqn qmix dcqmix sac td3 sctd3",
)
parser.add_argument("--seed", type=int, default=42, help="Evaluation seed.")
parser.add_argument(
"--workers",
type=int,
default=None,
help="Number of parallel evaluation workers. Default: min(model_count, max(1, cpu_count//2)).",
)
parser.add_argument(
"--with-gui",
action="store_true",
help="Run evaluation with SUMO GUI enabled.",
)
parser.add_argument(
"--begin-time",
type=int,
default=None,
help="Override SUMO evaluation begin time in seconds. Default: use training config.",
)
parser.add_argument(
"--end-time",
type=int,
default=None,
help="Override SUMO evaluation end time in seconds. Default: use training config.",
)
parser.add_argument(
"--step-length",
type=float,
default=None,
help="Override SUMO simulation step length for evaluation only. Default: use training config.",
)
parser.add_argument(
"--route-file",
type=str,
default=None,
help="Override SUMO route/flow file for evaluation only. Supports absolute paths or project-relative paths.",
)
return parser.parse_args()
def normalize_model_name(name: str) -> str:
lowered = name.strip().lower()
if lowered not in MODEL_ORDER:
raise ValueError(f"Unsupported model name: {name}")
return lowered
def discover_model_dirs(checkpoint_root: str, requested_models: List[str] = None) -> Dict[str, str]:
checkpoint_root = os.path.abspath(checkpoint_root)
requested = [normalize_model_name(m) for m in requested_models] if requested_models else None
discovered = {}
for model_name in MODEL_ORDER:
model_dir = os.path.join(checkpoint_root, model_name)
if os.path.isdir(model_dir):
discovered[model_name] = model_dir
if discovered:
if requested is None:
return discovered
return {k: v for k, v in discovered.items() if k in requested}
base_name = os.path.basename(checkpoint_root).lower()
parent_name = os.path.basename(os.path.dirname(checkpoint_root)).lower()
grandparent_name = os.path.basename(os.path.dirname(os.path.dirname(checkpoint_root))).lower()
if base_name in MODEL_LABELS and (
parent_name == "checkpoints" or grandparent_name in {"checkpoints", "multi-model"}
):
model_name = base_name
if requested is not None and model_name not in requested:
return {}
return {model_name: checkpoint_root}
if os.path.isfile(os.path.join(checkpoint_root, "config.yaml")):
model_name = None
if parent_name in MODEL_LABELS:
model_name = parent_name
elif base_name in MODEL_LABELS:
model_name = base_name
elif requested and len(requested) == 1:
model_name = requested[0]
if model_name is None:
raise ValueError(
"Single checkpoint dir detected, but model type cannot be inferred. "
"Please pass --models <model_name>."
)
return {model_name: checkpoint_root}
raise FileNotFoundError(f"No model checkpoint directories found in: {checkpoint_root}")
def infer_eval_run_name(checkpoint_root: str) -> str:
normalized_root = os.path.abspath(checkpoint_root)
base_name = os.path.basename(normalized_root)
parent_dir = os.path.dirname(normalized_root)
parent_name = os.path.basename(parent_dir)
grandparent_dir = os.path.dirname(parent_dir)
grandparent_name = os.path.basename(grandparent_dir)
if base_name == "checkpoints":
return parent_name
if parent_name == "checkpoints":
return f"{base_name}_{grandparent_name}"
if grandparent_name == "multi-model":
return f"{base_name}_{parent_name}"
if parent_name == "multi-model":
return base_name
if parent_name in MODEL_ORDER:
return f"{parent_name}_{base_name}"
return base_name
def resolve_eval_output_dir(output_dir: str, checkpoint_root: str) -> str:
if output_dir:
return output_dir
run_name = infer_eval_run_name(checkpoint_root)
return os.path.join("results", "evaluations", run_name)
def load_config_for_checkpoint(checkpoint_dir: Optional[str], fallback_config_path: str) -> dict:
with open(fallback_config_path, "r", encoding="utf-8") as f:
base_config = yaml.safe_load(f)
checkpoint_config = find_shared_config_path(checkpoint_dir, fallback_config_path)
if checkpoint_config and os.path.isfile(checkpoint_config):
with open(checkpoint_config, "r", encoding="utf-8") as f:
checkpoint_loaded = yaml.safe_load(f)
return deep_merge_dicts(base_config, checkpoint_loaded)
return base_config
def deep_merge_dicts(base: dict, override: dict) -> dict:
merged = copy.deepcopy(base)
for key, value in override.items():
if isinstance(value, dict) and isinstance(merged.get(key), dict):
merged[key] = deep_merge_dicts(merged[key], value)
else:
merged[key] = copy.deepcopy(value)
return merged
def resolve_project_path(path_str: Optional[str]) -> Optional[str]:
if not path_str:
return None
if os.path.isabs(path_str):
return path_str
return os.path.abspath(os.path.join(PROJECT_ROOT, path_str))
def resolve_model_load_path(model_name: str, checkpoint_dir: str) -> str:
if model_name in {"ppo", "gpro", "appo", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix"}:
best_path = os.path.join(checkpoint_dir, "model_best.pt")
if os.path.isfile(best_path):
return best_path
episode_paths = sorted(glob.glob(os.path.join(checkpoint_dir, "model_ep*.pt")))
if episode_paths:
return episode_paths[-1]
else:
best_zip = os.path.join(checkpoint_dir, "model_best.zip")
best_base = os.path.join(checkpoint_dir, "model_best")
if os.path.isfile(best_zip):
return best_base
episode_paths = sorted(glob.glob(os.path.join(checkpoint_dir, "model_ep*.zip")))
if episode_paths:
return episode_paths[-1][:-4]
raise FileNotFoundError(f"No checkpoint file found for {model_name} under: {checkpoint_dir}")
def build_value_based_agent(agent_cls, agent_cfg: dict, env: SUMOEdgeVSLEnvironment):
candidate_kwargs = {
"state_dim": env.state_dim,
"num_edges": env.num_controlled_edges,
"num_actions_per_edge": env.action_dim,
"hidden_dim": agent_cfg.get("hidden_dim", 256),
"mixing_hidden_dim": agent_cfg.get(
"mixing_hidden_dim",
agent_cfg.get("hidden_dim", 256),
),
"learning_rate": agent_cfg.get("learning_rate", 3e-4),
"gamma": agent_cfg.get("gamma", 0.99),
"epsilon_start": agent_cfg.get("epsilon_start", 1.0),
"epsilon_end": agent_cfg.get("epsilon_end", 0.01),
"epsilon_decay": agent_cfg.get("epsilon_decay", 600),
"buffer_size": agent_cfg.get("buffer_size", 20000),
"batch_size": agent_cfg.get("batch_size", 128),
"target_update": agent_cfg.get("target_update", 5),
"device": agent_cfg.get("device", "cuda"),
"edge_feature_dim": env.features_per_edge,
"time_feature_dim": 3,
"total_edge_count": env.num_edges,
"controlled_start_index": env.controlled_edge_start_index,
"num_corridor_blocks": agent_cfg.get("num_corridor_blocks", 2),
"corridor_kernel_size": agent_cfg.get("corridor_kernel_size", 5),
"corridor_dropout": agent_cfg.get("corridor_dropout", 0.05),
}
accepted = inspect.signature(agent_cls).parameters
filtered_kwargs = {
key: value
for key, value in candidate_kwargs.items()
if key in accepted
}
return agent_cls(**filtered_kwargs)
def build_agent(model_name: str, config: dict, env: SUMOEdgeVSLEnvironment):
agent_cfg = get_agent_config(config, model_name)
if model_name == "ppo":
return PPOAgent(
state_dim=env.state_dim,
action_dims=[env.action_dim] * env.num_controlled_edges,
hidden_layers=agent_cfg.get("hidden_layers", [256, 256]),
learning_rate=agent_cfg.get("learning_rate", 3e-4),
gamma=agent_cfg.get("gamma", 0.99),
gae_lambda=agent_cfg.get("gae_lambda", 0.95),
clip_epsilon=agent_cfg.get("clip_epsilon", 0.2),
value_coef=agent_cfg.get("value_coef", 0.5),
entropy_coef=agent_cfg.get("entropy_coef", 0.01),
max_grad_norm=agent_cfg.get("max_grad_norm", 0.5),
ppo_epochs=agent_cfg.get("ppo_epochs", 4),
minibatch_size=agent_cfg.get("batch_size", 15),
device=agent_cfg.get("device", "cuda"),
lr_schedule=agent_cfg.get("lr_schedule", "cosine"),
total_episodes=config.get("training", {}).get("num_episodes", 4000),
)
if model_name == "gpro":
return GPROAgent(
state_dim=env.state_dim,
action_dims=[env.action_dim] * env.num_controlled_edges,
hidden_layers=agent_cfg.get("hidden_layers", [256, 256]),
learning_rate=agent_cfg.get("learning_rate", 3e-4),
clip_epsilon=agent_cfg.get("clip_epsilon", 0.2),
entropy_coef=agent_cfg.get("entropy_coef", 0.01),
max_grad_norm=agent_cfg.get("max_grad_norm", 0.5),
ppo_epochs=agent_cfg.get("ppo_epochs", 4),
minibatch_size=agent_cfg.get("batch_size", 15),
group_size=agent_cfg.get("group_size", 4),
advantage_epsilon=agent_cfg.get("advantage_epsilon", 1e-8),
device=agent_cfg.get("device", "cuda"),
lr_schedule=agent_cfg.get("lr_schedule", "cosine"),
total_episodes=config.get("training", {}).get("num_episodes", 4000),
)
if model_name == "appo":
return APPOAgent(
state_dim=env.state_dim,
action_dims=[env.action_dim] * env.num_controlled_edges,
edge_feature_dim=env.features_per_edge,
total_edge_count=env.num_edges,
controlled_start_index=env.controlled_edge_start_index,
hidden_dim=agent_cfg.get("hidden_dim", 128),
num_heads=agent_cfg.get("num_heads", 4),
num_layers=agent_cfg.get("num_layers", 2),
learning_rate=agent_cfg.get("learning_rate", 3e-4),
gamma=agent_cfg.get("gamma", 0.99),
gae_lambda=agent_cfg.get("gae_lambda", 0.95),
clip_epsilon=agent_cfg.get("clip_epsilon", 0.2),
value_coef=agent_cfg.get("value_coef", 0.5),
entropy_coef=agent_cfg.get("entropy_coef", 0.01),
max_grad_norm=agent_cfg.get("max_grad_norm", 0.5),
ppo_epochs=agent_cfg.get("ppo_epochs", 4),
minibatch_size=agent_cfg.get("batch_size", 15),
device=agent_cfg.get("device", "cuda"),
lr_schedule=agent_cfg.get("lr_schedule", "cosine"),
total_episodes=config.get("training", {}).get("num_episodes", 4000),
)
if model_name == "mappo":
return MAPPOAgent(
state_dim=env.state_dim,
num_agents=env.num_controlled_edges,
num_actions=env.action_dim,
edge_feature_dim=env.features_per_edge,
total_edge_count=env.num_edges,
controlled_start_index=env.controlled_edge_start_index,
hidden_dim=agent_cfg.get("hidden_dim", 256),
critic_hidden_dim=agent_cfg.get("critic_hidden_dim", 256),
learning_rate=agent_cfg.get("learning_rate", 3e-4),
gamma=agent_cfg.get("gamma", 0.99),
gae_lambda=agent_cfg.get("gae_lambda", 0.95),
clip_epsilon=agent_cfg.get("clip_epsilon", 0.2),
value_coef=agent_cfg.get("value_coef", 0.5),
entropy_coef=agent_cfg.get("entropy_coef", 0.01),
max_grad_norm=agent_cfg.get("max_grad_norm", 0.5),
ppo_epochs=agent_cfg.get("ppo_epochs", 4),
minibatch_size=agent_cfg.get("batch_size", 15),
device=agent_cfg.get("device", "cuda"),
lr_schedule=agent_cfg.get("lr_schedule", "cosine"),
total_episodes=config.get("training", {}).get("num_episodes", 4000),
)
if model_name == "dcmappo":
return DCMAPPOAgent(
state_dim=env.state_dim,
num_agents=env.num_controlled_edges,
num_actions=env.action_dim,
edge_feature_dim=env.features_per_edge,
total_edge_count=env.num_edges,
controlled_start_index=env.controlled_edge_start_index,
hidden_dim=agent_cfg.get("hidden_dim", 256),
critic_hidden_dim=agent_cfg.get("critic_hidden_dim", 256),
num_corridor_blocks=agent_cfg.get("num_corridor_blocks", 2),
corridor_kernel_size=agent_cfg.get("corridor_kernel_size", 3),
corridor_dropout=agent_cfg.get("corridor_dropout", 0.05),
learning_rate=agent_cfg.get("learning_rate", 3e-4),
gamma=agent_cfg.get("gamma", 0.99),
gae_lambda=agent_cfg.get("gae_lambda", 0.95),
clip_epsilon=agent_cfg.get("clip_epsilon", 0.2),
value_coef=agent_cfg.get("value_coef", 0.5),
entropy_coef=agent_cfg.get("entropy_coef", 0.01),
max_grad_norm=agent_cfg.get("max_grad_norm", 0.5),
ppo_epochs=agent_cfg.get("ppo_epochs", 4),
minibatch_size=agent_cfg.get("batch_size", 15),
device=agent_cfg.get("device", "cuda"),
lr_schedule=agent_cfg.get("lr_schedule", "cosine"),
total_episodes=config.get("training", {}).get("num_episodes", 4000),
)
if model_name == "tcamappo":
return TCAMAPPOAgent(
state_dim=env.state_dim,
num_agents=env.num_controlled_edges,
num_actions=env.action_dim,
edge_feature_dim=env.features_per_edge,
total_edge_count=env.num_edges,
controlled_start_index=env.controlled_edge_start_index,
hidden_dim=agent_cfg.get("hidden_dim", 256),
critic_hidden_dim=agent_cfg.get("critic_hidden_dim", 256),
history_window=agent_cfg.get("history_window", 6),
critic_num_heads=agent_cfg.get("critic_num_heads", 4),
critic_num_layers=agent_cfg.get("critic_num_layers", 2),
critic_dropout=agent_cfg.get("critic_dropout", 0.05),
learning_rate=agent_cfg.get("learning_rate", 3e-4),
gamma=agent_cfg.get("gamma", 0.99),
gae_lambda=agent_cfg.get("gae_lambda", 0.95),
clip_epsilon=agent_cfg.get("clip_epsilon", 0.2),
value_coef=agent_cfg.get("value_coef", 0.5),
entropy_coef=agent_cfg.get("entropy_coef", 0.01),
max_grad_norm=agent_cfg.get("max_grad_norm", 0.5),
ppo_epochs=agent_cfg.get("ppo_epochs", 4),
minibatch_size=agent_cfg.get("batch_size", 15),
device=agent_cfg.get("device", "cuda"),
lr_schedule=agent_cfg.get("lr_schedule", "cosine"),
total_episodes=config.get("training", {}).get("num_episodes", 4000),
)
if model_name == "dqn":
return build_value_based_agent(DQNAgent, agent_cfg, env)
if model_name == "madqn":
return build_value_based_agent(MADQNAgent, agent_cfg, env)
if model_name == "ddqn":
return build_value_based_agent(DDQNAgent, agent_cfg, env)
if model_name == "qmix":
return build_value_based_agent(QMIXAgent, agent_cfg, env)
if model_name == "dcqmix":
return build_value_based_agent(DCQMIXAgent, agent_cfg, env)
if model_name == "ddpg":
return DDPGAgent(
state_dim=env.state_dim,
action_dims=[env.action_dim] * env.num_controlled_edges,
learning_rate=agent_cfg.get("learning_rate", 3e-4),
buffer_size=agent_cfg.get("buffer_size", 20000),
learning_starts=agent_cfg.get("learning_starts", 200),
batch_size=agent_cfg.get("batch_size", 128),
tau=agent_cfg.get("tau", 0.005),
gamma=agent_cfg.get("gamma", 0.99),
exploration_sigma=agent_cfg.get("exploration_sigma", 0.15),
device=agent_cfg.get("device", "cuda"),
actor_hidden_dims=agent_cfg.get("actor_hidden_dims"),
critic_hidden_dims=agent_cfg.get("critic_hidden_dims"),
activation_fn=agent_cfg.get("activation_fn", "relu"),
)
if model_name == "sac":
return SACAgent(
state_dim=env.state_dim,
action_dims=[env.action_dim] * env.num_controlled_edges,
learning_rate=agent_cfg.get("learning_rate", 3e-4),
buffer_size=agent_cfg.get("buffer_size", 20000),
learning_starts=agent_cfg.get("learning_starts", 200),
batch_size=agent_cfg.get("batch_size", 128),
tau=agent_cfg.get("tau", 0.005),
gamma=agent_cfg.get("gamma", 0.99),
ent_coef=agent_cfg.get("ent_coef", "auto"),
target_entropy=agent_cfg.get("target_entropy", "auto"),
target_update_interval=agent_cfg.get("target_update_interval", 1),
log_std_init=agent_cfg.get("log_std_init", -3.0),
device=agent_cfg.get("device", "cuda"),
actor_hidden_dims=agent_cfg.get("actor_hidden_dims"),
critic_hidden_dims=agent_cfg.get("critic_hidden_dims"),
activation_fn=agent_cfg.get("activation_fn", "relu"),
)
if model_name in {"td3", "sctd3"}:
agent_cls = SCTD3Agent if model_name == "sctd3" else TD3Agent
common_kwargs = dict(
state_dim=env.state_dim,
action_dims=[env.action_dim] * env.num_controlled_edges,
learning_rate=agent_cfg.get("learning_rate", 3e-4),
buffer_size=agent_cfg.get("buffer_size", 20000),
learning_starts=agent_cfg.get("learning_starts", 200),
batch_size=agent_cfg.get("batch_size", 128),
tau=agent_cfg.get("tau", 0.005),
gamma=agent_cfg.get("gamma", 0.99),
policy_delay=agent_cfg.get("policy_delay", 2),
exploration_sigma=agent_cfg.get("exploration_sigma", 0.15),
device=agent_cfg.get("device", "cuda"),
actor_hidden_dims=agent_cfg.get("actor_hidden_dims"),
critic_hidden_dims=agent_cfg.get("critic_hidden_dims"),
)
if model_name == "sctd3":
common_kwargs.update(
edge_feature_dim=env.features_per_edge,
total_edge_count=env.num_edges,
controlled_start_index=env.controlled_edge_start_index,
extractor_feature_dim=agent_cfg.get("extractor_feature_dim", 128),
extractor_edge_hidden_dim=agent_cfg.get("extractor_edge_hidden_dim", 16),
extractor_global_hidden_dim=agent_cfg.get("extractor_global_hidden_dim", 32),
extractor_spatial_blocks=agent_cfg.get("extractor_spatial_blocks", 1),
extractor_kernel_size=agent_cfg.get("extractor_kernel_size", 3),
activation_fn=agent_cfg.get("activation_fn", "relu"),
)
else:
common_kwargs.update(
activation_fn=agent_cfg.get("activation_fn", "relu"),
)
return agent_cls(**common_kwargs)
raise ValueError(f"Unsupported model name: {model_name}")
def select_deterministic_action(agent, state: np.ndarray) -> np.ndarray:
action, _, _ = agent.select_action(state, deterministic=True)
return action
def select_no_control_action(env: SUMOEdgeVSLEnvironment) -> np.ndarray:
if env.num_controlled_edges <= 0:
return np.zeros(0, dtype=np.int64)
return np.full(env.num_controlled_edges, env.action_dim - 1, dtype=np.int64)
def resolve_logged_action_info(
model_name: str,
env: SUMOEdgeVSLEnvironment,
action: Optional[np.ndarray],
action_applied_mask: List[bool],
edge_idx: int,
action_speed_kmh: float,
) -> Tuple[int, float]:
if not action_applied_mask[edge_idx]:
return -1, action_speed_kmh
controlled_idx = edge_idx - env.controlled_edge_start_index
if action is None or controlled_idx < 0 or controlled_idx >= len(action):
return -1, action_speed_kmh
return int(action[controlled_idx]), action_speed_kmh
def update_mainline_travel_time_tracking(
info: dict,
mainline_depart_times: Dict[str, float],
active_mainline_vehicle_ids: set,
completed_mainline_travel_times: List[float],
) -> Tuple[int, float, float]:
for event in info.get("departed_vehicle_events", []):
if not event.get("is_mainline", False):
continue
veh_id = str(event["vehicle_id"])
mainline_depart_times[veh_id] = float(event["sim_time"])
active_mainline_vehicle_ids.add(veh_id)
interval_travel_times = []
for event in info.get("arrived_vehicle_events", []):
veh_id = str(event["vehicle_id"])
if veh_id not in active_mainline_vehicle_ids:
continue
depart_time = mainline_depart_times.pop(veh_id, None)
active_mainline_vehicle_ids.discard(veh_id)
if depart_time is None:
continue
travel_time = float(event["sim_time"]) - depart_time
if travel_time < 0:
continue
interval_travel_times.append(travel_time)
completed_mainline_travel_times.append(travel_time)
interval_mean = float(np.mean(interval_travel_times)) if interval_travel_times else np.nan
cumulative_mean = (
float(np.mean(completed_mainline_travel_times))
if completed_mainline_travel_times
else np.nan
)
return len(interval_travel_times), interval_mean, cumulative_mean
def _extract_reward_components(info: dict) -> Dict[str, float]:
return {column: info.get(column, np.nan) for column in REWARD_COMPONENT_COLUMNS}
def evaluate_single_model(
model_name: str,
checkpoint_dir: Optional[str],
fallback_config_path: str,
output_dir: str,
seed: int,
begin_time: int,
end_time: int,
with_gui: bool,
step_length: Optional[float],
route_file: Optional[str],
) -> Tuple[pd.DataFrame, pd.DataFrame, dict]:
config = load_config_for_checkpoint(checkpoint_dir, fallback_config_path) if checkpoint_dir else load_config_for_checkpoint("", fallback_config_path)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("sumo", {})
effective_begin_time = runtime_config["sumo"].get("begin_time")
effective_end_time = runtime_config["sumo"].get("end_time")
if begin_time is not None:
runtime_config["sumo"]["begin_time"] = begin_time
effective_begin_time = begin_time
if end_time is not None:
runtime_config["sumo"]["end_time"] = end_time
effective_end_time = end_time
runtime_config["sumo"]["gui"] = with_gui
if step_length is not None:
runtime_config["sumo"]["step_length"] = step_length
if route_file is not None:
runtime_config["sumo"]["route_file"] = route_file
runtime_config.setdefault("runtime", {})["output_dir"] = os.path.join(output_dir, model_name)
runtime_config["runtime"]["metrics_subdir"] = "eval_sumo_metrics"
runtime_config["runtime"]["collect_detector_cells"] = True
runtime_config["runtime"]["use_vehicle_subscriptions"] = True
runtime_config["runtime"]["collect_trip_events"] = True
runtime_config["runtime"]["evaluation_mode"] = True
env = SUMOEdgeVSLEnvironment(runtime_config)
agent = None
load_path = None
if model_name != BASELINE_NAME:
agent = build_agent(model_name, runtime_config, env)
load_path = resolve_model_load_path(model_name, checkpoint_dir)
agent.load(load_path)
if hasattr(agent, "reset_episode"):
agent.reset_episode()
state = env.reset(seed=seed)
step_rows = []
edge_rows = []
detector_rows = []
step_idx = 0
mainline_depart_times: Dict[str, float] = {}
active_mainline_vehicle_ids = set()
completed_mainline_travel_times: List[float] = []
while True:
if model_name == BASELINE_NAME:
action = select_no_control_action(env)
next_state, reward, done, info = env.step(action, apply_control=True)
else:
action = select_deterministic_action(agent, state)
next_state, reward, done, info = env.step(action, apply_control=True)
if model_name != BASELINE_NAME and hasattr(agent, "update_temporal_context"):
agent.update_temporal_context(state, action, reward, info)
step_idx += 1
(
mainline_completed_count,
mainline_interval_travel_time_mean_s,
mainline_travel_time_cumulative_mean_s,
) = update_mainline_travel_time_tracking(
info,
mainline_depart_times,
active_mainline_vehicle_ids,
completed_mainline_travel_times,
)
step_row = {
"model": model_name,
"model_label": MODEL_LABELS[model_name],
"step": step_idx,
"sim_time": info.get("sim_time", np.nan),
"reward": reward,
"throughput": info.get("throughput", np.nan),
"arrived_count": info.get("arrived_count", np.nan),
"departed_count": info.get("departed_count", np.nan),
"mean_speed_kmh": info.get("mean_speed_kmh", np.nan),
"speed_variance_norm": info.get("speed_variance_norm", np.nan),
"mean_occupancy": info.get("mean_occupancy", np.nan),
"density": info.get("density", np.nan),
"num_vehicles": info.get("num_vehicles", np.nan),
"num_stops": info.get("num_stops", np.nan),
"mainline_completed_count": mainline_completed_count,
"mainline_interval_travel_time_mean_s": mainline_interval_travel_time_mean_s,
"mainline_travel_time_cumulative_mean_s": mainline_travel_time_cumulative_mean_s,
}
step_row.update(_extract_reward_components(info))
step_rows.append(step_row)
measured_speeds_ms = info.get("edge_speeds_ms", [])
occupancies = info.get("edge_occupancies", [])
action_speeds_kmh = info.get("edge_speeds_kmh", [])
action_applied_mask = info.get("action_applied_mask", [True] * env.num_edges)
for edge_idx, edge_id in enumerate(env.control_edges):
action_index, logged_speed_kmh = resolve_logged_action_info(
model_name=model_name,
env=env,
action=action,
action_applied_mask=action_applied_mask,
edge_idx=edge_idx,
action_speed_kmh=float(action_speeds_kmh[edge_idx]),
)
edge_rows.append(
{
"model": model_name,
"model_label": MODEL_LABELS[model_name],
"step": step_idx,
"edge_index": edge_idx,
"edge_id": edge_id,
"action_index": action_index,
"action_speed_kmh": logged_speed_kmh,
"action_applied": bool(action_applied_mask[edge_idx]),
"measured_speed_kmh": float(measured_speeds_ms[edge_idx] * 3.6),
"occupancy": float(occupancies[edge_idx]),
}
)
for detector_cell in info.get("detector_cells", []):
edge_idx = detector_cell["edge_index"]
action_index, logged_speed_kmh = resolve_logged_action_info(
model_name=model_name,
env=env,
action=action,
action_applied_mask=action_applied_mask,
edge_idx=edge_idx,
action_speed_kmh=float(action_speeds_kmh[edge_idx]),
)
detector_rows.append(
{
"model": model_name,
"model_label": MODEL_LABELS[model_name],
"step": step_idx,
"edge_index": edge_idx,
"edge_id": detector_cell["edge_id"],
"pos_index": detector_cell["pos_index"],
"position_m": detector_cell["position_m"],
"distance_m": detector_cell["distance_m"],
"cell_id": f"{detector_cell['edge_id']}@{detector_cell['pos_index']}",
"action_index": action_index,
"action_speed_kmh": logged_speed_kmh,
"action_applied": bool(action_applied_mask[edge_idx]),
"measured_speed_kmh": float(detector_cell["speed_ms"] * 3.6),
"occupancy": float(detector_cell["occupancy"]),
"vehicle_count": int(detector_cell["vehicle_count"]),
}
)
state = next_state
if done:
break
env.close()
step_df = pd.DataFrame(step_rows)
edge_df = pd.DataFrame(edge_rows)
detector_df = pd.DataFrame(detector_rows)
meta = {
"model": model_name,
"checkpoint_dir": os.path.abspath(checkpoint_dir) if checkpoint_dir else "",
"checkpoint_path": (
os.path.abspath(load_path if load_path.endswith(".pt") else f"{load_path}.zip")
if load_path else ""
),
"num_steps": len(step_df),
"num_edges": env.num_edges,
"begin_time": effective_begin_time,
"end_time": effective_end_time,
"with_gui": with_gui,
"step_length": runtime_config["sumo"].get("step_length"),
"route_file": runtime_config["sumo"].get("route_file", ""),
"mainline_completed_total": len(completed_mainline_travel_times),
"mainline_travel_time_mean_s": (
float(np.mean(completed_mainline_travel_times))
if completed_mainline_travel_times
else np.nan
),
}
return step_df, edge_df, detector_df, meta
def evaluate_worker(task: Tuple[str, Optional[str], str, str, int, Optional[int], Optional[int], bool, Optional[float], Optional[str]]):
return evaluate_single_model(*task)
def build_summary(step_df: pd.DataFrame) -> pd.DataFrame:
grouped = step_df.groupby(["model", "model_label"], sort=False)
aggregations = dict(
steps=("step", "count"),
reward_sum=("reward", "sum"),
reward_mean=("reward", "mean"),
throughput_mean=("throughput", "mean"),
throughput_std=("throughput", "std"),
mean_speed_kmh_mean=("mean_speed_kmh", "mean"),
mean_speed_kmh_std=("mean_speed_kmh", "std"),
speed_variance_norm_mean=("speed_variance_norm", "mean"),
density_mean=("density", "mean"),
stops_total=("num_stops", "sum"),
stops_mean=("num_stops", "mean"),
mainline_completed_total=("mainline_completed_count", "sum"),
mainline_travel_time_mean_s=("mainline_travel_time_cumulative_mean_s", "last"),
)
for column in REWARD_COMPONENT_COLUMNS:
aggregations[f"{column}_mean"] = (column, "mean")
summary_df = grouped.agg(**aggregations).reset_index()
summary_df["throughput_std"] = summary_df["throughput_std"].fillna(0.0)
summary_df["mean_speed_kmh_std"] = summary_df["mean_speed_kmh_std"].fillna(0.0)
return summary_df.sort_values("model", key=lambda s: s.map({m: i for i, m in enumerate(EVAL_ORDER)}))
def save_csv_outputs(
step_df: pd.DataFrame,
edge_df: pd.DataFrame,
detector_df: pd.DataFrame,
summary_df: pd.DataFrame,
meta_rows: List[dict],
output_dir: str,
):
os.makedirs(output_dir, exist_ok=True)
step_df.to_csv(os.path.join(output_dir, "step_metrics.csv"), index=False)
edge_df.to_csv(os.path.join(output_dir, "edge_metrics.csv"), index=False)
detector_df.to_csv(os.path.join(output_dir, "detector_metrics.csv"), index=False)
summary_df.to_csv(os.path.join(output_dir, "summary.csv"), index=False)
pd.DataFrame(meta_rows).to_csv(os.path.join(output_dir, "evaluation_manifest.csv"), index=False)
per_model_dir = os.path.join(output_dir, "per_model")
os.makedirs(per_model_dir, exist_ok=True)
for model_name in step_df["model"].unique():
model_dir = os.path.join(per_model_dir, model_name)
os.makedirs(model_dir, exist_ok=True)
step_df[step_df["model"] == model_name].to_csv(os.path.join(model_dir, "step_metrics.csv"), index=False)
edge_df[edge_df["model"] == model_name].to_csv(os.path.join(model_dir, "edge_metrics.csv"), index=False)
detector_df[detector_df["model"] == model_name].to_csv(os.path.join(model_dir, "detector_metrics.csv"), index=False)
def plot_step_comparison(step_df: pd.DataFrame, output_dir: str):
metrics = [
("reward", "Reward"),
("throughput", "Throughput (veh/h)"),
("mean_speed_kmh", "Mean Speed (km/h)"),
("speed_variance_norm", "Normalized Speed Variance"),
("num_stops", "Stops"),
("density", "Density (veh/km)"),
("mainline_travel_time_cumulative_mean_s", "Avg Mainline Travel Time (s)"),
]
fig, axes = plt.subplots(4, 2, figsize=(16, 15), sharex=True)
axes = axes.flatten()
for ax, (column, title) in zip(axes, metrics):
for model_name in EVAL_ORDER:
model_df = step_df[step_df["model"] == model_name]
if model_df.empty:
continue
ax.plot(model_df["step"], model_df[column], label=MODEL_LABELS[model_name], linewidth=1.8)
ax.set_title(title)
ax.set_xlabel("Step")
ax.grid(True, alpha=0.3)
for ax in axes[len(metrics):]:
ax.axis("off")
axes[0].legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "comparison_timeseries.png"), dpi=160)
plt.close()
def plot_reward_components(step_df: pd.DataFrame, output_dir: str):
components = [
(column, REWARD_COMPONENT_LABELS[column])
for column in REWARD_COMPONENT_COLUMNS
]
fig, axes = plt.subplots(4, 2, figsize=(15, 14), sharex=True)
axes = axes.flatten()
for ax, (column, title) in zip(axes, components):
for model_name in EVAL_ORDER:
model_df = step_df[step_df["model"] == model_name]
if model_df.empty:
continue
ax.plot(model_df["step"], model_df[column], label=MODEL_LABELS[model_name], linewidth=1.8)
ax.set_title(title)
ax.set_xlabel("Step")
ax.grid(True, alpha=0.3)
for ax in axes[len(components):]:
ax.axis("off")
axes[0].legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "reward_components.png"), dpi=160)
plt.close()
def plot_summary_bars(summary_df: pd.DataFrame, output_dir: str):
metrics = [
("throughput_mean", "Avg Throughput (veh/h)"),
("mean_speed_kmh_mean", "Avg Mean Speed (km/h)"),
("speed_variance_norm_mean", "Avg Normalized Speed Variance"),
("stops_total", "Total Stops"),
("mainline_travel_time_mean_s", "Avg Mainline Travel Time (s)"),
]
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
axes = axes.flatten()
labels = summary_df["model_label"].tolist()
x = np.arange(len(labels))
for ax, (column, title) in zip(axes, metrics):
ax.bar(x, summary_df[column].values)
ax.set_title(title)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.grid(True, axis="y", alpha=0.3)
for ax in axes[len(metrics):]:
ax.axis("off")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "summary_bars.png"), dpi=160)
plt.close()
def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output_dir: str):
heatmap_dir = os.path.join(output_dir, "heatmaps")
os.makedirs(heatmap_dir, exist_ok=True)
for model_name in EVAL_ORDER:
detector_model_df = detector_df[detector_df["model"] == model_name]
edge_model_df = edge_df[edge_df["model"] == model_name]
if detector_model_df.empty or edge_model_df.empty:
continue
cell_order = (
detector_model_df[["edge_index", "edge_id", "pos_index", "cell_id", "distance_m"]]
.drop_duplicates()
.sort_values(["distance_m", "edge_index", "pos_index"])
)
ordered_cell_ids = cell_order["cell_id"].tolist()
speed_grid = detector_model_df.pivot(index="cell_id", columns="step", values="measured_speed_kmh").reindex(ordered_cell_ids).values
occ_grid = detector_model_df.pivot(index="cell_id", columns="step", values="occupancy").reindex(ordered_cell_ids).values
edge_order = (
edge_model_df[["edge_index", "edge_id"]]
.drop_duplicates()
.sort_values("edge_index")
)
ordered_edge_ids = edge_order["edge_id"].tolist()
action_plot_df = edge_model_df.copy()
if "action_applied" in action_plot_df.columns:
action_plot_df.loc[~action_plot_df["action_applied"].astype(bool), "action_speed_kmh"] = np.nan
action_grid = (
action_plot_df.pivot(index="edge_id", columns="step", values="action_speed_kmh")
.reindex(ordered_edge_ids)
.values
)
panels = [
build_speed_panel(
speed_grid,
ordered_cell_ids,
f"{MODEL_LABELS[model_name]} Measured Speed (km/h)",
"Detector Cell (bottom=upstream, top=downstream)",
),
build_action_panel(
action_grid,
ordered_edge_ids,
f"{MODEL_LABELS[model_name]} Applied VSL (km/h)",
),
build_occupancy_panel(
occ_grid,
ordered_cell_ids,
f"{MODEL_LABELS[model_name]} Occupancy (%)",
"Detector Cell (bottom=upstream, top=downstream)",
),
]
save_heatmap_panels(
os.path.join(heatmap_dir, f"{model_name}_heatmaps.png"),
panels,
xlabel="Decision Step",
)
def _format_metric(value: float, fmt: str) -> str:
return "N/A" if pd.isna(value) else format(value, fmt)
def print_summary(summary_df: pd.DataFrame, output_dir: str):
print("\n" + "=" * 72)
print("Evaluation Summary")
print("=" * 72)
for _, row in summary_df.iterrows():
print(f"\n{row['model_label']}:")
print(f" Avg Throughput: {row['throughput_mean']:.1f} veh/h")
print(f" Avg Mean Speed: {row['mean_speed_kmh_mean']:.1f} km/h")
print(f" Avg Normalized Speed Variance: {row['speed_variance_norm_mean']:.4f}")
print(f" Total Stops: {row['stops_total']:.0f}")
print(f" Mainline Completed Vehicles: {row['mainline_completed_total']:.0f}")
print(
" Avg Mainline Travel Time: "
f"{_format_metric(row['mainline_travel_time_mean_s'], '.2f')} s"
)
print(f" Avg Density: {row['density_mean']:.2f} veh/km")
print(f" Episode Reward Sum: {row['reward_sum']:.2f}")
print("=" * 72)
print(f"Saved to: {os.path.abspath(output_dir)}")
def main():
args = parse_args()
route_file = resolve_project_path(args.route_file)
if route_file is not None and not os.path.isfile(route_file):
raise FileNotFoundError(f"Custom route file not found: {route_file}")
checkpoint_root = resolve_checkpoint_root(args.checkpoint_root)
model_dirs = discover_model_dirs(checkpoint_root, args.models)
if not model_dirs:
raise FileNotFoundError("No models matched the requested selection.")
output_dir = resolve_eval_output_dir(args.output_dir, checkpoint_root)
os.makedirs(output_dir, exist_ok=True)
all_step_dfs = []
all_edge_dfs = []
all_detector_dfs = []
meta_rows = []
tasks = []
first_checkpoint_dir = None
for model_name in MODEL_ORDER:
checkpoint_dir = model_dirs.get(model_name)
if checkpoint_dir is None:
continue
if first_checkpoint_dir is None:
first_checkpoint_dir = checkpoint_dir
tasks.append(
(
model_name,
checkpoint_dir,
args.config,
output_dir,
args.seed,
args.begin_time,
args.end_time,
args.with_gui,
args.step_length,
route_file,
)
)
if not tasks:
raise FileNotFoundError("No evaluation tasks were created.")
tasks.insert(
0,
(
BASELINE_NAME,
first_checkpoint_dir,
args.config,
output_dir,
args.seed,
args.begin_time,
args.end_time,
args.with_gui,
args.step_length,
route_file,
),
)
default_workers = min(len(tasks), max(1, (os.cpu_count() or 2) // 2))
max_workers = args.workers or default_workers
if args.with_gui and max_workers > 1:
print("GUI evaluation requested, forcing workers=1 to avoid multiple SUMO GUI windows.")
max_workers = 1
if max_workers <= 1 or len(tasks) == 1:
for task in tasks:
model_name, checkpoint_dir, *_ = task
print(f"Evaluating {MODEL_LABELS[model_name]} from {checkpoint_dir}")
step_df, edge_df, detector_df, meta = evaluate_worker(task)
all_step_dfs.append(step_df)
all_edge_dfs.append(edge_df)
all_detector_dfs.append(detector_df)
meta_rows.append(meta)
else:
print(f"Running evaluation in parallel with {max_workers} workers")
future_to_model = {}
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
for task in tasks:
model_name, checkpoint_dir, *_ = task
print(f"Queueing {MODEL_LABELS[model_name]} from {checkpoint_dir}")
future = executor.submit(evaluate_worker, task)
future_to_model[future] = model_name
for future in concurrent.futures.as_completed(future_to_model):
model_name = future_to_model[future]
step_df, edge_df, detector_df, meta = future.result()
print(f"Finished {MODEL_LABELS[model_name]}")
all_step_dfs.append(step_df)
all_edge_dfs.append(edge_df)
all_detector_dfs.append(detector_df)
meta_rows.append(meta)
step_df = pd.concat(all_step_dfs, ignore_index=True)
edge_df = pd.concat(all_edge_dfs, ignore_index=True)
detector_df = pd.concat(all_detector_dfs, ignore_index=True)
summary_df = build_summary(step_df)
save_csv_outputs(step_df, edge_df, detector_df, summary_df, meta_rows, output_dir)
plot_step_comparison(step_df, output_dir)
plot_reward_components(step_df, output_dir)
plot_summary_bars(summary_df, output_dir)
plot_model_heatmaps(edge_df, detector_df, output_dir)
print_summary(summary_df, output_dir)
if __name__ == "__main__":
main()