diff --git a/.gitignore b/.gitignore index 09e066a..63e2bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ Thumbs.db # AI .claude/ +.codex_tmp/ checkpoints_*/ logs_*/ diff --git a/agents/tcamappo_agent.py b/agents/tcamappo_agent.py index 502e93f..1e55e44 100644 --- a/agents/tcamappo_agent.py +++ b/agents/tcamappo_agent.py @@ -12,6 +12,8 @@ import torch import torch.nn as nn import torch.optim as optim +from envs.reward_system import REWARD_COMPONENT_COLUMNS + class SharedActor(nn.Module): def __init__(self, local_obs_dim: int, num_actions: int, hidden_dim: int = 256): @@ -182,7 +184,8 @@ class TCAMAPPOAgent: + self.last_reward_dim + self.agent_id_dim ) - self.history_token_dim = state_dim + num_agents + 5 + self.reward_feature_dim = 1 + len(REWARD_COMPONENT_COLUMNS) + self.history_token_dim = state_dim + num_agents + self.reward_feature_dim self.actor = SharedActor(self.local_obs_dim, num_actions, hidden_dim).to(self.device) self.critic = TemporalCreditCritic( @@ -260,10 +263,7 @@ class TCAMAPPOAgent: reward_features = np.array( [ float(reward) / 10.0, - float(info.get("r_flow", 0.0)), - float(info.get("r_var", 0.0)), - float(info.get("r_brake", 0.0)), - float(info.get("r_penalty", 0.0)), + *[float(info.get(column, 0.0)) for column in REWARD_COMPONENT_COLUMNS], ], dtype=np.float32, ) diff --git a/config_sumo_vsl.yaml b/config_sumo_vsl.yaml index 22fe076..23b84e4 100644 --- a/config_sumo_vsl.yaml +++ b/config_sumo_vsl.yaml @@ -52,23 +52,24 @@ environment: free_flow_speed: 30.56 reward: - # Positive term: throughput - w_flow: 0.4 - # Negative term: speed dispersion across edges - w_var: 0.3 - # Density-adaptive hard-brake penalty weight - w_brake_base: 0.1 - w_brake_max: 0.3 - # Penalty for abrupt VSL changes - w_penalty: 0.2 + reward_scale: 10.0 + outflow_weight: 0.75 + bottleneck_weight: 0.20 + control_weight_start: 0.05 + control_weight_end: 0.01 + control_weight_decay_power: 1.5 + mainline_discharge_ref_vehph: 4924.0 + + bottleneck_critical_occupancy: 15.0 + bottleneck_excess_occupancy_band: 10.0 + bottleneck_window_size: 3 + + control_temporal_weight: 0.7 + control_spatial_weight: 0.3 - rho_critical: 44.75 - k_sigmoid: 0.2 d_threshold: 5.0 - d_max: 20.0 - C_max: 4924 v_limit: 30.56 - delta_vsl_max: 16.67 + delta_vsl_max: 19.44 leader_gap_threshold_m: 100.0 training: diff --git a/envs/edge_vsl_env.py b/envs/edge_vsl_env.py index de551a9..a1d2d6e 100644 --- a/envs/edge_vsl_env.py +++ b/envs/edge_vsl_env.py @@ -19,6 +19,7 @@ import traci import traci.constants as tc from envs.network_parser import SUMONetworkParser +from envs.reward_system import RewardCalculator, RewardConfig from utils.experiment_corridor import prepare_experiment_corridor_assets @@ -40,6 +41,7 @@ class SUMOEdgeVSLEnvironment: runtime_cfg = config.get("runtime", {}) self.runtime_output_dir = runtime_cfg.get("output_dir") self.runtime_metrics_subdir = runtime_cfg.get("metrics_subdir", "sumo_metrics") + self.evaluation_mode = bool(runtime_cfg.get("evaluation_mode", False)) self.runtime_detector_add_file: Optional[str] = None self.runtime_enex_add_file: Optional[str] = None self.collect_detector_cells = runtime_cfg.get( @@ -117,21 +119,11 @@ class SUMOEdgeVSLEnvironment: self.control_edge_set = set(self.physical_control_edges) self.reward_cfg = env_cfg.get("reward", {}) - self.w1 = self.reward_cfg.get("w_flow", 0.4) - self.w2 = self.reward_cfg.get("w_var", 0.3) - self.w_base = self.reward_cfg.get("w_brake_base", 0.1) - self.w_max = self.reward_cfg.get("w_brake_max", 0.5) - self.w4 = self.reward_cfg.get("w_penalty", 0.2) - self.rho_critical = self.reward_cfg.get("rho_critical", 35.0) - self.k_sigmoid = self.reward_cfg.get("k_sigmoid", 0.2) - self.d_th = self.reward_cfg.get("d_threshold", 3.0) - self.d_max = self.reward_cfg.get("d_max", 8.0) - self.C_max = self.reward_cfg.get("C_max", 6000.0) - self.v_limit = self.reward_cfg.get("v_limit", 33.33) - self.delta_vsl_max = self.reward_cfg.get("delta_vsl_max", 60.0 / 3.6) - self.leader_gap_threshold_m = float( - self.reward_cfg.get("leader_gap_threshold_m", 100.0) + self.reward_config = RewardConfig.from_dict( + self.reward_cfg, + speed_actions_ms=self.speed_actions_ms, ) + self.total_training_episodes = int(config.get("training", {}).get("num_episodes", 1)) self.parser = SUMONetworkParser( detector_add_file=self._detector_add_template, @@ -140,6 +132,12 @@ class SUMOEdgeVSLEnvironment: self.controlled_length_km = corridor_assets.controlled_length_m / 1000.0 self.default_edge_speeds = self._build_default_segment_speeds() self.max_segment_speeds = self.default_edge_speeds.copy() + self.reward_calculator = RewardCalculator( + config=self.reward_config, + controlled_edge_start_index=self.controlled_edge_start_index, + total_training_episodes=self.total_training_episodes, + evaluation_mode=self.evaluation_mode, + ) self.action_dims = [self.num_speed_actions] * self.num_controlled_edges self.features_per_edge = 3 @@ -153,6 +151,10 @@ class SUMOEdgeVSLEnvironment: self._last_reward = 0.0 self.episode_metrics: List[Dict] = [] self._tracked_vehicle_ids: Set[str] = set() + self._mainline_depart_times: Dict[str, float] = {} + self._active_mainline_vehicle_ids: Set[str] = set() + self._completed_mainline_travel_times: List[float] = [] + self._interval_mainline_travel_times: List[float] = [] print("SUMO Edge VSL Environment initialized") print(f" Control segments: {self.num_edges}") @@ -285,6 +287,8 @@ class SUMOEdgeVSLEnvironment: pass self._sumo_running = False self._tracked_vehicle_ids.clear() + self._mainline_depart_times.clear() + self._active_mainline_vehicle_ids.clear() def reset(self, seed: Optional[int] = None) -> np.ndarray: self._episode_count += 1 @@ -294,6 +298,10 @@ class SUMOEdgeVSLEnvironment: self._prev_edge_speeds = self.default_edge_speeds.copy() self._last_reward = 0.0 self._tracked_vehicle_ids.clear() + self._mainline_depart_times = {} + self._active_mainline_vehicle_ids = set() + self._completed_mainline_travel_times = [] + self._interval_mainline_travel_times = [] self._start_sumo(seed=seed) warmup_steps = int(900 / self.control_interval) @@ -324,33 +332,13 @@ class SUMOEdgeVSLEnvironment: self._interval_departed = 0 self._interval_departed_vehicle_events = [] self._interval_arrived_vehicle_events = [] + self._interval_mainline_travel_times = [] for _ in range(self.steps_per_action): traci.simulationStep() sim_time = float(traci.simulation.getTime()) self._interval_arrived += traci.simulation.getArrivedNumber() self._interval_departed += traci.simulation.getDepartedNumber() - if self.collect_trip_events: - for veh_id in traci.simulation.getDepartedIDList(): - is_mainline = False - try: - route_edges = traci.vehicle.getRoute(veh_id) - is_mainline = any(edge_id in self.control_edge_set for edge_id in route_edges) - except Exception: - pass - self._interval_departed_vehicle_events.append( - { - "vehicle_id": veh_id, - "sim_time": sim_time, - "is_mainline": bool(is_mainline), - } - ) - for veh_id in traci.simulation.getArrivedIDList(): - self._interval_arrived_vehicle_events.append( - { - "vehicle_id": veh_id, - "sim_time": sim_time, - } - ) + self._update_mainline_trip_tracking(sim_time) detector_data = self._get_edge_detector_data() state = self._collect_state(detector_data) @@ -382,6 +370,62 @@ class SUMOEdgeVSLEnvironment: def close(self): self._close_sumo() + def _update_mainline_trip_tracking(self, sim_time: float): + departed_ids = [] + arrived_ids = [] + try: + departed_ids = list(traci.simulation.getDepartedIDList()) + except Exception: + departed_ids = [] + try: + arrived_ids = list(traci.simulation.getArrivedIDList()) + except Exception: + arrived_ids = [] + + for veh_id in departed_ids: + is_mainline = False + try: + route_edges = traci.vehicle.getRoute(veh_id) + is_mainline = any(edge_id in self.control_edge_set for edge_id in route_edges) + except Exception: + is_mainline = False + + if is_mainline: + self._mainline_depart_times[veh_id] = float(sim_time) + self._active_mainline_vehicle_ids.add(veh_id) + + if self.collect_trip_events: + self._interval_departed_vehicle_events.append( + { + "vehicle_id": veh_id, + "sim_time": sim_time, + "is_mainline": bool(is_mainline), + } + ) + + for veh_id in arrived_ids: + if self.collect_trip_events: + self._interval_arrived_vehicle_events.append( + { + "vehicle_id": veh_id, + "sim_time": sim_time, + } + ) + + if veh_id not in self._active_mainline_vehicle_ids: + continue + + depart_time = self._mainline_depart_times.pop(veh_id, None) + self._active_mainline_vehicle_ids.discard(veh_id) + if depart_time is None: + continue + + travel_time = float(sim_time) - float(depart_time) + if travel_time < 0: + continue + self._interval_mainline_travel_times.append(travel_time) + self._completed_mainline_travel_times.append(travel_time) + def _decode_action(self, action: np.ndarray) -> np.ndarray: action_array = np.asarray(action, dtype=np.int64).reshape(-1) if action_array.size != self.num_controlled_edges: @@ -565,12 +609,15 @@ class SUMOEdgeVSLEnvironment: controlled_vehicle_count += 1 speed = results.get(tc.VAR_SPEED) accel = results.get(tc.VAR_ACCELERATION) - if accel is not None and accel < -self.d_th: + if accel is not None and accel < -self.reward_config.d_threshold: brake_decels.append(abs(accel)) relative_speed = 0.0 if speed is not None: try: - leader_info = traci.vehicle.getLeader(veh_id, self.leader_gap_threshold_m) + leader_info = traci.vehicle.getLeader( + veh_id, + self.reward_config.leader_gap_threshold_m, + ) except Exception: leader_info = None if leader_info: @@ -610,12 +657,15 @@ class SUMOEdgeVSLEnvironment: accel = None speed = None - if accel is not None and accel < -self.d_th: + if accel is not None and accel < -self.reward_config.d_threshold: brake_decels.append(abs(accel)) relative_speed = 0.0 if speed is not None and speed >= 0: try: - leader_info = traci.vehicle.getLeader(veh_id, self.leader_gap_threshold_m) + leader_info = traci.vehicle.getLeader( + veh_id, + self.reward_config.leader_gap_threshold_m, + ) except Exception: leader_info = None if leader_info: @@ -648,9 +698,10 @@ class SUMOEdgeVSLEnvironment: info["arrived_count"] = self._interval_arrived info["departed_count"] = self._interval_departed - edge_speeds, edge_occs, _, valid_speeds = detector_data + edge_speeds, edge_occs, edge_counts, valid_speeds = detector_data info["edge_speeds_ms"] = edge_speeds info["edge_occupancies"] = edge_occs + info["edge_vehicle_counts"] = edge_counts info["mean_speed"] = np.mean(valid_speeds) if valid_speeds else 0.0 info["mean_speed_kmh"] = info["mean_speed"] * 3.6 info["mean_occupancy"] = np.mean(edge_occs) if edge_occs else 0.0 @@ -671,9 +722,47 @@ class SUMOEdgeVSLEnvironment: info["relative_speed_samples"] = relative_speed_samples info["relative_speed_variance"] = float(np.var(relative_speed_samples)) if relative_speed_samples else 0.0 info["speed_variance_norm"] = ( - info["relative_speed_variance"] / max(self.v_limit ** 2, 1e-6) + info["relative_speed_variance"] / max(self.reward_config.v_limit ** 2, 1e-6) ) + active_start = self.controlled_edge_start_index + active_occs = np.asarray(edge_occs[active_start:], dtype=float) + active_counts = np.asarray(edge_counts[active_start:], dtype=float) + + if active_occs.size > 0: + bottleneck_window = active_occs[-self.reward_config.bottleneck_window_size :] + bottleneck_rel_idx = int(np.argmax(bottleneck_window)) + bottleneck_abs_idx = active_start + max( + 0, + active_occs.size - self.reward_config.bottleneck_window_size, + ) + bottleneck_rel_idx + info["bottleneck_segment_index"] = bottleneck_abs_idx + info["bottleneck_occupancy"] = float(bottleneck_window[bottleneck_rel_idx]) + else: + info["bottleneck_segment_index"] = -1 + info["bottleneck_occupancy"] = 0.0 + + info["downstream_mainline_outflow"] = ( + float(active_counts[-1]) * (3600.0 / self.control_interval) + if active_counts.size > 0 + else 0.0 + ) + + interval_tt_mean = ( + float(np.mean(self._interval_mainline_travel_times)) + if self._interval_mainline_travel_times + else np.nan + ) + cumulative_tt_mean = ( + float(np.mean(self._completed_mainline_travel_times)) + if self._completed_mainline_travel_times + else np.nan + ) + info["mainline_completed_count"] = len(self._interval_mainline_travel_times) + info["mainline_interval_travel_time_mean_s"] = interval_tt_mean + info["mainline_travel_time_cumulative_mean_s"] = cumulative_tt_mean + info["mainline_active_vehicle_count"] = len(self._active_mainline_vehicle_ids) + try: info["sim_time"] = traci.simulation.getTime() except Exception: @@ -685,77 +774,12 @@ class SUMOEdgeVSLEnvironment: self, detector_data: Tuple[List[float], List[float], List[int], List[float]], ) -> Dict: - info = {} - - throughput = self._interval_arrived * (3600.0 / self.control_interval) - info["throughput"] = throughput - info["arrived_count"] = self._interval_arrived - info["departed_count"] = self._interval_departed - - edge_speeds, edge_occs, _, valid_speeds = detector_data - info["edge_speeds_ms"] = edge_speeds - info["edge_occupancies"] = edge_occs - info["mean_speed"] = np.mean(valid_speeds) if valid_speeds else 0.0 - info["mean_speed_kmh"] = info["mean_speed"] * 3.6 - info["mean_occupancy"] = np.mean(edge_occs) if edge_occs else 0.0 - - controlled_vehicle_count, brake_decels, valid_following_count, relative_speed_samples = ( - self._collect_controlled_vehicle_metrics() - ) - info["num_vehicles"] = controlled_vehicle_count - info["density"] = ( - controlled_vehicle_count / self.controlled_length_km - if self.controlled_length_km > 0 - else 0.0 - ) - info["brake_decels"] = brake_decels - info["num_hard_brakes"] = len(brake_decels) - info["valid_following_count"] = int(valid_following_count) - info["risky_following_count"] = int(sum(value > 0 for value in relative_speed_samples)) - info["relative_speed_samples"] = relative_speed_samples - info["relative_speed_variance"] = float(np.var(relative_speed_samples)) if relative_speed_samples else 0.0 - info["speed_variance_norm"] = ( - info["relative_speed_variance"] / max(self.v_limit ** 2, 1e-6) - ) - - try: - info["sim_time"] = traci.simulation.getTime() - except Exception: - info["sim_time"] = 0.0 - - return info + return self._collect_runtime_metrics(detector_data) def _calculate_reward(self, info: Dict) -> float: - q_t = info["throughput"] - r_flow = q_t / self.C_max - - speed_variance_norm = info["speed_variance_norm"] - r_var = -speed_variance_norm - - rho_t = info["density"] - w3 = self.w_base + (self.w_max - self.w_base) / ( - 1 + np.exp(-self.k_sigmoid * (rho_t - self.rho_critical)) + return self.reward_calculator.calculate( + info=info, + current_edge_speeds=self.current_edge_speeds, + prev_edge_speeds=self._prev_edge_speeds, + episode_index=self._episode_count, ) - - brake_decels = info["brake_decels"] - total_vehicles = max(info["num_vehicles"], 1) - if brake_decels: - sum_brake_penalty = sum( - max(0, (d - self.d_th) / (self.d_max - self.d_th)) - for d in brake_decels - ) - brake_penalty = sum_brake_penalty / total_vehicles - else: - brake_penalty = 0.0 - r_brake = -brake_penalty - - vsl_change = np.abs(self.current_edge_speeds - self._prev_edge_speeds) - max_vsl_change = np.max(vsl_change) - r_penalty = -max_vsl_change / self.delta_vsl_max - - info["r_flow"] = float(r_flow) - info["r_var"] = float(r_var) - info["r_brake"] = float(r_brake) - info["r_penalty"] = float(r_penalty) - reward = self.w1 * r_flow + self.w2 * r_var + w3 * r_brake + self.w4 * r_penalty - return float(reward * 10.0) diff --git a/envs/reward_system.py b/envs/reward_system.py new file mode 100644 index 0000000..3a018b8 --- /dev/null +++ b/envs/reward_system.py @@ -0,0 +1,183 @@ +"""Shared reward configuration and calculation for freeway VSL environments.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Mapping, Sequence + +import numpy as np + + +REWARD_COMPONENT_COLUMNS = ( + "r_outflow", + "r_bottleneck", + "r_ctrl", +) + +REWARD_COMPONENT_LABELS = { + "r_outflow": "R_outflow", + "r_bottleneck": "R_bottleneck", + "r_ctrl": "R_ctrl", +} + + +def clip01(value: float) -> float: + return float(np.clip(value, 0.0, 1.0)) + + +def init_reward_component_totals() -> Dict[str, float]: + return {column: 0.0 for column in REWARD_COMPONENT_COLUMNS} + + +def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[str, float]: + denom = max(int(steps), 1) + return {column: float(totals.get(column, 0.0)) / denom for column in REWARD_COMPONENT_COLUMNS} + + +@dataclass(frozen=True) +class RewardConfig: + reward_scale: float = 10.0 + outflow_weight: float = 0.75 + bottleneck_weight: float = 0.20 + control_weight_start: float = 0.05 + control_weight_end: float = 0.01 + control_weight_decay_power: float = 1.5 + mainline_discharge_ref_vehph: float = 4924.0 + bottleneck_critical_occupancy: float = 15.0 + bottleneck_excess_occupancy_band: float = 10.0 + bottleneck_window_size: int = 3 + control_temporal_weight: float = 0.7 + control_spatial_weight: float = 0.3 + delta_vsl_max: float = 0.0 + d_threshold: float = 3.0 + v_limit: float = 33.33 + leader_gap_threshold_m: float = 100.0 + + @classmethod + def from_dict( + cls, + raw_cfg: Mapping[str, object], + *, + speed_actions_ms: Sequence[float], + ) -> "RewardConfig": + default_delta_vsl_max = 0.0 + if len(speed_actions_ms) > 0: + default_delta_vsl_max = float(np.max(speed_actions_ms) - np.min(speed_actions_ms)) + + return cls( + reward_scale=float(raw_cfg.get("reward_scale", 10.0)), + outflow_weight=float(raw_cfg.get("outflow_weight", 0.75)), + bottleneck_weight=float(raw_cfg.get("bottleneck_weight", 0.20)), + control_weight_start=float(raw_cfg.get("control_weight_start", 0.05)), + control_weight_end=float(raw_cfg.get("control_weight_end", 0.01)), + control_weight_decay_power=float(raw_cfg.get("control_weight_decay_power", 1.5)), + mainline_discharge_ref_vehph=float(raw_cfg.get("mainline_discharge_ref_vehph", 4924.0)), + bottleneck_critical_occupancy=float(raw_cfg.get("bottleneck_critical_occupancy", 15.0)), + bottleneck_excess_occupancy_band=float( + raw_cfg.get("bottleneck_excess_occupancy_band", 10.0) + ), + bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))), + control_temporal_weight=float(raw_cfg.get("control_temporal_weight", 0.7)), + control_spatial_weight=float(raw_cfg.get("control_spatial_weight", 0.3)), + delta_vsl_max=float(raw_cfg.get("delta_vsl_max", default_delta_vsl_max)), + d_threshold=float(raw_cfg.get("d_threshold", 3.0)), + v_limit=float(raw_cfg.get("v_limit", 33.33)), + leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)), + ) + + +class RewardCalculator: + """Encapsulates a minimal reward for mainline bottleneck VSL control.""" + + def __init__( + self, + *, + config: RewardConfig, + controlled_edge_start_index: int, + total_training_episodes: int, + evaluation_mode: bool = False, + ): + self.config = config + self.controlled_edge_start_index = int(controlled_edge_start_index) + self.total_training_episodes = max(int(total_training_episodes), 1) + self.evaluation_mode = bool(evaluation_mode) + + def get_control_weight(self, episode_index: int) -> float: + if self.evaluation_mode: + return float(self.config.control_weight_end) + + if self.total_training_episodes <= 1: + progress = 1.0 + else: + progress = clip01((float(episode_index) - 1.0) / (self.total_training_episodes - 1.0)) + + decay = (1.0 - progress) ** max(self.config.control_weight_decay_power, 0.0) + return float( + self.config.control_weight_end + + (self.config.control_weight_start - self.config.control_weight_end) * decay + ) + + def calculate( + self, + *, + info: Dict, + current_edge_speeds: np.ndarray, + prev_edge_speeds: np.ndarray, + episode_index: int, + ) -> float: + downstream_mainline_outflow = float(info.get("downstream_mainline_outflow", 0.0)) + r_outflow = clip01( + downstream_mainline_outflow / max(self.config.mainline_discharge_ref_vehph, 1e-6) + ) + + bottleneck_occupancy = float(info.get("bottleneck_occupancy", 0.0)) + excess_occupancy = max( + bottleneck_occupancy - self.config.bottleneck_critical_occupancy, + 0.0, + ) + bottleneck_excess_norm = clip01( + excess_occupancy / max(self.config.bottleneck_excess_occupancy_band, 1e-6) + ) + r_bottleneck = -bottleneck_excess_norm + + active_start = self.controlled_edge_start_index + current_active_speeds = np.asarray(current_edge_speeds[active_start:], dtype=float) + prev_active_speeds = np.asarray(prev_edge_speeds[active_start:], dtype=float) + + if current_active_speeds.size > 0: + temporal_control_change = float( + np.mean(np.abs(current_active_speeds - prev_active_speeds)) + / max(self.config.delta_vsl_max, 1e-6) + ) + else: + temporal_control_change = 0.0 + if current_active_speeds.size >= 2: + spatial_control_change = float( + np.mean(np.abs(np.diff(current_active_speeds))) + / max(self.config.delta_vsl_max, 1e-6) + ) + else: + spatial_control_change = 0.0 + + control_change_norm = clip01( + self.config.control_temporal_weight * clip01(temporal_control_change) + + self.config.control_spatial_weight * clip01(spatial_control_change) + ) + r_ctrl = -control_change_norm + + control_weight = self.get_control_weight(episode_index) + + info["r_outflow"] = float(r_outflow) + info["r_bottleneck"] = float(r_bottleneck) + info["r_ctrl"] = float(r_ctrl) + info["bottleneck_excess_occupancy_norm"] = float(bottleneck_excess_norm) + info["temporal_control_change_norm"] = float(clip01(temporal_control_change)) + info["spatial_control_change_norm"] = float(clip01(spatial_control_change)) + info["control_weight"] = float(control_weight) + + reward = ( + self.config.outflow_weight * r_outflow + + self.config.bottleneck_weight * r_bottleneck + + control_weight * r_ctrl + ) + return float(reward * self.config.reward_scale) diff --git a/scripts/evaluate_models.py b/scripts/evaluate_models.py index 58baf0e..2689da1 100644 --- a/scripts/evaluate_models.py +++ b/scripts/evaluate_models.py @@ -19,6 +19,7 @@ import yaml matplotlib.use("Agg") import matplotlib.pyplot as plt +from matplotlib import colors from agents.appo_agent import APPOAgent from agents.dcmappo_agent import DCMAPPOAgent @@ -36,6 +37,7 @@ from agents.sctd3_agent import SCTD3Agent from agents.tcamappo_agent import TCAMAPPOAgent from agents.td3_agent import TD3Agent from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, REWARD_COMPONENT_LABELS from utils.config import get_agent_config from utils.run_dirs import find_shared_config_path, resolve_checkpoint_root @@ -43,6 +45,9 @@ from utils.run_dirs import find_shared_config_path, resolve_checkpoint_root MODEL_ORDER = ["ppo", "gpro", "appo", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix", "ddpg", "sac", "td3", "sctd3"] BASELINE_NAME = "no_control" EVAL_ORDER = [BASELINE_NAME] + MODEL_ORDER +HEATMAP_SPEED_RANGE_KMH = (40.0, 110.0) +HEATMAP_OCCUPANCY_RANGE = (0.0, 35.0) +HEATMAP_ACTION_LEVELS_KMH = [40.0, 60.0, 80.0, 100.0, 110.0] MODEL_LABELS = { BASELINE_NAME: "NO_CONTROL", "ppo": "PPO", @@ -582,6 +587,10 @@ def update_mainline_travel_time_tracking( return len(interval_travel_times), interval_mean, cumulative_mean +def _extract_reward_components(info: dict) -> Dict[str, float]: + return {column: info.get(column, np.nan) for column in REWARD_COMPONENT_COLUMNS} + + def evaluate_single_model( model_name: str, checkpoint_dir: Optional[str], @@ -615,6 +624,7 @@ def evaluate_single_model( runtime_config["runtime"]["collect_detector_cells"] = True runtime_config["runtime"]["use_vehicle_subscriptions"] = True runtime_config["runtime"]["collect_trip_events"] = True + runtime_config["runtime"]["evaluation_mode"] = True env = SUMOEdgeVSLEnvironment(runtime_config) agent = None @@ -656,31 +666,27 @@ def evaluate_single_model( completed_mainline_travel_times, ) - step_rows.append( - { - "model": model_name, - "model_label": MODEL_LABELS[model_name], - "step": step_idx, - "sim_time": info.get("sim_time", np.nan), - "reward": reward, - "throughput": info.get("throughput", np.nan), - "arrived_count": info.get("arrived_count", np.nan), - "departed_count": info.get("departed_count", np.nan), - "mean_speed_kmh": info.get("mean_speed_kmh", np.nan), - "speed_variance_norm": info.get("speed_variance_norm", np.nan), - "mean_occupancy": info.get("mean_occupancy", np.nan), - "density": info.get("density", np.nan), - "num_vehicles": info.get("num_vehicles", np.nan), - "num_hard_brakes": info.get("num_hard_brakes", np.nan), - "r_flow": info.get("r_flow", np.nan), - "r_var": info.get("r_var", np.nan), - "r_brake": info.get("r_brake", np.nan), - "r_penalty": info.get("r_penalty", np.nan), - "mainline_completed_count": mainline_completed_count, - "mainline_interval_travel_time_mean_s": mainline_interval_travel_time_mean_s, - "mainline_travel_time_cumulative_mean_s": mainline_travel_time_cumulative_mean_s, - } - ) + step_row = { + "model": model_name, + "model_label": MODEL_LABELS[model_name], + "step": step_idx, + "sim_time": info.get("sim_time", np.nan), + "reward": reward, + "throughput": info.get("throughput", np.nan), + "arrived_count": info.get("arrived_count", np.nan), + "departed_count": info.get("departed_count", np.nan), + "mean_speed_kmh": info.get("mean_speed_kmh", np.nan), + "speed_variance_norm": info.get("speed_variance_norm", np.nan), + "mean_occupancy": info.get("mean_occupancy", np.nan), + "density": info.get("density", np.nan), + "num_vehicles": info.get("num_vehicles", np.nan), + "num_hard_brakes": info.get("num_hard_brakes", np.nan), + "mainline_completed_count": mainline_completed_count, + "mainline_interval_travel_time_mean_s": mainline_interval_travel_time_mean_s, + "mainline_travel_time_cumulative_mean_s": mainline_travel_time_cumulative_mean_s, + } + step_row.update(_extract_reward_components(info)) + step_rows.append(step_row) measured_speeds_ms = info.get("edge_speeds_ms", []) occupancies = info.get("edge_occupancies", []) @@ -778,7 +784,7 @@ def evaluate_worker(task: Tuple[str, Optional[str], str, str, int, Optional[int] def build_summary(step_df: pd.DataFrame) -> pd.DataFrame: grouped = step_df.groupby(["model", "model_label"], sort=False) - summary_df = grouped.agg( + aggregations = dict( steps=("step", "count"), reward_sum=("reward", "sum"), reward_mean=("reward", "mean"), @@ -790,13 +796,12 @@ def build_summary(step_df: pd.DataFrame) -> pd.DataFrame: density_mean=("density", "mean"), hard_brakes_total=("num_hard_brakes", "sum"), hard_brakes_mean=("num_hard_brakes", "mean"), - r_flow_mean=("r_flow", "mean"), - r_var_mean=("r_var", "mean"), - r_brake_mean=("r_brake", "mean"), - r_penalty_mean=("r_penalty", "mean"), mainline_completed_total=("mainline_completed_count", "sum"), mainline_travel_time_mean_s=("mainline_travel_time_cumulative_mean_s", "last"), - ).reset_index() + ) + for column in REWARD_COMPONENT_COLUMNS: + aggregations[f"{column}_mean"] = (column, "mean") + summary_df = grouped.agg(**aggregations).reset_index() summary_df["throughput_std"] = summary_df["throughput_std"].fillna(0.0) summary_df["mean_speed_kmh_std"] = summary_df["mean_speed_kmh_std"].fillna(0.0) @@ -860,12 +865,10 @@ def plot_step_comparison(step_df: pd.DataFrame, output_dir: str): def plot_reward_components(step_df: pd.DataFrame, output_dir: str): components = [ - ("r_flow", "R_flow"), - ("r_var", "R_var"), - ("r_brake", "R_brake"), - ("r_penalty", "R_penalty"), + (column, REWARD_COMPONENT_LABELS[column]) + for column in REWARD_COMPONENT_COLUMNS ] - fig, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True) + fig, axes = plt.subplots(4, 2, figsize=(15, 14), sharex=True) axes = axes.flatten() for ax, (column, title) in zip(axes, components): @@ -877,6 +880,8 @@ def plot_reward_components(step_df: pd.DataFrame, output_dir: str): ax.set_title(title) ax.set_xlabel("Step") ax.grid(True, alpha=0.3) + for ax in axes[len(components):]: + ax.axis("off") axes[0].legend() plt.tight_layout() plt.savefig(os.path.join(output_dir, "reward_components.png"), dpi=160) @@ -916,6 +921,15 @@ def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output speed_cmap.set_bad(color="#d9d9d9") occ_cmap = plt.get_cmap("magma").copy() occ_cmap.set_bad(color="#d9d9d9") + action_cmap = plt.get_cmap("viridis", len(HEATMAP_ACTION_LEVELS_KMH)).copy() + action_cmap.set_bad(color="#d9d9d9") + action_boundaries = [HEATMAP_ACTION_LEVELS_KMH[0] - 10.0] + action_boundaries.extend( + (left + right) / 2.0 + for left, right in zip(HEATMAP_ACTION_LEVELS_KMH[:-1], HEATMAP_ACTION_LEVELS_KMH[1:]) + ) + action_boundaries.append(HEATMAP_ACTION_LEVELS_KMH[-1] + 10.0) + action_norm = colors.BoundaryNorm(action_boundaries, action_cmap.N, clip=True) for model_name in EVAL_ORDER: detector_model_df = detector_df[detector_df["model"] == model_name] @@ -939,21 +953,6 @@ def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output ordered_edge_ids = edge_order["edge_id"].tolist() action_grid = edge_model_df.pivot(index="edge_id", columns="step", values="action_speed_kmh").reindex(ordered_edge_ids).values - speed_valid = speed_grid[np.isfinite(speed_grid)] - occ_valid = occ_grid[np.isfinite(occ_grid)] - action_valid = action_grid[np.isfinite(action_grid)] - - speed_vmin = max(0.0, float(np.nanpercentile(speed_valid, 1))) if speed_valid.size else 0.0 - speed_vmax = float(np.nanpercentile(speed_valid, 99)) if speed_valid.size else 120.0 - if speed_vmax <= speed_vmin: - speed_vmin, speed_vmax = 0.0, 120.0 - - occ_vmax = float(np.nanpercentile(occ_valid, 99)) if occ_valid.size else 100.0 - occ_vmax = max(5.0, min(100.0, occ_vmax)) - - action_vmax = float(np.nanmax(action_valid)) if action_valid.size else 120.0 - action_vmax = max(120.0, action_vmax) - fig, axes = plt.subplots(1, 3, figsize=(21, 7), gridspec_kw={"width_ratios": [1.3, 0.8, 1.3]}) speed_im = axes[0].imshow( @@ -961,39 +960,57 @@ def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output aspect="auto", origin="lower", cmap=speed_cmap, - vmin=speed_vmin, - vmax=speed_vmax, + vmin=HEATMAP_SPEED_RANGE_KMH[0], + vmax=HEATMAP_SPEED_RANGE_KMH[1], ) axes[0].set_title(f"{MODEL_LABELS[model_name]} Measured Speed (km/h)") axes[0].set_xlabel("Step") axes[0].set_ylabel("Detector Cell (bottom=upstream, top=downstream)") - plt.colorbar(speed_im, ax=axes[0], fraction=0.046, pad=0.04) + plt.colorbar( + speed_im, + ax=axes[0], + fraction=0.046, + pad=0.04, + ticks=HEATMAP_ACTION_LEVELS_KMH, + ) action_im = axes[1].imshow( np.ma.masked_invalid(action_grid), aspect="auto", origin="lower", - cmap="viridis", - vmin=0, - vmax=action_vmax, + cmap=action_cmap, + norm=action_norm, ) axes[1].set_title(f"{MODEL_LABELS[model_name]} Applied VSL (km/h)") axes[1].set_xlabel("Step") axes[1].set_ylabel("Controlled Edge") - plt.colorbar(action_im, ax=axes[1], fraction=0.046, pad=0.04) + plt.colorbar( + action_im, + ax=axes[1], + fraction=0.046, + pad=0.04, + ticks=HEATMAP_ACTION_LEVELS_KMH, + boundaries=action_boundaries, + ) occ_im = axes[2].imshow( np.ma.masked_invalid(occ_grid), aspect="auto", origin="lower", cmap=occ_cmap, - vmin=0, - vmax=occ_vmax, + vmin=HEATMAP_OCCUPANCY_RANGE[0], + vmax=HEATMAP_OCCUPANCY_RANGE[1], ) axes[2].set_title(f"{MODEL_LABELS[model_name]} Occupancy (%)") axes[2].set_xlabel("Step") axes[2].set_ylabel("Detector Cell (bottom=upstream, top=downstream)") - plt.colorbar(occ_im, ax=axes[2], fraction=0.046, pad=0.04) + plt.colorbar( + occ_im, + ax=axes[2], + fraction=0.046, + pad=0.04, + ticks=np.arange(0.0, 36.0, 5.0), + ) plt.tight_layout() plt.savefig(os.path.join(heatmap_dir, f"{model_name}_heatmaps.png"), dpi=160) diff --git a/scripts/plot_live_training.py b/scripts/plot_live_training.py index a5dcb5c..5b73a8e 100644 --- a/scripts/plot_live_training.py +++ b/scripts/plot_live_training.py @@ -16,6 +16,7 @@ import pandas as pd matplotlib.use("Agg") import matplotlib.pyplot as plt +from envs.reward_system import REWARD_COMPONENT_COLUMNS, REWARD_COMPONENT_LABELS from utils.run_dirs import find_latest_run_root, find_run_root_by_timestamp @@ -203,7 +204,8 @@ def safe_read_csv(csv_path: str) -> pd.DataFrame: numeric_columns = [ "episode", "reward", "throughput", "mean_speed", "speed_variance_norm", - "r_flow", "r_var", "r_brake", "r_penalty", "hard_brakes", + *REWARD_COMPONENT_COLUMNS, + "hard_brakes", "policy_loss", "value_loss", "entropy", ] for column in numeric_columns: @@ -307,13 +309,22 @@ def plot_detailed_snapshot( ) plot_series(axes[4], df, "hard_brakes", "Hard Brakes", "count", "tab:red", window, show_ma) - reward_components = ["r_flow", "r_var", "r_brake", "r_penalty"] + reward_components = list(REWARD_COMPONENT_COLUMNS) has_components = any( col in df.columns and pd.to_numeric(df[col], errors="coerce").notna().sum() > 0 for col in reward_components ) if has_components: - for col, color in zip(reward_components, ["tab:green", "tab:purple", "tab:red", "tab:brown"]): + component_colors = [ + "tab:green", + "tab:blue", + "tab:orange", + "tab:purple", + "tab:red", + "tab:brown", + "tab:gray", + ] + for col, color in zip(reward_components, component_colors): series = pd.to_numeric(df[col], errors="coerce") if series.notna().sum() > 0: axes[5].plot( @@ -532,6 +543,17 @@ def plot_all_models_overview( if not run_logs: raise ValueError("No readable model logs found for overview plotting.") + available_reward_components = [] + for column in REWARD_COMPONENT_COLUMNS: + if any( + df is not None + and column in df.columns + and pd.to_numeric(df[column], errors="coerce").notna().sum() > 0 + for df in run_logs.values() + ): + available_reward_components.append(column) + available_reward_components = available_reward_components[:4] + fig, axes = plt.subplots(4, 3, figsize=(22, 16)) axes = axes.flatten() metrics = [ @@ -540,14 +562,15 @@ def plot_all_models_overview( ("mean_speed", "Mean Speed", "km/h"), ("speed_variance_norm", "Normalized Speed Variance", "norm"), ("hard_brakes", "Hard Brakes", "count"), - ("r_flow", "R_flow", "value"), - ("r_var", "R_var", "value"), - ("r_brake", "R_brake", "value"), - ("r_penalty", "R_penalty", "value"), ("policy_loss", "Policy Loss", "loss"), ("value_loss", "Value Loss", "loss"), ("entropy", "Entropy", "value"), ] + reward_metrics = [ + (column, REWARD_COMPONENT_LABELS.get(column, column), "value") + for column in available_reward_components + ] + metrics = metrics[:5] + reward_metrics + metrics[5:] for ax, (column, title, ylabel) in zip(axes, metrics): plot_metric_overlay(ax, run_logs, column, title, ylabel, window, show_ma) diff --git a/training/train_appo.py b/training/train_appo.py index dd9b6f6..22ec43c 100644 --- a/training/train_appo.py +++ b/training/train_appo.py @@ -15,6 +15,7 @@ from tqdm import tqdm import torch from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.appo_agent import APPOAgent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -42,6 +43,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -118,10 +120,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0 episode_speed = 0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0 - episode_r_var = 0 - episode_r_brake = 0 - episode_r_penalty = 0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -142,10 +141,8 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -173,10 +170,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) episode_mean_speeds.append(avg_speed) @@ -190,10 +184,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): logger.log( episode, episode_reward, avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, policy_loss=train_stats["policy_loss"], value_loss=train_stats["value_loss"], @@ -203,10 +194,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): logger.log( episode, episode_reward, avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -217,12 +205,10 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) if train_stats: episode_summary.update( policy_loss=float(train_stats["policy_loss"]), @@ -249,7 +235,13 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Throughput: {avg_tp:.1f} veh/h") print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") - print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}") + print( + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) + ) if train_stats: print(f" Policy Loss: {train_stats['policy_loss']:.4f}") print(f" Value Loss: {train_stats['value_loss']:.4f}") diff --git a/training/train_dcmappo.py b/training/train_dcmappo.py index 477a3a7..645d17d 100644 --- a/training/train_dcmappo.py +++ b/training/train_dcmappo.py @@ -11,6 +11,7 @@ matplotlib.use("Agg") from tqdm import tqdm from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.dcmappo_agent import DCMAPPOAgent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -36,6 +37,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -111,10 +113,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0.0 episode_speed = 0.0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0.0 - episode_r_var = 0.0 - episode_r_brake = 0.0 - episode_r_penalty = 0.0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -131,10 +130,8 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -154,10 +151,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) @@ -175,10 +169,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, policy_loss=train_stats["policy_loss"], value_loss=train_stats["value_loss"], @@ -191,10 +182,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -204,12 +192,10 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) if train_stats: episode_summary.update( policy_loss=float(train_stats["policy_loss"]), @@ -236,8 +222,11 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") print( - f" R(flow/var/brake/pen): " - f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}" + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) ) if train_stats: print(f" Policy Loss: {train_stats['policy_loss']:.4f}") diff --git a/training/train_ddpg.py b/training/train_ddpg.py index 3f8d3bc..58c9510 100644 --- a/training/train_ddpg.py +++ b/training/train_ddpg.py @@ -12,6 +12,7 @@ from datetime import datetime from tqdm import tqdm from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.ddpg_agent import DDPGAgent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -38,6 +39,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -100,10 +102,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0 episode_speed = 0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0 - episode_r_var = 0 - episode_r_brake = 0 - episode_r_penalty = 0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -121,10 +120,8 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -137,10 +134,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) episode_mean_speeds.append(avg_speed) @@ -150,10 +144,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None): logger.log( episode, episode_reward, avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -163,12 +154,10 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) save_training_episode_artifacts( log_dir=log_dir, episode=episode, @@ -188,7 +177,13 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Throughput: {avg_tp:.1f} veh/h") print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") - print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}") + print( + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) + ) if episode % save_freq == 0: agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}")) diff --git a/training/train_gpro.py b/training/train_gpro.py index 182b71e..228aa0e 100644 --- a/training/train_gpro.py +++ b/training/train_gpro.py @@ -9,6 +9,7 @@ matplotlib.use("Agg") from tqdm import tqdm from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.gpro_agent import GPROAgent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -35,6 +36,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -115,10 +117,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0.0 episode_speed = 0.0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0.0 - episode_r_var = 0.0 - episode_r_brake = 0.0 - episode_r_penalty = 0.0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -139,10 +138,8 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -161,10 +158,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) episode_mean_speeds.append(avg_speed) @@ -178,10 +172,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_tp": avg_tp, "avg_speed": avg_speed, "avg_speed_variance_norm": avg_speed_variance_norm, - "avg_r_flow": avg_r_flow, - "avg_r_var": avg_r_var, - "avg_r_brake": avg_r_brake, - "avg_r_penalty": avg_r_penalty, + "reward_components": dict(avg_reward_components), "episode_brakes": episode_brakes, } ) @@ -192,13 +183,11 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), "group_seed": int(group_seed), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) save_training_episode_artifacts( log_dir=log_dir, episode=episode, @@ -219,8 +208,11 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") print( - " R(flow/var/brake/pen): " - f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}" + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) ) if episode % save_freq == 0: @@ -247,10 +239,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): row["avg_tp"], row["avg_speed"], speed_variance_norm=row["avg_speed_variance_norm"], - r_flow=row["avg_r_flow"], - r_var=row["avg_r_var"], - r_brake=row["avg_r_brake"], - r_penalty=row["avg_r_penalty"], + reward_components=row["reward_components"], hard_brakes=row["episode_brakes"], policy_loss=train_stats["policy_loss"], value_loss=train_stats["value_loss"], @@ -263,10 +252,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None): row["avg_tp"], row["avg_speed"], speed_variance_norm=row["avg_speed_variance_norm"], - r_flow=row["avg_r_flow"], - r_var=row["avg_r_var"], - r_brake=row["avg_r_brake"], - r_penalty=row["avg_r_penalty"], + reward_components=row["reward_components"], hard_brakes=row["episode_brakes"], ) diff --git a/training/train_mappo.py b/training/train_mappo.py index c9a7d35..c026a7c 100644 --- a/training/train_mappo.py +++ b/training/train_mappo.py @@ -12,6 +12,7 @@ from datetime import datetime from tqdm import tqdm from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.mappo_agent import MAPPOAgent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -37,6 +38,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -108,10 +110,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0.0 episode_speed = 0.0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0.0 - episode_r_var = 0.0 - episode_r_brake = 0.0 - episode_r_penalty = 0.0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -128,10 +127,8 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -151,10 +148,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) @@ -172,10 +166,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, policy_loss=train_stats["policy_loss"], value_loss=train_stats["value_loss"], @@ -188,10 +179,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -201,12 +189,10 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) if train_stats: episode_summary.update( policy_loss=float(train_stats["policy_loss"]), @@ -233,8 +219,11 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") print( - f" R(flow/var/brake/pen): " - f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}" + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) ) if train_stats: print(f" Policy Loss: {train_stats['policy_loss']:.4f}") diff --git a/training/train_ppo.py b/training/train_ppo.py index c26f869..cc059fc 100644 --- a/training/train_ppo.py +++ b/training/train_ppo.py @@ -15,6 +15,7 @@ from tqdm import tqdm import torch from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.ppo_agent import PPOAgent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -43,6 +44,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -118,10 +120,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0 episode_speed = 0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0 - episode_r_var = 0 - episode_r_brake = 0 - episode_r_penalty = 0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -142,10 +141,8 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -173,10 +170,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) episode_mean_speeds.append(avg_speed) @@ -190,10 +184,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): logger.log( episode, episode_reward, avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, policy_loss=train_stats["policy_loss"], value_loss=train_stats["value_loss"], @@ -203,10 +194,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): logger.log( episode, episode_reward, avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -217,12 +205,10 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) if train_stats: episode_summary.update( policy_loss=float(train_stats["policy_loss"]), @@ -249,7 +235,13 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Throughput: {avg_tp:.1f} veh/h") print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") - print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}") + print( + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) + ) if train_stats: print(f" Policy Loss: {train_stats['policy_loss']:.4f}") print(f" Value Loss: {train_stats['value_loss']:.4f}") diff --git a/training/train_sac.py b/training/train_sac.py index 24aaf6b..87031da 100644 --- a/training/train_sac.py +++ b/training/train_sac.py @@ -9,6 +9,7 @@ matplotlib.use("Agg") from tqdm import tqdm from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.sac_agent import SACAgent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -35,6 +36,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -100,10 +102,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0.0 episode_speed = 0.0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0.0 - episode_r_var = 0.0 - episode_r_brake = 0.0 - episode_r_penalty = 0.0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -125,10 +124,8 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -145,10 +142,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) episode_mean_speeds.append(avg_speed) @@ -161,10 +155,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -174,12 +165,10 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) save_training_episode_artifacts( log_dir=log_dir, episode=episode, @@ -200,8 +189,11 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") print( - " R(flow/var/brake/pen): " - f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}" + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) ) if episode % save_freq == 0: diff --git a/training/train_tcamappo.py b/training/train_tcamappo.py index fad9e41..98520d5 100644 --- a/training/train_tcamappo.py +++ b/training/train_tcamappo.py @@ -13,6 +13,7 @@ matplotlib.use("Agg") from agents.tcamappo_agent import TCAMAPPOAgent from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts from utils.logger import TrainingLogger @@ -37,6 +38,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -116,10 +118,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput = 0.0 episode_speed = 0.0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0.0 - episode_r_var = 0.0 - episode_r_brake = 0.0 - episode_r_penalty = 0.0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -138,10 +137,8 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -161,10 +158,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) @@ -182,10 +176,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, policy_loss=train_stats["policy_loss"], value_loss=train_stats["value_loss"], @@ -198,10 +189,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -211,12 +199,10 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) if train_stats: episode_summary.update( policy_loss=float(train_stats["policy_loss"]), @@ -243,8 +229,11 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None): print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") print( - f" R(flow/var/brake/pen): " - f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}" + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) ) if train_stats: print(f" Policy Loss: {train_stats['policy_loss']:.4f}") diff --git a/training/train_td3.py b/training/train_td3.py index aedf83d..f158d3f 100644 --- a/training/train_td3.py +++ b/training/train_td3.py @@ -13,6 +13,7 @@ from datetime import datetime from tqdm import tqdm from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from agents.td3_agent import TD3Agent from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts @@ -47,6 +48,7 @@ def train_sumo_td3( os.makedirs(log_dir, exist_ok=True) runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -123,10 +125,7 @@ def train_sumo_td3( episode_throughput = 0 episode_speed = 0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0 - episode_r_var = 0 - episode_r_brake = 0 - episode_r_penalty = 0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -148,10 +147,8 @@ def train_sumo_td3( episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -168,10 +165,7 @@ def train_sumo_td3( avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) episode_mean_speeds.append(avg_speed) @@ -181,10 +175,7 @@ def train_sumo_td3( logger.log( episode, episode_reward, avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, ) @@ -194,12 +185,10 @@ def train_sumo_td3( "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) save_training_episode_artifacts( log_dir=log_dir, episode=episode, @@ -219,7 +208,13 @@ def train_sumo_td3( print(f" Throughput: {avg_tp:.1f} veh/h") print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") - print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}") + print( + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) + ) if episode % save_freq == 0: agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}")) diff --git a/training/train_value_based.py b/training/train_value_based.py index c1d48fd..8d378f0 100644 --- a/training/train_value_based.py +++ b/training/train_value_based.py @@ -12,6 +12,7 @@ import yaml from tqdm import tqdm from envs.edge_vsl_env import SUMOEdgeVSLEnvironment +from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals from utils.config import get_agent_config, get_training_config from utils.episode_artifacts import save_training_episode_artifacts from utils.logger import TrainingLogger @@ -83,6 +84,7 @@ def train_sumo_value_based( runtime_config = copy.deepcopy(config) runtime_config.setdefault("runtime", {})["output_dir"] = log_dir + runtime_config["runtime"]["evaluation_mode"] = False write_shared_run_config( runtime_config, log_dir=log_dir, @@ -135,10 +137,7 @@ def train_sumo_value_based( episode_throughput = 0.0 episode_speed = 0.0 episode_speed_variance_norm = 0.0 - episode_r_flow = 0.0 - episode_r_var = 0.0 - episode_r_brake = 0.0 - episode_r_penalty = 0.0 + episode_reward_components = init_reward_component_totals() episode_brakes = 0 done = False step = 0 @@ -158,10 +157,8 @@ def train_sumo_value_based( episode_throughput += info["throughput"] episode_speed += info["mean_speed_kmh"] episode_speed_variance_norm += info["speed_variance_norm"] - episode_r_flow += info["r_flow"] - episode_r_var += info["r_var"] - episode_r_brake += info["r_brake"] - episode_r_penalty += info["r_penalty"] + for column in REWARD_COMPONENT_COLUMNS: + episode_reward_components[column] += float(info.get(column, 0.0)) episode_brakes += info["num_hard_brakes"] state = next_state step += 1 @@ -178,10 +175,7 @@ def train_sumo_value_based( avg_tp = episode_throughput / max(step, 1) avg_speed = episode_speed / max(step, 1) avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1) - avg_r_flow = episode_r_flow / max(step, 1) - avg_r_var = episode_r_var / max(step, 1) - avg_r_brake = episode_r_brake / max(step, 1) - avg_r_penalty = episode_r_penalty / max(step, 1) + avg_reward_components = average_reward_components(episode_reward_components, step) episode_rewards.append(episode_reward) episode_throughputs.append(avg_tp) @@ -196,10 +190,7 @@ def train_sumo_value_based( avg_tp, avg_speed, speed_variance_norm=avg_speed_variance_norm, - r_flow=avg_r_flow, - r_var=avg_r_var, - r_brake=avg_r_brake, - r_penalty=avg_r_penalty, + reward_components=avg_reward_components, hard_brakes=episode_brakes, value_loss=loss_val, ) @@ -210,13 +201,11 @@ def train_sumo_value_based( "avg_throughput": float(avg_tp), "avg_mean_speed_kmh": float(avg_speed), "avg_speed_variance_norm": float(avg_speed_variance_norm), - "avg_r_flow": float(avg_r_flow), - "avg_r_var": float(avg_r_var), - "avg_r_brake": float(avg_r_brake), - "avg_r_penalty": float(avg_r_penalty), "hard_brakes": int(episode_brakes), "value_loss": float(loss_val) if loss_val is not None else None, } + for column, value in avg_reward_components.items(): + episode_summary[f"avg_{column}"] = float(value) save_training_episode_artifacts( log_dir=log_dir, episode=episode, @@ -237,8 +226,11 @@ def train_sumo_value_based( print(f" Mean Speed: {avg_speed:.1f} km/h") print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}") print( - f" R(flow/var/brake/pen): " - f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}" + " Reward Components: " + + ", ".join( + f"{column}={avg_reward_components[column]:.3f}" + for column in REWARD_COMPONENT_COLUMNS + ) ) if loss_val is not None: print(f" Value Loss: {loss_val:.4f}") diff --git a/utils/episode_artifacts.py b/utils/episode_artifacts.py index e18766e..e52358c 100644 --- a/utils/episode_artifacts.py +++ b/utils/episode_artifacts.py @@ -10,6 +10,8 @@ import numpy as np matplotlib.use("Agg") import matplotlib.pyplot as plt +from envs.reward_system import REWARD_COMPONENT_COLUMNS + def _safe_float(value, default: float = float("nan")) -> float: try: @@ -25,27 +27,24 @@ def _normalize_step_rows(episode: int, episode_metrics: Sequence[Dict]) -> Tuple for step_idx, info in enumerate(episode_metrics, start=1): step_value = int(info.get("step", step_idx)) - step_rows.append( - { - "episode": episode, - "step": step_value, - "sim_time": _safe_float(info.get("sim_time")), - "reward": _safe_float(info.get("reward")), - "throughput": _safe_float(info.get("throughput")), - "arrived_count": int(info.get("arrived_count", 0)), - "departed_count": int(info.get("departed_count", 0)), - "mean_speed_kmh": _safe_float(info.get("mean_speed_kmh")), - "speed_variance_norm": _safe_float(info.get("speed_variance_norm")), - "mean_occupancy": _safe_float(info.get("mean_occupancy")), - "density": _safe_float(info.get("density")), - "num_vehicles": int(info.get("num_vehicles", 0)), - "num_hard_brakes": int(info.get("num_hard_brakes", 0)), - "r_flow": _safe_float(info.get("r_flow")), - "r_var": _safe_float(info.get("r_var")), - "r_brake": _safe_float(info.get("r_brake")), - "r_penalty": _safe_float(info.get("r_penalty")), - } - ) + step_row = { + "episode": episode, + "step": step_value, + "sim_time": _safe_float(info.get("sim_time")), + "reward": _safe_float(info.get("reward")), + "throughput": _safe_float(info.get("throughput")), + "arrived_count": int(info.get("arrived_count", 0)), + "departed_count": int(info.get("departed_count", 0)), + "mean_speed_kmh": _safe_float(info.get("mean_speed_kmh")), + "speed_variance_norm": _safe_float(info.get("speed_variance_norm")), + "mean_occupancy": _safe_float(info.get("mean_occupancy")), + "density": _safe_float(info.get("density")), + "num_vehicles": int(info.get("num_vehicles", 0)), + "num_hard_brakes": int(info.get("num_hard_brakes", 0)), + } + for column in REWARD_COMPONENT_COLUMNS: + step_row[column] = _safe_float(info.get(column)) + step_rows.append(step_row) action_speeds = list(info.get("edge_speeds_kmh", [])) measured_speeds_ms = list(info.get("edge_speeds_ms", [])) diff --git a/utils/logger.py b/utils/logger.py index 457cc4d..c395b28 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -1,15 +1,26 @@ -"""训练日志CSV记录器""" +"""Training CSV logger.""" + import csv import os +from typing import Mapping, Optional + +from envs.reward_system import REWARD_COMPONENT_COLUMNS class TrainingLogger: def __init__(self, log_dir, model_name, resume=False): self.log_path = os.path.join(log_dir, f"{model_name}_training_log.csv") self.fieldnames = [ - "episode", "reward", "throughput", "mean_speed", "speed_variance_norm", - "r_flow", "r_var", "r_brake", "r_penalty", "hard_brakes", - "policy_loss", "value_loss", "entropy" + "episode", + "reward", + "throughput", + "mean_speed", + "speed_variance_norm", + *REWARD_COMPONENT_COLUMNS, + "hard_brakes", + "policy_loss", + "value_loss", + "entropy", ] if not resume or not os.path.exists(self.log_path): @@ -17,23 +28,38 @@ class TrainingLogger: writer = csv.DictWriter(f, fieldnames=self.fieldnames) writer.writeheader() - def log(self, episode, reward, throughput, mean_speed, speed_variance_norm=None, - r_flow=None, r_var=None, r_brake=None, r_penalty=None, hard_brakes=0, - policy_loss=None, value_loss=None, entropy=None): + def log( + self, + episode, + reward, + throughput, + mean_speed, + *, + speed_variance_norm: Optional[float] = None, + reward_components: Optional[Mapping[str, float]] = None, + hard_brakes=0, + policy_loss: Optional[float] = None, + value_loss: Optional[float] = None, + entropy: Optional[float] = None, + ): + reward_components = dict(reward_components or {}) + row = { + "episode": episode, + "reward": f"{reward:.4f}", + "throughput": f"{throughput:.2f}", + "mean_speed": f"{mean_speed:.2f}", + "speed_variance_norm": ( + f"{speed_variance_norm:.6f}" if speed_variance_norm is not None else "" + ), + "hard_brakes": f"{hard_brakes:.0f}", + "policy_loss": f"{policy_loss:.6f}" if policy_loss is not None else "", + "value_loss": f"{value_loss:.6f}" if value_loss is not None else "", + "entropy": f"{entropy:.6f}" if entropy is not None else "", + } + for column in REWARD_COMPONENT_COLUMNS: + value = reward_components.get(column) + row[column] = f"{value:.6f}" if value is not None else "" + with open(self.log_path, "a", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=self.fieldnames) - writer.writerow({ - "episode": episode, - "reward": f"{reward:.4f}", - "throughput": f"{throughput:.2f}", - "mean_speed": f"{mean_speed:.2f}", - "speed_variance_norm": f"{speed_variance_norm:.6f}" if speed_variance_norm is not None else "", - "r_flow": f"{r_flow:.6f}" if r_flow is not None else "", - "r_var": f"{r_var:.6f}" if r_var is not None else "", - "r_brake": f"{r_brake:.6f}" if r_brake is not None else "", - "r_penalty": f"{r_penalty:.6f}" if r_penalty is not None else "", - "hard_brakes": f"{hard_brakes:.0f}", - "policy_loss": f"{policy_loss:.6f}" if policy_loss is not None else "", - "value_loss": f"{value_loss:.6f}" if value_loss is not None else "", - "entropy": f"{entropy:.6f}" if entropy is not None else "" - }) + writer.writerow(row)