更换奖励函数

This commit is contained in:
Zihan Ye 2026-04-16 09:37:54 +08:00
parent 34e463d252
commit 8332385fde
19 changed files with 672 additions and 487 deletions

1
.gitignore vendored
View File

@ -39,6 +39,7 @@ Thumbs.db
# AI
.claude/
.codex_tmp/
checkpoints_*/
logs_*/

View File

@ -12,6 +12,8 @@ import torch
import torch.nn as nn
import torch.optim as optim
from envs.reward_system import REWARD_COMPONENT_COLUMNS
class SharedActor(nn.Module):
def __init__(self, local_obs_dim: int, num_actions: int, hidden_dim: int = 256):
@ -182,7 +184,8 @@ class TCAMAPPOAgent:
+ self.last_reward_dim
+ self.agent_id_dim
)
self.history_token_dim = state_dim + num_agents + 5
self.reward_feature_dim = 1 + len(REWARD_COMPONENT_COLUMNS)
self.history_token_dim = state_dim + num_agents + self.reward_feature_dim
self.actor = SharedActor(self.local_obs_dim, num_actions, hidden_dim).to(self.device)
self.critic = TemporalCreditCritic(
@ -260,10 +263,7 @@ class TCAMAPPOAgent:
reward_features = np.array(
[
float(reward) / 10.0,
float(info.get("r_flow", 0.0)),
float(info.get("r_var", 0.0)),
float(info.get("r_brake", 0.0)),
float(info.get("r_penalty", 0.0)),
*[float(info.get(column, 0.0)) for column in REWARD_COMPONENT_COLUMNS],
],
dtype=np.float32,
)

View File

@ -52,23 +52,24 @@ environment:
free_flow_speed: 30.56
reward:
# Positive term: throughput
w_flow: 0.4
# Negative term: speed dispersion across edges
w_var: 0.3
# Density-adaptive hard-brake penalty weight
w_brake_base: 0.1
w_brake_max: 0.3
# Penalty for abrupt VSL changes
w_penalty: 0.2
reward_scale: 10.0
outflow_weight: 0.75
bottleneck_weight: 0.20
control_weight_start: 0.05
control_weight_end: 0.01
control_weight_decay_power: 1.5
mainline_discharge_ref_vehph: 4924.0
bottleneck_critical_occupancy: 15.0
bottleneck_excess_occupancy_band: 10.0
bottleneck_window_size: 3
control_temporal_weight: 0.7
control_spatial_weight: 0.3
rho_critical: 44.75
k_sigmoid: 0.2
d_threshold: 5.0
d_max: 20.0
C_max: 4924
v_limit: 30.56
delta_vsl_max: 16.67
delta_vsl_max: 19.44
leader_gap_threshold_m: 100.0
training:

View File

@ -19,6 +19,7 @@ import traci
import traci.constants as tc
from envs.network_parser import SUMONetworkParser
from envs.reward_system import RewardCalculator, RewardConfig
from utils.experiment_corridor import prepare_experiment_corridor_assets
@ -40,6 +41,7 @@ class SUMOEdgeVSLEnvironment:
runtime_cfg = config.get("runtime", {})
self.runtime_output_dir = runtime_cfg.get("output_dir")
self.runtime_metrics_subdir = runtime_cfg.get("metrics_subdir", "sumo_metrics")
self.evaluation_mode = bool(runtime_cfg.get("evaluation_mode", False))
self.runtime_detector_add_file: Optional[str] = None
self.runtime_enex_add_file: Optional[str] = None
self.collect_detector_cells = runtime_cfg.get(
@ -117,21 +119,11 @@ class SUMOEdgeVSLEnvironment:
self.control_edge_set = set(self.physical_control_edges)
self.reward_cfg = env_cfg.get("reward", {})
self.w1 = self.reward_cfg.get("w_flow", 0.4)
self.w2 = self.reward_cfg.get("w_var", 0.3)
self.w_base = self.reward_cfg.get("w_brake_base", 0.1)
self.w_max = self.reward_cfg.get("w_brake_max", 0.5)
self.w4 = self.reward_cfg.get("w_penalty", 0.2)
self.rho_critical = self.reward_cfg.get("rho_critical", 35.0)
self.k_sigmoid = self.reward_cfg.get("k_sigmoid", 0.2)
self.d_th = self.reward_cfg.get("d_threshold", 3.0)
self.d_max = self.reward_cfg.get("d_max", 8.0)
self.C_max = self.reward_cfg.get("C_max", 6000.0)
self.v_limit = self.reward_cfg.get("v_limit", 33.33)
self.delta_vsl_max = self.reward_cfg.get("delta_vsl_max", 60.0 / 3.6)
self.leader_gap_threshold_m = float(
self.reward_cfg.get("leader_gap_threshold_m", 100.0)
self.reward_config = RewardConfig.from_dict(
self.reward_cfg,
speed_actions_ms=self.speed_actions_ms,
)
self.total_training_episodes = int(config.get("training", {}).get("num_episodes", 1))
self.parser = SUMONetworkParser(
detector_add_file=self._detector_add_template,
@ -140,6 +132,12 @@ class SUMOEdgeVSLEnvironment:
self.controlled_length_km = corridor_assets.controlled_length_m / 1000.0
self.default_edge_speeds = self._build_default_segment_speeds()
self.max_segment_speeds = self.default_edge_speeds.copy()
self.reward_calculator = RewardCalculator(
config=self.reward_config,
controlled_edge_start_index=self.controlled_edge_start_index,
total_training_episodes=self.total_training_episodes,
evaluation_mode=self.evaluation_mode,
)
self.action_dims = [self.num_speed_actions] * self.num_controlled_edges
self.features_per_edge = 3
@ -153,6 +151,10 @@ class SUMOEdgeVSLEnvironment:
self._last_reward = 0.0
self.episode_metrics: List[Dict] = []
self._tracked_vehicle_ids: Set[str] = set()
self._mainline_depart_times: Dict[str, float] = {}
self._active_mainline_vehicle_ids: Set[str] = set()
self._completed_mainline_travel_times: List[float] = []
self._interval_mainline_travel_times: List[float] = []
print("SUMO Edge VSL Environment initialized")
print(f" Control segments: {self.num_edges}")
@ -285,6 +287,8 @@ class SUMOEdgeVSLEnvironment:
pass
self._sumo_running = False
self._tracked_vehicle_ids.clear()
self._mainline_depart_times.clear()
self._active_mainline_vehicle_ids.clear()
def reset(self, seed: Optional[int] = None) -> np.ndarray:
self._episode_count += 1
@ -294,6 +298,10 @@ class SUMOEdgeVSLEnvironment:
self._prev_edge_speeds = self.default_edge_speeds.copy()
self._last_reward = 0.0
self._tracked_vehicle_ids.clear()
self._mainline_depart_times = {}
self._active_mainline_vehicle_ids = set()
self._completed_mainline_travel_times = []
self._interval_mainline_travel_times = []
self._start_sumo(seed=seed)
warmup_steps = int(900 / self.control_interval)
@ -324,33 +332,13 @@ class SUMOEdgeVSLEnvironment:
self._interval_departed = 0
self._interval_departed_vehicle_events = []
self._interval_arrived_vehicle_events = []
self._interval_mainline_travel_times = []
for _ in range(self.steps_per_action):
traci.simulationStep()
sim_time = float(traci.simulation.getTime())
self._interval_arrived += traci.simulation.getArrivedNumber()
self._interval_departed += traci.simulation.getDepartedNumber()
if self.collect_trip_events:
for veh_id in traci.simulation.getDepartedIDList():
is_mainline = False
try:
route_edges = traci.vehicle.getRoute(veh_id)
is_mainline = any(edge_id in self.control_edge_set for edge_id in route_edges)
except Exception:
pass
self._interval_departed_vehicle_events.append(
{
"vehicle_id": veh_id,
"sim_time": sim_time,
"is_mainline": bool(is_mainline),
}
)
for veh_id in traci.simulation.getArrivedIDList():
self._interval_arrived_vehicle_events.append(
{
"vehicle_id": veh_id,
"sim_time": sim_time,
}
)
self._update_mainline_trip_tracking(sim_time)
detector_data = self._get_edge_detector_data()
state = self._collect_state(detector_data)
@ -382,6 +370,62 @@ class SUMOEdgeVSLEnvironment:
def close(self):
self._close_sumo()
def _update_mainline_trip_tracking(self, sim_time: float):
departed_ids = []
arrived_ids = []
try:
departed_ids = list(traci.simulation.getDepartedIDList())
except Exception:
departed_ids = []
try:
arrived_ids = list(traci.simulation.getArrivedIDList())
except Exception:
arrived_ids = []
for veh_id in departed_ids:
is_mainline = False
try:
route_edges = traci.vehicle.getRoute(veh_id)
is_mainline = any(edge_id in self.control_edge_set for edge_id in route_edges)
except Exception:
is_mainline = False
if is_mainline:
self._mainline_depart_times[veh_id] = float(sim_time)
self._active_mainline_vehicle_ids.add(veh_id)
if self.collect_trip_events:
self._interval_departed_vehicle_events.append(
{
"vehicle_id": veh_id,
"sim_time": sim_time,
"is_mainline": bool(is_mainline),
}
)
for veh_id in arrived_ids:
if self.collect_trip_events:
self._interval_arrived_vehicle_events.append(
{
"vehicle_id": veh_id,
"sim_time": sim_time,
}
)
if veh_id not in self._active_mainline_vehicle_ids:
continue
depart_time = self._mainline_depart_times.pop(veh_id, None)
self._active_mainline_vehicle_ids.discard(veh_id)
if depart_time is None:
continue
travel_time = float(sim_time) - float(depart_time)
if travel_time < 0:
continue
self._interval_mainline_travel_times.append(travel_time)
self._completed_mainline_travel_times.append(travel_time)
def _decode_action(self, action: np.ndarray) -> np.ndarray:
action_array = np.asarray(action, dtype=np.int64).reshape(-1)
if action_array.size != self.num_controlled_edges:
@ -565,12 +609,15 @@ class SUMOEdgeVSLEnvironment:
controlled_vehicle_count += 1
speed = results.get(tc.VAR_SPEED)
accel = results.get(tc.VAR_ACCELERATION)
if accel is not None and accel < -self.d_th:
if accel is not None and accel < -self.reward_config.d_threshold:
brake_decels.append(abs(accel))
relative_speed = 0.0
if speed is not None:
try:
leader_info = traci.vehicle.getLeader(veh_id, self.leader_gap_threshold_m)
leader_info = traci.vehicle.getLeader(
veh_id,
self.reward_config.leader_gap_threshold_m,
)
except Exception:
leader_info = None
if leader_info:
@ -610,12 +657,15 @@ class SUMOEdgeVSLEnvironment:
accel = None
speed = None
if accel is not None and accel < -self.d_th:
if accel is not None and accel < -self.reward_config.d_threshold:
brake_decels.append(abs(accel))
relative_speed = 0.0
if speed is not None and speed >= 0:
try:
leader_info = traci.vehicle.getLeader(veh_id, self.leader_gap_threshold_m)
leader_info = traci.vehicle.getLeader(
veh_id,
self.reward_config.leader_gap_threshold_m,
)
except Exception:
leader_info = None
if leader_info:
@ -648,9 +698,10 @@ class SUMOEdgeVSLEnvironment:
info["arrived_count"] = self._interval_arrived
info["departed_count"] = self._interval_departed
edge_speeds, edge_occs, _, valid_speeds = detector_data
edge_speeds, edge_occs, edge_counts, valid_speeds = detector_data
info["edge_speeds_ms"] = edge_speeds
info["edge_occupancies"] = edge_occs
info["edge_vehicle_counts"] = edge_counts
info["mean_speed"] = np.mean(valid_speeds) if valid_speeds else 0.0
info["mean_speed_kmh"] = info["mean_speed"] * 3.6
info["mean_occupancy"] = np.mean(edge_occs) if edge_occs else 0.0
@ -671,9 +722,47 @@ class SUMOEdgeVSLEnvironment:
info["relative_speed_samples"] = relative_speed_samples
info["relative_speed_variance"] = float(np.var(relative_speed_samples)) if relative_speed_samples else 0.0
info["speed_variance_norm"] = (
info["relative_speed_variance"] / max(self.v_limit ** 2, 1e-6)
info["relative_speed_variance"] / max(self.reward_config.v_limit ** 2, 1e-6)
)
active_start = self.controlled_edge_start_index
active_occs = np.asarray(edge_occs[active_start:], dtype=float)
active_counts = np.asarray(edge_counts[active_start:], dtype=float)
if active_occs.size > 0:
bottleneck_window = active_occs[-self.reward_config.bottleneck_window_size :]
bottleneck_rel_idx = int(np.argmax(bottleneck_window))
bottleneck_abs_idx = active_start + max(
0,
active_occs.size - self.reward_config.bottleneck_window_size,
) + bottleneck_rel_idx
info["bottleneck_segment_index"] = bottleneck_abs_idx
info["bottleneck_occupancy"] = float(bottleneck_window[bottleneck_rel_idx])
else:
info["bottleneck_segment_index"] = -1
info["bottleneck_occupancy"] = 0.0
info["downstream_mainline_outflow"] = (
float(active_counts[-1]) * (3600.0 / self.control_interval)
if active_counts.size > 0
else 0.0
)
interval_tt_mean = (
float(np.mean(self._interval_mainline_travel_times))
if self._interval_mainline_travel_times
else np.nan
)
cumulative_tt_mean = (
float(np.mean(self._completed_mainline_travel_times))
if self._completed_mainline_travel_times
else np.nan
)
info["mainline_completed_count"] = len(self._interval_mainline_travel_times)
info["mainline_interval_travel_time_mean_s"] = interval_tt_mean
info["mainline_travel_time_cumulative_mean_s"] = cumulative_tt_mean
info["mainline_active_vehicle_count"] = len(self._active_mainline_vehicle_ids)
try:
info["sim_time"] = traci.simulation.getTime()
except Exception:
@ -685,77 +774,12 @@ class SUMOEdgeVSLEnvironment:
self,
detector_data: Tuple[List[float], List[float], List[int], List[float]],
) -> Dict:
info = {}
throughput = self._interval_arrived * (3600.0 / self.control_interval)
info["throughput"] = throughput
info["arrived_count"] = self._interval_arrived
info["departed_count"] = self._interval_departed
edge_speeds, edge_occs, _, valid_speeds = detector_data
info["edge_speeds_ms"] = edge_speeds
info["edge_occupancies"] = edge_occs
info["mean_speed"] = np.mean(valid_speeds) if valid_speeds else 0.0
info["mean_speed_kmh"] = info["mean_speed"] * 3.6
info["mean_occupancy"] = np.mean(edge_occs) if edge_occs else 0.0
controlled_vehicle_count, brake_decels, valid_following_count, relative_speed_samples = (
self._collect_controlled_vehicle_metrics()
)
info["num_vehicles"] = controlled_vehicle_count
info["density"] = (
controlled_vehicle_count / self.controlled_length_km
if self.controlled_length_km > 0
else 0.0
)
info["brake_decels"] = brake_decels
info["num_hard_brakes"] = len(brake_decels)
info["valid_following_count"] = int(valid_following_count)
info["risky_following_count"] = int(sum(value > 0 for value in relative_speed_samples))
info["relative_speed_samples"] = relative_speed_samples
info["relative_speed_variance"] = float(np.var(relative_speed_samples)) if relative_speed_samples else 0.0
info["speed_variance_norm"] = (
info["relative_speed_variance"] / max(self.v_limit ** 2, 1e-6)
)
try:
info["sim_time"] = traci.simulation.getTime()
except Exception:
info["sim_time"] = 0.0
return info
return self._collect_runtime_metrics(detector_data)
def _calculate_reward(self, info: Dict) -> float:
q_t = info["throughput"]
r_flow = q_t / self.C_max
speed_variance_norm = info["speed_variance_norm"]
r_var = -speed_variance_norm
rho_t = info["density"]
w3 = self.w_base + (self.w_max - self.w_base) / (
1 + np.exp(-self.k_sigmoid * (rho_t - self.rho_critical))
return self.reward_calculator.calculate(
info=info,
current_edge_speeds=self.current_edge_speeds,
prev_edge_speeds=self._prev_edge_speeds,
episode_index=self._episode_count,
)
brake_decels = info["brake_decels"]
total_vehicles = max(info["num_vehicles"], 1)
if brake_decels:
sum_brake_penalty = sum(
max(0, (d - self.d_th) / (self.d_max - self.d_th))
for d in brake_decels
)
brake_penalty = sum_brake_penalty / total_vehicles
else:
brake_penalty = 0.0
r_brake = -brake_penalty
vsl_change = np.abs(self.current_edge_speeds - self._prev_edge_speeds)
max_vsl_change = np.max(vsl_change)
r_penalty = -max_vsl_change / self.delta_vsl_max
info["r_flow"] = float(r_flow)
info["r_var"] = float(r_var)
info["r_brake"] = float(r_brake)
info["r_penalty"] = float(r_penalty)
reward = self.w1 * r_flow + self.w2 * r_var + w3 * r_brake + self.w4 * r_penalty
return float(reward * 10.0)

183
envs/reward_system.py Normal file
View File

@ -0,0 +1,183 @@
"""Shared reward configuration and calculation for freeway VSL environments."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Mapping, Sequence
import numpy as np
REWARD_COMPONENT_COLUMNS = (
"r_outflow",
"r_bottleneck",
"r_ctrl",
)
REWARD_COMPONENT_LABELS = {
"r_outflow": "R_outflow",
"r_bottleneck": "R_bottleneck",
"r_ctrl": "R_ctrl",
}
def clip01(value: float) -> float:
return float(np.clip(value, 0.0, 1.0))
def init_reward_component_totals() -> Dict[str, float]:
return {column: 0.0 for column in REWARD_COMPONENT_COLUMNS}
def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[str, float]:
denom = max(int(steps), 1)
return {column: float(totals.get(column, 0.0)) / denom for column in REWARD_COMPONENT_COLUMNS}
@dataclass(frozen=True)
class RewardConfig:
reward_scale: float = 10.0
outflow_weight: float = 0.75
bottleneck_weight: float = 0.20
control_weight_start: float = 0.05
control_weight_end: float = 0.01
control_weight_decay_power: float = 1.5
mainline_discharge_ref_vehph: float = 4924.0
bottleneck_critical_occupancy: float = 15.0
bottleneck_excess_occupancy_band: float = 10.0
bottleneck_window_size: int = 3
control_temporal_weight: float = 0.7
control_spatial_weight: float = 0.3
delta_vsl_max: float = 0.0
d_threshold: float = 3.0
v_limit: float = 33.33
leader_gap_threshold_m: float = 100.0
@classmethod
def from_dict(
cls,
raw_cfg: Mapping[str, object],
*,
speed_actions_ms: Sequence[float],
) -> "RewardConfig":
default_delta_vsl_max = 0.0
if len(speed_actions_ms) > 0:
default_delta_vsl_max = float(np.max(speed_actions_ms) - np.min(speed_actions_ms))
return cls(
reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
outflow_weight=float(raw_cfg.get("outflow_weight", 0.75)),
bottleneck_weight=float(raw_cfg.get("bottleneck_weight", 0.20)),
control_weight_start=float(raw_cfg.get("control_weight_start", 0.05)),
control_weight_end=float(raw_cfg.get("control_weight_end", 0.01)),
control_weight_decay_power=float(raw_cfg.get("control_weight_decay_power", 1.5)),
mainline_discharge_ref_vehph=float(raw_cfg.get("mainline_discharge_ref_vehph", 4924.0)),
bottleneck_critical_occupancy=float(raw_cfg.get("bottleneck_critical_occupancy", 15.0)),
bottleneck_excess_occupancy_band=float(
raw_cfg.get("bottleneck_excess_occupancy_band", 10.0)
),
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
control_temporal_weight=float(raw_cfg.get("control_temporal_weight", 0.7)),
control_spatial_weight=float(raw_cfg.get("control_spatial_weight", 0.3)),
delta_vsl_max=float(raw_cfg.get("delta_vsl_max", default_delta_vsl_max)),
d_threshold=float(raw_cfg.get("d_threshold", 3.0)),
v_limit=float(raw_cfg.get("v_limit", 33.33)),
leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
)
class RewardCalculator:
"""Encapsulates a minimal reward for mainline bottleneck VSL control."""
def __init__(
self,
*,
config: RewardConfig,
controlled_edge_start_index: int,
total_training_episodes: int,
evaluation_mode: bool = False,
):
self.config = config
self.controlled_edge_start_index = int(controlled_edge_start_index)
self.total_training_episodes = max(int(total_training_episodes), 1)
self.evaluation_mode = bool(evaluation_mode)
def get_control_weight(self, episode_index: int) -> float:
if self.evaluation_mode:
return float(self.config.control_weight_end)
if self.total_training_episodes <= 1:
progress = 1.0
else:
progress = clip01((float(episode_index) - 1.0) / (self.total_training_episodes - 1.0))
decay = (1.0 - progress) ** max(self.config.control_weight_decay_power, 0.0)
return float(
self.config.control_weight_end
+ (self.config.control_weight_start - self.config.control_weight_end) * decay
)
def calculate(
self,
*,
info: Dict,
current_edge_speeds: np.ndarray,
prev_edge_speeds: np.ndarray,
episode_index: int,
) -> float:
downstream_mainline_outflow = float(info.get("downstream_mainline_outflow", 0.0))
r_outflow = clip01(
downstream_mainline_outflow / max(self.config.mainline_discharge_ref_vehph, 1e-6)
)
bottleneck_occupancy = float(info.get("bottleneck_occupancy", 0.0))
excess_occupancy = max(
bottleneck_occupancy - self.config.bottleneck_critical_occupancy,
0.0,
)
bottleneck_excess_norm = clip01(
excess_occupancy / max(self.config.bottleneck_excess_occupancy_band, 1e-6)
)
r_bottleneck = -bottleneck_excess_norm
active_start = self.controlled_edge_start_index
current_active_speeds = np.asarray(current_edge_speeds[active_start:], dtype=float)
prev_active_speeds = np.asarray(prev_edge_speeds[active_start:], dtype=float)
if current_active_speeds.size > 0:
temporal_control_change = float(
np.mean(np.abs(current_active_speeds - prev_active_speeds))
/ max(self.config.delta_vsl_max, 1e-6)
)
else:
temporal_control_change = 0.0
if current_active_speeds.size >= 2:
spatial_control_change = float(
np.mean(np.abs(np.diff(current_active_speeds)))
/ max(self.config.delta_vsl_max, 1e-6)
)
else:
spatial_control_change = 0.0
control_change_norm = clip01(
self.config.control_temporal_weight * clip01(temporal_control_change)
+ self.config.control_spatial_weight * clip01(spatial_control_change)
)
r_ctrl = -control_change_norm
control_weight = self.get_control_weight(episode_index)
info["r_outflow"] = float(r_outflow)
info["r_bottleneck"] = float(r_bottleneck)
info["r_ctrl"] = float(r_ctrl)
info["bottleneck_excess_occupancy_norm"] = float(bottleneck_excess_norm)
info["temporal_control_change_norm"] = float(clip01(temporal_control_change))
info["spatial_control_change_norm"] = float(clip01(spatial_control_change))
info["control_weight"] = float(control_weight)
reward = (
self.config.outflow_weight * r_outflow
+ self.config.bottleneck_weight * r_bottleneck
+ control_weight * r_ctrl
)
return float(reward * self.config.reward_scale)

View File

@ -19,6 +19,7 @@ import yaml
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib import colors
from agents.appo_agent import APPOAgent
from agents.dcmappo_agent import DCMAPPOAgent
@ -36,6 +37,7 @@ from agents.sctd3_agent import SCTD3Agent
from agents.tcamappo_agent import TCAMAPPOAgent
from agents.td3_agent import TD3Agent
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, REWARD_COMPONENT_LABELS
from utils.config import get_agent_config
from utils.run_dirs import find_shared_config_path, resolve_checkpoint_root
@ -43,6 +45,9 @@ from utils.run_dirs import find_shared_config_path, resolve_checkpoint_root
MODEL_ORDER = ["ppo", "gpro", "appo", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix", "ddpg", "sac", "td3", "sctd3"]
BASELINE_NAME = "no_control"
EVAL_ORDER = [BASELINE_NAME] + MODEL_ORDER
HEATMAP_SPEED_RANGE_KMH = (40.0, 110.0)
HEATMAP_OCCUPANCY_RANGE = (0.0, 35.0)
HEATMAP_ACTION_LEVELS_KMH = [40.0, 60.0, 80.0, 100.0, 110.0]
MODEL_LABELS = {
BASELINE_NAME: "NO_CONTROL",
"ppo": "PPO",
@ -582,6 +587,10 @@ def update_mainline_travel_time_tracking(
return len(interval_travel_times), interval_mean, cumulative_mean
def _extract_reward_components(info: dict) -> Dict[str, float]:
return {column: info.get(column, np.nan) for column in REWARD_COMPONENT_COLUMNS}
def evaluate_single_model(
model_name: str,
checkpoint_dir: Optional[str],
@ -615,6 +624,7 @@ def evaluate_single_model(
runtime_config["runtime"]["collect_detector_cells"] = True
runtime_config["runtime"]["use_vehicle_subscriptions"] = True
runtime_config["runtime"]["collect_trip_events"] = True
runtime_config["runtime"]["evaluation_mode"] = True
env = SUMOEdgeVSLEnvironment(runtime_config)
agent = None
@ -656,31 +666,27 @@ def evaluate_single_model(
completed_mainline_travel_times,
)
step_rows.append(
{
"model": model_name,
"model_label": MODEL_LABELS[model_name],
"step": step_idx,
"sim_time": info.get("sim_time", np.nan),
"reward": reward,
"throughput": info.get("throughput", np.nan),
"arrived_count": info.get("arrived_count", np.nan),
"departed_count": info.get("departed_count", np.nan),
"mean_speed_kmh": info.get("mean_speed_kmh", np.nan),
"speed_variance_norm": info.get("speed_variance_norm", np.nan),
"mean_occupancy": info.get("mean_occupancy", np.nan),
"density": info.get("density", np.nan),
"num_vehicles": info.get("num_vehicles", np.nan),
"num_hard_brakes": info.get("num_hard_brakes", np.nan),
"r_flow": info.get("r_flow", np.nan),
"r_var": info.get("r_var", np.nan),
"r_brake": info.get("r_brake", np.nan),
"r_penalty": info.get("r_penalty", np.nan),
"mainline_completed_count": mainline_completed_count,
"mainline_interval_travel_time_mean_s": mainline_interval_travel_time_mean_s,
"mainline_travel_time_cumulative_mean_s": mainline_travel_time_cumulative_mean_s,
}
)
step_row = {
"model": model_name,
"model_label": MODEL_LABELS[model_name],
"step": step_idx,
"sim_time": info.get("sim_time", np.nan),
"reward": reward,
"throughput": info.get("throughput", np.nan),
"arrived_count": info.get("arrived_count", np.nan),
"departed_count": info.get("departed_count", np.nan),
"mean_speed_kmh": info.get("mean_speed_kmh", np.nan),
"speed_variance_norm": info.get("speed_variance_norm", np.nan),
"mean_occupancy": info.get("mean_occupancy", np.nan),
"density": info.get("density", np.nan),
"num_vehicles": info.get("num_vehicles", np.nan),
"num_hard_brakes": info.get("num_hard_brakes", np.nan),
"mainline_completed_count": mainline_completed_count,
"mainline_interval_travel_time_mean_s": mainline_interval_travel_time_mean_s,
"mainline_travel_time_cumulative_mean_s": mainline_travel_time_cumulative_mean_s,
}
step_row.update(_extract_reward_components(info))
step_rows.append(step_row)
measured_speeds_ms = info.get("edge_speeds_ms", [])
occupancies = info.get("edge_occupancies", [])
@ -778,7 +784,7 @@ def evaluate_worker(task: Tuple[str, Optional[str], str, str, int, Optional[int]
def build_summary(step_df: pd.DataFrame) -> pd.DataFrame:
grouped = step_df.groupby(["model", "model_label"], sort=False)
summary_df = grouped.agg(
aggregations = dict(
steps=("step", "count"),
reward_sum=("reward", "sum"),
reward_mean=("reward", "mean"),
@ -790,13 +796,12 @@ def build_summary(step_df: pd.DataFrame) -> pd.DataFrame:
density_mean=("density", "mean"),
hard_brakes_total=("num_hard_brakes", "sum"),
hard_brakes_mean=("num_hard_brakes", "mean"),
r_flow_mean=("r_flow", "mean"),
r_var_mean=("r_var", "mean"),
r_brake_mean=("r_brake", "mean"),
r_penalty_mean=("r_penalty", "mean"),
mainline_completed_total=("mainline_completed_count", "sum"),
mainline_travel_time_mean_s=("mainline_travel_time_cumulative_mean_s", "last"),
).reset_index()
)
for column in REWARD_COMPONENT_COLUMNS:
aggregations[f"{column}_mean"] = (column, "mean")
summary_df = grouped.agg(**aggregations).reset_index()
summary_df["throughput_std"] = summary_df["throughput_std"].fillna(0.0)
summary_df["mean_speed_kmh_std"] = summary_df["mean_speed_kmh_std"].fillna(0.0)
@ -860,12 +865,10 @@ def plot_step_comparison(step_df: pd.DataFrame, output_dir: str):
def plot_reward_components(step_df: pd.DataFrame, output_dir: str):
components = [
("r_flow", "R_flow"),
("r_var", "R_var"),
("r_brake", "R_brake"),
("r_penalty", "R_penalty"),
(column, REWARD_COMPONENT_LABELS[column])
for column in REWARD_COMPONENT_COLUMNS
]
fig, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True)
fig, axes = plt.subplots(4, 2, figsize=(15, 14), sharex=True)
axes = axes.flatten()
for ax, (column, title) in zip(axes, components):
@ -877,6 +880,8 @@ def plot_reward_components(step_df: pd.DataFrame, output_dir: str):
ax.set_title(title)
ax.set_xlabel("Step")
ax.grid(True, alpha=0.3)
for ax in axes[len(components):]:
ax.axis("off")
axes[0].legend()
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "reward_components.png"), dpi=160)
@ -916,6 +921,15 @@ def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output
speed_cmap.set_bad(color="#d9d9d9")
occ_cmap = plt.get_cmap("magma").copy()
occ_cmap.set_bad(color="#d9d9d9")
action_cmap = plt.get_cmap("viridis", len(HEATMAP_ACTION_LEVELS_KMH)).copy()
action_cmap.set_bad(color="#d9d9d9")
action_boundaries = [HEATMAP_ACTION_LEVELS_KMH[0] - 10.0]
action_boundaries.extend(
(left + right) / 2.0
for left, right in zip(HEATMAP_ACTION_LEVELS_KMH[:-1], HEATMAP_ACTION_LEVELS_KMH[1:])
)
action_boundaries.append(HEATMAP_ACTION_LEVELS_KMH[-1] + 10.0)
action_norm = colors.BoundaryNorm(action_boundaries, action_cmap.N, clip=True)
for model_name in EVAL_ORDER:
detector_model_df = detector_df[detector_df["model"] == model_name]
@ -939,21 +953,6 @@ def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output
ordered_edge_ids = edge_order["edge_id"].tolist()
action_grid = edge_model_df.pivot(index="edge_id", columns="step", values="action_speed_kmh").reindex(ordered_edge_ids).values
speed_valid = speed_grid[np.isfinite(speed_grid)]
occ_valid = occ_grid[np.isfinite(occ_grid)]
action_valid = action_grid[np.isfinite(action_grid)]
speed_vmin = max(0.0, float(np.nanpercentile(speed_valid, 1))) if speed_valid.size else 0.0
speed_vmax = float(np.nanpercentile(speed_valid, 99)) if speed_valid.size else 120.0
if speed_vmax <= speed_vmin:
speed_vmin, speed_vmax = 0.0, 120.0
occ_vmax = float(np.nanpercentile(occ_valid, 99)) if occ_valid.size else 100.0
occ_vmax = max(5.0, min(100.0, occ_vmax))
action_vmax = float(np.nanmax(action_valid)) if action_valid.size else 120.0
action_vmax = max(120.0, action_vmax)
fig, axes = plt.subplots(1, 3, figsize=(21, 7), gridspec_kw={"width_ratios": [1.3, 0.8, 1.3]})
speed_im = axes[0].imshow(
@ -961,39 +960,57 @@ def plot_model_heatmaps(edge_df: pd.DataFrame, detector_df: pd.DataFrame, output
aspect="auto",
origin="lower",
cmap=speed_cmap,
vmin=speed_vmin,
vmax=speed_vmax,
vmin=HEATMAP_SPEED_RANGE_KMH[0],
vmax=HEATMAP_SPEED_RANGE_KMH[1],
)
axes[0].set_title(f"{MODEL_LABELS[model_name]} Measured Speed (km/h)")
axes[0].set_xlabel("Step")
axes[0].set_ylabel("Detector Cell (bottom=upstream, top=downstream)")
plt.colorbar(speed_im, ax=axes[0], fraction=0.046, pad=0.04)
plt.colorbar(
speed_im,
ax=axes[0],
fraction=0.046,
pad=0.04,
ticks=HEATMAP_ACTION_LEVELS_KMH,
)
action_im = axes[1].imshow(
np.ma.masked_invalid(action_grid),
aspect="auto",
origin="lower",
cmap="viridis",
vmin=0,
vmax=action_vmax,
cmap=action_cmap,
norm=action_norm,
)
axes[1].set_title(f"{MODEL_LABELS[model_name]} Applied VSL (km/h)")
axes[1].set_xlabel("Step")
axes[1].set_ylabel("Controlled Edge")
plt.colorbar(action_im, ax=axes[1], fraction=0.046, pad=0.04)
plt.colorbar(
action_im,
ax=axes[1],
fraction=0.046,
pad=0.04,
ticks=HEATMAP_ACTION_LEVELS_KMH,
boundaries=action_boundaries,
)
occ_im = axes[2].imshow(
np.ma.masked_invalid(occ_grid),
aspect="auto",
origin="lower",
cmap=occ_cmap,
vmin=0,
vmax=occ_vmax,
vmin=HEATMAP_OCCUPANCY_RANGE[0],
vmax=HEATMAP_OCCUPANCY_RANGE[1],
)
axes[2].set_title(f"{MODEL_LABELS[model_name]} Occupancy (%)")
axes[2].set_xlabel("Step")
axes[2].set_ylabel("Detector Cell (bottom=upstream, top=downstream)")
plt.colorbar(occ_im, ax=axes[2], fraction=0.046, pad=0.04)
plt.colorbar(
occ_im,
ax=axes[2],
fraction=0.046,
pad=0.04,
ticks=np.arange(0.0, 36.0, 5.0),
)
plt.tight_layout()
plt.savefig(os.path.join(heatmap_dir, f"{model_name}_heatmaps.png"), dpi=160)

View File

@ -16,6 +16,7 @@ import pandas as pd
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from envs.reward_system import REWARD_COMPONENT_COLUMNS, REWARD_COMPONENT_LABELS
from utils.run_dirs import find_latest_run_root, find_run_root_by_timestamp
@ -203,7 +204,8 @@ def safe_read_csv(csv_path: str) -> pd.DataFrame:
numeric_columns = [
"episode", "reward", "throughput", "mean_speed", "speed_variance_norm",
"r_flow", "r_var", "r_brake", "r_penalty", "hard_brakes",
*REWARD_COMPONENT_COLUMNS,
"hard_brakes",
"policy_loss", "value_loss", "entropy",
]
for column in numeric_columns:
@ -307,13 +309,22 @@ def plot_detailed_snapshot(
)
plot_series(axes[4], df, "hard_brakes", "Hard Brakes", "count", "tab:red", window, show_ma)
reward_components = ["r_flow", "r_var", "r_brake", "r_penalty"]
reward_components = list(REWARD_COMPONENT_COLUMNS)
has_components = any(
col in df.columns and pd.to_numeric(df[col], errors="coerce").notna().sum() > 0
for col in reward_components
)
if has_components:
for col, color in zip(reward_components, ["tab:green", "tab:purple", "tab:red", "tab:brown"]):
component_colors = [
"tab:green",
"tab:blue",
"tab:orange",
"tab:purple",
"tab:red",
"tab:brown",
"tab:gray",
]
for col, color in zip(reward_components, component_colors):
series = pd.to_numeric(df[col], errors="coerce")
if series.notna().sum() > 0:
axes[5].plot(
@ -532,6 +543,17 @@ def plot_all_models_overview(
if not run_logs:
raise ValueError("No readable model logs found for overview plotting.")
available_reward_components = []
for column in REWARD_COMPONENT_COLUMNS:
if any(
df is not None
and column in df.columns
and pd.to_numeric(df[column], errors="coerce").notna().sum() > 0
for df in run_logs.values()
):
available_reward_components.append(column)
available_reward_components = available_reward_components[:4]
fig, axes = plt.subplots(4, 3, figsize=(22, 16))
axes = axes.flatten()
metrics = [
@ -540,14 +562,15 @@ def plot_all_models_overview(
("mean_speed", "Mean Speed", "km/h"),
("speed_variance_norm", "Normalized Speed Variance", "norm"),
("hard_brakes", "Hard Brakes", "count"),
("r_flow", "R_flow", "value"),
("r_var", "R_var", "value"),
("r_brake", "R_brake", "value"),
("r_penalty", "R_penalty", "value"),
("policy_loss", "Policy Loss", "loss"),
("value_loss", "Value Loss", "loss"),
("entropy", "Entropy", "value"),
]
reward_metrics = [
(column, REWARD_COMPONENT_LABELS.get(column, column), "value")
for column in available_reward_components
]
metrics = metrics[:5] + reward_metrics + metrics[5:]
for ax, (column, title, ylabel) in zip(axes, metrics):
plot_metric_overlay(ax, run_logs, column, title, ylabel, window, show_ma)

View File

@ -15,6 +15,7 @@ from tqdm import tqdm
import torch
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.appo_agent import APPOAgent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -42,6 +43,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -118,10 +120,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0
episode_speed = 0
episode_speed_variance_norm = 0.0
episode_r_flow = 0
episode_r_var = 0
episode_r_brake = 0
episode_r_penalty = 0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -142,10 +141,8 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -173,10 +170,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
episode_mean_speeds.append(avg_speed)
@ -190,10 +184,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
logger.log(
episode, episode_reward, avg_tp, avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
policy_loss=train_stats["policy_loss"],
value_loss=train_stats["value_loss"],
@ -203,10 +194,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
logger.log(
episode, episode_reward, avg_tp, avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -217,12 +205,10 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
if train_stats:
episode_summary.update(
policy_loss=float(train_stats["policy_loss"]),
@ -249,7 +235,13 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Throughput: {avg_tp:.1f} veh/h")
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}")
print(
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if train_stats:
print(f" Policy Loss: {train_stats['policy_loss']:.4f}")
print(f" Value Loss: {train_stats['value_loss']:.4f}")

View File

@ -11,6 +11,7 @@ matplotlib.use("Agg")
from tqdm import tqdm
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.dcmappo_agent import DCMAPPOAgent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -36,6 +37,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -111,10 +113,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0.0
episode_speed = 0.0
episode_speed_variance_norm = 0.0
episode_r_flow = 0.0
episode_r_var = 0.0
episode_r_brake = 0.0
episode_r_penalty = 0.0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -131,10 +130,8 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -154,10 +151,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
@ -175,10 +169,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
policy_loss=train_stats["policy_loss"],
value_loss=train_stats["value_loss"],
@ -191,10 +182,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -204,12 +192,10 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
if train_stats:
episode_summary.update(
policy_loss=float(train_stats["policy_loss"]),
@ -236,8 +222,11 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(
f" R(flow/var/brake/pen): "
f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}"
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if train_stats:
print(f" Policy Loss: {train_stats['policy_loss']:.4f}")

View File

@ -12,6 +12,7 @@ from datetime import datetime
from tqdm import tqdm
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.ddpg_agent import DDPGAgent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -38,6 +39,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -100,10 +102,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0
episode_speed = 0
episode_speed_variance_norm = 0.0
episode_r_flow = 0
episode_r_var = 0
episode_r_brake = 0
episode_r_penalty = 0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -121,10 +120,8 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -137,10 +134,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
episode_mean_speeds.append(avg_speed)
@ -150,10 +144,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
logger.log(
episode, episode_reward, avg_tp, avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -163,12 +154,10 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
save_training_episode_artifacts(
log_dir=log_dir,
episode=episode,
@ -188,7 +177,13 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Throughput: {avg_tp:.1f} veh/h")
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}")
print(
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if episode % save_freq == 0:
agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}"))

View File

@ -9,6 +9,7 @@ matplotlib.use("Agg")
from tqdm import tqdm
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.gpro_agent import GPROAgent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -35,6 +36,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -115,10 +117,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0.0
episode_speed = 0.0
episode_speed_variance_norm = 0.0
episode_r_flow = 0.0
episode_r_var = 0.0
episode_r_brake = 0.0
episode_r_penalty = 0.0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -139,10 +138,8 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -161,10 +158,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
episode_mean_speeds.append(avg_speed)
@ -178,10 +172,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_tp": avg_tp,
"avg_speed": avg_speed,
"avg_speed_variance_norm": avg_speed_variance_norm,
"avg_r_flow": avg_r_flow,
"avg_r_var": avg_r_var,
"avg_r_brake": avg_r_brake,
"avg_r_penalty": avg_r_penalty,
"reward_components": dict(avg_reward_components),
"episode_brakes": episode_brakes,
}
)
@ -192,13 +183,11 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
"group_seed": int(group_seed),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
save_training_episode_artifacts(
log_dir=log_dir,
episode=episode,
@ -219,8 +208,11 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(
" R(flow/var/brake/pen): "
f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}"
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if episode % save_freq == 0:
@ -247,10 +239,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
row["avg_tp"],
row["avg_speed"],
speed_variance_norm=row["avg_speed_variance_norm"],
r_flow=row["avg_r_flow"],
r_var=row["avg_r_var"],
r_brake=row["avg_r_brake"],
r_penalty=row["avg_r_penalty"],
reward_components=row["reward_components"],
hard_brakes=row["episode_brakes"],
policy_loss=train_stats["policy_loss"],
value_loss=train_stats["value_loss"],
@ -263,10 +252,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
row["avg_tp"],
row["avg_speed"],
speed_variance_norm=row["avg_speed_variance_norm"],
r_flow=row["avg_r_flow"],
r_var=row["avg_r_var"],
r_brake=row["avg_r_brake"],
r_penalty=row["avg_r_penalty"],
reward_components=row["reward_components"],
hard_brakes=row["episode_brakes"],
)

View File

@ -12,6 +12,7 @@ from datetime import datetime
from tqdm import tqdm
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.mappo_agent import MAPPOAgent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -37,6 +38,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -108,10 +110,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0.0
episode_speed = 0.0
episode_speed_variance_norm = 0.0
episode_r_flow = 0.0
episode_r_var = 0.0
episode_r_brake = 0.0
episode_r_penalty = 0.0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -128,10 +127,8 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -151,10 +148,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
@ -172,10 +166,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
policy_loss=train_stats["policy_loss"],
value_loss=train_stats["value_loss"],
@ -188,10 +179,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -201,12 +189,10 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
if train_stats:
episode_summary.update(
policy_loss=float(train_stats["policy_loss"]),
@ -233,8 +219,11 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(
f" R(flow/var/brake/pen): "
f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}"
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if train_stats:
print(f" Policy Loss: {train_stats['policy_loss']:.4f}")

View File

@ -15,6 +15,7 @@ from tqdm import tqdm
import torch
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.ppo_agent import PPOAgent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -43,6 +44,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -118,10 +120,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0
episode_speed = 0
episode_speed_variance_norm = 0.0
episode_r_flow = 0
episode_r_var = 0
episode_r_brake = 0
episode_r_penalty = 0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -142,10 +141,8 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -173,10 +170,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
episode_mean_speeds.append(avg_speed)
@ -190,10 +184,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
logger.log(
episode, episode_reward, avg_tp, avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
policy_loss=train_stats["policy_loss"],
value_loss=train_stats["value_loss"],
@ -203,10 +194,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
logger.log(
episode, episode_reward, avg_tp, avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -217,12 +205,10 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
if train_stats:
episode_summary.update(
policy_loss=float(train_stats["policy_loss"]),
@ -249,7 +235,13 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Throughput: {avg_tp:.1f} veh/h")
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}")
print(
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if train_stats:
print(f" Policy Loss: {train_stats['policy_loss']:.4f}")
print(f" Value Loss: {train_stats['value_loss']:.4f}")

View File

@ -9,6 +9,7 @@ matplotlib.use("Agg")
from tqdm import tqdm
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.sac_agent import SACAgent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -35,6 +36,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -100,10 +102,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0.0
episode_speed = 0.0
episode_speed_variance_norm = 0.0
episode_r_flow = 0.0
episode_r_var = 0.0
episode_r_brake = 0.0
episode_r_penalty = 0.0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -125,10 +124,8 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -145,10 +142,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
episode_mean_speeds.append(avg_speed)
@ -161,10 +155,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -174,12 +165,10 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
save_training_episode_artifacts(
log_dir=log_dir,
episode=episode,
@ -200,8 +189,11 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(
" R(flow/var/brake/pen): "
f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}"
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if episode % save_freq == 0:

View File

@ -13,6 +13,7 @@ matplotlib.use("Agg")
from agents.tcamappo_agent import TCAMAPPOAgent
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
from utils.logger import TrainingLogger
@ -37,6 +38,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -116,10 +118,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput = 0.0
episode_speed = 0.0
episode_speed_variance_norm = 0.0
episode_r_flow = 0.0
episode_r_var = 0.0
episode_r_brake = 0.0
episode_r_penalty = 0.0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -138,10 +137,8 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -161,10 +158,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
@ -182,10 +176,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
policy_loss=train_stats["policy_loss"],
value_loss=train_stats["value_loss"],
@ -198,10 +189,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -211,12 +199,10 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
if train_stats:
episode_summary.update(
policy_loss=float(train_stats["policy_loss"]),
@ -243,8 +229,11 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(
f" R(flow/var/brake/pen): "
f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}"
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if train_stats:
print(f" Policy Loss: {train_stats['policy_loss']:.4f}")

View File

@ -13,6 +13,7 @@ from datetime import datetime
from tqdm import tqdm
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from agents.td3_agent import TD3Agent
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
@ -47,6 +48,7 @@ def train_sumo_td3(
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -123,10 +125,7 @@ def train_sumo_td3(
episode_throughput = 0
episode_speed = 0
episode_speed_variance_norm = 0.0
episode_r_flow = 0
episode_r_var = 0
episode_r_brake = 0
episode_r_penalty = 0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -148,10 +147,8 @@ def train_sumo_td3(
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -168,10 +165,7 @@ def train_sumo_td3(
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
episode_mean_speeds.append(avg_speed)
@ -181,10 +175,7 @@ def train_sumo_td3(
logger.log(
episode, episode_reward, avg_tp, avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
)
@ -194,12 +185,10 @@ def train_sumo_td3(
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
save_training_episode_artifacts(
log_dir=log_dir,
episode=episode,
@ -219,7 +208,13 @@ def train_sumo_td3(
print(f" Throughput: {avg_tp:.1f} veh/h")
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(f" R(flow/var/brake/pen): {avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}")
print(
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if episode % save_freq == 0:
agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}"))

View File

@ -12,6 +12,7 @@ import yaml
from tqdm import tqdm
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from utils.config import get_agent_config, get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
from utils.logger import TrainingLogger
@ -83,6 +84,7 @@ def train_sumo_value_based(
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -135,10 +137,7 @@ def train_sumo_value_based(
episode_throughput = 0.0
episode_speed = 0.0
episode_speed_variance_norm = 0.0
episode_r_flow = 0.0
episode_r_var = 0.0
episode_r_brake = 0.0
episode_r_penalty = 0.0
episode_reward_components = init_reward_component_totals()
episode_brakes = 0
done = False
step = 0
@ -158,10 +157,8 @@ def train_sumo_value_based(
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_r_flow += info["r_flow"]
episode_r_var += info["r_var"]
episode_r_brake += info["r_brake"]
episode_r_penalty += info["r_penalty"]
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
episode_brakes += info["num_hard_brakes"]
state = next_state
step += 1
@ -178,10 +175,7 @@ def train_sumo_value_based(
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_r_flow = episode_r_flow / max(step, 1)
avg_r_var = episode_r_var / max(step, 1)
avg_r_brake = episode_r_brake / max(step, 1)
avg_r_penalty = episode_r_penalty / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
@ -196,10 +190,7 @@ def train_sumo_value_based(
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
r_flow=avg_r_flow,
r_var=avg_r_var,
r_brake=avg_r_brake,
r_penalty=avg_r_penalty,
reward_components=avg_reward_components,
hard_brakes=episode_brakes,
value_loss=loss_val,
)
@ -210,13 +201,11 @@ def train_sumo_value_based(
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"avg_r_flow": float(avg_r_flow),
"avg_r_var": float(avg_r_var),
"avg_r_brake": float(avg_r_brake),
"avg_r_penalty": float(avg_r_penalty),
"hard_brakes": int(episode_brakes),
"value_loss": float(loss_val) if loss_val is not None else None,
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
save_training_episode_artifacts(
log_dir=log_dir,
episode=episode,
@ -237,8 +226,11 @@ def train_sumo_value_based(
print(f" Mean Speed: {avg_speed:.1f} km/h")
print(f" Normalized Speed Variance: {avg_speed_variance_norm:.4f}")
print(
f" R(flow/var/brake/pen): "
f"{avg_r_flow:.3f} / {avg_r_var:.3f} / {avg_r_brake:.3f} / {avg_r_penalty:.3f}"
" Reward Components: "
+ ", ".join(
f"{column}={avg_reward_components[column]:.3f}"
for column in REWARD_COMPONENT_COLUMNS
)
)
if loss_val is not None:
print(f" Value Loss: {loss_val:.4f}")

View File

@ -10,6 +10,8 @@ import numpy as np
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from envs.reward_system import REWARD_COMPONENT_COLUMNS
def _safe_float(value, default: float = float("nan")) -> float:
try:
@ -25,27 +27,24 @@ def _normalize_step_rows(episode: int, episode_metrics: Sequence[Dict]) -> Tuple
for step_idx, info in enumerate(episode_metrics, start=1):
step_value = int(info.get("step", step_idx))
step_rows.append(
{
"episode": episode,
"step": step_value,
"sim_time": _safe_float(info.get("sim_time")),
"reward": _safe_float(info.get("reward")),
"throughput": _safe_float(info.get("throughput")),
"arrived_count": int(info.get("arrived_count", 0)),
"departed_count": int(info.get("departed_count", 0)),
"mean_speed_kmh": _safe_float(info.get("mean_speed_kmh")),
"speed_variance_norm": _safe_float(info.get("speed_variance_norm")),
"mean_occupancy": _safe_float(info.get("mean_occupancy")),
"density": _safe_float(info.get("density")),
"num_vehicles": int(info.get("num_vehicles", 0)),
"num_hard_brakes": int(info.get("num_hard_brakes", 0)),
"r_flow": _safe_float(info.get("r_flow")),
"r_var": _safe_float(info.get("r_var")),
"r_brake": _safe_float(info.get("r_brake")),
"r_penalty": _safe_float(info.get("r_penalty")),
}
)
step_row = {
"episode": episode,
"step": step_value,
"sim_time": _safe_float(info.get("sim_time")),
"reward": _safe_float(info.get("reward")),
"throughput": _safe_float(info.get("throughput")),
"arrived_count": int(info.get("arrived_count", 0)),
"departed_count": int(info.get("departed_count", 0)),
"mean_speed_kmh": _safe_float(info.get("mean_speed_kmh")),
"speed_variance_norm": _safe_float(info.get("speed_variance_norm")),
"mean_occupancy": _safe_float(info.get("mean_occupancy")),
"density": _safe_float(info.get("density")),
"num_vehicles": int(info.get("num_vehicles", 0)),
"num_hard_brakes": int(info.get("num_hard_brakes", 0)),
}
for column in REWARD_COMPONENT_COLUMNS:
step_row[column] = _safe_float(info.get(column))
step_rows.append(step_row)
action_speeds = list(info.get("edge_speeds_kmh", []))
measured_speeds_ms = list(info.get("edge_speeds_ms", []))

View File

@ -1,15 +1,26 @@
"""训练日志CSV记录器"""
"""Training CSV logger."""
import csv
import os
from typing import Mapping, Optional
from envs.reward_system import REWARD_COMPONENT_COLUMNS
class TrainingLogger:
def __init__(self, log_dir, model_name, resume=False):
self.log_path = os.path.join(log_dir, f"{model_name}_training_log.csv")
self.fieldnames = [
"episode", "reward", "throughput", "mean_speed", "speed_variance_norm",
"r_flow", "r_var", "r_brake", "r_penalty", "hard_brakes",
"policy_loss", "value_loss", "entropy"
"episode",
"reward",
"throughput",
"mean_speed",
"speed_variance_norm",
*REWARD_COMPONENT_COLUMNS,
"hard_brakes",
"policy_loss",
"value_loss",
"entropy",
]
if not resume or not os.path.exists(self.log_path):
@ -17,23 +28,38 @@ class TrainingLogger:
writer = csv.DictWriter(f, fieldnames=self.fieldnames)
writer.writeheader()
def log(self, episode, reward, throughput, mean_speed, speed_variance_norm=None,
r_flow=None, r_var=None, r_brake=None, r_penalty=None, hard_brakes=0,
policy_loss=None, value_loss=None, entropy=None):
def log(
self,
episode,
reward,
throughput,
mean_speed,
*,
speed_variance_norm: Optional[float] = None,
reward_components: Optional[Mapping[str, float]] = None,
hard_brakes=0,
policy_loss: Optional[float] = None,
value_loss: Optional[float] = None,
entropy: Optional[float] = None,
):
reward_components = dict(reward_components or {})
row = {
"episode": episode,
"reward": f"{reward:.4f}",
"throughput": f"{throughput:.2f}",
"mean_speed": f"{mean_speed:.2f}",
"speed_variance_norm": (
f"{speed_variance_norm:.6f}" if speed_variance_norm is not None else ""
),
"hard_brakes": f"{hard_brakes:.0f}",
"policy_loss": f"{policy_loss:.6f}" if policy_loss is not None else "",
"value_loss": f"{value_loss:.6f}" if value_loss is not None else "",
"entropy": f"{entropy:.6f}" if entropy is not None else "",
}
for column in REWARD_COMPONENT_COLUMNS:
value = reward_components.get(column)
row[column] = f"{value:.6f}" if value is not None else ""
with open(self.log_path, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=self.fieldnames)
writer.writerow({
"episode": episode,
"reward": f"{reward:.4f}",
"throughput": f"{throughput:.2f}",
"mean_speed": f"{mean_speed:.2f}",
"speed_variance_norm": f"{speed_variance_norm:.6f}" if speed_variance_norm is not None else "",
"r_flow": f"{r_flow:.6f}" if r_flow is not None else "",
"r_var": f"{r_var:.6f}" if r_var is not None else "",
"r_brake": f"{r_brake:.6f}" if r_brake is not None else "",
"r_penalty": f"{r_penalty:.6f}" if r_penalty is not None else "",
"hard_brakes": f"{hard_brakes:.0f}",
"policy_loss": f"{policy_loss:.6f}" if policy_loss is not None else "",
"value_loss": f"{value_loss:.6f}" if value_loss is not None else "",
"entropy": f"{entropy:.6f}" if entropy is not None else ""
})
writer.writerow(row)