"""Shared reward configuration and calculation for freeway VSL environments.""" from __future__ import annotations from dataclasses import dataclass from typing import Dict, Mapping, Sequence import numpy as np REWARD_COMPONENT_COLUMNS = ( "r_outflow", "r_bottleneck", "r_ctrl", ) REWARD_COMPONENT_LABELS = { "r_outflow": "R_outflow", "r_bottleneck": "R_bottleneck", "r_ctrl": "R_ctrl", } def clip01(value: float) -> float: return float(np.clip(value, 0.0, 1.0)) def init_reward_component_totals() -> Dict[str, float]: return {column: 0.0 for column in REWARD_COMPONENT_COLUMNS} def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[str, float]: denom = max(int(steps), 1) return {column: float(totals.get(column, 0.0)) / denom for column in REWARD_COMPONENT_COLUMNS} @dataclass(frozen=True) class RewardConfig: reward_scale: float = 10.0 outflow_weight: float = 0.75 bottleneck_weight: float = 0.20 control_weight_start: float = 0.05 control_weight_end: float = 0.01 control_weight_decay_power: float = 1.5 mainline_discharge_ref_vehph: float = 4924.0 bottleneck_critical_occupancy: float = 15.0 bottleneck_excess_occupancy_band: float = 10.0 bottleneck_window_size: int = 3 control_temporal_weight: float = 0.7 control_spatial_weight: float = 0.3 delta_vsl_max: float = 0.0 d_threshold: float = 3.0 v_limit: float = 33.33 leader_gap_threshold_m: float = 100.0 @classmethod def from_dict( cls, raw_cfg: Mapping[str, object], *, speed_actions_ms: Sequence[float], ) -> "RewardConfig": default_delta_vsl_max = 0.0 if len(speed_actions_ms) > 0: default_delta_vsl_max = float(np.max(speed_actions_ms) - np.min(speed_actions_ms)) return cls( reward_scale=float(raw_cfg.get("reward_scale", 10.0)), outflow_weight=float(raw_cfg.get("outflow_weight", 0.75)), bottleneck_weight=float(raw_cfg.get("bottleneck_weight", 0.20)), control_weight_start=float(raw_cfg.get("control_weight_start", 0.05)), control_weight_end=float(raw_cfg.get("control_weight_end", 0.01)), control_weight_decay_power=float(raw_cfg.get("control_weight_decay_power", 1.5)), mainline_discharge_ref_vehph=float(raw_cfg.get("mainline_discharge_ref_vehph", 4924.0)), bottleneck_critical_occupancy=float(raw_cfg.get("bottleneck_critical_occupancy", 15.0)), bottleneck_excess_occupancy_band=float( raw_cfg.get("bottleneck_excess_occupancy_band", 10.0) ), bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))), control_temporal_weight=float(raw_cfg.get("control_temporal_weight", 0.7)), control_spatial_weight=float(raw_cfg.get("control_spatial_weight", 0.3)), delta_vsl_max=float(raw_cfg.get("delta_vsl_max", default_delta_vsl_max)), d_threshold=float(raw_cfg.get("d_threshold", 3.0)), v_limit=float(raw_cfg.get("v_limit", 33.33)), leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)), ) class RewardCalculator: """Encapsulates a minimal reward for mainline bottleneck VSL control.""" def __init__( self, *, config: RewardConfig, controlled_edge_start_index: int, total_training_episodes: int, evaluation_mode: bool = False, ): self.config = config self.controlled_edge_start_index = int(controlled_edge_start_index) self.total_training_episodes = max(int(total_training_episodes), 1) self.evaluation_mode = bool(evaluation_mode) def get_control_weight(self, episode_index: int) -> float: if self.evaluation_mode: return float(self.config.control_weight_end) if self.total_training_episodes <= 1: progress = 1.0 else: progress = clip01((float(episode_index) - 1.0) / (self.total_training_episodes - 1.0)) decay = (1.0 - progress) ** max(self.config.control_weight_decay_power, 0.0) return float( self.config.control_weight_end + (self.config.control_weight_start - self.config.control_weight_end) * decay ) def calculate( self, *, info: Dict, current_edge_speeds: np.ndarray, prev_edge_speeds: np.ndarray, episode_index: int, ) -> float: downstream_mainline_outflow = float(info.get("downstream_mainline_outflow", 0.0)) r_outflow = clip01( downstream_mainline_outflow / max(self.config.mainline_discharge_ref_vehph, 1e-6) ) bottleneck_occupancy = float(info.get("bottleneck_occupancy", 0.0)) excess_occupancy = max( bottleneck_occupancy - self.config.bottleneck_critical_occupancy, 0.0, ) bottleneck_excess_norm = clip01( excess_occupancy / max(self.config.bottleneck_excess_occupancy_band, 1e-6) ) r_bottleneck = -bottleneck_excess_norm active_start = self.controlled_edge_start_index current_active_speeds = np.asarray(current_edge_speeds[active_start:], dtype=float) prev_active_speeds = np.asarray(prev_edge_speeds[active_start:], dtype=float) if current_active_speeds.size > 0: temporal_control_change = float( np.mean(np.abs(current_active_speeds - prev_active_speeds)) / max(self.config.delta_vsl_max, 1e-6) ) else: temporal_control_change = 0.0 if current_active_speeds.size >= 2: spatial_control_change = float( np.mean(np.abs(np.diff(current_active_speeds))) / max(self.config.delta_vsl_max, 1e-6) ) else: spatial_control_change = 0.0 control_change_norm = clip01( self.config.control_temporal_weight * clip01(temporal_control_change) + self.config.control_spatial_weight * clip01(spatial_control_change) ) r_ctrl = -control_change_norm control_weight = self.get_control_weight(episode_index) info["r_outflow"] = float(r_outflow) info["r_bottleneck"] = float(r_bottleneck) info["r_ctrl"] = float(r_ctrl) info["bottleneck_excess_occupancy_norm"] = float(bottleneck_excess_norm) info["temporal_control_change_norm"] = float(clip01(temporal_control_change)) info["spatial_control_change_norm"] = float(clip01(spatial_control_change)) info["control_weight"] = float(control_weight) reward = ( self.config.outflow_weight * r_outflow + self.config.bottleneck_weight * r_bottleneck + control_weight * r_ctrl ) return float(reward * self.config.reward_scale)