ctm-dqn/envs/reward_system.py

184 lines
6.8 KiB
Python

"""Shared reward configuration and calculation for freeway VSL environments."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, Mapping, Sequence
import numpy as np
REWARD_COMPONENT_COLUMNS = (
"r_outflow",
"r_bottleneck",
"r_ctrl",
)
REWARD_COMPONENT_LABELS = {
"r_outflow": "R_outflow",
"r_bottleneck": "R_bottleneck",
"r_ctrl": "R_ctrl",
}
def clip01(value: float) -> float:
return float(np.clip(value, 0.0, 1.0))
def init_reward_component_totals() -> Dict[str, float]:
return {column: 0.0 for column in REWARD_COMPONENT_COLUMNS}
def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[str, float]:
denom = max(int(steps), 1)
return {column: float(totals.get(column, 0.0)) / denom for column in REWARD_COMPONENT_COLUMNS}
@dataclass(frozen=True)
class RewardConfig:
reward_scale: float = 10.0
outflow_weight: float = 0.75
bottleneck_weight: float = 0.20
control_weight_start: float = 0.05
control_weight_end: float = 0.01
control_weight_decay_power: float = 1.5
mainline_discharge_ref_vehph: float = 4924.0
bottleneck_critical_occupancy: float = 15.0
bottleneck_excess_occupancy_band: float = 10.0
bottleneck_window_size: int = 3
control_temporal_weight: float = 0.7
control_spatial_weight: float = 0.3
delta_vsl_max: float = 0.0
d_threshold: float = 3.0
v_limit: float = 33.33
leader_gap_threshold_m: float = 100.0
@classmethod
def from_dict(
cls,
raw_cfg: Mapping[str, object],
*,
speed_actions_ms: Sequence[float],
) -> "RewardConfig":
default_delta_vsl_max = 0.0
if len(speed_actions_ms) > 0:
default_delta_vsl_max = float(np.max(speed_actions_ms) - np.min(speed_actions_ms))
return cls(
reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
outflow_weight=float(raw_cfg.get("outflow_weight", 0.75)),
bottleneck_weight=float(raw_cfg.get("bottleneck_weight", 0.20)),
control_weight_start=float(raw_cfg.get("control_weight_start", 0.05)),
control_weight_end=float(raw_cfg.get("control_weight_end", 0.01)),
control_weight_decay_power=float(raw_cfg.get("control_weight_decay_power", 1.5)),
mainline_discharge_ref_vehph=float(raw_cfg.get("mainline_discharge_ref_vehph", 4924.0)),
bottleneck_critical_occupancy=float(raw_cfg.get("bottleneck_critical_occupancy", 15.0)),
bottleneck_excess_occupancy_band=float(
raw_cfg.get("bottleneck_excess_occupancy_band", 10.0)
),
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
control_temporal_weight=float(raw_cfg.get("control_temporal_weight", 0.7)),
control_spatial_weight=float(raw_cfg.get("control_spatial_weight", 0.3)),
delta_vsl_max=float(raw_cfg.get("delta_vsl_max", default_delta_vsl_max)),
d_threshold=float(raw_cfg.get("d_threshold", 3.0)),
v_limit=float(raw_cfg.get("v_limit", 33.33)),
leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
)
class RewardCalculator:
"""Encapsulates a minimal reward for mainline bottleneck VSL control."""
def __init__(
self,
*,
config: RewardConfig,
controlled_edge_start_index: int,
total_training_episodes: int,
evaluation_mode: bool = False,
):
self.config = config
self.controlled_edge_start_index = int(controlled_edge_start_index)
self.total_training_episodes = max(int(total_training_episodes), 1)
self.evaluation_mode = bool(evaluation_mode)
def get_control_weight(self, episode_index: int) -> float:
if self.evaluation_mode:
return float(self.config.control_weight_end)
if self.total_training_episodes <= 1:
progress = 1.0
else:
progress = clip01((float(episode_index) - 1.0) / (self.total_training_episodes - 1.0))
decay = (1.0 - progress) ** max(self.config.control_weight_decay_power, 0.0)
return float(
self.config.control_weight_end
+ (self.config.control_weight_start - self.config.control_weight_end) * decay
)
def calculate(
self,
*,
info: Dict,
current_edge_speeds: np.ndarray,
prev_edge_speeds: np.ndarray,
episode_index: int,
) -> float:
downstream_mainline_outflow = float(info.get("downstream_mainline_outflow", 0.0))
r_outflow = clip01(
downstream_mainline_outflow / max(self.config.mainline_discharge_ref_vehph, 1e-6)
)
bottleneck_occupancy = float(info.get("bottleneck_occupancy", 0.0))
excess_occupancy = max(
bottleneck_occupancy - self.config.bottleneck_critical_occupancy,
0.0,
)
bottleneck_excess_norm = clip01(
excess_occupancy / max(self.config.bottleneck_excess_occupancy_band, 1e-6)
)
r_bottleneck = -bottleneck_excess_norm
active_start = self.controlled_edge_start_index
current_active_speeds = np.asarray(current_edge_speeds[active_start:], dtype=float)
prev_active_speeds = np.asarray(prev_edge_speeds[active_start:], dtype=float)
if current_active_speeds.size > 0:
temporal_control_change = float(
np.mean(np.abs(current_active_speeds - prev_active_speeds))
/ max(self.config.delta_vsl_max, 1e-6)
)
else:
temporal_control_change = 0.0
if current_active_speeds.size >= 2:
spatial_control_change = float(
np.mean(np.abs(np.diff(current_active_speeds)))
/ max(self.config.delta_vsl_max, 1e-6)
)
else:
spatial_control_change = 0.0
control_change_norm = clip01(
self.config.control_temporal_weight * clip01(temporal_control_change)
+ self.config.control_spatial_weight * clip01(spatial_control_change)
)
r_ctrl = -control_change_norm
control_weight = self.get_control_weight(episode_index)
info["r_outflow"] = float(r_outflow)
info["r_bottleneck"] = float(r_bottleneck)
info["r_ctrl"] = float(r_ctrl)
info["bottleneck_excess_occupancy_norm"] = float(bottleneck_excess_norm)
info["temporal_control_change_norm"] = float(clip01(temporal_control_change))
info["spatial_control_change_norm"] = float(clip01(spatial_control_change))
info["control_weight"] = float(control_weight)
reward = (
self.config.outflow_weight * r_outflow
+ self.config.bottleneck_weight * r_bottleneck
+ control_weight * r_ctrl
)
return float(reward * self.config.reward_scale)