ctm-dqn/envs/reward_system.py

"""Shared reward configuration and calculation for freeway VSL environments."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, Mapping, Sequence

import numpy as np


REWARD_COMPONENT_COLUMNS = (
    "r_outflow",
    "r_bottleneck",
    "r_ctrl",
)

REWARD_COMPONENT_LABELS = {
    "r_outflow": "R_outflow",
    "r_bottleneck": "R_bottleneck",
    "r_ctrl": "R_ctrl",
}


def clip01(value: float) -> float:
    return float(np.clip(value, 0.0, 1.0))


def init_reward_component_totals() -> Dict[str, float]:
    return {column: 0.0 for column in REWARD_COMPONENT_COLUMNS}


def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[str, float]:
    denom = max(int(steps), 1)
    return {column: float(totals.get(column, 0.0)) / denom for column in REWARD_COMPONENT_COLUMNS}


@dataclass(frozen=True)
class RewardConfig:
    reward_scale: float = 10.0
    outflow_weight: float = 0.75
    bottleneck_weight: float = 0.20
    control_weight_start: float = 0.05
    control_weight_end: float = 0.01
    control_weight_decay_power: float = 1.5
    mainline_discharge_ref_vehph: float = 4924.0
    bottleneck_critical_occupancy: float = 15.0
    bottleneck_excess_occupancy_band: float = 10.0
    bottleneck_window_size: int = 3
    control_temporal_weight: float = 0.7
    control_spatial_weight: float = 0.3
    delta_vsl_max: float = 0.0
    d_threshold: float = 3.0
    v_limit: float = 33.33
    leader_gap_threshold_m: float = 100.0

    @classmethod
    def from_dict(
        cls,
        raw_cfg: Mapping[str, object],
        *,
        speed_actions_ms: Sequence[float],
    ) -> "RewardConfig":
        default_delta_vsl_max = 0.0
        if len(speed_actions_ms) > 0:
            default_delta_vsl_max = float(np.max(speed_actions_ms) - np.min(speed_actions_ms))

        return cls(
            reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
            outflow_weight=float(raw_cfg.get("outflow_weight", 0.75)),
            bottleneck_weight=float(raw_cfg.get("bottleneck_weight", 0.20)),
            control_weight_start=float(raw_cfg.get("control_weight_start", 0.05)),
            control_weight_end=float(raw_cfg.get("control_weight_end", 0.01)),
            control_weight_decay_power=float(raw_cfg.get("control_weight_decay_power", 1.5)),
            mainline_discharge_ref_vehph=float(raw_cfg.get("mainline_discharge_ref_vehph", 4924.0)),
            bottleneck_critical_occupancy=float(raw_cfg.get("bottleneck_critical_occupancy", 15.0)),
            bottleneck_excess_occupancy_band=float(
                raw_cfg.get("bottleneck_excess_occupancy_band", 10.0)
            ),
            bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
            control_temporal_weight=float(raw_cfg.get("control_temporal_weight", 0.7)),
            control_spatial_weight=float(raw_cfg.get("control_spatial_weight", 0.3)),
            delta_vsl_max=float(raw_cfg.get("delta_vsl_max", default_delta_vsl_max)),
            d_threshold=float(raw_cfg.get("d_threshold", 3.0)),
            v_limit=float(raw_cfg.get("v_limit", 33.33)),
            leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
        )


class RewardCalculator:
    """Encapsulates a minimal reward for mainline bottleneck VSL control."""

    def __init__(
        self,
        *,
        config: RewardConfig,
        controlled_edge_start_index: int,
        total_training_episodes: int,
        evaluation_mode: bool = False,
    ):
        self.config = config
        self.controlled_edge_start_index = int(controlled_edge_start_index)
        self.total_training_episodes = max(int(total_training_episodes), 1)
        self.evaluation_mode = bool(evaluation_mode)

    def get_control_weight(self, episode_index: int) -> float:
        if self.evaluation_mode:
            return float(self.config.control_weight_end)

        if self.total_training_episodes <= 1:
            progress = 1.0
        else:
            progress = clip01((float(episode_index) - 1.0) / (self.total_training_episodes - 1.0))

        decay = (1.0 - progress) ** max(self.config.control_weight_decay_power, 0.0)
        return float(
            self.config.control_weight_end
            + (self.config.control_weight_start - self.config.control_weight_end) * decay
        )

    def calculate(
        self,
        *,
        info: Dict,
        current_edge_speeds: np.ndarray,
        prev_edge_speeds: np.ndarray,
        episode_index: int,
    ) -> float:
        downstream_mainline_outflow = float(info.get("downstream_mainline_outflow", 0.0))
        r_outflow = clip01(
            downstream_mainline_outflow / max(self.config.mainline_discharge_ref_vehph, 1e-6)
        )

        bottleneck_occupancy = float(info.get("bottleneck_occupancy", 0.0))
        excess_occupancy = max(
            bottleneck_occupancy - self.config.bottleneck_critical_occupancy,
            0.0,
        )
        bottleneck_excess_norm = clip01(
            excess_occupancy / max(self.config.bottleneck_excess_occupancy_band, 1e-6)
        )
        r_bottleneck = -bottleneck_excess_norm

        active_start = self.controlled_edge_start_index
        current_active_speeds = np.asarray(current_edge_speeds[active_start:], dtype=float)
        prev_active_speeds = np.asarray(prev_edge_speeds[active_start:], dtype=float)

        if current_active_speeds.size > 0:
            temporal_control_change = float(
                np.mean(np.abs(current_active_speeds - prev_active_speeds))
                / max(self.config.delta_vsl_max, 1e-6)
            )
        else:
            temporal_control_change = 0.0
        if current_active_speeds.size >= 2:
            spatial_control_change = float(
                np.mean(np.abs(np.diff(current_active_speeds)))
                / max(self.config.delta_vsl_max, 1e-6)
            )
        else:
            spatial_control_change = 0.0

        control_change_norm = clip01(
            self.config.control_temporal_weight * clip01(temporal_control_change)
            + self.config.control_spatial_weight * clip01(spatial_control_change)
        )
        r_ctrl = -control_change_norm

        control_weight = self.get_control_weight(episode_index)

        info["r_outflow"] = float(r_outflow)
        info["r_bottleneck"] = float(r_bottleneck)
        info["r_ctrl"] = float(r_ctrl)
        info["bottleneck_excess_occupancy_norm"] = float(bottleneck_excess_norm)
        info["temporal_control_change_norm"] = float(clip01(temporal_control_change))
        info["spatial_control_change_norm"] = float(clip01(spatial_control_change))
        info["control_weight"] = float(control_weight)

        reward = (
            self.config.outflow_weight * r_outflow
            + self.config.bottleneck_weight * r_bottleneck
            + control_weight * r_ctrl
        )
        return float(reward * self.config.reward_scale)