ctm-dqn/envs/reward_design_blueprint.py

"""Minimal reward blueprint for the current freeway VSL study."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Iterable, Tuple


@dataclass(frozen=True)
class RewardTerm:
    name: str
    symbol: str
    objective: str
    rationale: str
    formula_tex: str
    required_signals: Tuple[str, ...]

    def to_markdown(self) -> str:
        return "\n".join(
            [
                f"### {self.name}",
                f"- Symbol: `{self.symbol}`",
                f"- Objective: {self.objective}",
                f"- Rationale: {self.rationale}",
                f"- Formula: `{self.formula_tex}`",
                "- Required signals: " + ", ".join(f"`{value}`" for value in self.required_signals),
            ]
        )


@dataclass(frozen=True)
class RewardBlueprint:
    name: str
    scenario_summary: str
    primary_objective: str
    design_principles: Tuple[str, ...]
    terms: Tuple[RewardTerm, ...]
    global_formula_tex: str
    excluded_metrics: Tuple[str, ...] = ()
    implementation_notes: Tuple[str, ...] = ()

    def term_names(self) -> Tuple[str, ...]:
        return tuple(term.name for term in self.terms)

    def to_markdown(self) -> str:
        lines = [
            f"# {self.name}",
            "",
            "## Scenario",
            self.scenario_summary,
            "",
            "## Primary Objective",
            self.primary_objective,
            "",
            "## Global Formula",
            f"`{self.global_formula_tex}`",
            "",
            "## Design Principles",
        ]
        lines.extend(f"- {item}" for item in self.design_principles)
        lines.append("")
        lines.append("## Reward Terms")
        for term in self.terms:
            lines.append(term.to_markdown())
            lines.append("")
        if self.excluded_metrics:
            lines.append("## Metrics To Avoid As Primary Reward Drivers")
            lines.extend(f"- {item}" for item in self.excluded_metrics)
            lines.append("")
        if self.implementation_notes:
            lines.append("## Implementation Notes")
            lines.extend(f"- {item}" for item in self.implementation_notes)
        return "\n".join(lines).rstrip() + "\n"


def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
    """Build the current multiplicative-MAUT reward blueprint for corridor VSL."""

    terms = (
        RewardTerm(
            name="efficiency",
            symbol="R_efficiency",
            objective="Map normalized corridor running efficiency into a bounded utility term.",
            rationale=(
                "The raw efficiency indicator should not enter the reward linearly. A monotone concave utility "
                "function emphasizes recovery from low-efficiency states while avoiding over-rewarding already "
                "high-speed regimes."
            ),
            formula_tex=(
                r"R_{\mathrm{efficiency}}(t)=1-\exp\!\left(-\alpha E(t)\right),\ "
                r"E(t)=\begin{cases}\mathrm{clip}(\bar{v}(t)/v_{\max},0,1),&N(t)>0\\0,&N(t)=0\end{cases}"
            ),
            required_signals=("controlled-corridor mean speed", "controlled-corridor active vehicle count", "speed limit"),
        ),
        RewardTerm(
            name="safety",
            symbol="R_safety",
            objective="Map TTC-based following risk into a bounded safety utility term.",
            rationale=(
                "For freeway VSL, rear-end conflict risk is better captured by time-to-collision than by "
                "stop counting. An exponential safety utility makes the reward sensitive to the spread of "
                "short-TTC following states while preserving boundedness and interpretability."
            ),
            formula_tex=(
                r"R_{\mathrm{safety}}(t)=\exp\!\left(-\beta S(t)\right),\ "
                r"S(t)=\frac{1}{N(t)}\sum_{i=1}^{N(t)}\max\!\left(0,1-\frac{\mathrm{TTC}_i(t)}{\tau_{\mathrm{ttc}}}\right)"
            ),
            required_signals=(
                "controlled-corridor vehicle speeds",
                "leader speeds",
                "leader gaps",
                "TTC threshold",
            ),
        ),
    )

    return RewardBlueprint(
        name="Multiplicative MAUT Reward Blueprint For TCA-MAPPO",
        scenario_summary=(
            "The study controls a segmented freeway VSL corridor under fixed control intervals. "
            "For architecture comparison, the reward should emphasize a small number of stable, "
            "interpretable traffic goals while avoiding overly simplistic linear compensation between "
            "efficiency and safety."
        ),
        primary_objective=(
            "Improve corridor running efficiency and traffic safety through a nonlinear utility-based reward."
        ),
        design_principles=(
            "Prefer stable per-step traffic signals over highly time-dependent outflow spikes.",
            "Map raw traffic indicators into utility space before aggregation.",
            "Avoid full linear substitutability between efficiency and safety.",
            "Use a compact nonlinear aggregation that remains easy to interpret and implement.",
            "Use a freeway-native rear-end risk surrogate instead of coarse stop counting.",
            "Avoid auxiliary regularizers that change the objective across training stages.",
        ),
        terms=terms,
        global_formula_tex=(
            r"R(t)=R_{\mathrm{efficiency}}(t)^{\lambda_{\mathrm{eff}}}"
            r"R_{\mathrm{safety}}(t)^{\lambda_{\mathrm{safe}}}"
        ),
        excluded_metrics=(
            "Instantaneous outflow as the primary step reward because it is strongly modulated by simulation time.",
            "Too many correlated penalties such as shockwave, occupancy, braking, and variance all entering together.",
            "Purely linear reward aggregation when safety should not be fully compensated by efficiency gains.",
        ),
        implementation_notes=(
            "The efficiency utility is built from normalized controlled-corridor mean speed.",
            "The safety utility is built from a TTC-thresholded average following-risk indicator.",
            "The exponents are normalized internally before multiplicative aggregation.",
            "Throughput and bottleneck occupancy can still be logged for diagnosis even if they are no longer part of the reward.",
            "Training and evaluate now share the same fixed reward structure.",
        ),
    )


def build_reward_blueprint_markdown() -> str:
    return build_tca_mappo_reward_blueprint().to_markdown()


def iter_required_signals() -> Iterable[str]:
    signals = set()
    for term in build_tca_mappo_reward_blueprint().terms:
        signals.update(term.required_signals)
    return tuple(sorted(signals))


if __name__ == "__main__":
    print(build_reward_blueprint_markdown())