新奖励函数，取消通行量指标改为通行效率

2026-04-17 07:01:18 +08:00 · 2026-04-17 07:01:18 +08:00 · 05ef01d93f
parent 3d1782c348
commit 05ef01d93f
3 changed files with 26 additions and 25 deletions
--- a/config_sumo_vsl.yaml
+++ b/config_sumo_vsl.yaml
@ -53,9 +53,8 @@ environment:

  reward:
    reward_scale: 10.0
-    throughput_weight: 0.75
+    efficiency_weight: 0.75
    safety_weight: 0.20
-    throughput_ref_vehph: 3908.1

    bottleneck_window_size: 3

--- a/envs/reward_design_blueprint.py
+++ b/envs/reward_design_blueprint.py
@ -78,15 +78,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:

    terms = (
        RewardTerm(
-            name="throughput",
-            symbol="R_throughput",
-            objective="Maximize realized corridor throughput in each control interval.",
+            name="efficiency",
+            symbol="R_efficiency",
+            objective="Maintain a high average running speed in the controlled corridor.",
            rationale=(
-                "If the experiment focuses on final operational effectiveness, a direct throughput term "
-                "is simpler and more result-oriented than several intermediate bottleneck-state surrogates."
+                "Instantaneous corridor outflow fluctuates strongly with simulation time and is noisy as a "
+                "step reward. A normalized mean-speed term is smoother, reacts more directly to VSL control, "
+                "and is easier for policy optimization to fit."
            ),
-            formula_tex=r"R_throughput(t)=clip(q_tp(t)/q_ref,0,1)",
-            required_signals=("interval throughput", "throughput reference"),
+            formula_tex=r"R_efficiency(t)=\begin{cases}\mathrm{clip}(\bar{v}(t)/v_{\max},0,1),&N(t)>0\\0,&N(t)=0\end{cases}",
+            required_signals=("controlled-corridor mean speed", "controlled-corridor active vehicle count", "speed limit"),
        ),
        RewardTerm(
            name="safety",
@ -105,17 +106,18 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
    )

    return RewardBlueprint(
-        name="Minimal Throughput-Safety Reward Blueprint For TCA-MAPPO",
+        name="Minimal Efficiency-Safety Reward Blueprint For TCA-MAPPO",
        scenario_summary=(
            "The study controls a segmented freeway VSL corridor under fixed control intervals. "
            "For architecture comparison, the reward should emphasize a small number of stable, "
            "interpretable traffic goals."
        ),
        primary_objective=(
-            "Increase realized throughput and reduce unsafe interactions with a compact reward."
+            "Improve corridor running efficiency and reduce unsafe interactions with a compact reward."
        ),
        design_principles=(
-            "Prefer direct result metrics over too many intermediate bottleneck heuristics.",
+            "Prefer stable per-step traffic signals over highly time-dependent outflow spikes.",
+            "Prefer direct operational efficiency metrics over too many intermediate bottleneck heuristics.",
            "Keep safety in the reward, but merge it into a single compact term.",
            "Avoid overlapping safety surrogates when one strong proxy is already available.",
            "Keep the reward simple enough that architecture comparison remains credible.",
@ -123,16 +125,17 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
        ),
        terms=terms,
        global_formula_tex=(
-            r"R(t)=kappa [lambda_tp R_throughput(t)+lambda_safe R_safety(t)]"
+            r"R(t)=\kappa [\lambda_{\mathrm{eff}} R_{\mathrm{efficiency}}(t)+\lambda_{\mathrm{safe}} R_{\mathrm{safety}}(t)]"
        ),
        excluded_metrics=(
-            "Separate bottleneck tracking and oversaturation terms when throughput already captures outcome.",
+            "Instantaneous outflow as the primary step reward because it is strongly modulated by simulation time.",
            "Too many correlated penalties such as shockwave, occupancy, braking, and variance all entering together.",
            "Control smoothness penalties that distort architecture comparison.",
        ),
        implementation_notes=(
+            "The efficiency term is currently built from normalized controlled-corridor mean speed.",
            "The safety term is currently built only from corridor stop rate.",
-            "Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
+            "Throughput and bottleneck occupancy can still be logged for diagnosis even if they are no longer part of the reward.",
            "Training and evaluate now share the same fixed reward structure.",
        ),
    )
--- a/envs/reward_system.py
+++ b/envs/reward_system.py
@ -9,12 +9,12 @@ import numpy as np


 REWARD_COMPONENT_COLUMNS = (
-    "r_throughput",
+    "r_efficiency",
    "r_safety",
 )

 REWARD_COMPONENT_LABELS = {
-    "r_throughput": "R_throughput",
+    "r_efficiency": "R_efficiency",
    "r_safety": "R_safety",
 }

@ -35,9 +35,8 @@ def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[s
@dataclass(frozen=True)
 class RewardConfig:
    reward_scale: float = 10.0
-    throughput_weight: float = 0.75
+    efficiency_weight: float = 0.75
    safety_weight: float = 0.20
-    throughput_ref_vehph: float = 3908.1
    safety_stop_weight: float = 1.0
    bottleneck_window_size: int = 3
    v_limit: float = 33.33
@ -54,9 +53,8 @@ class RewardConfig:

        return cls(
            reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
-            throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
+            efficiency_weight=float(raw_cfg.get("efficiency_weight", 0.75)),
            safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
-            throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
            safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
            bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
            v_limit=float(raw_cfg.get("v_limit", 33.33)),
@ -88,19 +86,20 @@ class RewardCalculator:
    ) -> float:
        _ = current_edge_speeds, prev_edge_speeds, episode_index

-        throughput = float(info.get("throughput", 0.0))
-        r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
+        mean_speed = max(float(info.get("mean_speed", 0.0)), 0.0)
+        num_vehicles = max(int(info.get("num_vehicles", 0)), 0)
+        r_efficiency = clip01(mean_speed / max(self.config.v_limit, 1e-6)) if num_vehicles > 0 else 0.0

        stop_rate = clip01(float(info.get("stop_rate", 0.0)))
        safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
        r_safety = -safety_penalty

-        info["r_throughput"] = float(r_throughput)
+        info["r_efficiency"] = float(r_efficiency)
        info["r_safety"] = float(r_safety)
        info["safety_penalty_norm"] = float(safety_penalty)

        reward = (
-            self.config.throughput_weight * r_throughput
+            self.config.efficiency_weight * r_efficiency
            + self.config.safety_weight * r_safety
        )
        return float(reward * self.config.reward_scale)