From 05ef01d93f0c7dadfe4d6a5c8ea3050078e0a7e3 Mon Sep 17 00:00:00 2001
From: Maple-YZ <zihanyee@gmail.com>
Date: Fri, 17 Apr 2026 07:01:18 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A5=96=E5=8A=B1=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=EF=BC=8C=E5=8F=96=E6=B6=88=E9=80=9A=E8=A1=8C=E9=87=8F=E6=8C=87?=
 =?UTF-8?q?=E6=A0=87=E6=94=B9=E4=B8=BA=E9=80=9A=E8=A1=8C=E6=95=88=E7=8E=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config_sumo_vsl.yaml            |  3 +--
 envs/reward_design_blueprint.py | 29 ++++++++++++++++-------------
 envs/reward_system.py           | 19 +++++++++----------
 3 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/config_sumo_vsl.yaml b/config_sumo_vsl.yaml
index 5b5ade6..3ec8da4 100644
--- a/config_sumo_vsl.yaml
+++ b/config_sumo_vsl.yaml
@@ -53,9 +53,8 @@ environment:
 
   reward:
     reward_scale: 10.0
-    throughput_weight: 0.75
+    efficiency_weight: 0.75
     safety_weight: 0.20
-    throughput_ref_vehph: 3908.1
 
     bottleneck_window_size: 3
 
diff --git a/envs/reward_design_blueprint.py b/envs/reward_design_blueprint.py
index de8f874..02f8436 100644
--- a/envs/reward_design_blueprint.py
+++ b/envs/reward_design_blueprint.py
@@ -78,15 +78,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
 
     terms = (
         RewardTerm(
-            name="throughput",
-            symbol="R_throughput",
-            objective="Maximize realized corridor throughput in each control interval.",
+            name="efficiency",
+            symbol="R_efficiency",
+            objective="Maintain a high average running speed in the controlled corridor.",
             rationale=(
-                "If the experiment focuses on final operational effectiveness, a direct throughput term "
-                "is simpler and more result-oriented than several intermediate bottleneck-state surrogates."
+                "Instantaneous corridor outflow fluctuates strongly with simulation time and is noisy as a "
+                "step reward. A normalized mean-speed term is smoother, reacts more directly to VSL control, "
+                "and is easier for policy optimization to fit."
             ),
-            formula_tex=r"R_throughput(t)=clip(q_tp(t)/q_ref,0,1)",
-            required_signals=("interval throughput", "throughput reference"),
+            formula_tex=r"R_efficiency(t)=\begin{cases}\mathrm{clip}(\bar{v}(t)/v_{\max},0,1),&N(t)>0\\0,&N(t)=0\end{cases}",
+            required_signals=("controlled-corridor mean speed", "controlled-corridor active vehicle count", "speed limit"),
         ),
         RewardTerm(
             name="safety",
@@ -105,17 +106,18 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
     )
 
     return RewardBlueprint(
-        name="Minimal Throughput-Safety Reward Blueprint For TCA-MAPPO",
+        name="Minimal Efficiency-Safety Reward Blueprint For TCA-MAPPO",
         scenario_summary=(
             "The study controls a segmented freeway VSL corridor under fixed control intervals. "
             "For architecture comparison, the reward should emphasize a small number of stable, "
             "interpretable traffic goals."
         ),
         primary_objective=(
-            "Increase realized throughput and reduce unsafe interactions with a compact reward."
+            "Improve corridor running efficiency and reduce unsafe interactions with a compact reward."
         ),
         design_principles=(
-            "Prefer direct result metrics over too many intermediate bottleneck heuristics.",
+            "Prefer stable per-step traffic signals over highly time-dependent outflow spikes.",
+            "Prefer direct operational efficiency metrics over too many intermediate bottleneck heuristics.",
             "Keep safety in the reward, but merge it into a single compact term.",
             "Avoid overlapping safety surrogates when one strong proxy is already available.",
             "Keep the reward simple enough that architecture comparison remains credible.",
@@ -123,16 +125,17 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
         ),
         terms=terms,
         global_formula_tex=(
-            r"R(t)=kappa [lambda_tp R_throughput(t)+lambda_safe R_safety(t)]"
+            r"R(t)=\kappa [\lambda_{\mathrm{eff}} R_{\mathrm{efficiency}}(t)+\lambda_{\mathrm{safe}} R_{\mathrm{safety}}(t)]"
         ),
         excluded_metrics=(
-            "Separate bottleneck tracking and oversaturation terms when throughput already captures outcome.",
+            "Instantaneous outflow as the primary step reward because it is strongly modulated by simulation time.",
             "Too many correlated penalties such as shockwave, occupancy, braking, and variance all entering together.",
             "Control smoothness penalties that distort architecture comparison.",
         ),
         implementation_notes=(
+            "The efficiency term is currently built from normalized controlled-corridor mean speed.",
             "The safety term is currently built only from corridor stop rate.",
-            "Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
+            "Throughput and bottleneck occupancy can still be logged for diagnosis even if they are no longer part of the reward.",
             "Training and evaluate now share the same fixed reward structure.",
         ),
     )
diff --git a/envs/reward_system.py b/envs/reward_system.py
index 7667d02..9b2a456 100644
--- a/envs/reward_system.py
+++ b/envs/reward_system.py
@@ -9,12 +9,12 @@ import numpy as np
 
 
 REWARD_COMPONENT_COLUMNS = (
-    "r_throughput",
+    "r_efficiency",
     "r_safety",
 )
 
 REWARD_COMPONENT_LABELS = {
-    "r_throughput": "R_throughput",
+    "r_efficiency": "R_efficiency",
     "r_safety": "R_safety",
 }
 
@@ -35,9 +35,8 @@ def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[s
 @dataclass(frozen=True)
 class RewardConfig:
     reward_scale: float = 10.0
-    throughput_weight: float = 0.75
+    efficiency_weight: float = 0.75
     safety_weight: float = 0.20
-    throughput_ref_vehph: float = 3908.1
     safety_stop_weight: float = 1.0
     bottleneck_window_size: int = 3
     v_limit: float = 33.33
@@ -54,9 +53,8 @@ class RewardConfig:
 
         return cls(
             reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
-            throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
+            efficiency_weight=float(raw_cfg.get("efficiency_weight", 0.75)),
             safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
-            throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
             safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
             bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
             v_limit=float(raw_cfg.get("v_limit", 33.33)),
@@ -88,19 +86,20 @@ class RewardCalculator:
     ) -> float:
         _ = current_edge_speeds, prev_edge_speeds, episode_index
 
-        throughput = float(info.get("throughput", 0.0))
-        r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
+        mean_speed = max(float(info.get("mean_speed", 0.0)), 0.0)
+        num_vehicles = max(int(info.get("num_vehicles", 0)), 0)
+        r_efficiency = clip01(mean_speed / max(self.config.v_limit, 1e-6)) if num_vehicles > 0 else 0.0
 
         stop_rate = clip01(float(info.get("stop_rate", 0.0)))
         safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
         r_safety = -safety_penalty
 
-        info["r_throughput"] = float(r_throughput)
+        info["r_efficiency"] = float(r_efficiency)
         info["r_safety"] = float(r_safety)
         info["safety_penalty_norm"] = float(safety_penalty)
 
         reward = (
-            self.config.throughput_weight * r_throughput
+            self.config.efficiency_weight * r_efficiency
             + self.config.safety_weight * r_safety
         )
         return float(reward * self.config.reward_scale)