From 2f594b0eb0a39370eec95f912b1f0677f0768ac2 Mon Sep 17 00:00:00 2001
From: Maple-YZ <zihanyee@gmail.com>
Date: Fri, 17 Apr 2026 06:09:51 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=A5=96=E5=8A=B1=E5=87=BD?=
 =?UTF-8?q?=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config_sumo_vsl.yaml            |  3 +--
 envs/reward_design_blueprint.py | 16 ++++++++--------
 envs/reward_system.py           | 12 +++---------
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/config_sumo_vsl.yaml b/config_sumo_vsl.yaml
index 2d6829d..5b5ade6 100644
--- a/config_sumo_vsl.yaml
+++ b/config_sumo_vsl.yaml
@@ -59,8 +59,7 @@ environment:
 
     bottleneck_window_size: 3
 
-    safety_closing_speed_weight: 0.6
-    safety_stop_weight: 0.4
+    safety_stop_weight: 1.0
 
     v_limit: 30.56
     leader_gap_threshold_m: 100.0
diff --git a/envs/reward_design_blueprint.py b/envs/reward_design_blueprint.py
index 6b75851..de8f874 100644
--- a/envs/reward_design_blueprint.py
+++ b/envs/reward_design_blueprint.py
@@ -91,16 +91,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
         RewardTerm(
             name="safety",
             symbol="R_safety",
-            objective="Penalize unsafe speed interactions and excessive stopping with one merged safety term.",
+            objective="Penalize corridor stopping with one compact safety term.",
             rationale=(
-                "Safety should remain in the reward, but in a compact form. Positive closing-speed "
-                "dispersion and corridor stop rate are already available in the environment and together "
-                "provide a reasonable lightweight safety proxy."
+                "For this experiment, excessive stopping is already a strong proxy for unstable or "
+                "unsafe traffic operation. Using only stop rate keeps the objective simpler and "
+                "reduces interference from overlapping surrogate signals."
             ),
             formula_tex=(
-                r"R_safety(t)=-clip(w_c \hat{\sigma}_{close}^2(t)+w_s \hat{s}(t),0,1)"
+                r"R_safety(t)=-clip(w_s \hat{s}(t),0,1)"
             ),
-            required_signals=("closing-speed variance", "corridor stop rate", "controlled vehicle count"),
+            required_signals=("corridor stop rate",),
         ),
     )
 
@@ -117,7 +117,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
         design_principles=(
             "Prefer direct result metrics over too many intermediate bottleneck heuristics.",
             "Keep safety in the reward, but merge it into a single compact term.",
-            "Use the same safety proxies that are already measurable in the environment.",
+            "Avoid overlapping safety surrogates when one strong proxy is already available.",
             "Keep the reward simple enough that architecture comparison remains credible.",
             "Avoid auxiliary regularizers that change the objective across training stages.",
         ),
@@ -131,7 +131,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
             "Control smoothness penalties that distort architecture comparison.",
         ),
         implementation_notes=(
-            "The safety term is currently built from normalized positive closing-speed variance and corridor stop rate.",
+            "The safety term is currently built only from corridor stop rate.",
             "Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
             "Training and evaluate now share the same fixed reward structure.",
         ),
diff --git a/envs/reward_system.py b/envs/reward_system.py
index 6c766d2..7667d02 100644
--- a/envs/reward_system.py
+++ b/envs/reward_system.py
@@ -38,8 +38,7 @@ class RewardConfig:
     throughput_weight: float = 0.75
     safety_weight: float = 0.20
     throughput_ref_vehph: float = 3908.1
-    safety_closing_speed_weight: float = 0.6
-    safety_stop_weight: float = 0.4
+    safety_stop_weight: float = 1.0
     bottleneck_window_size: int = 3
     v_limit: float = 33.33
     leader_gap_threshold_m: float = 100.0
@@ -58,8 +57,7 @@ class RewardConfig:
             throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
             safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
             throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
-            safety_closing_speed_weight=float(raw_cfg.get("safety_closing_speed_weight", 0.6)),
-            safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 0.4)),
+            safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
             bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
             v_limit=float(raw_cfg.get("v_limit", 33.33)),
             leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
@@ -93,12 +91,8 @@ class RewardCalculator:
         throughput = float(info.get("throughput", 0.0))
         r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
 
-        speed_variance_norm = clip01(float(info.get("speed_variance_norm", 0.0)))
         stop_rate = clip01(float(info.get("stop_rate", 0.0)))
-        safety_penalty = clip01(
-            self.config.safety_closing_speed_weight * speed_variance_norm
-            + self.config.safety_stop_weight * stop_rate
-        )
+        safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
         r_safety = -safety_penalty
 
         info["r_throughput"] = float(r_throughput)