更新奖励函数

This commit is contained in:
Zihan Ye 2026-04-17 06:09:51 +08:00
parent c126e4a9f5
commit 2f594b0eb0
3 changed files with 12 additions and 19 deletions

View File

@ -59,8 +59,7 @@ environment:
bottleneck_window_size: 3 bottleneck_window_size: 3
safety_closing_speed_weight: 0.6 safety_stop_weight: 1.0
safety_stop_weight: 0.4
v_limit: 30.56 v_limit: 30.56
leader_gap_threshold_m: 100.0 leader_gap_threshold_m: 100.0

View File

@ -91,16 +91,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
RewardTerm( RewardTerm(
name="safety", name="safety",
symbol="R_safety", symbol="R_safety",
objective="Penalize unsafe speed interactions and excessive stopping with one merged safety term.", objective="Penalize corridor stopping with one compact safety term.",
rationale=( rationale=(
"Safety should remain in the reward, but in a compact form. Positive closing-speed " "For this experiment, excessive stopping is already a strong proxy for unstable or "
"dispersion and corridor stop rate are already available in the environment and together " "unsafe traffic operation. Using only stop rate keeps the objective simpler and "
"provide a reasonable lightweight safety proxy." "reduces interference from overlapping surrogate signals."
), ),
formula_tex=( formula_tex=(
r"R_safety(t)=-clip(w_c \hat{\sigma}_{close}^2(t)+w_s \hat{s}(t),0,1)" r"R_safety(t)=-clip(w_s \hat{s}(t),0,1)"
), ),
required_signals=("closing-speed variance", "corridor stop rate", "controlled vehicle count"), required_signals=("corridor stop rate",),
), ),
) )
@ -117,7 +117,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
design_principles=( design_principles=(
"Prefer direct result metrics over too many intermediate bottleneck heuristics.", "Prefer direct result metrics over too many intermediate bottleneck heuristics.",
"Keep safety in the reward, but merge it into a single compact term.", "Keep safety in the reward, but merge it into a single compact term.",
"Use the same safety proxies that are already measurable in the environment.", "Avoid overlapping safety surrogates when one strong proxy is already available.",
"Keep the reward simple enough that architecture comparison remains credible.", "Keep the reward simple enough that architecture comparison remains credible.",
"Avoid auxiliary regularizers that change the objective across training stages.", "Avoid auxiliary regularizers that change the objective across training stages.",
), ),
@ -131,7 +131,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
"Control smoothness penalties that distort architecture comparison.", "Control smoothness penalties that distort architecture comparison.",
), ),
implementation_notes=( implementation_notes=(
"The safety term is currently built from normalized positive closing-speed variance and corridor stop rate.", "The safety term is currently built only from corridor stop rate.",
"Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.", "Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
"Training and evaluate now share the same fixed reward structure.", "Training and evaluate now share the same fixed reward structure.",
), ),

View File

@ -38,8 +38,7 @@ class RewardConfig:
throughput_weight: float = 0.75 throughput_weight: float = 0.75
safety_weight: float = 0.20 safety_weight: float = 0.20
throughput_ref_vehph: float = 3908.1 throughput_ref_vehph: float = 3908.1
safety_closing_speed_weight: float = 0.6 safety_stop_weight: float = 1.0
safety_stop_weight: float = 0.4
bottleneck_window_size: int = 3 bottleneck_window_size: int = 3
v_limit: float = 33.33 v_limit: float = 33.33
leader_gap_threshold_m: float = 100.0 leader_gap_threshold_m: float = 100.0
@ -58,8 +57,7 @@ class RewardConfig:
throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)), throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
safety_weight=float(raw_cfg.get("safety_weight", 0.20)), safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)), throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
safety_closing_speed_weight=float(raw_cfg.get("safety_closing_speed_weight", 0.6)), safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 0.4)),
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))), bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
v_limit=float(raw_cfg.get("v_limit", 33.33)), v_limit=float(raw_cfg.get("v_limit", 33.33)),
leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)), leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
@ -93,12 +91,8 @@ class RewardCalculator:
throughput = float(info.get("throughput", 0.0)) throughput = float(info.get("throughput", 0.0))
r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6)) r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
speed_variance_norm = clip01(float(info.get("speed_variance_norm", 0.0)))
stop_rate = clip01(float(info.get("stop_rate", 0.0))) stop_rate = clip01(float(info.get("stop_rate", 0.0)))
safety_penalty = clip01( safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
self.config.safety_closing_speed_weight * speed_variance_norm
+ self.config.safety_stop_weight * stop_rate
)
r_safety = -safety_penalty r_safety = -safety_penalty
info["r_throughput"] = float(r_throughput) info["r_throughput"] = float(r_throughput)