更新奖励函数
This commit is contained in:
parent
c126e4a9f5
commit
2f594b0eb0
|
|
@ -59,8 +59,7 @@ environment:
|
|||
|
||||
bottleneck_window_size: 3
|
||||
|
||||
safety_closing_speed_weight: 0.6
|
||||
safety_stop_weight: 0.4
|
||||
safety_stop_weight: 1.0
|
||||
|
||||
v_limit: 30.56
|
||||
leader_gap_threshold_m: 100.0
|
||||
|
|
|
|||
|
|
@ -91,16 +91,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
|||
RewardTerm(
|
||||
name="safety",
|
||||
symbol="R_safety",
|
||||
objective="Penalize unsafe speed interactions and excessive stopping with one merged safety term.",
|
||||
objective="Penalize corridor stopping with one compact safety term.",
|
||||
rationale=(
|
||||
"Safety should remain in the reward, but in a compact form. Positive closing-speed "
|
||||
"dispersion and corridor stop rate are already available in the environment and together "
|
||||
"provide a reasonable lightweight safety proxy."
|
||||
"For this experiment, excessive stopping is already a strong proxy for unstable or "
|
||||
"unsafe traffic operation. Using only stop rate keeps the objective simpler and "
|
||||
"reduces interference from overlapping surrogate signals."
|
||||
),
|
||||
formula_tex=(
|
||||
r"R_safety(t)=-clip(w_c \hat{\sigma}_{close}^2(t)+w_s \hat{s}(t),0,1)"
|
||||
r"R_safety(t)=-clip(w_s \hat{s}(t),0,1)"
|
||||
),
|
||||
required_signals=("closing-speed variance", "corridor stop rate", "controlled vehicle count"),
|
||||
required_signals=("corridor stop rate",),
|
||||
),
|
||||
)
|
||||
|
||||
|
|
@ -117,7 +117,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
|||
design_principles=(
|
||||
"Prefer direct result metrics over too many intermediate bottleneck heuristics.",
|
||||
"Keep safety in the reward, but merge it into a single compact term.",
|
||||
"Use the same safety proxies that are already measurable in the environment.",
|
||||
"Avoid overlapping safety surrogates when one strong proxy is already available.",
|
||||
"Keep the reward simple enough that architecture comparison remains credible.",
|
||||
"Avoid auxiliary regularizers that change the objective across training stages.",
|
||||
),
|
||||
|
|
@ -131,7 +131,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
|||
"Control smoothness penalties that distort architecture comparison.",
|
||||
),
|
||||
implementation_notes=(
|
||||
"The safety term is currently built from normalized positive closing-speed variance and corridor stop rate.",
|
||||
"The safety term is currently built only from corridor stop rate.",
|
||||
"Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
|
||||
"Training and evaluate now share the same fixed reward structure.",
|
||||
),
|
||||
|
|
|
|||
|
|
@ -38,8 +38,7 @@ class RewardConfig:
|
|||
throughput_weight: float = 0.75
|
||||
safety_weight: float = 0.20
|
||||
throughput_ref_vehph: float = 3908.1
|
||||
safety_closing_speed_weight: float = 0.6
|
||||
safety_stop_weight: float = 0.4
|
||||
safety_stop_weight: float = 1.0
|
||||
bottleneck_window_size: int = 3
|
||||
v_limit: float = 33.33
|
||||
leader_gap_threshold_m: float = 100.0
|
||||
|
|
@ -58,8 +57,7 @@ class RewardConfig:
|
|||
throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
|
||||
safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
|
||||
throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
|
||||
safety_closing_speed_weight=float(raw_cfg.get("safety_closing_speed_weight", 0.6)),
|
||||
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 0.4)),
|
||||
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
|
||||
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
|
||||
v_limit=float(raw_cfg.get("v_limit", 33.33)),
|
||||
leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
|
||||
|
|
@ -93,12 +91,8 @@ class RewardCalculator:
|
|||
throughput = float(info.get("throughput", 0.0))
|
||||
r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
|
||||
|
||||
speed_variance_norm = clip01(float(info.get("speed_variance_norm", 0.0)))
|
||||
stop_rate = clip01(float(info.get("stop_rate", 0.0)))
|
||||
safety_penalty = clip01(
|
||||
self.config.safety_closing_speed_weight * speed_variance_norm
|
||||
+ self.config.safety_stop_weight * stop_rate
|
||||
)
|
||||
safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
|
||||
r_safety = -safety_penalty
|
||||
|
||||
info["r_throughput"] = float(r_throughput)
|
||||
|
|
|
|||
Loading…
Reference in New Issue