更新奖励函数
This commit is contained in:
parent
c126e4a9f5
commit
2f594b0eb0
|
|
@ -59,8 +59,7 @@ environment:
|
||||||
|
|
||||||
bottleneck_window_size: 3
|
bottleneck_window_size: 3
|
||||||
|
|
||||||
safety_closing_speed_weight: 0.6
|
safety_stop_weight: 1.0
|
||||||
safety_stop_weight: 0.4
|
|
||||||
|
|
||||||
v_limit: 30.56
|
v_limit: 30.56
|
||||||
leader_gap_threshold_m: 100.0
|
leader_gap_threshold_m: 100.0
|
||||||
|
|
|
||||||
|
|
@ -91,16 +91,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
||||||
RewardTerm(
|
RewardTerm(
|
||||||
name="safety",
|
name="safety",
|
||||||
symbol="R_safety",
|
symbol="R_safety",
|
||||||
objective="Penalize unsafe speed interactions and excessive stopping with one merged safety term.",
|
objective="Penalize corridor stopping with one compact safety term.",
|
||||||
rationale=(
|
rationale=(
|
||||||
"Safety should remain in the reward, but in a compact form. Positive closing-speed "
|
"For this experiment, excessive stopping is already a strong proxy for unstable or "
|
||||||
"dispersion and corridor stop rate are already available in the environment and together "
|
"unsafe traffic operation. Using only stop rate keeps the objective simpler and "
|
||||||
"provide a reasonable lightweight safety proxy."
|
"reduces interference from overlapping surrogate signals."
|
||||||
),
|
),
|
||||||
formula_tex=(
|
formula_tex=(
|
||||||
r"R_safety(t)=-clip(w_c \hat{\sigma}_{close}^2(t)+w_s \hat{s}(t),0,1)"
|
r"R_safety(t)=-clip(w_s \hat{s}(t),0,1)"
|
||||||
),
|
),
|
||||||
required_signals=("closing-speed variance", "corridor stop rate", "controlled vehicle count"),
|
required_signals=("corridor stop rate",),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -117,7 +117,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
||||||
design_principles=(
|
design_principles=(
|
||||||
"Prefer direct result metrics over too many intermediate bottleneck heuristics.",
|
"Prefer direct result metrics over too many intermediate bottleneck heuristics.",
|
||||||
"Keep safety in the reward, but merge it into a single compact term.",
|
"Keep safety in the reward, but merge it into a single compact term.",
|
||||||
"Use the same safety proxies that are already measurable in the environment.",
|
"Avoid overlapping safety surrogates when one strong proxy is already available.",
|
||||||
"Keep the reward simple enough that architecture comparison remains credible.",
|
"Keep the reward simple enough that architecture comparison remains credible.",
|
||||||
"Avoid auxiliary regularizers that change the objective across training stages.",
|
"Avoid auxiliary regularizers that change the objective across training stages.",
|
||||||
),
|
),
|
||||||
|
|
@ -131,7 +131,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
||||||
"Control smoothness penalties that distort architecture comparison.",
|
"Control smoothness penalties that distort architecture comparison.",
|
||||||
),
|
),
|
||||||
implementation_notes=(
|
implementation_notes=(
|
||||||
"The safety term is currently built from normalized positive closing-speed variance and corridor stop rate.",
|
"The safety term is currently built only from corridor stop rate.",
|
||||||
"Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
|
"Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
|
||||||
"Training and evaluate now share the same fixed reward structure.",
|
"Training and evaluate now share the same fixed reward structure.",
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -38,8 +38,7 @@ class RewardConfig:
|
||||||
throughput_weight: float = 0.75
|
throughput_weight: float = 0.75
|
||||||
safety_weight: float = 0.20
|
safety_weight: float = 0.20
|
||||||
throughput_ref_vehph: float = 3908.1
|
throughput_ref_vehph: float = 3908.1
|
||||||
safety_closing_speed_weight: float = 0.6
|
safety_stop_weight: float = 1.0
|
||||||
safety_stop_weight: float = 0.4
|
|
||||||
bottleneck_window_size: int = 3
|
bottleneck_window_size: int = 3
|
||||||
v_limit: float = 33.33
|
v_limit: float = 33.33
|
||||||
leader_gap_threshold_m: float = 100.0
|
leader_gap_threshold_m: float = 100.0
|
||||||
|
|
@ -58,8 +57,7 @@ class RewardConfig:
|
||||||
throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
|
throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
|
||||||
safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
|
safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
|
||||||
throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
|
throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
|
||||||
safety_closing_speed_weight=float(raw_cfg.get("safety_closing_speed_weight", 0.6)),
|
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
|
||||||
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 0.4)),
|
|
||||||
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
|
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
|
||||||
v_limit=float(raw_cfg.get("v_limit", 33.33)),
|
v_limit=float(raw_cfg.get("v_limit", 33.33)),
|
||||||
leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
|
leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
|
||||||
|
|
@ -93,12 +91,8 @@ class RewardCalculator:
|
||||||
throughput = float(info.get("throughput", 0.0))
|
throughput = float(info.get("throughput", 0.0))
|
||||||
r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
|
r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
|
||||||
|
|
||||||
speed_variance_norm = clip01(float(info.get("speed_variance_norm", 0.0)))
|
|
||||||
stop_rate = clip01(float(info.get("stop_rate", 0.0)))
|
stop_rate = clip01(float(info.get("stop_rate", 0.0)))
|
||||||
safety_penalty = clip01(
|
safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
|
||||||
self.config.safety_closing_speed_weight * speed_variance_norm
|
|
||||||
+ self.config.safety_stop_weight * stop_rate
|
|
||||||
)
|
|
||||||
r_safety = -safety_penalty
|
r_safety = -safety_penalty
|
||||||
|
|
||||||
info["r_throughput"] = float(r_throughput)
|
info["r_throughput"] = float(r_throughput)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue