新奖励函数,取消通行量指标改为通行效率
This commit is contained in:
parent
3d1782c348
commit
05ef01d93f
|
|
@ -53,9 +53,8 @@ environment:
|
|||
|
||||
reward:
|
||||
reward_scale: 10.0
|
||||
throughput_weight: 0.75
|
||||
efficiency_weight: 0.75
|
||||
safety_weight: 0.20
|
||||
throughput_ref_vehph: 3908.1
|
||||
|
||||
bottleneck_window_size: 3
|
||||
|
||||
|
|
|
|||
|
|
@ -78,15 +78,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
|||
|
||||
terms = (
|
||||
RewardTerm(
|
||||
name="throughput",
|
||||
symbol="R_throughput",
|
||||
objective="Maximize realized corridor throughput in each control interval.",
|
||||
name="efficiency",
|
||||
symbol="R_efficiency",
|
||||
objective="Maintain a high average running speed in the controlled corridor.",
|
||||
rationale=(
|
||||
"If the experiment focuses on final operational effectiveness, a direct throughput term "
|
||||
"is simpler and more result-oriented than several intermediate bottleneck-state surrogates."
|
||||
"Instantaneous corridor outflow fluctuates strongly with simulation time and is noisy as a "
|
||||
"step reward. A normalized mean-speed term is smoother, reacts more directly to VSL control, "
|
||||
"and is easier for policy optimization to fit."
|
||||
),
|
||||
formula_tex=r"R_throughput(t)=clip(q_tp(t)/q_ref,0,1)",
|
||||
required_signals=("interval throughput", "throughput reference"),
|
||||
formula_tex=r"R_efficiency(t)=\begin{cases}\mathrm{clip}(\bar{v}(t)/v_{\max},0,1),&N(t)>0\\0,&N(t)=0\end{cases}",
|
||||
required_signals=("controlled-corridor mean speed", "controlled-corridor active vehicle count", "speed limit"),
|
||||
),
|
||||
RewardTerm(
|
||||
name="safety",
|
||||
|
|
@ -105,17 +106,18 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
|||
)
|
||||
|
||||
return RewardBlueprint(
|
||||
name="Minimal Throughput-Safety Reward Blueprint For TCA-MAPPO",
|
||||
name="Minimal Efficiency-Safety Reward Blueprint For TCA-MAPPO",
|
||||
scenario_summary=(
|
||||
"The study controls a segmented freeway VSL corridor under fixed control intervals. "
|
||||
"For architecture comparison, the reward should emphasize a small number of stable, "
|
||||
"interpretable traffic goals."
|
||||
),
|
||||
primary_objective=(
|
||||
"Increase realized throughput and reduce unsafe interactions with a compact reward."
|
||||
"Improve corridor running efficiency and reduce unsafe interactions with a compact reward."
|
||||
),
|
||||
design_principles=(
|
||||
"Prefer direct result metrics over too many intermediate bottleneck heuristics.",
|
||||
"Prefer stable per-step traffic signals over highly time-dependent outflow spikes.",
|
||||
"Prefer direct operational efficiency metrics over too many intermediate bottleneck heuristics.",
|
||||
"Keep safety in the reward, but merge it into a single compact term.",
|
||||
"Avoid overlapping safety surrogates when one strong proxy is already available.",
|
||||
"Keep the reward simple enough that architecture comparison remains credible.",
|
||||
|
|
@ -123,16 +125,17 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
|||
),
|
||||
terms=terms,
|
||||
global_formula_tex=(
|
||||
r"R(t)=kappa [lambda_tp R_throughput(t)+lambda_safe R_safety(t)]"
|
||||
r"R(t)=\kappa [\lambda_{\mathrm{eff}} R_{\mathrm{efficiency}}(t)+\lambda_{\mathrm{safe}} R_{\mathrm{safety}}(t)]"
|
||||
),
|
||||
excluded_metrics=(
|
||||
"Separate bottleneck tracking and oversaturation terms when throughput already captures outcome.",
|
||||
"Instantaneous outflow as the primary step reward because it is strongly modulated by simulation time.",
|
||||
"Too many correlated penalties such as shockwave, occupancy, braking, and variance all entering together.",
|
||||
"Control smoothness penalties that distort architecture comparison.",
|
||||
),
|
||||
implementation_notes=(
|
||||
"The efficiency term is currently built from normalized controlled-corridor mean speed.",
|
||||
"The safety term is currently built only from corridor stop rate.",
|
||||
"Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
|
||||
"Throughput and bottleneck occupancy can still be logged for diagnosis even if they are no longer part of the reward.",
|
||||
"Training and evaluate now share the same fixed reward structure.",
|
||||
),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -9,12 +9,12 @@ import numpy as np
|
|||
|
||||
|
||||
REWARD_COMPONENT_COLUMNS = (
|
||||
"r_throughput",
|
||||
"r_efficiency",
|
||||
"r_safety",
|
||||
)
|
||||
|
||||
REWARD_COMPONENT_LABELS = {
|
||||
"r_throughput": "R_throughput",
|
||||
"r_efficiency": "R_efficiency",
|
||||
"r_safety": "R_safety",
|
||||
}
|
||||
|
||||
|
|
@ -35,9 +35,8 @@ def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[s
|
|||
@dataclass(frozen=True)
|
||||
class RewardConfig:
|
||||
reward_scale: float = 10.0
|
||||
throughput_weight: float = 0.75
|
||||
efficiency_weight: float = 0.75
|
||||
safety_weight: float = 0.20
|
||||
throughput_ref_vehph: float = 3908.1
|
||||
safety_stop_weight: float = 1.0
|
||||
bottleneck_window_size: int = 3
|
||||
v_limit: float = 33.33
|
||||
|
|
@ -54,9 +53,8 @@ class RewardConfig:
|
|||
|
||||
return cls(
|
||||
reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
|
||||
throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
|
||||
efficiency_weight=float(raw_cfg.get("efficiency_weight", 0.75)),
|
||||
safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
|
||||
throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
|
||||
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
|
||||
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
|
||||
v_limit=float(raw_cfg.get("v_limit", 33.33)),
|
||||
|
|
@ -88,19 +86,20 @@ class RewardCalculator:
|
|||
) -> float:
|
||||
_ = current_edge_speeds, prev_edge_speeds, episode_index
|
||||
|
||||
throughput = float(info.get("throughput", 0.0))
|
||||
r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
|
||||
mean_speed = max(float(info.get("mean_speed", 0.0)), 0.0)
|
||||
num_vehicles = max(int(info.get("num_vehicles", 0)), 0)
|
||||
r_efficiency = clip01(mean_speed / max(self.config.v_limit, 1e-6)) if num_vehicles > 0 else 0.0
|
||||
|
||||
stop_rate = clip01(float(info.get("stop_rate", 0.0)))
|
||||
safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
|
||||
r_safety = -safety_penalty
|
||||
|
||||
info["r_throughput"] = float(r_throughput)
|
||||
info["r_efficiency"] = float(r_efficiency)
|
||||
info["r_safety"] = float(r_safety)
|
||||
info["safety_penalty_norm"] = float(safety_penalty)
|
||||
|
||||
reward = (
|
||||
self.config.throughput_weight * r_throughput
|
||||
self.config.efficiency_weight * r_efficiency
|
||||
+ self.config.safety_weight * r_safety
|
||||
)
|
||||
return float(reward * self.config.reward_scale)
|
||||
|
|
|
|||
Loading…
Reference in New Issue