新奖励函数,取消通行量指标改为通行效率
This commit is contained in:
parent
3d1782c348
commit
05ef01d93f
|
|
@ -53,9 +53,8 @@ environment:
|
||||||
|
|
||||||
reward:
|
reward:
|
||||||
reward_scale: 10.0
|
reward_scale: 10.0
|
||||||
throughput_weight: 0.75
|
efficiency_weight: 0.75
|
||||||
safety_weight: 0.20
|
safety_weight: 0.20
|
||||||
throughput_ref_vehph: 3908.1
|
|
||||||
|
|
||||||
bottleneck_window_size: 3
|
bottleneck_window_size: 3
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -78,15 +78,16 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
||||||
|
|
||||||
terms = (
|
terms = (
|
||||||
RewardTerm(
|
RewardTerm(
|
||||||
name="throughput",
|
name="efficiency",
|
||||||
symbol="R_throughput",
|
symbol="R_efficiency",
|
||||||
objective="Maximize realized corridor throughput in each control interval.",
|
objective="Maintain a high average running speed in the controlled corridor.",
|
||||||
rationale=(
|
rationale=(
|
||||||
"If the experiment focuses on final operational effectiveness, a direct throughput term "
|
"Instantaneous corridor outflow fluctuates strongly with simulation time and is noisy as a "
|
||||||
"is simpler and more result-oriented than several intermediate bottleneck-state surrogates."
|
"step reward. A normalized mean-speed term is smoother, reacts more directly to VSL control, "
|
||||||
|
"and is easier for policy optimization to fit."
|
||||||
),
|
),
|
||||||
formula_tex=r"R_throughput(t)=clip(q_tp(t)/q_ref,0,1)",
|
formula_tex=r"R_efficiency(t)=\begin{cases}\mathrm{clip}(\bar{v}(t)/v_{\max},0,1),&N(t)>0\\0,&N(t)=0\end{cases}",
|
||||||
required_signals=("interval throughput", "throughput reference"),
|
required_signals=("controlled-corridor mean speed", "controlled-corridor active vehicle count", "speed limit"),
|
||||||
),
|
),
|
||||||
RewardTerm(
|
RewardTerm(
|
||||||
name="safety",
|
name="safety",
|
||||||
|
|
@ -105,17 +106,18 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
||||||
)
|
)
|
||||||
|
|
||||||
return RewardBlueprint(
|
return RewardBlueprint(
|
||||||
name="Minimal Throughput-Safety Reward Blueprint For TCA-MAPPO",
|
name="Minimal Efficiency-Safety Reward Blueprint For TCA-MAPPO",
|
||||||
scenario_summary=(
|
scenario_summary=(
|
||||||
"The study controls a segmented freeway VSL corridor under fixed control intervals. "
|
"The study controls a segmented freeway VSL corridor under fixed control intervals. "
|
||||||
"For architecture comparison, the reward should emphasize a small number of stable, "
|
"For architecture comparison, the reward should emphasize a small number of stable, "
|
||||||
"interpretable traffic goals."
|
"interpretable traffic goals."
|
||||||
),
|
),
|
||||||
primary_objective=(
|
primary_objective=(
|
||||||
"Increase realized throughput and reduce unsafe interactions with a compact reward."
|
"Improve corridor running efficiency and reduce unsafe interactions with a compact reward."
|
||||||
),
|
),
|
||||||
design_principles=(
|
design_principles=(
|
||||||
"Prefer direct result metrics over too many intermediate bottleneck heuristics.",
|
"Prefer stable per-step traffic signals over highly time-dependent outflow spikes.",
|
||||||
|
"Prefer direct operational efficiency metrics over too many intermediate bottleneck heuristics.",
|
||||||
"Keep safety in the reward, but merge it into a single compact term.",
|
"Keep safety in the reward, but merge it into a single compact term.",
|
||||||
"Avoid overlapping safety surrogates when one strong proxy is already available.",
|
"Avoid overlapping safety surrogates when one strong proxy is already available.",
|
||||||
"Keep the reward simple enough that architecture comparison remains credible.",
|
"Keep the reward simple enough that architecture comparison remains credible.",
|
||||||
|
|
@ -123,16 +125,17 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
||||||
),
|
),
|
||||||
terms=terms,
|
terms=terms,
|
||||||
global_formula_tex=(
|
global_formula_tex=(
|
||||||
r"R(t)=kappa [lambda_tp R_throughput(t)+lambda_safe R_safety(t)]"
|
r"R(t)=\kappa [\lambda_{\mathrm{eff}} R_{\mathrm{efficiency}}(t)+\lambda_{\mathrm{safe}} R_{\mathrm{safety}}(t)]"
|
||||||
),
|
),
|
||||||
excluded_metrics=(
|
excluded_metrics=(
|
||||||
"Separate bottleneck tracking and oversaturation terms when throughput already captures outcome.",
|
"Instantaneous outflow as the primary step reward because it is strongly modulated by simulation time.",
|
||||||
"Too many correlated penalties such as shockwave, occupancy, braking, and variance all entering together.",
|
"Too many correlated penalties such as shockwave, occupancy, braking, and variance all entering together.",
|
||||||
"Control smoothness penalties that distort architecture comparison.",
|
"Control smoothness penalties that distort architecture comparison.",
|
||||||
),
|
),
|
||||||
implementation_notes=(
|
implementation_notes=(
|
||||||
|
"The efficiency term is currently built from normalized controlled-corridor mean speed.",
|
||||||
"The safety term is currently built only from corridor stop rate.",
|
"The safety term is currently built only from corridor stop rate.",
|
||||||
"Bottleneck occupancy can still be logged for diagnosis even if it is no longer part of the reward.",
|
"Throughput and bottleneck occupancy can still be logged for diagnosis even if they are no longer part of the reward.",
|
||||||
"Training and evaluate now share the same fixed reward structure.",
|
"Training and evaluate now share the same fixed reward structure.",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,12 @@ import numpy as np
|
||||||
|
|
||||||
|
|
||||||
REWARD_COMPONENT_COLUMNS = (
|
REWARD_COMPONENT_COLUMNS = (
|
||||||
"r_throughput",
|
"r_efficiency",
|
||||||
"r_safety",
|
"r_safety",
|
||||||
)
|
)
|
||||||
|
|
||||||
REWARD_COMPONENT_LABELS = {
|
REWARD_COMPONENT_LABELS = {
|
||||||
"r_throughput": "R_throughput",
|
"r_efficiency": "R_efficiency",
|
||||||
"r_safety": "R_safety",
|
"r_safety": "R_safety",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -35,9 +35,8 @@ def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[s
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class RewardConfig:
|
class RewardConfig:
|
||||||
reward_scale: float = 10.0
|
reward_scale: float = 10.0
|
||||||
throughput_weight: float = 0.75
|
efficiency_weight: float = 0.75
|
||||||
safety_weight: float = 0.20
|
safety_weight: float = 0.20
|
||||||
throughput_ref_vehph: float = 3908.1
|
|
||||||
safety_stop_weight: float = 1.0
|
safety_stop_weight: float = 1.0
|
||||||
bottleneck_window_size: int = 3
|
bottleneck_window_size: int = 3
|
||||||
v_limit: float = 33.33
|
v_limit: float = 33.33
|
||||||
|
|
@ -54,9 +53,8 @@ class RewardConfig:
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
|
reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
|
||||||
throughput_weight=float(raw_cfg.get("throughput_weight", 0.75)),
|
efficiency_weight=float(raw_cfg.get("efficiency_weight", 0.75)),
|
||||||
safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
|
safety_weight=float(raw_cfg.get("safety_weight", 0.20)),
|
||||||
throughput_ref_vehph=float(raw_cfg.get("throughput_ref_vehph", 3908.1)),
|
|
||||||
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
|
safety_stop_weight=float(raw_cfg.get("safety_stop_weight", 1.0)),
|
||||||
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
|
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
|
||||||
v_limit=float(raw_cfg.get("v_limit", 33.33)),
|
v_limit=float(raw_cfg.get("v_limit", 33.33)),
|
||||||
|
|
@ -88,19 +86,20 @@ class RewardCalculator:
|
||||||
) -> float:
|
) -> float:
|
||||||
_ = current_edge_speeds, prev_edge_speeds, episode_index
|
_ = current_edge_speeds, prev_edge_speeds, episode_index
|
||||||
|
|
||||||
throughput = float(info.get("throughput", 0.0))
|
mean_speed = max(float(info.get("mean_speed", 0.0)), 0.0)
|
||||||
r_throughput = clip01(throughput / max(self.config.throughput_ref_vehph, 1e-6))
|
num_vehicles = max(int(info.get("num_vehicles", 0)), 0)
|
||||||
|
r_efficiency = clip01(mean_speed / max(self.config.v_limit, 1e-6)) if num_vehicles > 0 else 0.0
|
||||||
|
|
||||||
stop_rate = clip01(float(info.get("stop_rate", 0.0)))
|
stop_rate = clip01(float(info.get("stop_rate", 0.0)))
|
||||||
safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
|
safety_penalty = clip01(self.config.safety_stop_weight * stop_rate)
|
||||||
r_safety = -safety_penalty
|
r_safety = -safety_penalty
|
||||||
|
|
||||||
info["r_throughput"] = float(r_throughput)
|
info["r_efficiency"] = float(r_efficiency)
|
||||||
info["r_safety"] = float(r_safety)
|
info["r_safety"] = float(r_safety)
|
||||||
info["safety_penalty_norm"] = float(safety_penalty)
|
info["safety_penalty_norm"] = float(safety_penalty)
|
||||||
|
|
||||||
reward = (
|
reward = (
|
||||||
self.config.throughput_weight * r_throughput
|
self.config.efficiency_weight * r_efficiency
|
||||||
+ self.config.safety_weight * r_safety
|
+ self.config.safety_weight * r_safety
|
||||||
)
|
)
|
||||||
return float(reward * self.config.reward_scale)
|
return float(reward * self.config.reward_scale)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue