尝试基于模拟运行的奖励

This commit is contained in:
Zihan Ye 2026-04-25 08:23:15 +08:00
parent 45e6de22c1
commit 7becebfe6b
19 changed files with 717 additions and 374 deletions

View File

@ -63,15 +63,17 @@ environment:
free_flow_speed: 30.56
reward:
efficiency_alpha: 2.1159
safety_beta: 7.8553
efficiency_exponent: 0.50
safety_exponent: 0.50
mode: "paired_no_control"
baseline_dir: ""
baseline_key: "step"
baseline_wait_timeout_s: 3600.0
baseline_poll_interval_s: 1.0
travel_time_weight: 0.5
ttc_weight: 0.5
travel_time_min_denominator_s: 60.0
ttc_threshold_s: 2.3
bottleneck_window_size: 3
v_limit: 30.56
leader_gap_threshold_m: 100.0
training:

View File

@ -21,6 +21,7 @@ import traci.constants as tc
from envs.network_parser import SUMONetworkParser
from envs.reward_system import RewardCalculator, RewardConfig
from utils.experiment_corridor import prepare_experiment_corridor_assets
from utils.reward_baseline import resolve_baseline_dir, wait_for_episode_baseline
class SUMOEdgeVSLEnvironment:
@ -41,6 +42,7 @@ class SUMOEdgeVSLEnvironment:
runtime_cfg = config.get("runtime", {})
self.runtime_output_dir = runtime_cfg.get("output_dir")
self.runtime_metrics_subdir = runtime_cfg.get("metrics_subdir", "sumo_metrics")
self.run_timestamp = runtime_cfg.get("run_timestamp")
self.evaluation_mode = bool(runtime_cfg.get("evaluation_mode", False))
self.runtime_detector_add_file: Optional[str] = None
self.runtime_enex_add_file: Optional[str] = None
@ -163,6 +165,8 @@ class SUMOEdgeVSLEnvironment:
controlled_edge_start_index=self.controlled_edge_start_index,
evaluation_mode=self.evaluation_mode,
)
self.reward_baseline_dir = resolve_baseline_dir(config, self.run_timestamp)
self.reward_calculator.set_step_baseline({})
self.action_dims = [self.num_speed_actions] * self.num_controlled_edges
self.features_per_edge = 3
@ -182,6 +186,8 @@ class SUMOEdgeVSLEnvironment:
self._interval_mainline_travel_times: List[float] = []
self._episode_rng = np.random.default_rng()
self._incident_state: Dict[str, object] = {}
self._reward_baseline_episode = 0
self._reward_baseline_loaded_step = 0
print("SUMO Edge VSL Environment initialized")
print(f" Control segments: {self.num_edges}")
@ -498,6 +504,38 @@ class SUMOEdgeVSLEnvironment:
)
info["incident_lane_index"] = int(incident.get("lane_index", -1))
def _load_episode_reward_baseline(self, episode: int) -> Dict[int, Dict[str, float]]:
mode = str(self.reward_config.mode).lower()
if mode in {"paired_no_control", "episode_baseline"}:
return wait_for_episode_baseline(
baseline_dir=self.reward_baseline_dir,
episode=episode,
min_step=self._reward_baseline_loaded_step + 1,
timeout_s=self.reward_config.baseline_wait_timeout_s,
poll_interval_s=self.reward_config.baseline_poll_interval_s,
)
return {}
def _sync_episode_reward_baseline(self, min_step: int) -> None:
mode = str(self.reward_config.mode).lower()
if mode not in {"paired_no_control", "episode_baseline"}:
return
if self._reward_baseline_episode != self._episode_count:
self._reward_baseline_episode = self._episode_count
self._reward_baseline_loaded_step = 0
self.reward_calculator.set_step_baseline({})
if self._reward_baseline_loaded_step >= min_step:
return
baseline = wait_for_episode_baseline(
baseline_dir=self.reward_baseline_dir,
episode=self._episode_count,
min_step=min_step,
timeout_s=self.reward_config.baseline_wait_timeout_s,
poll_interval_s=self.reward_config.baseline_poll_interval_s,
)
self.reward_calculator.set_step_baseline(baseline)
self._reward_baseline_loaded_step = max(baseline) if baseline else 0
def _start_sumo(self, seed: Optional[int] = None):
if self._sumo_running:
self._close_sumo()
@ -617,6 +655,13 @@ class SUMOEdgeVSLEnvironment:
self._completed_mainline_travel_times = []
self._interval_mainline_travel_times = []
self._reset_incident_runtime(seed)
mode = str(self.reward_config.mode).lower()
if mode in {"paired_no_control", "episode_baseline"}:
self._reward_baseline_episode = self._episode_count
self._reward_baseline_loaded_step = 0
self.reward_calculator.set_step_baseline({})
else:
self.reward_calculator.set_step_baseline(self._load_episode_reward_baseline(self._episode_count))
self._start_sumo(seed=seed)
warmup_steps = int(self.warmup_time / self.control_interval)
@ -659,6 +704,8 @@ class SUMOEdgeVSLEnvironment:
detector_data = self._get_edge_detector_data()
state = self._collect_state(detector_data)
info = self._collect_runtime_metrics(detector_data)
info["step"] = self.current_step + 1
self._sync_episode_reward_baseline(int(info["step"]))
info["detector_cells"] = self._collect_all_detector_cells() if self.collect_detector_cells else []
reward = self._calculate_reward(info)
self._last_reward = reward
@ -1040,7 +1087,7 @@ class SUMOEdgeVSLEnvironment:
info["relative_speed_samples"] = relative_speed_samples
info["relative_speed_variance"] = float(np.var(relative_speed_samples)) if relative_speed_samples else 0.0
info["speed_variance_norm"] = (
info["relative_speed_variance"] / max(self.reward_config.v_limit ** 2, 1e-6)
info["relative_speed_variance"] / max(self.free_flow_speed ** 2, 1e-6)
)
info["ttc_samples"] = ttc_samples
positive_ttc_samples = [value for value in ttc_samples if value > 0.0]

View File

@ -1,168 +0,0 @@
"""Minimal reward blueprint for the current freeway VSL study."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Iterable, Tuple
@dataclass(frozen=True)
class RewardTerm:
name: str
symbol: str
objective: str
rationale: str
formula_tex: str
required_signals: Tuple[str, ...]
def to_markdown(self) -> str:
return "\n".join(
[
f"### {self.name}",
f"- Symbol: `{self.symbol}`",
f"- Objective: {self.objective}",
f"- Rationale: {self.rationale}",
f"- Formula: `{self.formula_tex}`",
"- Required signals: " + ", ".join(f"`{value}`" for value in self.required_signals),
]
)
@dataclass(frozen=True)
class RewardBlueprint:
name: str
scenario_summary: str
primary_objective: str
design_principles: Tuple[str, ...]
terms: Tuple[RewardTerm, ...]
global_formula_tex: str
excluded_metrics: Tuple[str, ...] = ()
implementation_notes: Tuple[str, ...] = ()
def term_names(self) -> Tuple[str, ...]:
return tuple(term.name for term in self.terms)
def to_markdown(self) -> str:
lines = [
f"# {self.name}",
"",
"## Scenario",
self.scenario_summary,
"",
"## Primary Objective",
self.primary_objective,
"",
"## Global Formula",
f"`{self.global_formula_tex}`",
"",
"## Design Principles",
]
lines.extend(f"- {item}" for item in self.design_principles)
lines.append("")
lines.append("## Reward Terms")
for term in self.terms:
lines.append(term.to_markdown())
lines.append("")
if self.excluded_metrics:
lines.append("## Metrics To Avoid As Primary Reward Drivers")
lines.extend(f"- {item}" for item in self.excluded_metrics)
lines.append("")
if self.implementation_notes:
lines.append("## Implementation Notes")
lines.extend(f"- {item}" for item in self.implementation_notes)
return "\n".join(lines).rstrip() + "\n"
def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
"""Build the current multiplicative-MAUT reward blueprint for corridor VSL."""
terms = (
RewardTerm(
name="efficiency",
symbol="R_efficiency",
objective="Map normalized corridor running efficiency into a bounded utility term.",
rationale=(
"The raw efficiency indicator should not enter the reward linearly. A monotone concave utility "
"function emphasizes recovery from low-efficiency states while avoiding over-rewarding already "
"high-speed regimes."
),
formula_tex=(
r"R_{\mathrm{efficiency}}(t)=1-\exp\!\left(-\alpha E(t)\right),\ "
r"E(t)=\begin{cases}\mathrm{clip}(\bar{v}(t)/v_{\max},0,1),&N(t)>0\\0,&N(t)=0\end{cases}"
),
required_signals=("controlled-corridor mean speed", "controlled-corridor active vehicle count", "speed limit"),
),
RewardTerm(
name="safety",
symbol="R_safety",
objective="Map TTC-based following risk into a bounded safety utility term.",
rationale=(
"For freeway VSL, rear-end conflict risk is better captured by time-to-collision than by "
"stop counting. An exponential safety utility makes the reward sensitive to the spread of "
"short-TTC following states while preserving boundedness and interpretability."
),
formula_tex=(
r"R_{\mathrm{safety}}(t)=\exp\!\left(-\beta S(t)\right),\ "
r"S(t)=\frac{1}{N(t)}\sum_{i=1}^{N(t)}\max\!\left(0,1-\frac{\mathrm{TTC}_i(t)}{\tau_{\mathrm{ttc}}}\right)"
),
required_signals=(
"controlled-corridor vehicle speeds",
"leader speeds",
"leader gaps",
"TTC threshold",
),
),
)
return RewardBlueprint(
name="Multiplicative MAUT Reward Blueprint For TCA-MAPPO",
scenario_summary=(
"The study controls a segmented freeway VSL corridor under fixed control intervals. "
"For architecture comparison, the reward should emphasize a small number of stable, "
"interpretable traffic goals while avoiding overly simplistic linear compensation between "
"efficiency and safety."
),
primary_objective=(
"Improve corridor running efficiency and traffic safety through a nonlinear utility-based reward."
),
design_principles=(
"Prefer stable per-step traffic signals over highly time-dependent outflow spikes.",
"Map raw traffic indicators into utility space before aggregation.",
"Avoid full linear substitutability between efficiency and safety.",
"Use a compact nonlinear aggregation that remains easy to interpret and implement.",
"Use a freeway-native rear-end risk surrogate instead of coarse stop counting.",
"Avoid auxiliary regularizers that change the objective across training stages.",
),
terms=terms,
global_formula_tex=(
r"R(t)=R_{\mathrm{efficiency}}(t)^{\lambda_{\mathrm{eff}}}"
r"R_{\mathrm{safety}}(t)^{\lambda_{\mathrm{safe}}}"
),
excluded_metrics=(
"Instantaneous outflow as the primary step reward because it is strongly modulated by simulation time.",
"Too many correlated penalties such as shockwave, occupancy, braking, and variance all entering together.",
"Purely linear reward aggregation when safety should not be fully compensated by efficiency gains.",
),
implementation_notes=(
"The efficiency utility is built from normalized controlled-corridor mean speed.",
"The safety utility is built from a TTC-thresholded average following-risk indicator.",
"The exponents are normalized internally before multiplicative aggregation.",
"Throughput and bottleneck occupancy can still be logged for diagnosis even if they are no longer part of the reward.",
"Training and evaluate now share the same fixed reward structure.",
),
)
def build_reward_blueprint_markdown() -> str:
return build_tca_mappo_reward_blueprint().to_markdown()
def iter_required_signals() -> Iterable[str]:
signals = set()
for term in build_tca_mappo_reward_blueprint().terms:
signals.update(term.required_signals)
return tuple(sorted(signals))
if __name__ == "__main__":
print(build_reward_blueprint_markdown())

View File

@ -1,4 +1,4 @@
"""Shared reward configuration and calculation for freeway VSL environments."""
"""Shared reward calculation for freeway VSL environments."""
from __future__ import annotations
@ -9,22 +9,18 @@ import numpy as np
REWARD_COMPONENT_COLUMNS = (
"r_efficiency",
"r_safety",
"r_utility",
"r_travel_time_improvement",
"r_ttc_improvement",
"r_improvement",
)
REWARD_COMPONENT_LABELS = {
"r_efficiency": "R_efficiency",
"r_safety": "R_safety",
"r_utility": "R_utility",
"r_travel_time_improvement": "R_travel_time",
"r_ttc_improvement": "R_ttc",
"r_improvement": "R_total",
}
def clip01(value: float) -> float:
return float(np.clip(value, 0.0, 1.0))
def init_reward_component_totals() -> Dict[str, float]:
return {column: 0.0 for column in REWARD_COMPONENT_COLUMNS}
@ -36,14 +32,17 @@ def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[s
@dataclass(frozen=True)
class RewardConfig:
efficiency_alpha: float = 2.1159
safety_beta: float = 7.8553
efficiency_exponent: float = 0.50
safety_exponent: float = 0.50
ttc_threshold_s: float = 2.3
bottleneck_window_size: int = 3
v_limit: float = 33.33
leader_gap_threshold_m: float = 100.0
mode: str = "paired_no_control"
baseline_dir: str = ""
baseline_key: str = "step"
baseline_wait_timeout_s: float = 3600.0
baseline_poll_interval_s: float = 1.0
travel_time_weight: float = 0.5
ttc_weight: float = 0.5
travel_time_min_denominator_s: float = 60.0
@classmethod
def from_dict(
@ -53,21 +52,26 @@ class RewardConfig:
speed_actions_ms: Sequence[float],
) -> "RewardConfig":
_ = speed_actions_ms
return cls(
efficiency_alpha=float(raw_cfg.get("efficiency_alpha", 2.1159)),
safety_beta=float(raw_cfg.get("safety_beta", 7.8553)),
efficiency_exponent=float(raw_cfg.get("efficiency_exponent", 0.50)),
safety_exponent=float(raw_cfg.get("safety_exponent", 0.50)),
ttc_threshold_s=float(raw_cfg.get("ttc_threshold_s", 2.3)),
bottleneck_window_size=max(1, int(raw_cfg.get("bottleneck_window_size", 3))),
v_limit=float(raw_cfg.get("v_limit", 33.33)),
leader_gap_threshold_m=float(raw_cfg.get("leader_gap_threshold_m", 100.0)),
mode=str(raw_cfg.get("mode", "paired_no_control")).strip().lower(),
baseline_dir=str(raw_cfg.get("baseline_dir", "") or ""),
baseline_key=str(raw_cfg.get("baseline_key", "step")).strip().lower(),
baseline_wait_timeout_s=float(raw_cfg.get("baseline_wait_timeout_s", 3600.0)),
baseline_poll_interval_s=float(raw_cfg.get("baseline_poll_interval_s", 1.0)),
travel_time_weight=float(raw_cfg.get("travel_time_weight", 0.5)),
ttc_weight=float(raw_cfg.get("ttc_weight", 0.5)),
travel_time_min_denominator_s=max(
float(raw_cfg.get("travel_time_min_denominator_s", 60.0)),
1e-6,
),
)
class RewardCalculator:
"""Encapsulates a multiplicative MAUT reward for freeway VSL control."""
"""Counterfactual reward relative to synchronized no-control episodes."""
def __init__(
self,
@ -79,14 +83,10 @@ class RewardCalculator:
self.config = config
self.controlled_edge_start_index = int(controlled_edge_start_index)
self.evaluation_mode = bool(evaluation_mode)
self.baseline_by_step: Dict[int, Mapping[str, float]] = {}
def _normalized_preference_exponents(self) -> tuple[float, float]:
eff = max(float(self.config.efficiency_exponent), 0.0)
safe = max(float(self.config.safety_exponent), 0.0)
total = eff + safe
if total <= 1e-8:
return 0.5, 0.5
return eff / total, safe / total
def set_step_baseline(self, baseline_by_step: Mapping[int, Mapping[str, float]]) -> None:
self.baseline_by_step = dict(baseline_by_step or {})
def calculate(
self,
@ -98,26 +98,86 @@ class RewardCalculator:
) -> float:
_ = current_edge_speeds, prev_edge_speeds, episode_index
mean_speed = max(float(info.get("mean_speed", 0.0)), 0.0)
num_vehicles = max(int(info.get("num_vehicles", 0)), 0)
efficiency_norm = clip01(mean_speed / max(self.config.v_limit, 1e-6)) if num_vehicles > 0 else 0.0
r_efficiency = 1.0 - float(np.exp(-max(self.config.efficiency_alpha, 0.0) * efficiency_norm))
ttc_risk_rate = clip01(float(info.get("ttc_risk_rate", 0.0)))
safety_risk = ttc_risk_rate
r_safety = float(np.exp(-max(self.config.safety_beta, 0.0) * safety_risk))
lambda_eff, lambda_safe = self._normalized_preference_exponents()
r_utility = float(
np.power(max(r_efficiency, 1e-8), lambda_eff)
* np.power(max(r_safety, 1e-8), lambda_safe)
current_ttc_risk = _clip(float(info.get("ttc_risk_rate", 0.0)), 0.0, 1.0)
current_travel_time = float(
info.get("mainline_travel_time_cumulative_mean_s", np.nan)
)
info["r_efficiency"] = float(r_efficiency)
info["r_safety"] = float(r_safety)
info["r_utility"] = float(r_utility)
info["efficiency_norm"] = float(efficiency_norm)
info["safety_risk_norm"] = float(safety_risk)
baseline = self._get_baseline(info)
baseline_ttc_risk = _safe_baseline_float(baseline, "ttc_risk_rate")
baseline_travel_time = _safe_baseline_float(
baseline,
"mainline_travel_time_cumulative_mean_s",
)
travel_time_improvement = self._travel_time_improvement(
current_travel_time=current_travel_time,
baseline_travel_time=baseline_travel_time,
)
ttc_improvement = self._ttc_improvement(
current_ttc_risk=current_ttc_risk,
baseline_ttc_risk=baseline_ttc_risk,
)
reward = float(
self.config.travel_time_weight * travel_time_improvement
+ self.config.ttc_weight * ttc_improvement
)
info["reward_mode"] = self.config.mode
info["safety_risk_norm"] = float(current_ttc_risk)
info["ttc_threshold_s"] = float(self.config.ttc_threshold_s)
info["efficiency_lambda"] = float(lambda_eff)
info["safety_lambda"] = float(lambda_safe)
return float(r_utility)
info["baseline_mainline_travel_time_cumulative_mean_s"] = float(baseline_travel_time)
info["baseline_ttc_risk_rate"] = float(baseline_ttc_risk)
info["travel_time_relative_denominator_s"] = float(
self._travel_time_denominator(baseline_travel_time)
)
info["r_travel_time_improvement"] = float(travel_time_improvement)
info["r_ttc_improvement"] = float(ttc_improvement)
info["r_improvement"] = float(reward)
if self.config.mode in {"paired_no_control", "episode_baseline"}:
return reward
return 0.0
def _get_baseline(self, info: Mapping[str, object]) -> Mapping[str, float] | None:
lookup_key = "sim_time" if self.config.baseline_key == "sim_time" else "step"
try:
key_value = int(round(float(info.get(lookup_key, 0.0))))
except (TypeError, ValueError):
key_value = 0
return self.baseline_by_step.get(key_value)
def _travel_time_denominator(self, baseline_travel_time: float) -> float:
if not np.isfinite(baseline_travel_time):
return float("nan")
return max(float(baseline_travel_time), self.config.travel_time_min_denominator_s)
def _travel_time_improvement(
self,
*,
current_travel_time: float,
baseline_travel_time: float,
) -> float:
denominator = self._travel_time_denominator(baseline_travel_time)
if not np.isfinite(current_travel_time) or not np.isfinite(denominator):
return 0.0
return _clip((baseline_travel_time - current_travel_time) / denominator, -1.0, 1.0)
@staticmethod
def _ttc_improvement(*, current_ttc_risk: float, baseline_ttc_risk: float) -> float:
if not np.isfinite(baseline_ttc_risk):
return 0.0
return _clip(baseline_ttc_risk - current_ttc_risk, -1.0, 1.0)
def _safe_baseline_float(baseline: Mapping[str, float] | None, key: str) -> float:
if baseline is None:
return float("nan")
try:
return float(baseline.get(key, np.nan))
except (TypeError, ValueError):
return float("nan")
def _clip(value: float, lower: float, upper: float) -> float:
return float(np.clip(value, lower, upper))

View File

@ -130,7 +130,9 @@ def print_failure_excerpt(name, log_path, returncode):
if tail_lines:
print(" Last log lines:")
for line in tail_lines:
print(f" {line}")
encoding = sys.stdout.encoding or "utf-8"
safe_line = line.encode(encoding, errors="replace").decode(encoding, errors="replace")
print(f" {safe_line}")
else:
print(" No readable log content found.")

View File

@ -57,8 +57,8 @@ MODEL_COLORS = {
"td3": "#17becf",
"sctd3": "#bcbd22",
}
EFFICIENCY_COLUMN = "r_efficiency"
EFFICIENCY_LABEL = REWARD_COMPONENT_LABELS.get(EFFICIENCY_COLUMN, "Running Efficiency")
EFFICIENCY_COLUMN = "r_travel_time_improvement"
EFFICIENCY_LABEL = REWARD_COMPONENT_LABELS.get(EFFICIENCY_COLUMN, "Travel Time Improvement")
def parse_args():

View File

@ -1,52 +1,80 @@
"""Central registry for training entry functions."""
from typing import Callable, Dict, List
from training.train_appo import train_sumo_appo
from training.train_dcmappo import train_sumo_dcmappo
from training.train_dcqmix import train_sumo_dcqmix
from training.train_d3pg import train_sumo_d3pg
from training.train_ddqn import train_sumo_ddqn
from training.train_ddpg import train_sumo_ddpg
from training.train_dqn import train_sumo_dqn
from training.train_gpro import train_sumo_gpro
from training.train_madqn import train_sumo_madqn
from training.train_mappo import train_sumo_mappo
from training.train_ppo import train_sumo_ppo
from training.train_qmix import train_sumo_qmix
from training.train_sac import train_sumo_sac
from training.train_sctd3 import train_sumo_sctd3
from training.train_tcamappo import train_sumo_tcamappo
from training.train_td3 import train_sumo_td3
from __future__ import annotations
import importlib
from typing import Callable, Dict, List, Tuple
# DEFAULT_MODELS: List[str] = ["ppo"]
DEFAULT_MODELS: List[str] = ["ppo", "gpro", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix", "ddpg", "d3pg", "sac", "td3"]
ALL_MODELS: List[str] = ["ppo", "gpro", "appo", "mappo", "tcamappo", "dcmappo", "dqn", "madqn", "ddqn", "qmix", "dcqmix", "ddpg", "d3pg", "sac", "td3", "sctd3"]
TRAINERS: Dict[str, Callable] = {
"ppo": train_sumo_ppo,
"gpro": train_sumo_gpro,
"appo": train_sumo_appo,
"mappo": train_sumo_mappo,
"tcamappo": train_sumo_tcamappo,
"dcmappo": train_sumo_dcmappo,
"dqn": train_sumo_dqn,
"madqn": train_sumo_madqn,
"ddqn": train_sumo_ddqn,
"qmix": train_sumo_qmix,
"dcqmix": train_sumo_dcqmix,
"ddpg": train_sumo_ddpg,
"d3pg": train_sumo_d3pg,
"sac": train_sumo_sac,
"td3": train_sumo_td3,
"sctd3": train_sumo_sctd3,
TRAINER_SPECS: Dict[str, Tuple[str, str]] = {
"no_control": ("training.train_no_control", "train_sumo_no_control"),
"ppo": ("training.train_ppo", "train_sumo_ppo"),
"gpro": ("training.train_gpro", "train_sumo_gpro"),
"appo": ("training.train_appo", "train_sumo_appo"),
"mappo": ("training.train_mappo", "train_sumo_mappo"),
"tcamappo": ("training.train_tcamappo", "train_sumo_tcamappo"),
"dcmappo": ("training.train_dcmappo", "train_sumo_dcmappo"),
"dqn": ("training.train_dqn", "train_sumo_dqn"),
"madqn": ("training.train_madqn", "train_sumo_madqn"),
"ddqn": ("training.train_ddqn", "train_sumo_ddqn"),
"qmix": ("training.train_qmix", "train_sumo_qmix"),
"dcqmix": ("training.train_dcqmix", "train_sumo_dcqmix"),
"ddpg": ("training.train_ddpg", "train_sumo_ddpg"),
"d3pg": ("training.train_d3pg", "train_sumo_d3pg"),
"sac": ("training.train_sac", "train_sumo_sac"),
"td3": ("training.train_td3", "train_sumo_td3"),
"sctd3": ("training.train_sctd3", "train_sumo_sctd3"),
}
DEFAULT_MODELS: List[str] = [
"no_control",
"ppo",
"gpro",
"mappo",
"tcamappo",
"dcmappo",
"dqn",
"madqn",
"ddqn",
"qmix",
"dcqmix",
"ddpg",
"d3pg",
"sac",
"td3",
]
ALL_MODELS: List[str] = list(TRAINER_SPECS.keys())
def get_trainer(model_name: str) -> Callable:
model = normalize_model_name(model_name)
module_name, function_name = TRAINER_SPECS[model]
module = importlib.import_module(module_name)
return getattr(module, function_name)
class _TrainerRegistry(dict):
def __contains__(self, key):
return str(key).strip().lower() in TRAINER_SPECS
def __getitem__(self, key):
return get_trainer(str(key))
def keys(self):
return TRAINER_SPECS.keys()
def items(self):
for key in TRAINER_SPECS:
yield key, get_trainer(key)
TRAINERS: Dict[str, Callable] = _TrainerRegistry()
def normalize_model_name(name: str) -> str:
model = name.strip().lower()
if model not in TRAINERS:
if model not in TRAINER_SPECS:
raise ValueError(f"Unsupported model name: {name}")
return model

View File

@ -1,6 +1,6 @@
"""
基于 SUMO+TraCI APPO 训练脚本
使用微观仿真环境训练 VSL 控制策略
鍩轰簬 SUMO+TraCI ?APPO 粌鑴氭湰
浣跨敤寰浠跨湡鐜 VSL 鎺у埗绛栫暐
"""
import os
import sys
@ -26,7 +26,7 @@ from utils.seeding import derive_seed, resolve_base_seed, set_global_seed
def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"""SUMO 环境下的 APPO 训练主函数"""
"""Train APPO on the SUMO VSL environment."""
with open("config_sumo_vsl.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
@ -36,7 +36,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
set_global_seed(base_seed)
start_episode = 1
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"appo",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -47,6 +47,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -61,15 +62,14 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
action_dims = [env.action_dim] * env.num_controlled_edges
print("=" * 70)
print("APPO训练 - SUMO+TraCI VSL 环境")
print("APPO training - SUMO+TraCI VSL")
print("=" * 70)
print(f" 状态维度: {state_dim}")
print(f" 动作空间: {action_dims}")
print(f" Episode 步数: {env.episode_length}")
print(f" 控制间隔: {env.control_interval}s")
print(f" 隐藏维度: {agent_config.get('hidden_dim', 128)}")
print(f" 学习率: {agent_config.get('learning_rate', 3e-4)}")
print(f" 设备: {agent_config.get('device', 'cuda')}")
print(f" State dim: {state_dim}")
print(f" Action dims: {action_dims}")
print(f" Episode length: {env.episode_length}")
print(f" Control interval: {env.control_interval}s")
print(f" Learning rate: {agent_config.get('learning_rate', 3e-4)}")
print(f" Device: {agent_config.get('device', 'cuda')}")
print()
agent = APPOAgent(
@ -95,12 +95,12 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
total_episodes=train_config["num_episodes"]
)
# 训练参数
# 璁粌鍙傛暟
num_episodes = train_config["num_episodes"]
save_freq = train_config.get("save_freq", 50)
log_freq = train_config.get("log_freq", 10)
# 统计变量
# 缁熻鍙橀噺
episode_rewards = []
episode_throughputs = []
episode_mean_speeds = []
@ -111,11 +111,11 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
entropies = []
best_reward = -float("inf")
print("开始训练...\n")
print("Starting training...\n")
try:
for episode in range(start_episode, num_episodes + 1):
# 每个 episode 使用不同 seed 引入随机性
# 姣忎釜 episode 浣跨敤涓嶅悓 seed 寮曞叆闅忔満鎬?
seed = derive_seed(base_seed, episode)
state = env.reset(seed=seed)
episode_reward = 0
@ -158,7 +158,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
pbar.close()
# GAE 计算和策略更新
# GAE 璁$畻鍜岀瓥鐣ユ洿鏂?
if done:
next_value = 0.0
else:
@ -168,7 +168,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
train_stats = agent.update(next_value)
# 记录统计
# 璁板綍缁熻
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
@ -200,7 +200,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
ttc_risk=episode_ttc_risk,
)
# 保存最佳模型
# 淇濆瓨鏈€浣虫ā鍨?
episode_summary = {
"episode": episode,
"reward": float(episode_reward),
@ -229,7 +229,7 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
best_reward = episode_reward
agent.save(os.path.join(checkpoint_dir, "model_best.pt"))
# 定期日志
# 瀹氭湡鏃ュ織
if episode % log_freq == 0:
recent_rewards = episode_rewards[-log_freq:]
print(f"\nEpisode {episode}/{num_episodes}")
@ -249,20 +249,20 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Value Loss: {train_stats['value_loss']:.4f}")
print(f" Entropy: {train_stats['entropy']:.4f}")
# 定期保存
# 瀹氭湡淇濆瓨
if episode % save_freq == 0:
agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}.pt"))
except KeyboardInterrupt:
print("\n训练被中断,保存当前模型...")
print("\nTraining interrupted, saving current model...")
agent.save(os.path.join(checkpoint_dir, "model_interrupted.pt"))
finally:
env.close()
# 最终保存
# 鏈€缁堜繚瀛?
agent.save(os.path.join(checkpoint_dir, f"model_ep{num_episodes}.pt"))
# 绘制训练曲线
# 缁樺埗璁粌鏇茬嚎
plot_training_curves(
episode_rewards, episode_throughputs, episode_mean_speeds, episode_speed_variance_norms, episode_ttc_risks,
policy_losses, value_losses,
@ -270,8 +270,8 @@ def train_sumo_appo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
)
print("=" * 70)
print("训练完成!")
print(f" 最佳奖励: {best_reward:.2f}")
print(f" 模型目录: {checkpoint_dir}")
print(f" 日志目录: {log_dir}")
print("Training complete")
print(f" Best reward: {best_reward:.2f}")
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

View File

@ -1,4 +1,4 @@
"""
"""
Directional Corridor MAPPO training script for SUMO + TraCI VSL.
"""
import os
@ -30,7 +30,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"dcmappo",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -41,6 +41,7 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -263,3 +264,6 @@ def train_sumo_dcmappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

View File

@ -1,6 +1,6 @@
"""
基于 SUMO+TraCI DDPG 训练脚本
使用 Stable-Baselines3 DDPG 算法
鍩轰簬 SUMO+TraCI ?DDPG 粌鑴氭湰
浣跨敤 Stable-Baselines3 ?DDPG 绠楁硶
"""
import os
import copy
@ -23,7 +23,7 @@ from utils.seeding import derive_seed, resolve_base_seed, set_global_seed
def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"""SUMO 环境下的 DDPG 训练主函数"""
"""Train DDPG on the SUMO VSL environment."""
with open("config_sumo_vsl.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
@ -32,7 +32,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"ddpg",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -43,6 +43,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -57,14 +58,14 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
action_dims = [env.action_dim] * env.num_controlled_edges
print("=" * 70)
print("DDPG训练 - SUMO+TraCI VSL 环境")
print("DDPG training - SUMO+TraCI VSL")
print("=" * 70)
print(f" 状态维度: {state_dim}")
print(f" 动作空间: {action_dims}")
print(f" Episode 步数: {env.episode_length}")
print(f" 控制间隔: {env.control_interval}s")
print(f" 学习率: {agent_config.get('learning_rate', 3e-4)}")
print(f" 设备: {agent_config.get('device', 'cuda')}")
print(f" State dim: {state_dim}")
print(f" Action dims: {action_dims}")
print(f" Episode length: {env.episode_length}")
print(f" Control interval: {env.control_interval}s")
print(f" Learning rate: {agent_config.get('learning_rate', 3e-4)}")
print(f" Device: {agent_config.get('device', 'cuda')}")
print()
agent = DDPGAgent(
@ -94,7 +95,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
episode_ttc_risks = []
best_reward = -float("inf")
print("开始训练...\n")
print("Starting training...\n")
try:
for episode in range(1, num_episodes + 1):
@ -191,7 +192,7 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}"))
except KeyboardInterrupt:
print("\n训练被中断,保存当前模型...")
print("\nTraining interrupted, saving current model...")
agent.save(os.path.join(checkpoint_dir, "model_interrupted"))
finally:
env.close()
@ -204,8 +205,8 @@ def train_sumo_ddpg(log_dir=None, checkpoint_dir=None, run_timestamp=None):
)
print("=" * 70)
print("训练完成!")
print(f" 最佳奖励: {best_reward:.2f}")
print(f" 模型目录: {checkpoint_dir}")
print(f" 日志目录: {log_dir}")
print("Training complete")
print(f" Best reward: {best_reward:.2f}")
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

View File

@ -1,4 +1,4 @@
"""GRPO-inspired PPO training entrypoint for corridor VSL control."""
"""GRPO-inspired PPO training entrypoint for corridor VSL control."""
import os
import copy
import yaml
@ -29,7 +29,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"gpro",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -40,6 +40,7 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -282,3 +283,6 @@ def train_sumo_gpro(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Checkpoints: {checkpoint_dir}")
print(f" Logs: {log_dir}")
print("=" * 70)

View File

@ -1,4 +1,4 @@
"""
"""
MAPPO training script for SUMO + TraCI VSL.
"""
import os
@ -31,7 +31,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"mappo",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -42,6 +42,7 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -260,3 +261,6 @@ def train_sumo_mappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

View File

@ -0,0 +1,209 @@
"""No-control baseline runner for synchronized reward baselines."""
from __future__ import annotations
import copy
import os
import matplotlib
import numpy as np
import yaml
from tqdm import tqdm
matplotlib.use("Agg")
from envs.edge_vsl_env import SUMOEdgeVSLEnvironment
from envs.reward_system import REWARD_COMPONENT_COLUMNS, average_reward_components, init_reward_component_totals
from utils.config import get_training_config
from utils.episode_artifacts import save_training_episode_artifacts
from utils.logger import TrainingLogger
from utils.plot import plot_training_curves
from utils.reward_baseline import EpisodeBaselineWriter, resolve_baseline_dir
from utils.run_dirs import resolve_run_dirs, write_shared_run_config
from utils.seeding import derive_seed, resolve_base_seed, set_global_seed
def _select_no_control_action(env: SUMOEdgeVSLEnvironment) -> np.ndarray:
if env.num_controlled_edges <= 0:
return np.zeros(0, dtype=np.int64)
return np.full(env.num_controlled_edges, env.action_dim - 1, dtype=np.int64)
def _baseline_row(episode: int, seed: int | None, reward: float, info: dict) -> dict:
return {
"episode": int(episode),
"step": int(info.get("step", 0)),
"seed": "" if seed is None else int(seed),
"sim_time": float(info.get("sim_time", np.nan)),
"reward": float(reward),
"mean_speed_kmh": float(info.get("mean_speed_kmh", np.nan)),
"num_vehicles": int(info.get("num_vehicles", 0)),
"mainline_completed_count": int(info.get("mainline_completed_count", 0)),
"mainline_interval_travel_time_mean_s": float(
info.get("mainline_interval_travel_time_mean_s", np.nan)
),
"mainline_travel_time_cumulative_mean_s": float(
info.get("mainline_travel_time_cumulative_mean_s", np.nan)
),
"ttc_risk_rate": float(info.get("ttc_risk_rate", np.nan)),
}
def train_sumo_no_control(log_dir=None, checkpoint_dir=None, run_timestamp=None):
with open("config_sumo_vsl.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
train_config = get_training_config(config)
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"no_control",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
run_timestamp=run_timestamp,
)
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
reward_cfg = runtime_config.setdefault("environment", {}).setdefault("reward", {})
reward_cfg["mode"] = "absolute"
reward_cfg["baseline_dir"] = resolve_baseline_dir(runtime_config, resolved_run_timestamp)
baseline_dir = reward_cfg["baseline_dir"]
write_shared_run_config(
runtime_config,
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
run_timestamp=run_timestamp,
)
logger = TrainingLogger(log_dir, "no_control")
env = SUMOEdgeVSLEnvironment(runtime_config)
num_episodes = train_config["num_episodes"]
log_freq = train_config.get("log_freq", 10)
snapshot_interval = int(train_config.get("artifact_snapshot_interval", 50))
episode_rewards = []
episode_throughputs = []
episode_mean_speeds = []
episode_speed_variance_norms = []
episode_ttc_risks = []
print("=" * 70)
print("NO_CONTROL baseline runner - synchronized reward baseline")
print("=" * 70)
print(f" Episode steps: {env.episode_length}")
print(f" Baseline dir: {baseline_dir}")
print(f" Global seed: {base_seed if base_seed is not None else 'None (random)'}")
print()
try:
for episode in range(1, num_episodes + 1):
seed = derive_seed(base_seed, episode)
env.reset(seed=seed)
baseline_writer = EpisodeBaselineWriter(baseline_dir=baseline_dir, episode=episode)
episode_reward = 0.0
episode_throughput = 0.0
episode_speed = 0.0
episode_speed_variance_norm = 0.0
episode_ttc_risk = 0.0
episode_reward_components = init_reward_component_totals()
done = False
step = 0
pbar = tqdm(total=env.episode_length, desc=f"NO_CONTROL Ep {episode}/{num_episodes}", leave=False)
while not done:
action = _select_no_control_action(env)
_, reward, done, info = env.step(action, apply_control=True)
baseline_writer.append(_baseline_row(episode, seed, reward, info))
episode_reward += reward
episode_throughput += info["throughput"]
episode_speed += info["mean_speed_kmh"]
episode_speed_variance_norm += info["speed_variance_norm"]
episode_ttc_risk += float(info.get("ttc_risk_rate", 0.0))
for column in REWARD_COMPONENT_COLUMNS:
episode_reward_components[column] += float(info.get(column, 0.0))
step += 1
pbar.set_postfix(r=f"{episode_reward:.1f}", v=f"{info['mean_speed_kmh']:.1f}")
pbar.update(1)
pbar.close()
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
avg_reward_components = average_reward_components(episode_reward_components, step)
episode_rewards.append(episode_reward)
episode_throughputs.append(avg_tp)
episode_mean_speeds.append(avg_speed)
episode_speed_variance_norms.append(avg_speed_variance_norm)
episode_ttc_risks.append(episode_ttc_risk)
logger.log(
episode,
episode_reward,
avg_tp,
avg_speed,
speed_variance_norm=avg_speed_variance_norm,
reward_components=avg_reward_components,
ttc_risk=episode_ttc_risk,
)
episode_summary = {
"episode": int(episode),
"reward": float(episode_reward),
"avg_throughput": float(avg_tp),
"avg_mean_speed_kmh": float(avg_speed),
"avg_speed_variance_norm": float(avg_speed_variance_norm),
"ttc_risk": float(episode_ttc_risk),
"baseline_dir": baseline_dir,
}
for column, value in avg_reward_components.items():
episode_summary[f"avg_{column}"] = float(value)
save_training_episode_artifacts(
log_dir=log_dir,
episode=episode,
episode_metrics=env.episode_metrics,
control_edges=env.control_edges,
summary=episode_summary,
snapshot_interval=snapshot_interval,
)
if episode % log_freq == 0:
recent_rewards = episode_rewards[-log_freq:]
print(f"\nNO_CONTROL episode {episode}/{num_episodes}")
print(f" Reward: {episode_reward:.2f} (Avg: {np.mean(recent_rewards):.2f})")
print(f" Mean Speed: {avg_speed:.1f} km/h")
finally:
env.close()
plot_training_curves(
episode_rewards,
episode_throughputs,
episode_mean_speeds,
episode_speed_variance_norms,
episode_ttc_risks,
save_path=os.path.join(log_dir, "training_curves.png"),
)
return {
"model": "no_control",
"log_dir": log_dir,
"checkpoint_dir": checkpoint_dir,
"baseline_dir": baseline_dir,
}
if __name__ == "__main__":
train_sumo_no_control()

View File

@ -1,6 +1,6 @@
"""
基于 SUMO+TraCI PPO 训练脚本
使用微观仿真环境训练 VSL 控制策略
鍩轰簬 SUMO+TraCI ?PPO 粌鑴氭湰
浣跨敤寰浠跨湡鐜 VSL 鎺у埗绛栫暐
"""
import os
import sys
@ -26,8 +26,8 @@ from utils.seeding import derive_seed, resolve_base_seed, set_global_seed
def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
"""SUMO 环境下的 PPO 训练主函数"""
# 加载配置
"""Train PPO on the SUMO VSL environment."""
# 鍔犺浇閰嶇疆
with open("config_sumo_vsl.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
@ -37,7 +37,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
set_global_seed(base_seed)
start_episode = 1
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"ppo",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -48,6 +48,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -57,26 +58,24 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
logger = TrainingLogger(log_dir, "ppo")
# 创建环境
# 鍒涘缓鐜
env = SUMOEdgeVSLEnvironment(runtime_config)
state_dim = env.state_dim
action_dims = [env.action_dim] * env.num_controlled_edges
print("=" * 70)
print("PPO 训练 - SUMO+TraCI VSL 环境")
print("PPO training - SUMO+TraCI VSL")
print("=" * 70)
print(f" 状态维度: {state_dim}")
print(f" 控制边数: {env.num_edges}")
print(f" 每边动作数: {env.action_dim}")
print(f" Episode 步数: {env.episode_length}")
print(f" 控制间隔: {env.control_interval}s")
print(f" 隐藏层: {agent_config.get('hidden_layers', [512, 256])}")
print(f" 学习率: {agent_config.get('learning_rate', 3e-4)}")
print(f" 设备: {agent_config.get('device', 'cuda')}")
print(f" State dim: {state_dim}")
print(f" Action dims: {action_dims}")
print(f" Episode length: {env.episode_length}")
print(f" Control interval: {env.control_interval}s")
print(f" Learning rate: {agent_config.get('learning_rate', 3e-4)}")
print(f" Device: {agent_config.get('device', 'cuda')}")
print()
# 创建 PPO 智能体
# 鍒涘缓 PPO 鏅鸿兘浣?
agent = PPOAgent(
state_dim=state_dim,
action_dims=action_dims,
@ -95,12 +94,12 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
total_episodes=train_config["num_episodes"],
)
# 训练参数
# 璁粌鍙傛暟
num_episodes = train_config["num_episodes"]
save_freq = train_config.get("save_freq", 50)
log_freq = train_config.get("log_freq", 10)
# 统计变量
# 缁熻鍙橀噺
episode_rewards = []
episode_throughputs = []
episode_mean_speeds = []
@ -111,11 +110,11 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
entropies = []
best_reward = -float("inf")
print("开始训练...\n")
print("Starting training...\n")
try:
for episode in range(start_episode, num_episodes + 1):
# 每个 episode 使用不同 seed 引入随机性
# 姣忎釜 episode 浣跨敤涓嶅悓 seed 寮曞叆闅忔満鎬?
seed = derive_seed(base_seed, episode)
state = env.reset(seed=seed)
episode_reward = 0
@ -158,7 +157,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
pbar.close()
# GAE 计算和策略更新
# GAE 璁$畻鍜岀瓥鐣ユ洿鏂?
if done:
next_value = 0.0
else:
@ -168,7 +167,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
train_stats = agent.update(next_value)
# 记录统计
# 璁板綍缁熻
avg_tp = episode_throughput / max(step, 1)
avg_speed = episode_speed / max(step, 1)
avg_speed_variance_norm = episode_speed_variance_norm / max(step, 1)
@ -200,7 +199,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
ttc_risk=episode_ttc_risk,
)
# 保存最佳模型
# 淇濆瓨鏈€浣虫ā鍨?
episode_summary = {
"episode": episode,
"reward": float(episode_reward),
@ -229,7 +228,7 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
best_reward = episode_reward
agent.save(os.path.join(checkpoint_dir, "model_best.pt"))
# 定期日志
# 瀹氭湡鏃ュ織
if episode % log_freq == 0:
recent_rewards = episode_rewards[-log_freq:]
print(f"\nEpisode {episode}/{num_episodes}")
@ -249,20 +248,20 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Value Loss: {train_stats['value_loss']:.4f}")
print(f" Entropy: {train_stats['entropy']:.4f}")
# 定期保存
# 瀹氭湡淇濆瓨
if episode % save_freq == 0:
agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}.pt"))
except KeyboardInterrupt:
print("\n训练被中断,保存当前模型...")
print("\nTraining interrupted, saving current model...")
agent.save(os.path.join(checkpoint_dir, "model_interrupted.pt"))
finally:
env.close()
# 最终保存
# 鏈€缁堜繚瀛?
agent.save(os.path.join(checkpoint_dir, f"model_ep{num_episodes}.pt"))
# 绘制训练曲线
# 缁樺埗璁粌鏇茬嚎
plot_training_curves(
episode_rewards, episode_throughputs, episode_mean_speeds, episode_speed_variance_norms, episode_ttc_risks,
policy_losses, value_losses,
@ -270,8 +269,8 @@ def train_sumo_ppo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
)
print("=" * 70)
print("训练完成!")
print(f" 最佳奖励: {best_reward:.2f}")
print(f" 模型目录: {checkpoint_dir}")
print(f" 日志目录: {log_dir}")
print("Training complete")
print(f" Best reward: {best_reward:.2f}")
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

View File

@ -1,4 +1,4 @@
"""SUMO SAC training entrypoint."""
"""SUMO SAC training entrypoint."""
import os
import copy
import yaml
@ -29,7 +29,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"sac",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -40,6 +40,7 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -224,3 +225,6 @@ def train_sumo_sac(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Checkpoints: {checkpoint_dir}")
print(f" Logs: {log_dir}")
print("=" * 70)

View File

@ -1,4 +1,4 @@
"""
"""
Temporal Credit Assignment MAPPO training script for SUMO + TraCI VSL.
"""
import copy
@ -31,7 +31,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
"tcamappo",
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -42,6 +42,7 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -270,3 +271,6 @@ def train_sumo_tcamappo(log_dir=None, checkpoint_dir=None, run_timestamp=None):
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

View File

@ -1,6 +1,6 @@
"""
基于 SUMO+TraCI TD3 训练脚本
使用 Stable-Baselines3 TD3 算法
鍩轰簬 SUMO+TraCI ?TD3 粌鑴氭湰
浣跨敤 Stable-Baselines3 ?TD3 绠楁硶
"""
import os
import copy
@ -32,7 +32,7 @@ def train_sumo_td3(
display_name: str = "TD3",
agent_class=TD3Agent,
):
"""SUMO 环境下的 TD3 训练主函数"""
"""Train TD3 on the SUMO VSL environment."""
with open("config_sumo_vsl.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
@ -41,7 +41,7 @@ def train_sumo_td3(
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
model_name,
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -52,6 +52,7 @@ def train_sumo_td3(
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -66,14 +67,14 @@ def train_sumo_td3(
action_dims = [env.action_dim] * env.num_controlled_edges
print("=" * 70)
print(f"{display_name}训练 - SUMO+TraCI VSL 环境")
print(f"{display_name} training - SUMO+TraCI VSL")
print("=" * 70)
print(f" 状态维度: {state_dim}")
print(f" 动作空间: {action_dims}")
print(f" Episode 步数: {env.episode_length}")
print(f" 控制间隔: {env.control_interval}s")
print(f" 学习率: {agent_config.get('learning_rate', 3e-4)}")
print(f" 设备: {agent_config.get('device', 'cuda')}")
print(f" State dim: {state_dim}")
print(f" Action dims: {action_dims}")
print(f" Episode length: {env.episode_length}")
print(f" Control interval: {env.control_interval}s")
print(f" Learning rate: {agent_config.get('learning_rate', 3e-4)}")
print(f" Device: {agent_config.get('device', 'cuda')}")
print()
common_kwargs = dict(
@ -119,7 +120,7 @@ def train_sumo_td3(
episode_ttc_risks = []
best_reward = -float("inf")
print("开始训练...\n")
print("Starting training...\n")
try:
for episode in range(1, num_episodes + 1):
@ -224,7 +225,7 @@ def train_sumo_td3(
agent.save(os.path.join(checkpoint_dir, f"model_ep{episode}"))
except KeyboardInterrupt:
print("\n训练被中断,保存当前模型...")
print("\nTraining interrupted, saving current model...")
agent.save(os.path.join(checkpoint_dir, "model_interrupted"))
finally:
env.close()
@ -237,8 +238,8 @@ def train_sumo_td3(
)
print("=" * 70)
print("训练完成!")
print(f" 最佳奖励: {best_reward:.2f}")
print(f" 模型目录: {checkpoint_dir}")
print(f" 日志目录: {log_dir}")
print("Training complete")
print(f" Best reward: {best_reward:.2f}")
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

View File

@ -1,4 +1,4 @@
"""Shared training loop for value-based VSL agents."""
"""Shared training loop for value-based VSL agents."""
from __future__ import annotations
import copy
@ -76,7 +76,7 @@ def train_sumo_value_based(
base_seed = resolve_base_seed(train_config)
set_global_seed(base_seed)
_, checkpoint_dir, log_dir = resolve_run_dirs(
resolved_run_timestamp, checkpoint_dir, log_dir = resolve_run_dirs(
model_key,
log_dir=log_dir,
checkpoint_dir=checkpoint_dir,
@ -88,6 +88,7 @@ def train_sumo_value_based(
runtime_config = copy.deepcopy(config)
runtime_config.setdefault("runtime", {})["output_dir"] = log_dir
runtime_config["runtime"]["evaluation_mode"] = False
runtime_config["runtime"]["run_timestamp"] = resolved_run_timestamp
write_shared_run_config(
runtime_config,
log_dir=log_dir,
@ -266,3 +267,6 @@ def train_sumo_value_based(
print(f" Model dir: {checkpoint_dir}")
print(f" Log dir: {log_dir}")
print("=" * 70)

138
utils/reward_baseline.py Normal file
View File

@ -0,0 +1,138 @@
"""Step-level no-control reward baseline exchange utilities."""
from __future__ import annotations
import csv
import os
import tempfile
import time
from pathlib import Path
from typing import Dict, Iterable, Mapping
BASELINE_COLUMNS = [
"episode",
"step",
"seed",
"sim_time",
"reward",
"mean_speed_kmh",
"num_vehicles",
"mainline_completed_count",
"mainline_interval_travel_time_mean_s",
"mainline_travel_time_cumulative_mean_s",
"ttc_risk_rate",
]
def resolve_baseline_dir(config: Mapping[str, object], run_timestamp: str | None = None) -> str:
runtime_cfg = config.get("runtime", {}) if isinstance(config, Mapping) else {}
reward_cfg = (config.get("environment", {}) or {}).get("reward", {}) if isinstance(config, Mapping) else {}
baseline_dir = str(
runtime_cfg.get("reward_baseline_dir")
or reward_cfg.get("baseline_dir")
or ""
).strip()
if baseline_dir:
return os.path.abspath(baseline_dir)
timestamp = run_timestamp or str(runtime_cfg.get("run_timestamp", "") or "default")
return os.path.abspath(os.path.join("runs", timestamp, "reward_baseline"))
def episode_baseline_path(baseline_dir: str, episode: int) -> str:
return os.path.join(os.path.abspath(baseline_dir), f"episode_{int(episode):04d}.csv")
def write_episode_baseline(
*,
baseline_dir: str,
episode: int,
rows: Iterable[Mapping[str, object]],
) -> str:
os.makedirs(baseline_dir, exist_ok=True)
target_path = episode_baseline_path(baseline_dir, episode)
fd, temp_path = tempfile.mkstemp(
prefix=f"episode_{int(episode):04d}_",
suffix=".tmp",
dir=baseline_dir,
text=True,
)
try:
with os.fdopen(fd, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=BASELINE_COLUMNS)
writer.writeheader()
for row in rows:
writer.writerow({column: row.get(column, "") for column in BASELINE_COLUMNS})
os.replace(temp_path, target_path)
finally:
if os.path.exists(temp_path):
os.remove(temp_path)
return target_path
class EpisodeBaselineWriter:
def __init__(self, *, baseline_dir: str, episode: int):
self.baseline_dir = os.path.abspath(baseline_dir)
self.episode = int(episode)
self.rows: list[Mapping[str, object]] = []
def append(self, row: Mapping[str, object]) -> str:
self.rows.append(dict(row))
return write_episode_baseline(
baseline_dir=self.baseline_dir,
episode=self.episode,
rows=self.rows,
)
def read_episode_baseline(path: str) -> Dict[int, Dict[str, float]]:
baseline_by_step: Dict[int, Dict[str, float]] = {}
with open(path, "r", newline="", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
for row in reader:
try:
step = int(float(row.get("step", "")))
except (TypeError, ValueError):
continue
baseline_by_step[step] = {
"mean_speed_kmh": _safe_float(row.get("mean_speed_kmh")),
"mainline_completed_count": _safe_float(row.get("mainline_completed_count")),
"mainline_interval_travel_time_mean_s": _safe_float(
row.get("mainline_interval_travel_time_mean_s")
),
"mainline_travel_time_cumulative_mean_s": _safe_float(
row.get("mainline_travel_time_cumulative_mean_s")
),
"ttc_risk_rate": _safe_float(row.get("ttc_risk_rate")),
}
return baseline_by_step
def wait_for_episode_baseline(
*,
baseline_dir: str,
episode: int,
min_step: int,
timeout_s: float,
poll_interval_s: float,
) -> Dict[int, Dict[str, float]]:
path = episode_baseline_path(baseline_dir, episode)
deadline = time.monotonic() + max(float(timeout_s), 0.0)
while True:
if os.path.isfile(path):
baseline = read_episode_baseline(path)
if baseline and max(baseline) >= int(min_step):
return baseline
if time.monotonic() >= deadline:
raise TimeoutError(
f"Timed out waiting for reward baseline episode={episode} step>={min_step}: {path}"
)
time.sleep(max(float(poll_interval_s), 0.05))
def _safe_float(value: object) -> float:
try:
return float(value)
except (TypeError, ValueError):
return float("nan")