diff --git a/agents/tcamappo_agent.py b/agents/tcamappo_agent.py index 1e55e44..3092c7a 100644 --- a/agents/tcamappo_agent.py +++ b/agents/tcamappo_agent.py @@ -262,7 +262,7 @@ class TCAMAPPOAgent: action_norm = np.asarray(action, dtype=np.float32) / max(self.num_actions - 1, 1) reward_features = np.array( [ - float(reward) / 10.0, + float(reward), *[float(info.get(column, 0.0)) for column in REWARD_COMPONENT_COLUMNS], ], dtype=np.float32, diff --git a/config_sumo_vsl.yaml b/config_sumo_vsl.yaml index 7f3a846..c6242f4 100644 --- a/config_sumo_vsl.yaml +++ b/config_sumo_vsl.yaml @@ -53,9 +53,8 @@ environment: free_flow_speed: 30.56 reward: - reward_scale: 10.0 - efficiency_alpha: 3.0 - safety_beta: 4.0 + efficiency_alpha: 2.19 + safety_beta: 9.19 efficiency_exponent: 0.50 safety_exponent: 0.50 ttc_threshold_s: 2.3 diff --git a/envs/edge_vsl_env.py b/envs/edge_vsl_env.py index a4ddb86..669f003 100644 --- a/envs/edge_vsl_env.py +++ b/envs/edge_vsl_env.py @@ -554,7 +554,7 @@ class SUMOEdgeVSLEnvironment: state_parts.append(time_progress) state_parts.append(np.sin(2 * np.pi * time_progress)) state_parts.append(np.cos(2 * np.pi * time_progress)) - state_parts.append(self._last_reward / 10.0) + state_parts.append(self._last_reward) return np.array(state_parts, dtype=np.float32) diff --git a/envs/reward_design_blueprint.py b/envs/reward_design_blueprint.py index ce0ec58..f6dc5b9 100644 --- a/envs/reward_design_blueprint.py +++ b/envs/reward_design_blueprint.py @@ -135,7 +135,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint: ), terms=terms, global_formula_tex=( - r"R(t)=\kappa \,R_{\mathrm{efficiency}}(t)^{\lambda_{\mathrm{eff}}}" + r"R(t)=R_{\mathrm{efficiency}}(t)^{\lambda_{\mathrm{eff}}}" r"R_{\mathrm{safety}}(t)^{\lambda_{\mathrm{safe}}}" ), excluded_metrics=( diff --git a/envs/reward_system.py b/envs/reward_system.py index a84127f..5ccd0f4 100644 --- a/envs/reward_system.py +++ b/envs/reward_system.py @@ -36,9 +36,8 @@ def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[s @dataclass(frozen=True) class RewardConfig: - reward_scale: float = 10.0 - efficiency_alpha: float = 3.0 - safety_beta: float = 4.0 + efficiency_alpha: float = 2.19 + safety_beta: float = 9.19 efficiency_exponent: float = 0.50 safety_exponent: float = 0.50 ttc_threshold_s: float = 2.3 @@ -56,9 +55,8 @@ class RewardConfig: _ = speed_actions_ms return cls( - reward_scale=float(raw_cfg.get("reward_scale", 10.0)), - efficiency_alpha=float(raw_cfg.get("efficiency_alpha", 3.0)), - safety_beta=float(raw_cfg.get("safety_beta", 4.0)), + efficiency_alpha=float(raw_cfg.get("efficiency_alpha", 2.19)), + safety_beta=float(raw_cfg.get("safety_beta", 9.19)), efficiency_exponent=float(raw_cfg.get("efficiency_exponent", 0.50)), safety_exponent=float(raw_cfg.get("safety_exponent", 0.50)), ttc_threshold_s=float(raw_cfg.get("ttc_threshold_s", 2.3)), @@ -122,4 +120,4 @@ class RewardCalculator: info["ttc_threshold_s"] = float(self.config.ttc_threshold_s) info["efficiency_lambda"] = float(lambda_eff) info["safety_lambda"] = float(lambda_safe) - return float(r_utility * self.config.reward_scale) + return float(r_utility)