标定奖励函数参数
This commit is contained in:
parent
43c9ae3fbe
commit
d764c7e763
|
|
@ -262,7 +262,7 @@ class TCAMAPPOAgent:
|
||||||
action_norm = np.asarray(action, dtype=np.float32) / max(self.num_actions - 1, 1)
|
action_norm = np.asarray(action, dtype=np.float32) / max(self.num_actions - 1, 1)
|
||||||
reward_features = np.array(
|
reward_features = np.array(
|
||||||
[
|
[
|
||||||
float(reward) / 10.0,
|
float(reward),
|
||||||
*[float(info.get(column, 0.0)) for column in REWARD_COMPONENT_COLUMNS],
|
*[float(info.get(column, 0.0)) for column in REWARD_COMPONENT_COLUMNS],
|
||||||
],
|
],
|
||||||
dtype=np.float32,
|
dtype=np.float32,
|
||||||
|
|
|
||||||
|
|
@ -53,9 +53,8 @@ environment:
|
||||||
free_flow_speed: 30.56
|
free_flow_speed: 30.56
|
||||||
|
|
||||||
reward:
|
reward:
|
||||||
reward_scale: 10.0
|
efficiency_alpha: 2.19
|
||||||
efficiency_alpha: 3.0
|
safety_beta: 9.19
|
||||||
safety_beta: 4.0
|
|
||||||
efficiency_exponent: 0.50
|
efficiency_exponent: 0.50
|
||||||
safety_exponent: 0.50
|
safety_exponent: 0.50
|
||||||
ttc_threshold_s: 2.3
|
ttc_threshold_s: 2.3
|
||||||
|
|
|
||||||
|
|
@ -554,7 +554,7 @@ class SUMOEdgeVSLEnvironment:
|
||||||
state_parts.append(time_progress)
|
state_parts.append(time_progress)
|
||||||
state_parts.append(np.sin(2 * np.pi * time_progress))
|
state_parts.append(np.sin(2 * np.pi * time_progress))
|
||||||
state_parts.append(np.cos(2 * np.pi * time_progress))
|
state_parts.append(np.cos(2 * np.pi * time_progress))
|
||||||
state_parts.append(self._last_reward / 10.0)
|
state_parts.append(self._last_reward)
|
||||||
|
|
||||||
return np.array(state_parts, dtype=np.float32)
|
return np.array(state_parts, dtype=np.float32)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -135,7 +135,7 @@ def build_tca_mappo_reward_blueprint() -> RewardBlueprint:
|
||||||
),
|
),
|
||||||
terms=terms,
|
terms=terms,
|
||||||
global_formula_tex=(
|
global_formula_tex=(
|
||||||
r"R(t)=\kappa \,R_{\mathrm{efficiency}}(t)^{\lambda_{\mathrm{eff}}}"
|
r"R(t)=R_{\mathrm{efficiency}}(t)^{\lambda_{\mathrm{eff}}}"
|
||||||
r"R_{\mathrm{safety}}(t)^{\lambda_{\mathrm{safe}}}"
|
r"R_{\mathrm{safety}}(t)^{\lambda_{\mathrm{safe}}}"
|
||||||
),
|
),
|
||||||
excluded_metrics=(
|
excluded_metrics=(
|
||||||
|
|
|
||||||
|
|
@ -36,9 +36,8 @@ def average_reward_components(totals: Mapping[str, float], steps: int) -> Dict[s
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class RewardConfig:
|
class RewardConfig:
|
||||||
reward_scale: float = 10.0
|
efficiency_alpha: float = 2.19
|
||||||
efficiency_alpha: float = 3.0
|
safety_beta: float = 9.19
|
||||||
safety_beta: float = 4.0
|
|
||||||
efficiency_exponent: float = 0.50
|
efficiency_exponent: float = 0.50
|
||||||
safety_exponent: float = 0.50
|
safety_exponent: float = 0.50
|
||||||
ttc_threshold_s: float = 2.3
|
ttc_threshold_s: float = 2.3
|
||||||
|
|
@ -56,9 +55,8 @@ class RewardConfig:
|
||||||
_ = speed_actions_ms
|
_ = speed_actions_ms
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
reward_scale=float(raw_cfg.get("reward_scale", 10.0)),
|
efficiency_alpha=float(raw_cfg.get("efficiency_alpha", 2.19)),
|
||||||
efficiency_alpha=float(raw_cfg.get("efficiency_alpha", 3.0)),
|
safety_beta=float(raw_cfg.get("safety_beta", 9.19)),
|
||||||
safety_beta=float(raw_cfg.get("safety_beta", 4.0)),
|
|
||||||
efficiency_exponent=float(raw_cfg.get("efficiency_exponent", 0.50)),
|
efficiency_exponent=float(raw_cfg.get("efficiency_exponent", 0.50)),
|
||||||
safety_exponent=float(raw_cfg.get("safety_exponent", 0.50)),
|
safety_exponent=float(raw_cfg.get("safety_exponent", 0.50)),
|
||||||
ttc_threshold_s=float(raw_cfg.get("ttc_threshold_s", 2.3)),
|
ttc_threshold_s=float(raw_cfg.get("ttc_threshold_s", 2.3)),
|
||||||
|
|
@ -122,4 +120,4 @@ class RewardCalculator:
|
||||||
info["ttc_threshold_s"] = float(self.config.ttc_threshold_s)
|
info["ttc_threshold_s"] = float(self.config.ttc_threshold_s)
|
||||||
info["efficiency_lambda"] = float(lambda_eff)
|
info["efficiency_lambda"] = float(lambda_eff)
|
||||||
info["safety_lambda"] = float(lambda_safe)
|
info["safety_lambda"] = float(lambda_safe)
|
||||||
return float(r_utility * self.config.reward_scale)
|
return float(r_utility)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue