ctm-dqn/docs/reward_function_core.tex

\documentclass[12pt]{article}
\usepackage[a4paper,margin=1in]{geometry}
\usepackage{amsmath,amssymb}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{CJKutf8}

\hypersetup{colorlinks=true,linkcolor=black,urlcolor=blue,citecolor=black}

\title{TCA-MAPPO 实验奖励函数说明}
\author{}
\date{\today}

\begin{document}
\begin{CJK}{UTF8}{gbsn}
\maketitle

\section{设计思想}

当前奖励函数采用同轮次 no-control 反事实基准下的``平均通行时间--TTC 风险改善''结构。它不再直接奖励某个绝对交通状态，而是比较模型控制与 no-control 在同一 episode、同一随机种子、同一控制步下的差异。

这样做的核心原因是：交通需求、车辆生成随机性和事故场景随机性会造成不同 episode 的绝对运行状态天然波动。使用同步 no-control 作为反事实基准后，奖励可以更直接地表示可变限速控制策略相对于不控制策略的边际贡献。

\section{通行能力指标：主线平均通行时间}

设 episode $e$ 中第 $t$ 个控制步结束时，模型 $\pi$ 下已经完成主线通行的车辆集合为 $\mathcal{A}_{\pi}^{(e)}(t)$。车辆 $i$ 的主线进入时刻为 $t_i^{\mathrm{dep}}$，离开时刻为 $t_i^{\mathrm{arr}}$，则其主线通行时间为
\begin{equation}
T_i=t_i^{\mathrm{arr}}-t_i^{\mathrm{dep}}.
\end{equation}

当前实现采用累计主线平均通行时间：
\begin{equation}
\bar{T}_{\pi}^{(e)}(t)=
\begin{cases}
\dfrac{1}{|\mathcal{A}_{\pi}^{(e)}(t)|}
\sum_{i\in\mathcal{A}_{\pi}^{(e)}(t)}T_i, & |\mathcal{A}_{\pi}^{(e)}(t)|>0,\\[10pt]
\mathrm{NaN}, & |\mathcal{A}_{\pi}^{(e)}(t)|=0.
\end{cases}
\end{equation}

代码中该指标对应 \texttt{mainline\_travel\_time\_cumulative\_mean\_s}。使用累计均值而不是单步区间均值，是因为较短控制步内可能没有主线车辆完成通行，累计均值可以降低稀疏完成事件导致的奖励缺失。

设 no-control 在同一 episode、同一 seed、同一控制步下的累计主线平均通行时间为 $\bar{T}_{0}^{(e)}(t)$。通行时间改善量定义为
\begin{equation}
\Delta_T^{(e)}(t)=
\mathrm{clip}\!\left(
\frac{\bar{T}_{0}^{(e)}(t)-\bar{T}_{\pi}^{(e)}(t)}
{\max\!\left(\bar{T}_{0}^{(e)}(t),T_{\min}\right)},-1,1
\right).
\end{equation}

当前配置中，最小分母为
\begin{equation}
T_{\min}=60\ \mathrm{s}.
\end{equation}

该形式表示相对于 no-control 的通行时间比例改善，而不是固定秒数尺度下的绝对改善。因此，在高流量阶段和低流量阶段，通行时间项的量纲更加一致；同时 $T_{\min}$ 可以避免早期基准通行时间过小导致比例项异常放大。

当模型降低平均通行时间时，$\Delta_T^{(e)}(t)>0$；当模型增加平均通行时间时，$\Delta_T^{(e)}(t)<0$。若当前模型或 no-control 尚无完成车辆导致累计通行时间不可用，则当前实现令 $\Delta_T^{(e)}(t)=0$。

\section{安全指标：TTC 风险}

安全项采用基于 TTC（Time To Collision）的追尾风险代理。对受控走廊内车辆 $i$，设其速度为 $v_i(t)$，前车速度为 $v_i^{\mathrm{lead}}(t)$，与前车净间距为 $g_i(t)$。正向闭合速度定义为
\begin{equation}
\Delta v_i^+(t)=\max\!\left(v_i(t)-v_i^{\mathrm{lead}}(t),0\right).
\end{equation}

若 $\Delta v_i^+(t)>0$，则
\begin{equation}
\mathrm{TTC}_i(t)=\frac{g_i(t)}{\Delta v_i^+(t)}.
\end{equation}
若车辆没有前车、前车不在受控走廊内，或 $\Delta v_i^+(t)=0$，则该车辆在当前步的 TTC 风险贡献记为 $0$。

设 TTC 阈值为 $\tau_{\mathrm{ttc}}$，当前配置为
\begin{equation}
\tau_{\mathrm{ttc}}=2.3\ \mathrm{s}.
\end{equation}

单车风险贡献为
\begin{equation}
\rho_i(t)=
\begin{cases}
\max\!\left(0,1-\dfrac{\mathrm{TTC}_i(t)}{\tau_{\mathrm{ttc}}}\right), & \mathrm{TTC}_i(t)>0,\\[10pt]
0, & \mathrm{TTC}_i(t)=0.
\end{cases}
\end{equation}

控制步 $t$ 的总体 TTC 风险为
\begin{equation}
S_{\pi}^{(e)}(t)=
\begin{cases}
\dfrac{1}{N_{\pi}^{(e)}(t)}\sum_{i=1}^{N_{\pi}^{(e)}(t)}\rho_i(t), & N_{\pi}^{(e)}(t)>0,\\[10pt]
0, & N_{\pi}^{(e)}(t)=0.
\end{cases}
\end{equation}

设同一 episode、同一 seed、同一控制步下 no-control 的 TTC 风险为 $S_0^{(e)}(t)$。安全改善量定义为
\begin{equation}
\Delta_S^{(e)}(t)=
\mathrm{clip}\!\left(S_0^{(e)}(t)-S_{\pi}^{(e)}(t),-1,1\right).
\end{equation}

当模型降低 TTC 风险时，$\Delta_S^{(e)}(t)>0$；当模型提高 TTC 风险时，$\Delta_S^{(e)}(t)<0$。

\section{最终奖励函数}

当前训练默认采用 \texttt{paired\_no\_control} 模式。设基础随机种子为 $z_0$，episode $e$ 使用的 SUMO 随机种子为
\begin{equation}
z_e=z_0+e.
\end{equation}
no-control 基准进程与其它模型使用相同的 $z_e$ 运行同一个 episode。

最终奖励定义为通行时间改善与 TTC 风险改善的线性组合：
\begin{equation}
R_{\pi}^{(e)}(t)=
w_T\Delta_T^{(e)}(t)+w_S\Delta_S^{(e)}(t).
\end{equation}

当前配置为
\begin{equation}
w_T=0.5,\qquad w_S=0.5.
\end{equation}

若模型在同一 seed、同一控制步下与 no-control 的累计主线平均通行时间和 TTC 风险完全一致，则 $R_{\pi}^{(e)}(t)=0$。若模型降低通行时间或降低 TTC 风险，则对应项为正；若模型增加通行时间或提高 TTC 风险，则对应项为负。

\section{同步 no-control 基准机制}

no-control runner 在每个 episode 内按 step 写出基准过程数据：
\begin{equation}
\texttt{runs/<run\_timestamp>/reward\_baseline/episode\_XXXX.csv}.
\end{equation}

其它模型在 episode $e$ 的第 $t$ 个控制步计算奖励前，会等待对应 CSV 文件中第 $t$ 行出现，然后读取 no-control 的 $\bar{T}_{0}^{(e)}(t)$ 与 $S_0^{(e)}(t)$。若基准文件不存在或当前 step 尚未写出，其它模型会阻塞等待。等待时间由 \texttt{baseline\_wait\_timeout\_s} 控制；轮询间隔由 \texttt{baseline\_poll\_interval\_s} 控制。

\section{代码变量对应关系}

\begin{table}[h]
\centering
\begin{tabular}{lll}
\toprule
数学符号 & 代码变量 & 含义 \\
\midrule
$R_{\pi}^{(e)}(t)$ & \texttt{reward}, \texttt{r\_improvement} & 最终训练奖励 \\
$\Delta_T^{(e)}(t)$ & \texttt{r\_travel\_time\_improvement} & 主线平均通行时间改善量 \\
$\Delta_S^{(e)}(t)$ & \texttt{r\_ttc\_improvement} & TTC 风险改善量 \\
$\bar{T}_{\pi}^{(e)}(t)$ & \texttt{mainline\_travel\_time\_cumulative\_mean\_s} & 当前模型累计主线平均通行时间 \\
$\bar{T}_{0}^{(e)}(t)$ & \texttt{baseline\_mainline\_travel\_time\_cumulative\_mean\_s} & no-control 累计主线平均通行时间 \\
$S_{\pi}^{(e)}(t)$ & \texttt{ttc\_risk\_rate} & 当前模型 TTC 风险 \\
$S_0^{(e)}(t)$ & \texttt{baseline\_ttc\_risk\_rate} & no-control TTC 风险 \\
$T_{\min}$ & \texttt{travel\_time\_min\_denominator\_s} & 通行时间相对改善最小分母 \\
$w_T$ & \texttt{travel\_time\_weight} & 通行时间改善权重 \\
$w_S$ & \texttt{ttc\_weight} & TTC 风险改善权重 \\
$\tau_{\mathrm{ttc}}$ & \texttt{ttc\_threshold\_s} & TTC 风险阈值 \\
$\rho_i(t)$ & \texttt{ttc\_risk\_samples[i]} & 单车 TTC 风险贡献 \\
$\mathrm{TTC}_i(t)$ & \texttt{ttc\_samples[i]} & 单车 TTC \\
\bottomrule
\end{tabular}
\caption{奖励函数数学符号与代码变量对应关系}
\end{table}

\section{运行要求}

使用 \texttt{paired\_no\_control} 奖励时，no-control runner 与其它模型必须使用相同的 \texttt{run\_timestamp}。推荐先启动 no-control，再启动其它模型；若并行启动，其它模型会自动在 step 级别等待 no-control 写出对应基准行。

\section*{参考文献}
\begin{enumerate}
    \item Wang, Q., et al. Assessing the Transferability of Time-to-Collision and Other Car-Following Safety Measures in Real-Time Rear-End Crash Risk Prediction. \textit{Accident Analysis \& Prevention}, 2024, 197: 107640.
    \item Jin, Q., Abdel-Aty, M., Ugan, J., Islam, Z., Zheng, O. Identifying the Threshold Discrepancy of Rear-End Conflicts under Clear and Rainy Weather Conditions Using Trajectory Data. \textit{Transportation Research Record}, 2025, 2679(4): 952--968.
\end{enumerate}

\end{CJK}
\end{document}