diff --git a/appo_sumo_agent.py b/appo_sumo_agent.py deleted file mode 100644 index a91cfaf..0000000 --- a/appo_sumo_agent.py +++ /dev/null @@ -1,321 +0,0 @@ -""" -APPO for SUMO - MultiDiscrete动作空间版本 -""" -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import numpy as np -from typing import Tuple, List, Dict - - -class SpatialAttentionMultiDiscrete(nn.Module): - """空间注意力 + MultiDiscrete输出""" - - def __init__(self, d_model: int, num_heads: int = 4): - super().__init__() - assert d_model % num_heads == 0 - self.d_model = d_model - self.num_heads = num_heads - self.d_k = d_model // num_heads - - self.W_q = nn.Linear(d_model, d_model) - self.W_k = nn.Linear(d_model, d_model) - self.W_v = nn.Linear(d_model, d_model) - self.W_o = nn.Linear(d_model, d_model) - self.dropout = nn.Dropout(0.1) - self.layer_norm = nn.LayerNorm(d_model) - - def forward(self, x): - batch_size, seq_len, _ = x.size() - - Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2) - K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2) - V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2) - - scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_k) - attn = F.softmax(scores, dim=-1) - attn = self.dropout(attn) - - context = torch.matmul(attn, V) - context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model) - output = self.W_o(context) - - return self.layer_norm(x + self.dropout(output)) - - -class MultiDiscreteActorCritic(nn.Module): - """MultiDiscrete Actor-Critic with Spatial Attention""" - - def __init__( - self, - state_dim: int, - action_dims: List[int], - hidden_dim: int = 128, - num_heads: int = 4, - num_layers: int = 2, - ): - super().__init__() - self.state_dim = state_dim - self.action_dims = action_dims - self.num_zones = len(action_dims) - - # 状态编码 - self.state_encoder = nn.Sequential( - nn.Linear(state_dim, hidden_dim * self.num_zones), - nn.ReLU(), - nn.LayerNorm(hidden_dim * self.num_zones) - ) - - # 位置编码 - self.pos_encoding = nn.Parameter(torch.randn(1, self.num_zones, hidden_dim)) - - # 注意力层 - self.attention_layers = nn.ModuleList([ - SpatialAttentionMultiDiscrete(hidden_dim, num_heads) for _ in range(num_layers) - ]) - - self.ffn_layers = nn.ModuleList([ - nn.Sequential( - nn.Linear(hidden_dim, hidden_dim * 2), - nn.ReLU(), - nn.Dropout(0.1), - nn.Linear(hidden_dim * 2, hidden_dim), - nn.LayerNorm(hidden_dim) - ) for _ in range(num_layers) - ]) - - # Actor heads (每个zone独立) - self.actor_heads = nn.ModuleList([ - nn.Linear(hidden_dim, adim) for adim in action_dims - ]) - - # Critic - self.critic = nn.Sequential( - nn.Linear(hidden_dim * self.num_zones, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, 1) - ) - - self._init_weights() - - def _init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Linear): - nn.init.orthogonal_(m.weight, gain=np.sqrt(2)) - if m.bias is not None: - nn.init.constant_(m.bias, 0) - - for head in self.actor_heads: - nn.init.orthogonal_(head.weight, gain=0.01) - - def forward(self, state): - batch_size = state.size(0) - - # 编码为zone表示 - x = self.state_encoder(state) - x = x.view(batch_size, self.num_zones, -1) - x = x + self.pos_encoding - - # 注意力 - for attn, ffn in zip(self.attention_layers, self.ffn_layers): - x = attn(x) - x = x + ffn(x) - - # Actor: 每个zone独立输出 - logits_list = [head(x[:, i, :]) for i, head in enumerate(self.actor_heads)] - - # Critic - global_feat = x.view(batch_size, -1) - value = self.critic(global_feat) - - return logits_list, value - - def get_value(self, state): - _, value = self.forward(state) - return value - - -class APPOSUMOAgent: - """APPO Agent for SUMO MultiDiscrete Action Space""" - - def __init__( - self, - state_dim: int, - action_dims: List[int], - hidden_dim: int = 128, - num_heads: int = 4, - num_layers: int = 2, - learning_rate: float = 3e-4, - gamma: float = 0.99, - gae_lambda: float = 0.95, - clip_epsilon: float = 0.2, - value_coef: float = 0.5, - entropy_coef: float = 0.02, - max_grad_norm: float = 0.5, - ppo_epochs: int = 10, - minibatch_size: int = 64, - device: str = "cuda", - total_episodes: int = 300, - ): - self.device = torch.device(device if torch.cuda.is_available() else "cpu") - self.gamma = gamma - self.gae_lambda = gae_lambda - self.clip_epsilon = clip_epsilon - self.value_coef = value_coef - self.entropy_coef = entropy_coef - self.max_grad_norm = max_grad_norm - self.ppo_epochs = ppo_epochs - self.minibatch_size = minibatch_size - self.action_dims = action_dims - - self.policy = MultiDiscreteActorCritic( - state_dim, action_dims, hidden_dim, num_heads, num_layers - ).to(self.device) - - self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate, eps=1e-5) - self.scheduler = optim.lr_scheduler.CosineAnnealingLR( - self.optimizer, T_max=total_episodes, eta_min=learning_rate * 0.1 - ) - - self.reset_buffers() - - def reset_buffers(self): - self.states = [] - self.actions = [] - self.rewards = [] - self.values = [] - self.log_probs = [] - self.dones = [] - - def select_action(self, state: np.ndarray, deterministic: bool = False) -> Tuple[np.ndarray, float, float]: - state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) - - with torch.no_grad(): - logits_list, value = self.policy(state_tensor) - - actions = [] - log_prob_total = 0.0 - - for logits in logits_list: - probs = F.softmax(logits, dim=-1) - if deterministic: - action = torch.argmax(probs, dim=-1).item() - else: - dist = torch.distributions.Categorical(probs) - action = dist.sample().item() - actions.append(action) - log_prob_total += torch.log(probs[0, action] + 1e-10).item() - - return np.array(actions), log_prob_total, value.item() - - def store_transition(self, state, action, reward, value, log_prob, done): - self.states.append(state) - self.actions.append(action) - self.rewards.append(reward) - self.values.append(value) - self.log_probs.append(log_prob) - self.dones.append(done) - - def compute_gae(self, next_value: float) -> Tuple[np.ndarray, np.ndarray]: - advantages = [] - gae = 0 - - for t in reversed(range(len(self.rewards))): - if t == len(self.rewards) - 1: - next_val = next_value - else: - next_val = self.values[t + 1] - - delta = self.rewards[t] + self.gamma * next_val * (1 - self.dones[t]) - self.values[t] - gae = delta + self.gamma * self.gae_lambda * (1 - self.dones[t]) * gae - advantages.insert(0, gae) - - advantages = np.array(advantages, dtype=np.float32) - returns = advantages + np.array(self.values, dtype=np.float32) - return advantages, returns - - def update(self, next_value: float) -> Dict[str, float]: - if len(self.states) == 0: - return {} - - advantages, returns = self.compute_gae(next_value) - advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) - - states = torch.FloatTensor(np.array(self.states)).to(self.device) - actions = torch.LongTensor(np.array(self.actions)).to(self.device) - old_log_probs = torch.FloatTensor(self.log_probs).to(self.device) - advantages_t = torch.FloatTensor(advantages).to(self.device) - returns_t = torch.FloatTensor(returns).to(self.device) - - total_policy_loss = 0 - total_value_loss = 0 - total_entropy = 0 - update_count = 0 - - dataset_size = len(self.states) - for _ in range(self.ppo_epochs): - indices = np.random.permutation(dataset_size) - - for start_idx in range(0, dataset_size, self.minibatch_size): - end_idx = min(start_idx + self.minibatch_size, dataset_size) - batch_idx = indices[start_idx:end_idx] - - batch_states = states[batch_idx] - batch_actions = actions[batch_idx] - batch_old_lp = old_log_probs[batch_idx] - batch_adv = advantages_t[batch_idx] - batch_ret = returns_t[batch_idx] - - logits_list, values = self.policy(batch_states) - - new_log_probs = torch.zeros(len(batch_idx), device=self.device) - entropy = torch.zeros(len(batch_idx), device=self.device) - - for i, logits in enumerate(logits_list): - probs = F.softmax(logits, dim=-1) - dist = torch.distributions.Categorical(probs) - new_log_probs += dist.log_prob(batch_actions[:, i]) - entropy += dist.entropy() - - new_log_probs_mean = new_log_probs - entropy_mean = entropy.mean() - - ratio = torch.exp(new_log_probs_mean - batch_old_lp) - surr1 = ratio * batch_adv - surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_adv - policy_loss = -torch.min(surr1, surr2).mean() - - value_loss = F.mse_loss(values.squeeze(), batch_ret) - - loss = policy_loss + self.value_coef * value_loss - self.entropy_coef * entropy_mean - - self.optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) - self.optimizer.step() - - total_policy_loss += policy_loss.item() - total_value_loss += value_loss.item() - total_entropy += entropy_mean.item() - update_count += 1 - - self.scheduler.step() - self.reset_buffers() - - return { - 'policy_loss': total_policy_loss / max(update_count, 1), - 'value_loss': total_value_loss / max(update_count, 1), - 'entropy': total_entropy / max(update_count, 1), - } - - def save(self, path: str): - torch.save({ - 'policy_state_dict': self.policy.state_dict(), - 'optimizer_state_dict': self.optimizer.state_dict(), - }, path) - - def load(self, path: str): - checkpoint = torch.load(path, map_location=self.device, weights_only=False) - self.policy.load_state_dict(checkpoint['policy_state_dict']) - self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) diff --git a/show_ppo_decisions.py b/show_ppo_decisions.py deleted file mode 100644 index 0d77ac3..0000000 --- a/show_ppo_decisions.py +++ /dev/null @@ -1,41 +0,0 @@ -"""显示PPO控制决策""" -import numpy as np -import yaml -from sumo_vsl_environment import SUMOVSLEnvironment -from ppo_agent import PPOAgent - -with open('config_sumo_vsl.yaml', 'r', encoding='utf-8') as f: - config = yaml.safe_load(f) - -env = SUMOVSLEnvironment(config) -agent = PPOAgent( - state_dim=env.state_dim, - action_dims=[5] * env.num_control_zones, - hidden_layers=[256, 256, 128], - learning_rate=3e-4, - device='cuda' -) - -try: - agent.load('checkpoints_sumo_ppo/best_model.pt') - print('已加载训练好的PPO模型\n') -except: - print('警告: 未找到训练好的模型\n') - -state = env.reset(seed=42) -speed_map = {0: 40, 1: 60, 2: 80, 3: 100, 4: 120} - -print('PPO控制决策 (全部60步):') -print('步数 | Zone0 | Zone1 | Zone2 | Zone3 | Zone4') -print('-' * 50) - -for step in range(60): - action, _, _ = agent.select_action(state, deterministic=True) - speeds = [speed_map[int(a)] for a in action] - print(f'{step:3d} | {speeds[0]:3d}km | {speeds[1]:3d}km | {speeds[2]:3d}km | {speeds[3]:3d}km | {speeds[4]:3d}km') - - state, _, done, _ = env.step(action) - if done: - break - -env.close() diff --git a/train_sumo_appo.py b/train_sumo_appo.py index f9371c3..2a5361b 100644 --- a/train_sumo_appo.py +++ b/train_sumo_appo.py @@ -14,7 +14,7 @@ from tqdm import tqdm import torch from sumo_edge_vsl_environment import SUMOEdgeVSLEnvironment -from appo_sumo_agent import APPOSUMOAgent +from appo_agent import APPOAgent from training_logger import TrainingLogger @@ -53,7 +53,7 @@ def train_sumo_appo(): print(f" 设备: {agent_config.get('device', 'cuda')}") print() - agent = APPOSUMOAgent( + agent = APPOAgent( state_dim=state_dim, action_dims=action_dims, hidden_dim=agent_config.get("hidden_dim", 128),