方法重命名
This commit is contained in:
parent
4d68003fce
commit
29aae92ad2
|
|
@ -1,321 +0,0 @@
|
|||
"""
|
||||
APPO for SUMO - MultiDiscrete动作空间版本
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
from typing import Tuple, List, Dict
|
||||
|
||||
|
||||
class SpatialAttentionMultiDiscrete(nn.Module):
|
||||
"""空间注意力 + MultiDiscrete输出"""
|
||||
|
||||
def __init__(self, d_model: int, num_heads: int = 4):
|
||||
super().__init__()
|
||||
assert d_model % num_heads == 0
|
||||
self.d_model = d_model
|
||||
self.num_heads = num_heads
|
||||
self.d_k = d_model // num_heads
|
||||
|
||||
self.W_q = nn.Linear(d_model, d_model)
|
||||
self.W_k = nn.Linear(d_model, d_model)
|
||||
self.W_v = nn.Linear(d_model, d_model)
|
||||
self.W_o = nn.Linear(d_model, d_model)
|
||||
self.dropout = nn.Dropout(0.1)
|
||||
self.layer_norm = nn.LayerNorm(d_model)
|
||||
|
||||
def forward(self, x):
|
||||
batch_size, seq_len, _ = x.size()
|
||||
|
||||
Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
|
||||
K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
|
||||
V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
|
||||
|
||||
scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.d_k)
|
||||
attn = F.softmax(scores, dim=-1)
|
||||
attn = self.dropout(attn)
|
||||
|
||||
context = torch.matmul(attn, V)
|
||||
context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
|
||||
output = self.W_o(context)
|
||||
|
||||
return self.layer_norm(x + self.dropout(output))
|
||||
|
||||
|
||||
class MultiDiscreteActorCritic(nn.Module):
|
||||
"""MultiDiscrete Actor-Critic with Spatial Attention"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
state_dim: int,
|
||||
action_dims: List[int],
|
||||
hidden_dim: int = 128,
|
||||
num_heads: int = 4,
|
||||
num_layers: int = 2,
|
||||
):
|
||||
super().__init__()
|
||||
self.state_dim = state_dim
|
||||
self.action_dims = action_dims
|
||||
self.num_zones = len(action_dims)
|
||||
|
||||
# 状态编码
|
||||
self.state_encoder = nn.Sequential(
|
||||
nn.Linear(state_dim, hidden_dim * self.num_zones),
|
||||
nn.ReLU(),
|
||||
nn.LayerNorm(hidden_dim * self.num_zones)
|
||||
)
|
||||
|
||||
# 位置编码
|
||||
self.pos_encoding = nn.Parameter(torch.randn(1, self.num_zones, hidden_dim))
|
||||
|
||||
# 注意力层
|
||||
self.attention_layers = nn.ModuleList([
|
||||
SpatialAttentionMultiDiscrete(hidden_dim, num_heads) for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.ffn_layers = nn.ModuleList([
|
||||
nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim * 2),
|
||||
nn.ReLU(),
|
||||
nn.Dropout(0.1),
|
||||
nn.Linear(hidden_dim * 2, hidden_dim),
|
||||
nn.LayerNorm(hidden_dim)
|
||||
) for _ in range(num_layers)
|
||||
])
|
||||
|
||||
# Actor heads (每个zone独立)
|
||||
self.actor_heads = nn.ModuleList([
|
||||
nn.Linear(hidden_dim, adim) for adim in action_dims
|
||||
])
|
||||
|
||||
# Critic
|
||||
self.critic = nn.Sequential(
|
||||
nn.Linear(hidden_dim * self.num_zones, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, 1)
|
||||
)
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Linear):
|
||||
nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
|
||||
if m.bias is not None:
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
for head in self.actor_heads:
|
||||
nn.init.orthogonal_(head.weight, gain=0.01)
|
||||
|
||||
def forward(self, state):
|
||||
batch_size = state.size(0)
|
||||
|
||||
# 编码为zone表示
|
||||
x = self.state_encoder(state)
|
||||
x = x.view(batch_size, self.num_zones, -1)
|
||||
x = x + self.pos_encoding
|
||||
|
||||
# 注意力
|
||||
for attn, ffn in zip(self.attention_layers, self.ffn_layers):
|
||||
x = attn(x)
|
||||
x = x + ffn(x)
|
||||
|
||||
# Actor: 每个zone独立输出
|
||||
logits_list = [head(x[:, i, :]) for i, head in enumerate(self.actor_heads)]
|
||||
|
||||
# Critic
|
||||
global_feat = x.view(batch_size, -1)
|
||||
value = self.critic(global_feat)
|
||||
|
||||
return logits_list, value
|
||||
|
||||
def get_value(self, state):
|
||||
_, value = self.forward(state)
|
||||
return value
|
||||
|
||||
|
||||
class APPOSUMOAgent:
|
||||
"""APPO Agent for SUMO MultiDiscrete Action Space"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
state_dim: int,
|
||||
action_dims: List[int],
|
||||
hidden_dim: int = 128,
|
||||
num_heads: int = 4,
|
||||
num_layers: int = 2,
|
||||
learning_rate: float = 3e-4,
|
||||
gamma: float = 0.99,
|
||||
gae_lambda: float = 0.95,
|
||||
clip_epsilon: float = 0.2,
|
||||
value_coef: float = 0.5,
|
||||
entropy_coef: float = 0.02,
|
||||
max_grad_norm: float = 0.5,
|
||||
ppo_epochs: int = 10,
|
||||
minibatch_size: int = 64,
|
||||
device: str = "cuda",
|
||||
total_episodes: int = 300,
|
||||
):
|
||||
self.device = torch.device(device if torch.cuda.is_available() else "cpu")
|
||||
self.gamma = gamma
|
||||
self.gae_lambda = gae_lambda
|
||||
self.clip_epsilon = clip_epsilon
|
||||
self.value_coef = value_coef
|
||||
self.entropy_coef = entropy_coef
|
||||
self.max_grad_norm = max_grad_norm
|
||||
self.ppo_epochs = ppo_epochs
|
||||
self.minibatch_size = minibatch_size
|
||||
self.action_dims = action_dims
|
||||
|
||||
self.policy = MultiDiscreteActorCritic(
|
||||
state_dim, action_dims, hidden_dim, num_heads, num_layers
|
||||
).to(self.device)
|
||||
|
||||
self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate, eps=1e-5)
|
||||
self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
|
||||
self.optimizer, T_max=total_episodes, eta_min=learning_rate * 0.1
|
||||
)
|
||||
|
||||
self.reset_buffers()
|
||||
|
||||
def reset_buffers(self):
|
||||
self.states = []
|
||||
self.actions = []
|
||||
self.rewards = []
|
||||
self.values = []
|
||||
self.log_probs = []
|
||||
self.dones = []
|
||||
|
||||
def select_action(self, state: np.ndarray, deterministic: bool = False) -> Tuple[np.ndarray, float, float]:
|
||||
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_list, value = self.policy(state_tensor)
|
||||
|
||||
actions = []
|
||||
log_prob_total = 0.0
|
||||
|
||||
for logits in logits_list:
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
if deterministic:
|
||||
action = torch.argmax(probs, dim=-1).item()
|
||||
else:
|
||||
dist = torch.distributions.Categorical(probs)
|
||||
action = dist.sample().item()
|
||||
actions.append(action)
|
||||
log_prob_total += torch.log(probs[0, action] + 1e-10).item()
|
||||
|
||||
return np.array(actions), log_prob_total, value.item()
|
||||
|
||||
def store_transition(self, state, action, reward, value, log_prob, done):
|
||||
self.states.append(state)
|
||||
self.actions.append(action)
|
||||
self.rewards.append(reward)
|
||||
self.values.append(value)
|
||||
self.log_probs.append(log_prob)
|
||||
self.dones.append(done)
|
||||
|
||||
def compute_gae(self, next_value: float) -> Tuple[np.ndarray, np.ndarray]:
|
||||
advantages = []
|
||||
gae = 0
|
||||
|
||||
for t in reversed(range(len(self.rewards))):
|
||||
if t == len(self.rewards) - 1:
|
||||
next_val = next_value
|
||||
else:
|
||||
next_val = self.values[t + 1]
|
||||
|
||||
delta = self.rewards[t] + self.gamma * next_val * (1 - self.dones[t]) - self.values[t]
|
||||
gae = delta + self.gamma * self.gae_lambda * (1 - self.dones[t]) * gae
|
||||
advantages.insert(0, gae)
|
||||
|
||||
advantages = np.array(advantages, dtype=np.float32)
|
||||
returns = advantages + np.array(self.values, dtype=np.float32)
|
||||
return advantages, returns
|
||||
|
||||
def update(self, next_value: float) -> Dict[str, float]:
|
||||
if len(self.states) == 0:
|
||||
return {}
|
||||
|
||||
advantages, returns = self.compute_gae(next_value)
|
||||
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
|
||||
|
||||
states = torch.FloatTensor(np.array(self.states)).to(self.device)
|
||||
actions = torch.LongTensor(np.array(self.actions)).to(self.device)
|
||||
old_log_probs = torch.FloatTensor(self.log_probs).to(self.device)
|
||||
advantages_t = torch.FloatTensor(advantages).to(self.device)
|
||||
returns_t = torch.FloatTensor(returns).to(self.device)
|
||||
|
||||
total_policy_loss = 0
|
||||
total_value_loss = 0
|
||||
total_entropy = 0
|
||||
update_count = 0
|
||||
|
||||
dataset_size = len(self.states)
|
||||
for _ in range(self.ppo_epochs):
|
||||
indices = np.random.permutation(dataset_size)
|
||||
|
||||
for start_idx in range(0, dataset_size, self.minibatch_size):
|
||||
end_idx = min(start_idx + self.minibatch_size, dataset_size)
|
||||
batch_idx = indices[start_idx:end_idx]
|
||||
|
||||
batch_states = states[batch_idx]
|
||||
batch_actions = actions[batch_idx]
|
||||
batch_old_lp = old_log_probs[batch_idx]
|
||||
batch_adv = advantages_t[batch_idx]
|
||||
batch_ret = returns_t[batch_idx]
|
||||
|
||||
logits_list, values = self.policy(batch_states)
|
||||
|
||||
new_log_probs = torch.zeros(len(batch_idx), device=self.device)
|
||||
entropy = torch.zeros(len(batch_idx), device=self.device)
|
||||
|
||||
for i, logits in enumerate(logits_list):
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
dist = torch.distributions.Categorical(probs)
|
||||
new_log_probs += dist.log_prob(batch_actions[:, i])
|
||||
entropy += dist.entropy()
|
||||
|
||||
new_log_probs_mean = new_log_probs
|
||||
entropy_mean = entropy.mean()
|
||||
|
||||
ratio = torch.exp(new_log_probs_mean - batch_old_lp)
|
||||
surr1 = ratio * batch_adv
|
||||
surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_adv
|
||||
policy_loss = -torch.min(surr1, surr2).mean()
|
||||
|
||||
value_loss = F.mse_loss(values.squeeze(), batch_ret)
|
||||
|
||||
loss = policy_loss + self.value_coef * value_loss - self.entropy_coef * entropy_mean
|
||||
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
|
||||
self.optimizer.step()
|
||||
|
||||
total_policy_loss += policy_loss.item()
|
||||
total_value_loss += value_loss.item()
|
||||
total_entropy += entropy_mean.item()
|
||||
update_count += 1
|
||||
|
||||
self.scheduler.step()
|
||||
self.reset_buffers()
|
||||
|
||||
return {
|
||||
'policy_loss': total_policy_loss / max(update_count, 1),
|
||||
'value_loss': total_value_loss / max(update_count, 1),
|
||||
'entropy': total_entropy / max(update_count, 1),
|
||||
}
|
||||
|
||||
def save(self, path: str):
|
||||
torch.save({
|
||||
'policy_state_dict': self.policy.state_dict(),
|
||||
'optimizer_state_dict': self.optimizer.state_dict(),
|
||||
}, path)
|
||||
|
||||
def load(self, path: str):
|
||||
checkpoint = torch.load(path, map_location=self.device, weights_only=False)
|
||||
self.policy.load_state_dict(checkpoint['policy_state_dict'])
|
||||
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
"""显示PPO控制决策"""
|
||||
import numpy as np
|
||||
import yaml
|
||||
from sumo_vsl_environment import SUMOVSLEnvironment
|
||||
from ppo_agent import PPOAgent
|
||||
|
||||
with open('config_sumo_vsl.yaml', 'r', encoding='utf-8') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
env = SUMOVSLEnvironment(config)
|
||||
agent = PPOAgent(
|
||||
state_dim=env.state_dim,
|
||||
action_dims=[5] * env.num_control_zones,
|
||||
hidden_layers=[256, 256, 128],
|
||||
learning_rate=3e-4,
|
||||
device='cuda'
|
||||
)
|
||||
|
||||
try:
|
||||
agent.load('checkpoints_sumo_ppo/best_model.pt')
|
||||
print('已加载训练好的PPO模型\n')
|
||||
except:
|
||||
print('警告: 未找到训练好的模型\n')
|
||||
|
||||
state = env.reset(seed=42)
|
||||
speed_map = {0: 40, 1: 60, 2: 80, 3: 100, 4: 120}
|
||||
|
||||
print('PPO控制决策 (全部60步):')
|
||||
print('步数 | Zone0 | Zone1 | Zone2 | Zone3 | Zone4')
|
||||
print('-' * 50)
|
||||
|
||||
for step in range(60):
|
||||
action, _, _ = agent.select_action(state, deterministic=True)
|
||||
speeds = [speed_map[int(a)] for a in action]
|
||||
print(f'{step:3d} | {speeds[0]:3d}km | {speeds[1]:3d}km | {speeds[2]:3d}km | {speeds[3]:3d}km | {speeds[4]:3d}km')
|
||||
|
||||
state, _, done, _ = env.step(action)
|
||||
if done:
|
||||
break
|
||||
|
||||
env.close()
|
||||
|
|
@ -14,7 +14,7 @@ from tqdm import tqdm
|
|||
import torch
|
||||
|
||||
from sumo_edge_vsl_environment import SUMOEdgeVSLEnvironment
|
||||
from appo_sumo_agent import APPOSUMOAgent
|
||||
from appo_agent import APPOAgent
|
||||
from training_logger import TrainingLogger
|
||||
|
||||
|
||||
|
|
@ -53,7 +53,7 @@ def train_sumo_appo():
|
|||
print(f" 设备: {agent_config.get('device', 'cuda')}")
|
||||
print()
|
||||
|
||||
agent = APPOSUMOAgent(
|
||||
agent = APPOAgent(
|
||||
state_dim=state_dim,
|
||||
action_dims=action_dims,
|
||||
hidden_dim=agent_config.get("hidden_dim", 128),
|
||||
|
|
|
|||
Loading…
Reference in New Issue