ctm-dqn/agents/appo_agent.py

332 lines
12 KiB
Python

"""
APPO agent for SUMO VSL with edge-structured tokenization.
"""
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from typing import Dict, List, Tuple
class SpatialAttentionBlock(nn.Module):
"""Self-attention block over ordered edge tokens."""
def __init__(self, hidden_dim: int, num_heads: int = 4, dropout: float = 0.1):
super().__init__()
self.attn = nn.MultiheadAttention(
embed_dim=hidden_dim,
num_heads=num_heads,
dropout=dropout,
batch_first=True,
)
self.norm1 = nn.LayerNorm(hidden_dim)
self.norm2 = nn.LayerNorm(hidden_dim)
self.ffn = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim * 2),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim * 2, hidden_dim),
)
self.dropout = nn.Dropout(dropout)
def forward(self, x: torch.Tensor) -> torch.Tensor:
attn_out, _ = self.attn(x, x, x, need_weights=False)
x = self.norm1(x + self.dropout(attn_out))
ffn_out = self.ffn(x)
return self.norm2(x + self.dropout(ffn_out))
class MultiDiscreteActorCritic(nn.Module):
"""Actor-critic that builds one token per controlled edge."""
def __init__(
self,
state_dim: int,
action_dims: List[int],
edge_feature_dim: int = 3,
time_feature_dim: int = 3,
hidden_dim: int = 128,
num_heads: int = 4,
num_layers: int = 2,
dropout: float = 0.1,
):
super().__init__()
self.state_dim = state_dim
self.action_dims = action_dims
self.num_zones = len(action_dims)
self.edge_feature_dim = edge_feature_dim
self.speed_feature_dim = 1
self.time_feature_dim = time_feature_dim
self.last_reward_dim = 1
self.global_feature_dim = self.time_feature_dim + self.last_reward_dim
self.agent_id_dim = 1
self.local_obs_dim = (
self.edge_feature_dim
+ self.speed_feature_dim
+ self.global_feature_dim
+ self.agent_id_dim
)
self.local_encoder = nn.Sequential(
nn.Linear(self.local_obs_dim, hidden_dim),
nn.LayerNorm(hidden_dim),
nn.GELU(),
)
self.pos_encoding = nn.Parameter(torch.zeros(1, self.num_zones, hidden_dim))
self.attention_layers = nn.ModuleList(
[SpatialAttentionBlock(hidden_dim, num_heads=num_heads, dropout=dropout) for _ in range(num_layers)]
)
self.actor_heads = nn.ModuleList([nn.Linear(hidden_dim, adim) for adim in action_dims])
self.critic = nn.Sequential(
nn.Linear(hidden_dim * 2 + self.global_feature_dim, hidden_dim),
nn.LayerNorm(hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1),
)
agent_ids = torch.linspace(0.0, 1.0, self.num_zones, dtype=torch.float32)
self.register_buffer("agent_id_features", agent_ids.view(1, self.num_zones, 1))
self._init_weights()
def _init_weights(self):
for module in self.modules():
if isinstance(module, nn.Linear):
nn.init.orthogonal_(module.weight, gain=np.sqrt(2))
if module.bias is not None:
nn.init.constant_(module.bias, 0)
for head in self.actor_heads:
nn.init.orthogonal_(head.weight, gain=0.01)
nn.init.orthogonal_(self.critic[-1].weight, gain=1.0)
nn.init.normal_(self.pos_encoding, mean=0.0, std=0.02)
def _build_local_tokens(self, state: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
if state.dim() == 1:
state = state.unsqueeze(0)
batch_size = state.size(0)
edge_block = self.num_zones * self.edge_feature_dim
speed_block_start = edge_block
speed_block_end = speed_block_start + self.num_zones
global_block_start = speed_block_end
global_block_end = global_block_start + self.global_feature_dim
edge_features = state[:, :edge_block].view(batch_size, self.num_zones, self.edge_feature_dim)
local_speed_limits = state[:, speed_block_start:speed_block_end].view(batch_size, self.num_zones, 1)
global_features = state[:, global_block_start:global_block_end]
repeated_global = global_features.unsqueeze(1).expand(-1, self.num_zones, -1)
agent_ids = self.agent_id_features.expand(batch_size, -1, -1)
tokens = torch.cat([edge_features, local_speed_limits, repeated_global, agent_ids], dim=-1)
return tokens, global_features
def forward(self, state: torch.Tensor) -> Tuple[List[torch.Tensor], torch.Tensor]:
tokens, global_features = self._build_local_tokens(state)
x = self.local_encoder(tokens) + self.pos_encoding
for attention_layer in self.attention_layers:
x = attention_layer(x)
logits_list = [head(x[:, idx, :]) for idx, head in enumerate(self.actor_heads)]
pooled = torch.cat([x.mean(dim=1), x.max(dim=1).values, global_features], dim=-1)
value = self.critic(pooled)
return logits_list, value
def get_value(self, state: torch.Tensor) -> torch.Tensor:
_, value = self.forward(state)
return value
class APPOAgent:
"""APPO agent for SUMO MultiDiscrete action space."""
def __init__(
self,
state_dim: int,
action_dims: List[int],
edge_feature_dim: int = 3,
time_feature_dim: int = 3,
hidden_dim: int = 128,
num_heads: int = 4,
num_layers: int = 2,
learning_rate: float = 3e-4,
gamma: float = 0.99,
gae_lambda: float = 0.95,
clip_epsilon: float = 0.2,
value_coef: float = 0.5,
entropy_coef: float = 0.02,
max_grad_norm: float = 0.5,
ppo_epochs: int = 10,
minibatch_size: int = 64,
device: str = "cuda",
lr_schedule: str = "cosine",
total_episodes: int = 300,
):
self.device = torch.device(device if torch.cuda.is_available() else "cpu")
self.gamma = gamma
self.gae_lambda = gae_lambda
self.clip_epsilon = clip_epsilon
self.value_coef = value_coef
self.entropy_coef = entropy_coef
self.max_grad_norm = max_grad_norm
self.ppo_epochs = ppo_epochs
self.minibatch_size = minibatch_size
self.action_dims = action_dims
self.policy = MultiDiscreteActorCritic(
state_dim=state_dim,
action_dims=action_dims,
edge_feature_dim=edge_feature_dim,
time_feature_dim=time_feature_dim,
hidden_dim=hidden_dim,
num_heads=num_heads,
num_layers=num_layers,
).to(self.device)
self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate, eps=1e-5)
if lr_schedule == "cosine":
self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
self.optimizer,
T_max=total_episodes,
eta_min=learning_rate * 0.1,
)
else:
self.scheduler = None
self.reset_buffers()
def reset_buffers(self):
self.states = []
self.actions = []
self.rewards = []
self.values = []
self.log_probs = []
self.dones = []
def select_action(self, state: np.ndarray, deterministic: bool = False) -> Tuple[np.ndarray, float, float]:
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
with torch.no_grad():
logits_list, value = self.policy(state_tensor)
actions = []
log_prob_total = 0.0
for logits in logits_list:
dist = torch.distributions.Categorical(logits=logits)
if deterministic:
action = torch.argmax(logits, dim=-1).item()
else:
action = dist.sample().item()
actions.append(action)
log_prob_total += dist.log_prob(torch.tensor(action, device=self.device)).item()
return np.array(actions, dtype=np.int64), log_prob_total, value.item()
def store_transition(self, state, action, reward, value, log_prob, done):
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
self.values.append(value)
self.log_probs.append(log_prob)
self.dones.append(done)
def compute_gae(self, next_value: float) -> Tuple[np.ndarray, np.ndarray]:
advantages = []
gae = 0.0
for t in reversed(range(len(self.rewards))):
next_val = next_value if t == len(self.rewards) - 1 else self.values[t + 1]
delta = self.rewards[t] + self.gamma * next_val * (1 - self.dones[t]) - self.values[t]
gae = delta + self.gamma * self.gae_lambda * (1 - self.dones[t]) * gae
advantages.insert(0, gae)
advantages = np.array(advantages, dtype=np.float32)
returns = advantages + np.array(self.values, dtype=np.float32)
return advantages, returns
def update(self, next_value: float) -> Dict[str, float]:
if len(self.states) == 0:
return {}
advantages, returns = self.compute_gae(next_value)
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
states = torch.FloatTensor(np.array(self.states)).to(self.device)
actions = torch.LongTensor(np.array(self.actions)).to(self.device)
old_log_probs = torch.FloatTensor(self.log_probs).to(self.device)
advantages_t = torch.FloatTensor(advantages).to(self.device)
returns_t = torch.FloatTensor(returns).to(self.device)
total_policy_loss = 0.0
total_value_loss = 0.0
total_entropy = 0.0
update_count = 0
dataset_size = len(self.states)
for _ in range(self.ppo_epochs):
indices = np.random.permutation(dataset_size)
for start_idx in range(0, dataset_size, self.minibatch_size):
end_idx = min(start_idx + self.minibatch_size, dataset_size)
batch_idx = indices[start_idx:end_idx]
batch_states = states[batch_idx]
batch_actions = actions[batch_idx]
batch_old_lp = old_log_probs[batch_idx]
batch_adv = advantages_t[batch_idx]
batch_ret = returns_t[batch_idx]
logits_list, values = self.policy(batch_states)
new_log_probs = torch.zeros(len(batch_idx), device=self.device)
entropy = torch.zeros(len(batch_idx), device=self.device)
for i, logits in enumerate(logits_list):
dist = torch.distributions.Categorical(logits=logits)
new_log_probs += dist.log_prob(batch_actions[:, i])
entropy += dist.entropy()
entropy_mean = entropy.mean()
ratio = torch.exp(new_log_probs - batch_old_lp)
surr1 = ratio * batch_adv
surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_adv
policy_loss = -torch.min(surr1, surr2).mean()
value_loss = nn.functional.mse_loss(values.squeeze(-1), batch_ret)
loss = policy_loss + self.value_coef * value_loss - self.entropy_coef * entropy_mean
self.optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
self.optimizer.step()
total_policy_loss += policy_loss.item()
total_value_loss += value_loss.item()
total_entropy += entropy_mean.item()
update_count += 1
if self.scheduler is not None:
self.scheduler.step()
self.reset_buffers()
return {
"policy_loss": total_policy_loss / max(update_count, 1),
"value_loss": total_value_loss / max(update_count, 1),
"entropy": total_entropy / max(update_count, 1),
}
def save(self, path: str):
torch.save(
{
"policy_state_dict": self.policy.state_dict(),
"optimizer_state_dict": self.optimizer.state_dict(),
},
path,
)
def load(self, path: str):
checkpoint = torch.load(path, map_location=self.device, weights_only=False)
self.policy.load_state_dict(checkpoint["policy_state_dict"])
self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])