ctm-dqn/environment.py

"""
Training environment for DQN-based speed limit control.
"""
import numpy as np
from typing import Tuple, Dict, Optional
from ctm_model import CTMModel
from demand_loader import DemandLoader


class TrafficEnvironment:
    """Traffic environment for speed limit control."""

    def __init__(self, config: dict):
        """Initialize environment."""
        env_config = config["environment"]
        reward_config = config["reward"]

        self.ctm = CTMModel(
            num_cells=env_config["num_cells"],
            cell_length=env_config["cell_length"],
            free_flow_speed=env_config["free_flow_speed"],
            congestion_wave_speed=env_config["congestion_wave_speed"],
            max_density=env_config["max_density"],
            critical_density=env_config["critical_density"],
            jam_density=env_config["jam_density"],
            time_step=env_config["time_step"],
        )

        self.demand_mean = env_config["demand_mean"]
        self.demand_std = env_config["demand_std"]
        self.demand_pattern = env_config["demand_pattern"]

        # Initialize demand loader for CSV input
        self.demand_loader = None
        if self.demand_pattern == "csv":
            csv_path = env_config.get("demand_csv_path")
            csv_column = env_config.get("demand_csv_column", "demand")
            if csv_path is None:
                raise ValueError("demand_csv_path must be specified when demand_pattern is 'csv'")
            self.demand_loader = DemandLoader(
                csv_path=csv_path,
                time_step=env_config["time_step"],
                demand_column=csv_column
            )
            print(f"Using CSV demand from: {csv_path}")
            stats = self.demand_loader.get_statistics()
            if stats:
                print(f"Demand statistics: mean={stats['mean']:.1f}, std={stats['std']:.1f}, "
                      f"min={stats['min']:.1f}, max={stats['max']:.1f}")

        self.min_speed_limit = env_config["min_speed_limit"]
        self.max_speed_limit = env_config["max_speed_limit"]
        self.num_speed_actions = env_config["num_speed_actions"]
        self.episode_length = env_config["episode_length"]
        self.time_step = env_config["time_step"]

        self.throughput_weight = reward_config["throughput_weight"]
        self.speed_weight = reward_config["speed_weight"]
        self.density_weight = reward_config["density_weight"]
        self.action_change_weight = reward_config["action_change_weight"]

        self.speed_actions = np.linspace(
            self.min_speed_limit, self.max_speed_limit, self.num_speed_actions
        )

        self.current_step = 0
        self.previous_action = None
        self.episode_metrics = []
        self.current_inflow = 0.0  # Track current inflow for reward calculation

    def reset(self) -> np.ndarray:
        """Reset environment to initial state."""
        self.ctm.reset()
        self.current_step = 0
        self.previous_action = None
        self.episode_metrics = []
        self.current_inflow = 0.0

        # Reset demand loader if using CSV
        if self.demand_loader is not None:
            self.demand_loader.reset()

        return self.ctm.get_state()

    def _generate_demand(self) -> float:
        """Generate traffic demand based on pattern."""
        if self.demand_pattern == "constant":
            demand = self.demand_mean
        elif self.demand_pattern == "sine":
            t = self.current_step * self.time_step / 3600.0
            demand = self.demand_mean + self.demand_std * np.sin(2 * np.pi * t / 2.0)
        elif self.demand_pattern == "random":
            demand = np.random.normal(self.demand_mean, self.demand_std)
        elif self.demand_pattern == "csv":
            if self.demand_loader is None:
                raise RuntimeError("Demand loader not initialized for CSV pattern")
            demand = self.demand_loader.get_demand(self.current_step)
        else:
            demand = self.demand_mean

        return max(0, demand)

    def _calculate_reward(self, info: Dict, action: int) -> float:
        """Calculate reward based on traffic metrics with non-linear normalization."""
        throughput = info["throughput"]
        avg_density = info["average_density"]

        avg_speed = 0
        count = 0
        for i in range(self.ctm.num_cells):
            if self.ctm.densities[i] > 0:
                avg_speed += min(
                    self.ctm.speed_limits[i],
                    self.ctm.free_flow_speed * (1 - self.ctm.densities[i] / self.ctm.jam_density)
                )
                count += 1
        avg_speed = avg_speed / count if count > 0 else 0

        # Non-linear normalization using tanh, centered on current inflow
        # This makes the reward adaptive to traffic conditions
        # When inflow is high, we expect high throughput; when low, we expect lower throughput
        # inflow_target = max(self.current_inflow, 1000.0)  # Avoid division by very small numbers
        inflow_target = self.current_inflow  # Avoid division by very small numbers
        throughput_ratio = throughput / inflow_target
        # throughput_ratio: 1.0 means perfect flow (output = input)
        # > 1.0 is impossible in steady state, < 1.0 means congestion
        throughput_norm = (np.tanh((throughput_ratio - 0.9) / 0.2) + 1.0) / 2.0
        # Speed reward: penalize low speeds more heavily
        speed_norm = (avg_speed / self.max_speed_limit) ** 2

        # Density penalty: exponential penalty for high density
        density_ratio = avg_density / self.ctm.critical_density
        density_norm = np.exp(-density_ratio)

        reward = (
            self.throughput_weight * throughput_norm
            + self.speed_weight * speed_norm
            + self.density_weight * (1.0 - density_norm)
        )

        if self.previous_action is not None and action != self.previous_action:
            reward += self.action_change_weight

        return reward

    def step(self, action: int) -> Tuple[np.ndarray, float, bool, Dict]:
        """Execute one step in the environment."""
        speed_limit = self.speed_actions[action]

        for i in range(self.ctm.num_cells):
            self.ctm.set_speed_limit(i, speed_limit)

        inflow = self._generate_demand()
        self.current_inflow = inflow  # Store current inflow for reward calculation
        outflow = 2000.0

        next_state, info = self.ctm.step(inflow, outflow)

        reward = self._calculate_reward(info, action)

        self.current_step += 1
        done = self.current_step >= self.episode_length

        self.episode_metrics.append(info)
        self.previous_action = action

        info["reward"] = reward
        info["step"] = self.current_step

        return next_state, reward, done, info

    @property
    def state_dim(self) -> int:
        """Get state dimension."""
        return self.ctm.num_cells * 2

    @property
    def action_dim(self) -> int:
        """Get action dimension."""
        return self.num_speed_actions