Source code for ddql_optimal_execution.agent._ddql

from ._agent import Agent
from typing import Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from ._neural_net import QNet
from ddql_optimal_execution import State, get_device


[docs]class DDQL(Agent):
    """
    The DDQL class inherits from the Agent class. It is an agent that implements a Double Deep Q-Learning algorithm.

    Parameters
    ----------
    state_dict : dict, optional
        A dictionary containing the state of the agent, by default None
    greedy_decay_rate : float, optional
        The greedy decay rate, by default 0.95
    target_update_rate : int, optional
        The target update rate, by default 15
    initial_greediness : float, optional
        The initial greediness, by default 1
    mode : str, optional
        The mode, by default "train"
    lr : float, optional
        The learning rate, by default 1e-3
    state_size : int, optional
        The state size, by default 5
    initial_budget : int, optional
        The initial budget, by default 100
    horizon : int, optional
        The horizon, by default 100
    gamma : float, optional
        The gamma parameter used in the Q-Learning algorithm, by default 0.99
    quadratic_penalty_coefficient : float, optional
        The quadratic penalty coefficient used to penalize the agent for selling big quantities of stocks, by default 0.01


    Attributes
    ----------
    device : torch.device
        The device used to run the agent
    main_net : QNet
        The main neural network used to predict the Q-values of the state-action pairs
    target_net : QNet
        The target neural network used to predict the Q-values of the state-action pairs
    state_size : int
        The state size
    greedy_decay_rate : float
        The greedy decay rate
    target_update_rate : int
        The target update rate
    initial_greediness : float
        The initial greediness of the agent. It is used to determine the probability of the agent taking a random action.
    greediness : float
        The current greediness of the agent.
    mode : str
        The mode of the agent. It can be either "train" or "test".
    lr : float
        The learning rate used to update the weights of the neural network.
    gamma : float
        The gamma parameter used in the Q-Learning algorithm.
    quadratic_penalty_coefficient : float
        The quadratic penalty coefficient used to penalize the agent for selling big quantities of stocks.
    optimizer : torch.optim
        The optimizer used to update the weights of the neural network.
    loss_fn : torch.nn
        The loss function used to calculate the loss between the predicted Q-values and the target Q-values.


    """

[docs]    def __init__(
        self,
        state_dict: Optional[dict] = None,
        greedy_decay_rate: float = 0.95,
        target_update_rate: int = 15,
        initial_greediness: float = 1,
        mode: str = "train",
        lr: float = 1e-3,
        state_size: int = 5,
        initial_budget: int = 100,
        horizon: int = 100,
        gamma: float = 0.99,
        quadratic_penalty_coefficient: float = 0.01,
        verbose : bool = False
    ) -> None:
        super().__init__(initial_budget, horizon)

        self.device = get_device()
        print(f"Using {self.device} device")

        self.main_net = QNet(state_size=state_size, action_size=initial_budget).to(
            self.device
        )
        self.target_net = QNet(state_size=state_size, action_size=initial_budget).to(
            self.device
        )

        self.state_size = state_size

        self.gamma = gamma

        if state_dict is not None:
            self.main_net.load_state_dict(state_dict)
            self.target_net.load_state_dict(state_dict)

        self.greedy_decay_rate = greedy_decay_rate
        self.target_update_rate = target_update_rate
        self.greediness = initial_greediness
        self.quadratic_penalty_coefficient = quadratic_penalty_coefficient

        self.mode = mode

        self.learning_step = 0

        if self.mode == "train":
            self.optimizer = optim.RMSprop(self.main_net.parameters(), lr=lr)
            self.loss_fn = nn.MSELoss()

        self.verbose = verbose

[docs]    def train(self) -> None:
        """This function sets the mode to "train" and trains the main neural network."""

        self.main_net.train()
        self.mode = "train"

[docs]    def eval(self) -> None:
        """This function sets the mode to "eval" and puts the main network in evaluation mode."""

        self.main_net.eval()
        self.mode = "eval"

[docs]    def get_action(self, state: State) -> int:
        """This function returns a tensor that is either a random binomial distribution or the index of the
        maximum value in the output of a neural network, depending on certain conditions.

        Parameters
        ----------
        state : State
            The `state` parameter is an instance of the `State` class, which contains information about the
        current state of the environment in which the agent is operating. This information typically
        includes things like the agent's current position, the state of the game board, and any other
        relevant information that the agent needs

        Returns
        -------
            an integer that represents the action to be taken based on the given state. If the `greediness`
        parameter is set and the `mode` is "train", a random binomial distribution is generated using the
        state's inventory as the number of trials and the probability of success as 1/inventory. Otherwise,
        the action is determined by the main neural network's output, which is the index of the maximum
        value in the output Q-values tensor.

        """
        return (
            np.random.binomial(state["inventory"], 1 / state["inventory"])
            if np.random.rand() < self.greediness and self.mode == "train"
            else self.main_net(state).argmax().item()
        )

    def __update_target_net(self) -> None:
        """This function updates the target network by loading the state dictionary of the main network."""
        self.target_net.load_state_dict(self.main_net.state_dict())

    def __complete_target(
        self, experience_batch: np.ndarray
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """This function takes in a batch of experiences and returns the corresponding targets, actions, and
        states for training a reinforcement learning agent.

        Parameters
        ----------
        experience_batch : np.ndarray
            `experience_batch` is a numpy array containing a batch of experiences. Each experience is a
        dictionary containing information about a single step taken by the agent in the environment. The
        dictionary contains keys such as "state", "action", "reward", "next_state", and "dist2Horizon".

        Returns
        -------
            a tuple of three torch Tensors: targets, actions, and states.

        """
        targets, actions, states = (
            torch.empty(len(experience_batch)),
            torch.empty(len(experience_batch)),
            torch.empty((len(experience_batch), self.state_size)),
        )
        for i, experience in enumerate(experience_batch):  # can be vectorized
            actions[i] = experience["action"]
            states[i] = experience["state"].astensor
            if experience["dist2Horizon"] == 1:
                targets[i] = experience["reward"]

            elif experience["dist2Horizon"] == 0:
                targets[i] = (
                    experience["reward"]
                    + self.gamma
                    * experience["next_state"]["inventory"](
                        experience["next_state"]["Price"] - experience["state"]["Price"]
                    )
                    - self.quadratic_penalty_coefficient
                    * (experience["next_state"]["inventory"]) ** 2
                )
            else:
                best_action = self.main_net(experience["next_state"]).argmax().item()
                targets[i] = (
                    experience["reward"]
                    + self.gamma
                    * self.target_net(experience["next_state"])[int(best_action)]
                )

        return targets, actions, states

[docs]    def learn(self, experience_batch: np.ndarray) -> None:
        """This function trains a neural network using a batch of experiences and updates the target network
        periodically.

        Parameters
        ----------
        experience_batch : np.ndarray
            The experience_batch parameter is a numpy array containing a batch of experiences, where each
        experience is a tuple of (state, action, reward, next_state, dist2Horizon). This batch is used to update the
        neural network's weights through backpropagation.

        """

        targets, actions, states = self.__complete_target(experience_batch)
        dataloader = DataLoader(
            TensorDataset(states, actions, targets),
            batch_size=32,
            shuffle=True,
        )

        for batch in dataloader:
            target = batch[2]
            pred = self.main_net(batch[0])[torch.arange(len(batch[0])), batch[1].long()]
            loss = self.loss_fn(pred, target)
            self.optimizer.zero_grad()
            loss.backward(retain_graph=True)
            self.optimizer.step()

        self.learning_step += 1
        self.greediness = max(0.01, self.greediness * self.greedy_decay_rate)
        if self.learning_step % self.target_update_rate == 0 and self.verbose==True:
            self.__update_target_net()
            print(
                f"Target network updated at step {self.learning_step} with greediness {self.greediness:.2f}"
            )