Source code for ddql_optimal_execution.agent._ddql

from ._agent import Agent
from typing import Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from ._neural_net import QNet
from ddql_optimal_execution import State, get_device


[docs]class DDQL(Agent): """ The DDQL class inherits from the Agent class. It is an agent that implements a Double Deep Q-Learning algorithm. Parameters ---------- state_dict : dict, optional A dictionary containing the state of the agent, by default None greedy_decay_rate : float, optional The greedy decay rate, by default 0.95 target_update_rate : int, optional The target update rate, by default 15 initial_greediness : float, optional The initial greediness, by default 1 mode : str, optional The mode, by default "train" lr : float, optional The learning rate, by default 1e-3 state_size : int, optional The state size, by default 5 initial_budget : int, optional The initial budget, by default 100 horizon : int, optional The horizon, by default 100 gamma : float, optional The gamma parameter used in the Q-Learning algorithm, by default 0.99 quadratic_penalty_coefficient : float, optional The quadratic penalty coefficient used to penalize the agent for selling big quantities of stocks, by default 0.01 Attributes ---------- device : torch.device The device used to run the agent main_net : QNet The main neural network used to predict the Q-values of the state-action pairs target_net : QNet The target neural network used to predict the Q-values of the state-action pairs state_size : int The state size greedy_decay_rate : float The greedy decay rate target_update_rate : int The target update rate initial_greediness : float The initial greediness of the agent. It is used to determine the probability of the agent taking a random action. greediness : float The current greediness of the agent. mode : str The mode of the agent. It can be either "train" or "test". lr : float The learning rate used to update the weights of the neural network. gamma : float The gamma parameter used in the Q-Learning algorithm. quadratic_penalty_coefficient : float The quadratic penalty coefficient used to penalize the agent for selling big quantities of stocks. optimizer : torch.optim The optimizer used to update the weights of the neural network. loss_fn : torch.nn The loss function used to calculate the loss between the predicted Q-values and the target Q-values. """
[docs] def __init__( self, state_dict: Optional[dict] = None, greedy_decay_rate: float = 0.95, target_update_rate: int = 15, initial_greediness: float = 1, mode: str = "train", lr: float = 1e-3, state_size: int = 5, initial_budget: int = 100, horizon: int = 100, gamma: float = 0.99, quadratic_penalty_coefficient: float = 0.01, verbose : bool = False ) -> None: super().__init__(initial_budget, horizon) self.device = get_device() print(f"Using {self.device} device") self.main_net = QNet(state_size=state_size, action_size=initial_budget).to( self.device ) self.target_net = QNet(state_size=state_size, action_size=initial_budget).to( self.device ) self.state_size = state_size self.gamma = gamma if state_dict is not None: self.main_net.load_state_dict(state_dict) self.target_net.load_state_dict(state_dict) self.greedy_decay_rate = greedy_decay_rate self.target_update_rate = target_update_rate self.greediness = initial_greediness self.quadratic_penalty_coefficient = quadratic_penalty_coefficient self.mode = mode self.learning_step = 0 if self.mode == "train": self.optimizer = optim.RMSprop(self.main_net.parameters(), lr=lr) self.loss_fn = nn.MSELoss() self.verbose = verbose
[docs] def train(self) -> None: """This function sets the mode to "train" and trains the main neural network.""" self.main_net.train() self.mode = "train"
[docs] def eval(self) -> None: """This function sets the mode to "eval" and puts the main network in evaluation mode.""" self.main_net.eval() self.mode = "eval"
[docs] def get_action(self, state: State) -> int: """This function returns a tensor that is either a random binomial distribution or the index of the maximum value in the output of a neural network, depending on certain conditions. Parameters ---------- state : State The `state` parameter is an instance of the `State` class, which contains information about the current state of the environment in which the agent is operating. This information typically includes things like the agent's current position, the state of the game board, and any other relevant information that the agent needs Returns ------- an integer that represents the action to be taken based on the given state. If the `greediness` parameter is set and the `mode` is "train", a random binomial distribution is generated using the state's inventory as the number of trials and the probability of success as 1/inventory. Otherwise, the action is determined by the main neural network's output, which is the index of the maximum value in the output Q-values tensor. """ return ( np.random.binomial(state["inventory"], 1 / state["inventory"]) if np.random.rand() < self.greediness and self.mode == "train" else self.main_net(state).argmax().item() )
def __update_target_net(self) -> None: """This function updates the target network by loading the state dictionary of the main network.""" self.target_net.load_state_dict(self.main_net.state_dict()) def __complete_target( self, experience_batch: np.ndarray ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """This function takes in a batch of experiences and returns the corresponding targets, actions, and states for training a reinforcement learning agent. Parameters ---------- experience_batch : np.ndarray `experience_batch` is a numpy array containing a batch of experiences. Each experience is a dictionary containing information about a single step taken by the agent in the environment. The dictionary contains keys such as "state", "action", "reward", "next_state", and "dist2Horizon". Returns ------- a tuple of three torch Tensors: targets, actions, and states. """ targets, actions, states = ( torch.empty(len(experience_batch)), torch.empty(len(experience_batch)), torch.empty((len(experience_batch), self.state_size)), ) for i, experience in enumerate(experience_batch): # can be vectorized actions[i] = experience["action"] states[i] = experience["state"].astensor if experience["dist2Horizon"] == 1: targets[i] = experience["reward"] elif experience["dist2Horizon"] == 0: targets[i] = ( experience["reward"] + self.gamma * experience["next_state"]["inventory"]( experience["next_state"]["Price"] - experience["state"]["Price"] ) - self.quadratic_penalty_coefficient * (experience["next_state"]["inventory"]) ** 2 ) else: best_action = self.main_net(experience["next_state"]).argmax().item() targets[i] = ( experience["reward"] + self.gamma * self.target_net(experience["next_state"])[int(best_action)] ) return targets, actions, states
[docs] def learn(self, experience_batch: np.ndarray) -> None: """This function trains a neural network using a batch of experiences and updates the target network periodically. Parameters ---------- experience_batch : np.ndarray The experience_batch parameter is a numpy array containing a batch of experiences, where each experience is a tuple of (state, action, reward, next_state, dist2Horizon). This batch is used to update the neural network's weights through backpropagation. """ targets, actions, states = self.__complete_target(experience_batch) dataloader = DataLoader( TensorDataset(states, actions, targets), batch_size=32, shuffle=True, ) for batch in dataloader: target = batch[2] pred = self.main_net(batch[0])[torch.arange(len(batch[0])), batch[1].long()] loss = self.loss_fn(pred, target) self.optimizer.zero_grad() loss.backward(retain_graph=True) self.optimizer.step() self.learning_step += 1 self.greediness = max(0.01, self.greediness * self.greedy_decay_rate) if self.learning_step % self.target_update_rate == 0 and self.verbose==True: self.__update_target_net() print( f"Target network updated at step {self.learning_step} with greediness {self.greediness:.2f}" )