Source code for ddql_optimal_execution.preprocessing._preprocessor

from typing import Tuple

import numpy as np
import pandas as pd


def normalize(df: pd.Series) -> pd.Series:
    '''This function normalizes the price data to have a mean of 0 and a standard deviation of 1 after substracting 
    the first price value.
    
    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame containing financial data with a "Date" column and a "Price" column.
    
    Returns
    -------
        a pandas DataFrame.
    
    '''
    df -= df.iloc[0]
    df -= df.mean()
    df /= df.std()
    return df

[docs]class Preprocessor: ''' This class is used to preprocess the data before it is fed into the environment. It splits the data into periods and optionally calculates the QV and normalizes the price. Attributes ---------- n_periods : int an integer representing the number of periods in the time series data. It is the number of trading actions agents can take. QV : bool QV stands for "Quadratic Variation" and is a measure of volatility. This parameter is a boolean value that determines whether the QV should be calculated or not. If set to True, the QV will be calculated and added to the DataFrame. normalize_price : bool A boolean parameter that determines whether the price data should be normalized or not. If set to True, the price data will be normalized to have a mean of 0 and a standard deviation of 1 after substracting the first price value. Methods ------- __call__(df) This function splits a pandas DataFrame into periods based on a specified number of periods, and optionally calculates the QV and normalizes the price. '''
[docs] def __init__(self, n_periods : int, QV :bool = True, normalize_price : bool = True , volume: bool = True) -> None: '''This is a constructor function that initializes the object with the given parameters. Parameters ---------- n_periods : int an integer representing the number of periods in the time series data. It is the number of trading actions agents can take. QV : bool, optional QV stands for "Quadratic Variation" and is a measure of volatility. This parameter is a boolean value that determines whether the QV should be calculated or not. If set to True, the QV will be calculated and added to the DataFrame. normalize_price : bool, optional A boolean parameter that determines whether the price data should be normalized or not. If set to True, the price data will be normalized to have a mean of 0 and a standard deviation of 1 after substracting the first price value. ''' self.n_periods = n_periods self.QV = QV self.normalize_price = normalize_price self.volume = volume
[docs] def __call__(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]: '''This function splits a pandas DataFrame into periods based on a specified number of periods, and optionally calculates the QV and normalizes the price. Parameters ---------- df : pd.DataFrame A pandas DataFrame containing financial data with a "Date" column and a "Price" column. Returns ------- a pandas DataFrame. ''' df = df.set_index("Date") _date_splits = np.split(df.index, self.n_periods) df["period"] = 0 df["period"] = df["period"].astype(int) for i, split in enumerate(_date_splits): df.loc[split, "period"] = i raw_prices = df[["Price", "period"]].iloc[1:].copy() if self.normalize_price: df["Price"] = normalize(df["Price"]) for col in df.columns: if col == "period": continue df[col] = df[col].astype(float) df[col] = normalize(df[col]) if self.QV: df["QV"] = df.groupby("period")["Price"].transform(lambda x: ((x - x.shift(1))**2).sum()) df["QV"] -= df["QV"].mean() df["QV"] /= 2*df["QV"].std() if self.volume: if "volume" not in df.columns: raise ValueError("Volume data is not present in the DataFrame.") else : df = df.drop("volume", axis = 1) df = df.iloc[1:] return df, raw_prices