#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

This contains utilities for forecasts.

"""
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import linear_model
import sklearn.base as skl_base
import sklearn.ensemble as skl_ens
import sklearn.neural_network as skl_nn
import sklearn.tree as skl_tree
import sklearn.svm as skl_svm
import sklearn.linear_model as skl_lin
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
import sklearn.cross_decomposition as cd
from sklearn.metrics import r2_score
from sklearn.linear_model import ElasticNet
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from collections import OrderedDict
import json
import configparser
import src.generalutils as gutil
import src.dmtest as dmtest

config = configparser.ConfigParser()
config.optionxform = str
config.read("config.ini")


# =============================================================================
# Misc functions
# =============================================================================
def write_roman(num):

    roman = OrderedDict()
    roman[1000] = "M"
    roman[900] = "CM"
    roman[500] = "D"
    roman[400] = "CD"
    roman[100] = "C"
    roman[90] = "XC"
    roman[50] = "L"
    roman[40] = "XL"
    roman[10] = "X"
    roman[9] = "IX"
    roman[5] = "V"
    roman[4] = "IV"
    roman[1] = "I"

    def roman_num(num):
        for r in roman.keys():
            x, y = divmod(num, r)
            yield roman[r] * x
            num -= r * x
            if num > 0:
                roman_num(num)
            else:
                break

    return "".join([a for a in roman_num(num)])


def listCombVars(X, Z, combine):
    """
    Combines variables in lists
    """
    if combine:
        X = [str(x) for x in X]
        Z = [str(x) for x in Z]
        return X + Z
    else:
        return X


def RunReg(xf, Xpassvars, Yvar, norm=False, catControls=[]):
    xf = prepForModelling(xf, Xpassvars + [Yvar[0]] + catControls)
    Xpassvars.sort()
    if norm:
        for col in gutil.singleListCheck(Xpassvars):
            xf[col] = (xf[col] - xf[col].mean()).divide(xf[col].std())
    trainY = xf.loc[:, Yvar]
    # Drop duplicated columns (this occurs
    # when the baseline is included)
    # If number of unique index entries and number of unique X values
    # are the same then there shouldn't be any catControls
    if len(Xpassvars) != len(list(set(Xpassvars))):
        print("Detected only one set of regressors per index val;")
        print("removing categorical variables")
        catControls = []
    Xpassvars = list(set(Xpassvars))
    # API version
    # if(len(catControls) == 0):
    #     trainX = xf.loc[:, Xpassvars]
    #     trainX = sm.add_constant(trainX)
    #     regMod = sm.OLS(trainY, trainX)
    #     regResTrain = regMod.fit()
    #     predictY = regResTrain.predict(exog=trainX)
    # Formula API version
    sY = 'Q("' + Yvar[0] + '")' + " ~ "
    sXs = "+".join(['Q("' + x + '")' for x in Xpassvars])
    if len(catControls) > 0:
        sCXs = "+" + "+".join(["C(" + cat + ")" for cat in catControls]) + "-1"
    else:
        sCXs = ""
    regMod = smf.ols(formula=sY + sXs + sCXs, data=xf)
    regResTrain = regMod.fit()
    predictY = regResTrain.predict(exog=xf[Xpassvars + catControls])
    rmse_vec = sm.tools.eval_measures.rmse(predictY.values, trainY.values[0])
    return regResTrain, rmse_vec


def MasterRunRegression(
    df,
    XX,
    y,
    Z=[""],
    catControls=[""],
    horizon=0,
    save=False,
    name="unnamed",
    norm=False,
    baseline=False,
):
    """Performs multiple regressions of single y on
    all entries in each of XX with latter shifted by an amount horizon,
    and also a set of controls included in each model.
    Args:
        df (pandas dataframe):  must contain Xindvars and Yvar as columns
        XX (list of list of str): columns to treat as exog (inner layers in
                                  same reg model) but will accept str or list
        y (str): column to treat as dependent (endog); could be at horizon
        Z (list): columns to treat as controls
        horizon: FOR NAMING PURPOSES ONLY.
                Input data should be at correct horizon
    Returns:
        statsmodels.iolib.summary2.Summary (statsmodels):
            contains regression results
    """
    titleString = "OLS regressions; dependent variable " + y
    if horizon != 0:
        titleString = titleString + " " + str(horizon) + " steps ahead"
    # If XX is not a list of lists, make it one
    XX = gutil.singleListCheck(XX)
    XX = gutil.doubleListCheck(XX)
    if type(y) != str:
        print("Error: please enter string for dependent variable")
        return np.nan
    # If Z is not a list, make it one
    Z = gutil.singleListCheck(Z)
    if baseline:
        XX = XX + [Z]
    XX = np.array(XX)
    XX.sort()
    # Check whether there is just a single model to run
    if XX.shape[0] == 1:
        Xpassvars = list(XX[0])
        if len(Z[0]) != 0:
            Xpassvars = list(XX[0]) + Z
        Yvar = [y]
        regResTrain, rmse_vec = RunReg(df, Xpassvars, Yvar, norm, catControls)
        regResSum2 = regResTrain.summary2()
        regResSum2.add_title(titleString)
        finalTable = regResSum2
    elif XX.shape[0] > 1:
        # Load in Z here if appropriate
        addControls = False
        if len(Z[0]) != 0:
            addControls = True
        # Case with multiple models
        infoDc = {
            "R-squared": lambda x: "{:.2f}".format(x.rsquared),
            "Adj. R-squared": lambda x: "{:.2f}".format(x.rsquared_adj),
            "No. observations": lambda x: "{0:d}".format(int(x.nobs)),
            "F-statistic": lambda x: "{:.2f}".format(x.fvalue),
        }
        # Reg all the different options
        (regsVec, rmse_vec) = map(
            list,
            zip(
                *[
                    RunReg(df, listCombVars(X, Z, addControls), [y], norm, catControls)
                    for X in XX
                ]
            ),
        )
        model_names_strList = [write_roman(i) for i in range(1, XX.shape[0] + 1)]
        float_format_str = "%0.2f"
        uniqueVars = [item for sublist in XX for item in sublist]
        uniqueVars = [str(x) for x in uniqueVars]
        results_table = summary_col(
            results=regsVec,
            float_format=float_format_str,
            stars=True,
            model_names=model_names_strList,
            info_dict=infoDc,
            regressor_order=list(set(uniqueVars + Z)),
        )
        results_table.add_title(titleString)
        finalTable = results_table
    print(finalTable)
    if save:
        outPath = os.path.join(config["data"]["output"], name + "Reg.txt")
        f = open(outPath, "w")
        outputStr = finalTable.as_latex()
        f.write(outputStr)
        f.close()
    return finalTable


def collateRunResults(
    allResOneRun,
    ts_yvar,
    ts_index,
    yvar,
    metric,
    horizon,
    paper,
    trafo,
    alpha,
    stepSize,
    expanding,
    CV,
    model,
    specification,
    run_type,
):
    # Benchmarks implicitly use the metric for their index, and thus
    # which values are used. So both sets of results need to be
    # labelled by the metric
    # Note that this function returns results in the times of the forecasted
    # variable (eg if forecasting y_{t+h}, it will return y^hat at t+h)
    # To work out the time of the features (ie the time at which the forecast
    # was made), subtract off the horizon from the given time
    catCols = [
        "metric",
        "target",
        "horizon",
        "model",
        "trafo",
        "CV",
        "paper",
        "specification",
        "alpha",
        "expanding",
    ]
    packagedRunSettings = (
        yvar,
        metric,
        horizon,
        paper,
        trafo,
        alpha,
        stepSize,
        expanding,
        CV,
        model,
        specification,
        run_type,
    )
    unionResults = script_ISOOS_summary(allResOneRun)
    runSettingsList = json.loads(config.get("runSettings", "runSettings"))
    for setting, value in zip(runSettingsList, packagedRunSettings):
        unionResults[setting] = value
    unionResults.loc[:, catCols] = unionResults.loc[:, catCols].astype("category")
    # This is the point to save allResultsDf if needed:
    # SAVE
    # Merge in the original data and shift the predicted data to
    # coincide with the original data. So this is now
    # going to be aligned with the data date (rather than the forecasted on
    # date)
    unionResults = unionResults.shift(horizon, pd.infer_freq(ts_index))
    # Merge original values of yvar back in:
    unionResults = pd.concat([unionResults, ts_yvar], axis=1)
    unionResults = unionResults.rename(columns={yvar: "target_value"})
    # Should have an insample prediction for every useful datetime, so
    # use this to drop nas
    unionResults = unionResults.dropna(subset=["IS_prediction"])
    return unionResults


def script_ISOOS_summary(allResultsDf):
    script_I = allResultsDf.loc[allResultsDf["IS"] == True, ["target", "mu"]]
    script_I = script_I.rename(columns={"target": "IS_prediction"})
    script_I = script_I[~script_I.index.duplicated(keep="first")]
    script_O = allResultsDf.loc[allResultsDf["IS"] == False, ["target", "mu"]]
    script_O = script_O.rename(columns={"target": "OOS_prediction"})
    script_O = script_O[~script_O.index.duplicated(keep="last")]
    # AT ran this and found names already set to 'date' but
    # EK fix req'd them to be set explicitly
    script_O.index.name = "date"
    script_I.index.name = "date"
    finalUnion = pd.merge(
        script_I.reset_index(),
        script_O.reset_index(),
        how="outer",
        on=["date"],
        suffixes=("_IS", "_OOS"),
    ).set_index("date")
    return finalUnion.drop(["mu_IS", "mu_OOS"], axis=1)


def prepForModelling(df, variables):
    """
    Gets rid of infs and nans - model runs break if they
    get passed them.
    NB: No nan filling via
    interpolation in order to avoid data leakage
    """
    variables = gutil.singleListCheck(variables)
    df = df.loc[:, variables]
    df = df.loc[:, ~df.columns.duplicated()]
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.dropna(axis=0, how="any")
    return df


def modelSpecWrap(
    df,
    metricList,
    controls,
    ts_yvar,
    yvar,
    metric,
    horizon,
    paper,
    trafo,
    alpha,
    stepSize,
    expanding,
    CV,
    model,
    specification,
):
    """
    Wrapper that performs a model run with the target, and another
    with the benchmark.

    Model specification uses input df['target'] var as endog
    and uses metricList + controls as exog

    Benchmark just uses controls as exog

    """
    run_type = "metric"
    packagedRunSettings = (
        yvar,
        metric,
        horizon,
        paper,
        trafo,
        alpha,
        stepSize,
        expanding,
        CV,
        model,
        specification,
        run_type,
    )
    allExog = metricList + controls
    target = "target"
    # As if real-time transformation of text metrics:
    df[metricList] = txtMetricTransformer(df[metricList], alpha, stepSize, trafo)
    # Drop yvar to ensure no contamination
    df = df.drop(yvar, axis=1)
    # Forecast gubbins ----------------------------------
    # Get the datetime index that will be used throughout
    df = prepForModelling(df, allExog + [target])
    inputDataIndex = df.index
    # Do not pass yvar through, only target
    # this is to protect from yvar accidentally being used in
    # fcasts
    allResTxtmet = forecastTool(
        df.loc[inputDataIndex, [target] + allExog],
        target,
        allExog,
        alpha,
        stepSize,
        expanding,
        CV,
        model,
    )
    summarisedTxtMet = collateRunResults(
        allResTxtmet, ts_yvar, inputDataIndex, *packagedRunSettings
    )
    # Now run the benchmark on the AR(1) exact same settings
    # - that includes the index from the first run to make it
    # a fair test
    run_type = "benchmark"
    packagedRunSettings = (
        yvar,
        metric,
        horizon,
        paper,
        trafo,
        alpha,
        stepSize,
        expanding,
        CV,
        model,
        specification,
        run_type,
    )
    allResBchmrk = forecastTool(
        df.loc[inputDataIndex, [target] + controls],
        target,
        controls,
        alpha,
        stepSize,
        expanding,
        CV,
        model,
    )
    summarisedBchmrk = collateRunResults(
        allResBchmrk, ts_yvar, inputDataIndex, *packagedRunSettings
    )
    # Merge results from both phases
    finResults = pd.concat([summarisedTxtMet, summarisedBchmrk], axis=0)
    return finResults


def forecastTool(xf, Yvar, Xvars, alpha, stepSize, expanding, CV, model_name):
    """
    This uses Xvars to predict Yvar both in and out of sample
    using slices that move forward in time.

    Not that the output is in tidy format: to split by
    IS or OOS use the IS column

    Assumes that any text based variables are already transfrormed
    to 'as-if' real-time (if transformed at all)

    Note that when the returned dataframe includes column 'target', it is
    actually the predicted value
    """
    lenDf = len(xf)
    assert not xf.empty
    alpha = np.int(alpha)
    stepSize = np.int(stepSize)
    numSteps = np.int(np.floor((lenDf - stepSize - alpha) / stepSize))
    action_str = "r"
    func_action_map = {
        "slice_insample_e": slice_insample_e,
        "slice_oosample_e": slice_oosample_e,
        "slice_insample_r": slice_insample_r,
        "slice_oosample_r": slice_oosample_r,
    }
    if expanding:
        print("Using expanding window")
        action_str = "e"
    in_samp_slice_str = "slice_insample_" + action_str
    oos_samp_slice_str = "slice_oosample_" + action_str
    # Settings complete: now loop over the forecasts
    allResultsDf = pd.DataFrame()
    for mu in range(1, numSteps + 1):
        # In-sample
        xfIS = func_action_map[in_samp_slice_str](xf, mu, stepSize, alpha)
        assert not xfIS.empty
        x_feat = xfIS[Xvars]
        if CV and model_name != "OLS":
            regr = GridSearchCV(
                model_selection(model_name),
                retParams(model_name),
                scoring="neg_mean_squared_error",
                iid=True,
                cv=TimeSeriesSplit(n_splits=5),
            )
            regr.fit(x_feat, xfIS[Yvar].values.ravel())
        else:
            regr = model_selection(model_name)
            regr.fit(x_feat, xfIS[Yvar].values.ravel())
        model, xfIS[Yvar] = regr, regr.predict(x_feat)
        xfIS["IS"] = True
        xfIS["mu"] = mu
        # Onto out of sample
        xfOOS = func_action_map[oos_samp_slice_str](xf, mu, stepSize, alpha)
        assert not xfOOS.empty
        x_test = xfOOS[Xvars].copy()
        xfOOS["IS"] = False
        xfOOS["mu"] = mu
        xfOOS[Yvar] = model.predict(x_test)
        # Drop Xvars here to avoid lugging around lots of copies
        xfIS = xfIS.drop(Xvars, axis=1)
        xfOOS = xfOOS.drop(Xvars, axis=1)
        allResultsDf = pd.concat([allResultsDf, xfIS, xfOOS], sort=True, axis=0)
    allResultsDf.loc[:, "mu"] = allResultsDf.loc[:, "mu"].astype("category")
    return allResultsDf


def rmse(seriesOne, seriesTwo):
    sqr = np.square(seriesOne - seriesTwo)
    mean = np.mean(sqr)
    return np.sqrt(mean)


def runDMTest(
    target,
    metric,
    horizon,
    paper,
    trafo,
    alpha,
    stepSize,
    expanding,
    CV,
    model,
    specification,
    run_type,
    save,
    sentiment,
):
    """
    Nested Diebold-Mariano test for significance of difference of two
    forecasts. Only appropriate if expanding=False.
    Returns (Test stat, p-value)
    """
    if expanding:
        return (np.nan, np.nan)
    namesVars = json.loads(config["runSettings"]["runSettings"])
    packagedSettings = (
        target,
        metric,
        horizon,
        paper,
        trafo,
        alpha,
        stepSize,
        expanding,
        CV,
        model,
        specification,
        run_type,
    )
    df = pd.read_pickle(
        os.path.join(config["data"]["results"], "ALL_" + specification + ".pkl")
    )
    for name, setting in zip(namesVars, packagedSettings):
        if name != "run_type":
            df = df.loc[df[name] == setting]
    # Pull out test statistics
    ycol, txtpred, bchpred = retrieveForecastData(df)
    dmans = dmtest.dm_test(
        ycol.values, txtpred.values, bchpred.values, h=horizon, crit="MSE"
    )
    return dmans[0], dmans[1]


def rmseDifference(ycol, txtpred, bchpred):
    """
    Difference in RMSE between benchmark and prediction
    if returned value is >0, new forecast is better
    """
    return rmse(ycol, bchpred) - rmse(ycol, txtpred)


def rmseRatio(ycol, txtpred, bchpred):
    """
    Ratio in RMSE between prediction and benchmark
    if returned value is <1, new forecast is better
    """
    return rmse(ycol, txtpred) / rmse(ycol, bchpred)


def retrieveForecastData(df):
    ycol = df.loc[~df.index.duplicated(keep="first"), "target_value"]
    txtpred = df.loc[df["run_type"] == "metric", "OOS_prediction"].dropna()
    bchpred = df.loc[df["run_type"] == "benchmark", "OOS_prediction"].dropna()
    ycol = ycol.loc[bchpred.index]
    return ycol, txtpred, bchpred


def runVSbchmarkData(df, horizon, expanding):
    ycol, txtpred, bchpred = retrieveForecastData(df)
    rmse_rat = rmseRatio(ycol, txtpred, bchpred)
    rmse_diff = rmseDifference(ycol, txtpred, bchpred)
    if expanding:
        DMstat, DMpval = (np.nan, np.nan)
    else:
        DMstat, DMpval = dmtest.dm_test(
            ycol.values, txtpred.values, bchpred.values, h=np.int(horizon), crit="MSE"
        )
    # Prep for storage (missing out run type)
    colNames = ["RMSE/RMSE_bch", "RMSE_bch-RMSE", "DMstat", "DMpval"]
    dataCols = [rmse_rat, rmse_diff, DMstat, DMpval]
    outDf = pd.DataFrame(data=[dataCols], columns=colNames)
    return outDf


# =============================================================================
# Models
# =============================================================================
def retParams(model_name):
    CV_vals = {
        "NN": {
            "hidden_layer_sizes": (
                [(n,) for n in range(2, 11, 2)] + [(n, n) for n in range(2, 11, 2)]
            ),
            "alpha": 10.0 ** np.arange(-4, 1),
            "activation": ["relu", "tanh"],
            "solver": ["lbfgs"],
        },
        "Tree": {"max_depth": list(range(1, 11))},
        "Forest": {"max_depth": list(range(1, 11))},
        "SVM": {
            "C": (
                list(range(1, 6, 4))
                + list(range(10, 60, 40))
                + list(range(100, 1000, 400))
            ),
            "gamma": 10.0 ** np.arange(-4, 0),
        },
        "OLS": {"alpha": [0] + list(10.0 ** np.arange(-5, 1))},
        "Elastic": {
            "alpha": list(10.0 ** np.arange(-1, 1.1, 0.2)),
            "l1_ratio": [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
        },
        "Ridge": {"alpha": [0] + list(10.0 ** np.arange(-5, 1))},
        "Lasso": {"alpha": [0] + list(10.0 ** np.arange(-5, 1))},
        "PLS": {"n_components": list(range(1, 15))},
    }
    return CV_vals[model_name]


def model_str(method, val_dict=None):
    """Set model parameters for various ML models
    Parameters
    ----------
    method : str
        model type, options: NB' (Naive-Bayes),'SVM','NN' (neural net),
        'Tree','Forest','kNN' and 'Reg'.
    val_dict : dict, optional (Default value = None)
        dictionary keyed by model parameter to be used and values to eval.
        if None, default values from "model_params_str" will be used.
    Returns
    -------
    model_str : string including the model and parameter values.
    """
    if method == "SVM":
        model_str = "skl_svm.SVR"
        all_params = ["C", "gamma", "epsilon"]
        default_vals = [800, '"auto"', 0.0]
    # Deep NN to go in here
    elif method == "NN":
        model_str = "skl_nn.MLPRegressor"
        all_params = ["hidden_layer_sizes", "alpha", "activation", "solver", "max_iter"]
        default_vals = [(2, 2), 0.1, '"tanh"', '"lbfgs"', 2000]
    elif method == "Tree":
        model_str = "skl_tree.DecisionTreeRegressor"
        all_params = ["max_depth"]
        default_vals = [2]
    elif method == "Forest":
        model_str = "skl_ens.RandomForestRegressor"
        all_params = ["n_estimators", "max_depth"]
        default_vals = [200, 8]
    elif method == "OLS":
        model_str = "skl_lin.LinearRegression"
        all_params = ["fit_intercept"]
        default_vals = [True]
    elif method == "Elastic":
        model_str = "skl_lin.ElasticNet"
        all_params = ["alpha", "l1_ratio", "fit_intercept"]
        default_vals = [1.0, 0.5, False]
    elif method == "Ridge":
        model_str = "skl_lin.Ridge"
        all_params = ["alpha", "fit_intercept"]
        default_vals = [1.0, False]
    elif method == "Lasso":
        model_str = "skl_lin.Lasso"
        all_params = ["alpha", "fit_intercept"]
        default_vals = [1.0, False]
    elif method == "ARIMA":
        model_str = "ARIMA"
        all_params = ["order"]
        default_vals = [(0, 1, 1)]
    elif method == "PLS":
        model_str = "cd.PLSRegression"
        all_params = ["n_components"]
        default_vals = [5]
    else:
        raise ValueError("Model default parameter values not given.")
    # construct parameter string
    params_str = ""
    for i, param in enumerate(all_params):
        if val_dict is not None:
            if param in val_dict.keys():
                d = val_dict[param]
            else:
                d = default_vals[i]
        else:
            d = default_vals[i]
        params_str += param + "=" + str(d) + ","
    # join model and parameters # remove last comma
    model_str += "({0})".format(params_str[:-1])
    return model_str


def model_selection(method, val_dict=None):
    """Select model instance from scikit-learn library.
    Parameters
    ----------
    method : str
        model type, options: 'SVM','NN' (neural net), 'Tree','Forest','kNN',
        'OLS',
        'Elastic', 'Ridge', 'LASSO'
    val_dict : dict, optional (Default value = None)
        dictionary keyed by model parameter to be used and values to eval.
        if None, default values from "model_params_str" will be used.
    Returns
    -------
    model : scikit-learn model instance
    """
    # check if model choice is valid
    valid_methods = [
        "NN",
        "Tree",
        "Forest",
        "SVM",
        "OLS",
        "Elastic",
        "Ridge",
        "Lasso",
        "ARIMA",
        "PLS",
    ]
    if method not in valid_methods:
        raise ValueError("Invalid method: '{0}' not supported.".format(method))
    # select model
    model = eval(model_str(method, val_dict=val_dict))
    return model


# =============================================================================
# Transforms and slices
# =============================================================================
def txtMetricTransformer(xf, alpha, stepSize, trafo):
    """
    Transforms a series as it would have been in real time,
    assuming an expanding window (this is reasonable for transforms)
    Likely to be unstable for short alpha+stepSize combinations
    """
    if trafo != "none":
        alpha = np.int(alpha)
        stepSize = np.int(stepSize)
        numSteps = np.int(np.floor((len(xf) - alpha) / stepSize))
        func_action_map = {
            "trafo_none": trafo_none,
            "trafo_Z": transform_norm,
            "trafo_minmax": transform_minmax,
        }
        trafo_str = "none"
        if trafo == "Z":
            trafo_str = "Z"
        if trafo == "minmax":
            trafo_str = "minmax"
        trafo_str = "trafo_" + trafo_str
        script_I = pd.DataFrame()
        # Ensure this is not just a series (so it can be indexed by col)
        xf = pd.DataFrame(xf)
        for mu in range(1, numSteps + 1):
            xfIS = slice_insample_e(xf, mu, stepSize, alpha)
            x_feat = func_action_map[trafo_str](xfIS, xfIS, mu, stepSize, alpha)
            xfIS["mu"] = mu
            # Save the transformed as if real-time series
            script_I = pd.concat([script_I, x_feat], axis=0)
        script_I = script_I[~script_I.index.duplicated(keep="first")]
        return script_I
    else:
        return xf


def slice_insample_e(X_feat, mu_period, step_size, alpha):
    """Returns the slice of the input data of the mu th train period
    e = expanding horizon
    args:
        step_size    : step size (int)
        mu_period : period of train and test
        X_feat    : dataframe of index by features
        alpha    : length of initial training period minus stepsize
    """
    assert mu_period > 0
    return X_feat.iloc[0 : mu_period * step_size + alpha, :].copy()


def slice_oosample_e(X_feat, mu_period, step_size, alpha):
    """Returns the slice of the input data of the mu th test period
    e = expanding horizon
    args:
        step_size    : step size (int)
        mu_period : period of train and test
        X_feat    : dataframe of index by features
        alpha    : length of initial training period minus stepsize
    """
    return X_feat.iloc[mu_period * step_size + alpha :, :].copy()


def slice_insample_r(X_feat, mu_period, step_size, alpha):
    """Returns the slice of the input data of the mu th train period
        r = rolling horizon
        args:
            step_size    : step size (int)
            mu_period : period of train and test
            X_feat    : dataframe of index by features
            alpha    : length of initial training period minus stepsize
    Note that there isn't a minus one below as in equations
    as written. This is because of the way pandas indexes.
    This runs from (mu-1)*s to mu.s+alpha -1 inclusive, or, as below,
    from (mu-1)*s (inclusive) to mu.s+alpha (exclusive)
    """
    assert mu_period > 0
    ans = X_feat.iloc[
        (mu_period - 1) * step_size : mu_period * step_size + alpha, :
    ].copy()
    return ans


def slice_oosample_r(X_feat, mu_period, step_size, alpha):
    """Returns the slice of the input data of the mu th test period
    e = expanding horizon
    args:
        step_size    : step size (int)
        mu_period : period of train and test
        X_feat    : dataframe of index by features
        alpha    : length of initial training period minus stepsize
    """
    return X_feat.iloc[mu_period * step_size + alpha :, :].copy()


def trafo_none(Z, X_feat, mu_period, horizon_val, alpha):
    return Z


def transform_norm(Z, X_feat, mu_period, step_size, alpha):
    """Returns the Z matrix normalised by X_feat data sliced
    according mu th training period for expanding window
    args:
        Z : dataframe (indexed with time, columns are features)
        X_feat: dataframe (time by features)
        alpha : length of initial training period minus stepsize
    outputs:
        Z normalised by X_feat using the in-sample mu_period slice
    """
    # Check that Z has as many features as X_feat
    assert np.shape(Z)[1] == np.shape(X_feat)[1]
    assert mu_period > 0
    X_feat_is = slice_insample_e(X_feat, mu_period, step_size, alpha)
    mean = X_feat_is.mean(axis=0)
    std = X_feat_is.std(axis=0)
    # Fix for zero std dev
    std = std.replace(to_replace=0, value=1)
    return (Z - mean).divide(std)


def transform_minmax(Z, X_feat, mu_period, step_size, alpha):
    """Returns tahe Z matrix normalised to max/min by X_feat data sliced
    according mu th training period for expanding window
    args:
        Z : dataframe (indexed with time, columns are features)
        X_feat: dataframe (time by features)
        alpha    : length of initial training period minus stepsize
    outputs:
        Z normalised by X_feat using the in-sample mu_period slice
    """
    # Check that Z has as many features as X_feat
    assert np.shape(Z)[1] == np.shape(X_feat)[1]
    assert mu_period > 0
    X_feat_is = slice_insample_e(X_feat, mu_period, step_size, alpha)
    maxVal = X_feat_is.max(axis=0)
    minVal = X_feat_is.min(axis=0)
    diff = maxVal - minVal
    diff = diff.replace(to_replace=0, value=1)
    return (Z - minVal).divide(diff)
