% Dias and Duarte
% All mistakes are our own. Please email joao.duarte@novasbe.pt for any
% mistakes or bugs found.
% last update: 8/10/2018


clear
close all
clc

addpath('../../auxiliary files')
addpath('../Data')

% =========================================================================
% DESCRIPTION 
% This script loads in a FRED-MD dataset, processes the dataset, and then
% estimates factors.
%
% -------------------------------------------------------------------------
% BREAKDOWN OF THE SCRIPT
% 
% Part 1: Load and label FRED-QD data.
%
% Part 2: Process data -- transform each series to be stationary and remove
%         outliers.
%
% 
%
% -------------------------------------------------------------------------
% AUXILIARY FUNCTIONS
% List of auxiliary functions to be saved in same folder as this script.
%
%   prepare_missing() - transforms series based on given transformation
%       numbers
%
%   
%
% 
%
% 
% 
%
% -------------------------------------------------------------------------
% NOTES
% Author: Joao Duarte
% Date: 21/7/2018
% Version: MATLAB 2017a
% Required Toolboxes: None
%
% -------------------------------------------------------------------------
% PARAMETERS TO BE CHANGED

% File name of desired FRED-MD vintage
csv_in='fred_data.csv';

% Type of transformation performed on each series before factors are
% estimated
%   0 --> no transformation
%   1 --> demean only
%   2 --> demean and standardize
%   3 --> recursively demean and then standardize
DEMEAN=2;

% =========================================================================
% PART 1: LOAD AND LABEL DATA

% Load data from CSV file
dum=importdata(csv_in,',');

% Variable names
series=dum.textdata(1,2:end);

% Fast code
fastcode=dum.data(1,:);


% Transformation numbers
tcode=dum.data(2,:);

% Raw data
rawdata=dum.data(3:end,:);

% Month/year of final observation
final_datevec=datevec(dum.textdata(end,1));
final_month=final_datevec(2);
final_year=final_datevec(1);

% Dates (monthly) are of the form YEAR+MONTH/12
% e.g. March 1970 is represented as 1970+3/12
% Dates go from 1959:01 to final_year:final_month (see above)
dates = (1959+1/12:1/12:final_year+final_month/12)';

% T = number of months in sample
T=size(dates,1);
rawdata=rawdata(1:T,:);

% =========================================================================
% PART 2: PROCESS DATA

% Transform raw data to be stationary using auxiliary function
% prepare_missing()


yt=prepare_missing(rawdata,tcode);

% Reduce sample to usable dates: remove first four quarters because some
% series have been first differenced
all_X = yt(12:T-1,:);
dates=dates(12:T-1,:);

finalseries = series(:);

slowseries = series(:, fastcode==0);

fastseries = series(:, fastcode==1);


% PART 1: CHECKS

x = all_X;

% Check that x is not missing values for an entire row
if sum(sum(isnan(all_X),2)==size(all_X,2))>0
    error('Input x contains entire row of missing values.');
end

% Check that x is not missing values for an entire column
if sum(sum(isnan(all_X),1)==size(all_X,1))>0
    error('Input x contains entire column of missing values.');
end


% Check that DEMEAN is one of 0, 1, 2, 3
if DEMEAN ~= 0 && DEMEAN ~= 1 && DEMEAN ~= 2 && DEMEAN ~= 3
    error('Input DEMEAN is specified incorrectly.');
end

% =========================================================================
% PART 2: SETUP

% Maximum number of iterations for the EM algorithm
maxit=50;

% Number of observations per series in x (i.e. number of rows)
T=size(all_X,1);

% Number of series in x (i.e. number of columns)
N=size(all_X,2);

% Set error to arbitrarily high number
err=999;

% Set iteration counter to 0
it=0;

% Locate missing values in x
x1=isnan(all_X);

% =========================================================================
% PART 3
% Fill in missing values for each series with the unconditional mean of
% that series. Demean and standardize the updated dataset. Estimate factors
% using the demeaned and standardized dataset, and use these factors to
% predict the original dataset.

% Get unconditional mean of the non-missing values of each series
mut=repmat(nanmean(all_X),T,1);

% Replace missing values with unconditional mean
x2=all_X;
x2(isnan(all_X))=mut(isnan(all_X));
%yt(:,fcode==1)= x2;
% Demean and standardize data using subfunction transform_data()
%   x3  = transformed dataset
%   mut = matrix containing the values subtracted from x2 during the
%         transformation
%   sdt = matrix containing the values that x2 was divided by during the
%         transformation
[x3,mut,sdt]=transform_data(x2,DEMEAN);


fid = fopen('FAVAR_data.csv', 'w') ;
 fprintf(fid, '%s,', series{1:end}) ;
 fprintf(fid, '%s\n', series{end}) ;
 fclose(fid) ;
 dlmwrite('FAVAR_data.csv', x3) ;

fid = fopen('finalseries2.csv', 'w') ;
 fprintf(fid, '%s,', finalseries{1:end-1}) ;
 fprintf(fid, '%s\n', finalseries{end}) ;
 fclose(fid) ;

 fid = fopen('slowseries2.csv', 'w') ;
 fprintf(fid, '%s,', slowseries{1:end-1}) ;
 fprintf(fid, '%s\n', slowseries{end}) ;
 fclose(fid) ;
 
 fid = fopen('fastseries2.csv', 'w') ;
 fprintf(fid, '%s,', fastseries{1:end-1}) ;
 fprintf(fid, '%s\n', fastseries{end}) ;
 fclose(fid) ;
%  
%  baing(x3(:,:), 7,3)

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

function [x22,mut,sdt]=transform_data(x2,DEMEAN)
% =========================================================================
% DESCRIPTION
% This function transforms a given set of series based upon the input
% variable DEMEAN. The following transformations are possible:
%
%   1) No transformation.
%   
%   2) Each series is demeaned only (i.e. each series is rescaled to have a
%   mean of 0).
%   
%   3) Each series is demeaned and standardized (i.e. each series is
%   rescaled to have a mean of 0 and a standard deviation of 1).
%   
%   4) Each series is recursively demeaned and then standardized. For a
%   given series x(t), where t=1,...,T, the recursively demeaned series
%   x'(t) is calculated as x'(t) = x(t) - mean(x(1:t)). After the
%   recursively demeaned series x'(t) is calculated, it is standardized by
%   dividing x'(t) by the standard deviation of the original series x. Note
%   that this transformation does not rescale the original series to have a
%   specified mean or standard deviation.
%
% -------------------------------------------------------------------------
% INPUTS
%           x2      = set of series to be transformed (one series per
%                     column); no missing values;
%           DEMEAN  = an integer indicating the type of transformation
%                     performed on each series in x2; it can take on the
%                     following values:
%                           0 (no transformation)
%                           1 (demean only)
%                           2 (demean and standardize)
%                           3 (recursively demean and then standardize) 
%
% OUTPUTS
%           x22     = transformed dataset
%           mut     = matrix containing the values subtracted from x2
%                     during the transformation
%           sdt     = matrix containing the values that x2 was divided by
%                     during the transformation
%
% =========================================================================
% FUNCTION

% Number of observations in each series (i.e. number of rows in x2)
T=size(x2,1);

% Number of series (i.e. number of columns in x2)
N=size(x2,2);

% Perform transformation based on type determined by 'DEMEAN'
switch DEMEAN
    
    % ---------------------------------------------------------------------
    % No transformation
    case 0
        mut=repmat(zeros(1,N),T,1);
        sdt=repmat(ones(1,N),T,1);
        x22=x2;
        
    % ---------------------------------------------------------------------
    % Each series is demeaned only
    case 1
        mut=repmat(mean(x2),T,1);
        sdt=repmat(ones(1,N),T,1);
        x22=x2-mut;
        
    % ---------------------------------------------------------------------
    % Each series is demeaned and standardized 
    case 2
        mut=repmat(mean(x2),T,1);
        sdt=repmat(std(x2),T,1);
        x22=(x2-mut)./sdt;
        
    % ---------------------------------------------------------------------
    % Each series is recursively demeaned and then standardized
    case 3
        mut=NaN(size(x2));
        for t=1:T
            mut(t,:)=mean(x2(1:t,:),1);
        end
        sdt=repmat(std(x2),T,1);
        x22=(x2-mut)./sdt; 
end
end


function [ic1,chat,Fhat,eigval]=baing(X,kmax,jj)
% =========================================================================
% DESCRIPTION
% This function determines the number of factors to be selected for a given
% dataset using one of three information criteria specified by the user.
% The user also specifies the maximum number of factors to be selected.
%
% -------------------------------------------------------------------------
% INPUTS
%           X       = dataset (one series per column)
%           kmax    = an integer indicating the maximum number of factors
%                     to be estimated
%           jj      = an integer indicating the information criterion used 
%                     for selecting the number of factors; it can take on 
%                     the following values:
%                           1 (information criterion PC_p1)
%                           2 (information criterion PC_p2)
%                           3 (information criterion PC_p3)    
%
% OUTPUTS
%           ic1     = number of factors selected
%           chat    = values of X predicted by the factors
%           Fhat    = factors
%           eigval  = eivenvalues of X'*X (or X*X' if N>T)
%
% -------------------------------------------------------------------------
% SUBFUNCTIONS USED
%
% minindc() - finds the index of the minimum value for each column of a
%       given matrix
%
% -------------------------------------------------------------------------
% BREAKDOWN OF THE FUNCTION
%
% Part 1: Setup.
%
% Part 2: Calculate the overfitting penalty for each possible number of
%         factors to be selected (from 1 to kmax).
%
% Part 3: Select the number of factors that minimizes the specified
%         information criterion by utilizing the overfitting penalties
%         calculated in Part 2.
%
% Part 4: Save other output variables to be returned by the function (chat,
%         Fhat, and eigval). 
%
% =========================================================================
% PART 1: SETUP

% Number of observations per series (i.e. number of rows)
T=size(X,1);

% Number of series (i.e. number of columns)
N=size(X,2);

% Total number of observations
NT=N*T;

% Number of rows + columns
NT1=N+T;

% =========================================================================
% PART 2: OVERFITTING PENALTY
% Determine penalty for overfitting based on the selected information
% criterion. 

% Allocate memory for overfitting penalty
CT=zeros(1,kmax);

% Array containing possible number of factors that can be selected (1 to
% kmax)
ii=1:1:kmax;

% The smaller of N and T
GCT=min([N;T]);

% Calculate penalty based on criterion determined by jj. 
switch jj
    
    % Criterion PC_p1
    case 1
        CT(1,:)=log(NT/NT1)*ii*NT1/NT;
        
    % Criterion PC_p2
    case 2
        CT(1,:)=(NT1/NT)*log(min([N;T]))*ii;
        
    % Criterion PC_p3
    case 3
        CT(1,:)=ii*log(GCT)/GCT;
        
end

% =========================================================================
% PART 3: SELECT NUMBER OF FACTORS
% Perform principal component analysis on the dataset and select the number
% of factors that minimizes the specified information criterion.

% -------------------------------------------------------------------------
% RUN PRINCIPAL COMPONENT ANALYSIS

% Get components, loadings, and eigenvalues
if T<N 
    
    % Singular value decomposition
    [ev,eigval,~]=svd(X*X'); 
    
    % Components
    Fhat0=sqrt(T)*ev;
    
    % Loadings
    Lambda0=X'*Fhat0/T;
    
else
    
    % Singular value decomposition
    [ev,eigval,~]=svd(X'*X);
    
    % Loadings
    Lambda0=sqrt(N)*ev;
    
    % Components
    Fhat0=X*Lambda0/N;

end

% -------------------------------------------------------------------------
% SELECT NUMBER OF FACTORS 
    
% Preallocate memory
Sigma=zeros(1,kmax+1); % sum of squared residuals divided by NT
IC1=zeros(size(CT,1),kmax+1); % information criterion value

% Loop through all possibilites for the number of factors 
for i=kmax:-1:1

    % Identify factors as first i components
    Fhat=Fhat0(:,1:i);

    % Identify factor loadings as first i loadings
    lambda=Lambda0(:,1:i);

    % Predict X using i factors
    chat=Fhat*lambda';

    % Residuals from predicting X using the factors
    ehat=X-chat;

    % Sum of squared residuals divided by NT
    Sigma(i)=mean(sum(ehat.*ehat/T));

    % Value of the information criterion when using i factors
    IC1(:,i)=log(Sigma(i))+CT(:,i);
    
end

% Sum of squared residuals when using no factors to predict X (i.e.
% fitted values are set to 0)
Sigma(kmax+1)=mean(sum(X.*X/T));

% Value of the information criterion when using no factors
IC1(:,kmax+1)=log(Sigma(kmax+1));

% Number of factors that minimizes the information criterion
ic1=minindc(IC1')';

% Set ic1=0 if ic1>kmax (i.e. no factors are selected if the value of the
% information criterion is minimized when no factors are used)
ic1=ic1 .*(ic1 <= kmax);

% =========================================================================
% PART 4: SAVE OTHER OUTPUT

% Factors and loadings when number of factors set to kmax
Fhat=Fhat0(:,1:kmax); % factors
Lambda=Lambda0(:,1:kmax); % factor loadings

% Predict X using kmax factors
chat=Fhat*Lambda';

% Get the eivenvalues corresponding to X'*X (or X*X' if N>T)
eigval=diag(eigval);
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


function pos=minindc(x)
% =========================================================================
% DESCRIPTION
% This function finds the index of the minimum value for each column of a
% given matrix. The function assumes that the minimum value of each column
% occurs only once within that column. The function returns an error if
% this is not the case.
%
% -------------------------------------------------------------------------
% INPUT
%           x   = matrix 
%
% OUTPUT
%           pos = column vector with pos(i) containing the row number
%                 corresponding to the minimum value of x(:,i)
%
% =========================================================================
% FUNCTION

% Number of rows and columns of x
nrows=size(x,1);
ncols=size(x,2);

% Preallocate memory for output array
pos=zeros(ncols,1);

% Create column vector 1:nrows
seq=(1:nrows)';

% Find the index of the minimum value of each column in x
for i=1:ncols
    
    % Minimum value of column i
    min_i=min(x(:,i));
    
    % Column vector containing the row number corresponding to the minimum
    % value of x(:,i) in that row and zeros elsewhere
    colmin_i= seq.*((x(:,i)-min_i)==0);
    
    % Produce an error if the minimum value occurs more than once
    if sum(colmin_i>0)>1
        error('Minimum value occurs more than once.');
    end
    
    % Obtain the index of the minimum value by taking the sum of column
    % vector 'colmin_i'
    pos(i)=sum(colmin_i);
    
end
end
