function [gmm gmmb se] = windmeijeriv(Y,X,Z,gmm0)
%% Calculates GMM point estimator and standard error 
%% for a two-way exponential regression model on a (balanced) n-by-m panel data set
%% using instrumental variables based on Windmeijer and Stantos Silva (1997 JAE) and performs a subsequent bias correction
% INPUT: 
% Y = n-by-m array of outcomes; 
% X = d-dimensional cell of n-by-m matrices of regressors ; 
% Z = q-dimensional cell of n-by-m matrices of instruments;  Z must include exogenous X
% starting value for optimization (gmm0).
% OUTPUT: 
% point estimate (gmm); 
% standard error (se);
% asymptotic covariance matrix (asyvar)

% NOTES: 
% This algorithm estimates two-way FE. Partitioned-inverse formulae are used to speed up computation.

warning('off','all');


% Minimization of GMM problem by Newton's method, starting at gmm0
[gmm, condition, numiter, S] = Newton(@QuadraticForm,gmm0,Y,X,Z); 

[bias asyvar] = Bias(Y,X,Z,gmm);

gmmb = gmm-bias;

se = sqrt(diag(asyvar));








function [criterion score Hessian H] = QuadraticForm(theta,Y,X,Z)
[N, T] = size(Y); % sample size
[d, ~] = size(X); % number of regressors
[q, ~] = size(Z); % number of moment conditions 

[alpha gamma f condition it]=FEmax(Y,X,theta,zeros(N,1),zeros(T,1));

A = alpha*ones(1,T) ;
G = ones(N,1)*gamma'; 
index = zeros(N,T); for k=1:d, index = index+X{k}*theta(k); end % linear index 
lambda = exp(index+A+G); U = Y-lambda;

% score vector
S = zeros(q,1); for k=1:q, S(k) = mean(mean(Z{k}.*U)); end
% Jacobian matrix
H = zeros(q,d); 
FE = zeros(N*T,N  ); for i=1:N  , temp = zeros(N,T); temp(i,:)=1; FE(:,i) = reshape(temp,N*T,1); end
TD = zeros(N*T,T-1); for t=1:T-1, temp = zeros(N,T); temp(:,t)=1; TD(:,t) = reshape(temp,N*T,1); end
for k=1:q,
    ols = lscov([FE TD],reshape(Z{k},N*T,1),reshape(lambda,N*T,1));
     Xi = ols(1:N)+[ols(N+1:end)' 0];  Q =  Z{k}-Xi ;
    for j=1:d,
       ols = lscov([FE TD],reshape(X{j},N*T,1),reshape(lambda,N*T,1));
       XXi = ols(1:N)+[ols(N+1:end)' 0]; QQ =  X{j}-XXi;
        
       H(k,j) = mean(mean(lambda.*(Z{k}.*X{j}-Xi.*XXi)));
    end
end
H = -H; 

if q>d, 
    Var = zeros(q,q);
    for k=1:q,
        ols = lscov([FE TD],reshape(Z{k},N*T,1),reshape(lambda,N*T,1));
         Xi = ols(1:N)+[ols(N+1:end)' 0];  Q =  Z{k}-Xi ;
        for j=1:q
            ols = lscov([FE TD],reshape(Z{j},N*T,1),reshape(lambda,N*T,1));
            XXi = ols(1:N)+[ols(N+1:end)' 0];  QQ =  Z{j}-XXi ;
            Var(k,j) = mean(mean((Q.*QQ).*(U).^2));
        end
    end
else
    Var=eye(q);
end
invV = inv(Var);

% objective function
criterion = -S'*invV*S/2;
score     = -H'*invV*S  ;
Hessian   = -H'*invV*H  ;


%% Bias calculation
function [bias asyvar] = Bias(Y,X,Z,theta)
[N, T] = size(Y); % sample size
[d, ~] = size(X); % number of regressors
[q, ~] = size(Z); % number of moment conditions 

[alpha gamma f condition it]=FEmax(Y,X,theta,zeros(N,1),zeros(T,1));

A = alpha*ones(1,T) ;
G = ones(N,1)*gamma'; 
index = zeros(N,T); for k=1:d, index = index+X{k}*theta(k); end % linear index 
lambda = exp(index+A+G); U = Y-lambda;

% Bias and Jacobian
b = zeros(q,1);
H = zeros(q,d); 
FE = zeros(N*T,N  ); for i=1:N  , temp = zeros(N,T); temp(i,:)=1; FE(:,i) = reshape(temp,N*T,1); end
TD = zeros(N*T,T-1); for t=1:T-1, temp = zeros(N,T); temp(:,t)=1; TD(:,t) = reshape(temp,N*T,1); end
for k=1:q,
    ols = lscov([FE TD],reshape(Z{k},N*T,1),reshape(lambda,N*T,1));
     Xi = ols(1:N)+[ols(N+1:end)' 0];  Q =  Z{k}-Xi ;
    
     B1 = -mean(mean(U.*lambda.*Q,2)./mean(lambda,2));
     B2 = -mean(mean(U.^2,2).*mean(lambda.*Q,2)./mean(lambda,2).^2); B2 = B2/2;

     D1 = -mean(mean(U.*lambda.*Q,1)./mean(lambda,1));
     D2 = -mean(mean(U.^2,1).*mean(lambda.*Q,1)./mean(lambda,1).^2); B2 = B2/2;

    b(k) = (B1+B2)/T + (D1+D2)/N;

    for j=1:d,
       ols = lscov([FE TD],reshape(X{j},N*T,1),reshape(lambda,N*T,1));
       XXi = ols(1:N)+[ols(N+1:end)' 0]; QQ =  X{j}-XXi;
        
       H(k,j) = mean(mean(lambda.*(Z{k}.*X{j}-Xi.*XXi)));
    end
end
H = -H;

 
Var = zeros(q,q);
for k=1:q,
    ols = lscov([FE TD],reshape(Z{k},N*T,1),reshape(lambda,N*T,1));
    Xi = ols(1:N)+[ols(N+1:end)' 0];  Q =  Z{k}-Xi ;
    for j=1:q
        ols = lscov([FE TD],reshape(Z{j},N*T,1),reshape(lambda,N*T,1));
        XXi = ols(1:N)+[ols(N+1:end)' 0];  QQ =  Z{j}-XXi ;
        Var(k,j) = mean(mean((Q.*QQ).*(U).^2));
    end
end

invV = inv(Var);

% bias
bias = -inv(H'*invV*H)*(H'*invV*b);

asyvar = inv(H'*invV*H)/(N*T);




%% FE estimation for given value of common parameter
function [L S H IH] = FE(Y,X,theta,alpha,gamma)
[N, T] = size(Y); % sample size
[d, ~] = size(X); % number of regressors

A = alpha*ones(1,T) ;  
G = ones(N,1)*gamma'; 
index = zeros(N,T); for k=1:d, index = index+X{k}*theta(k); end % linear index 
lambda = exp(index+A+G); U = Y-lambda;

s_a = sum(U,2) /sqrt(N*T) ;  s_a(1) = [];
s_g = sum(U,1)'/sqrt(N*T); 

L = s_a'*s_a+s_g'*s_g; L = -.5*L; 

H_aa = diag(sum(-lambda,2))/sqrt(N*T); H_ag = - lambda /sqrt(N*T);  H_aa(1,:) = []; H_aa(:,1) = [];
H_gg = diag(sum(-lambda,1))/sqrt(N*T); H_ga = - lambda'/sqrt(N*T);  H_ag(1,:) = []; H_ga(:,1) = [];             

%H_aa = H_aa - ones(N,N)/sqrt(N*T); H_ag = H_ag - ones(N,T)/sqrt(N*T);
%H_gg = H_gg - ones(T,T)/sqrt(N*T); H_ga = H_ga - ones(T,N)/sqrt(N*T);

S = [s_a; s_g]; 
H = [H_aa, H_ag; H_ga, H_gg]; 


IH_aa = diag(1./diag(H_aa));
IH_gg = diag(1./diag(H_gg));

block1 = [inv(H_aa-H_ag*IH_gg*H_ga), zeros(N-1,T); zeros(T,N-1), inv(H_gg-H_ga*IH_aa*H_ag)];
block2 = [eye(N-1)                 ,  -H_ag*IH_gg;  -H_ga*IH_aa, eye(T)                 ];
 
IH = block1*block2;


function [alpha gamma f condition it]=FEmax(Y,X,theta,alpha,gamma)
[N T] = size(Y);
tol=1e-3; maxit=100; smalleststep=.5^20;
it=1; condition=1; improvement=1; 
[f g H IH] = FE(Y,X,theta,alpha,gamma);
while it<=maxit && condition==1 && improvement==1;
    d = -IH*g; d = [0; d];
    step=1; improvement=0;
    while step>=smalleststep && improvement==0;
        n_alpha = alpha+step*d(  1 : N); 
        n_gamma = gamma+step*d(N+1:end); 
        [ff gg HH IHH] = FE(Y,X,theta,n_alpha,n_gamma);
        if (ff-f)/abs(f)>=-1e-5
            improvement=1; condition=sqrt(step*step*(d'*d))>tol & (ff-f)>tol;
            alpha=n_alpha;
            gamma=n_gamma;
            f=ff; g=gg; H=HH;
        else
            step=step/2;
        end
    end
    it=it+1;
end
it=it-1;



%% Newton algorithm used for optimization
function [x condition it J]=Newton(FUN,x,varargin) % varargout
% maximises FUN, starting at x by Newton-Raphson method
tol=1e-5; maxit=100; smalleststep=.5^20;
it=1; condition=1; improvement=1; k=length(x);
[f g H J] =feval(FUN,x,varargin{:}); %varargout
while it<=maxit && condition==1 && improvement==1;
    [s1 s2]=size(H); if s1==s2 && s2>1 d=-inv(H)*g; else d=-g./H; end      
    step=1; improvement=0;
    while step>=smalleststep && improvement==0;
        [ff gg HH JJ] =feval(FUN,x+step*d,varargin{:}); %varargout
        bounded = sum(sum(isnan(HH)))==0 & sum(sum(isinf(HH)))==0;
        if (ff-f)/abs(f)>=-1e-5 & bounded==1;
            improvement=1; condition=sqrt(step*step*(d'*d))>tol & (ff-f)>tol;
            x=x+step*d; f=ff; g=gg; H=HH; J = JJ;
        else
            step=step/2;
        end
    end
    it=it+1;
end
it=it-1;