
# This code calls and runs the matrix decomposition in Richey and Rosburg
# "Decomposing economic mobility transition matrices" published in JoAE


# The code runs using data 'nlsy.79.ind.merged1' which is assumed to be already loaded 

# Needed libraries and source codes
library(Hmisc)
source("procedures.decompose.matrix_JoAE.R")

## Define base and treatment data and weights
# NOTE: The version of copula estimation used in the procedure called does not support weights.
#       One could adjust the copula estimation to that of the original min.distance from Rothe's original
#       code and then weights would be usable.

# We define our 'treatment' group to be those children from 
# households in the top quartile. Our counterfactuals then conduct counterfactuals if all children
# had their characteristics/returns, etc.

dat.treat <- nlsy.79.ind.merged1[nlsy.79.ind.merged1$parent.cincome >= quantile(nlsy.79.ind.merged1$parent.cincome, probs = 0.75),]

dat.base1 <- nlsy.79.ind.merged1[nlsy.79.ind.merged1$parent.cincome < quantile(nlsy.79.ind.merged1$parent.cincome, probs = 0.25),]

dat.base2 <- nlsy.79.ind.merged1[nlsy.79.ind.merged1$parent.cincome < quantile(nlsy.79.ind.merged1$parent.cincome, probs = 0.5) & 
                                   nlsy.79.ind.merged1$parent.cincome >= quantile(nlsy.79.ind.merged1$parent.cincome, probs = 0.25),]

dat.base3 <- nlsy.79.ind.merged1[nlsy.79.ind.merged1$parent.cincome < quantile(nlsy.79.ind.merged1$parent.cincome, probs = 0.75) & 
                                   nlsy.79.ind.merged1$parent.cincome >= quantile(nlsy.79.ind.merged1$parent.cincome, probs = 0.5),]


# Define formula and covariates

formula <- lcincome~esteem+rotter+perlin+afqt+grade+exp+I(exp^2)
covariate <- ~esteem+rotter+perlin+afqt+grade+exp

# Create objects to pass to function that performs decomposition 

dat <- list(NA)
dat[[1]] <- dat.base1
dat[[2]] <- dat.base2
dat[[3]] <- dat.base3
dat[[4]] <- dat.treat

n1 <- dim(dat.base1)[1]
n2 <- dim(dat.base2)[1]
n3 <- dim(dat.base3)[1]
n4 <- dim(dat.treat)[1]

weights.base1 <- rep(1,n1)
weights.base2 <- rep(1,n2)
weights.base3 <- rep(1,n3)
weights.treat <- rep(1,n4)
 
weights <- list(NA)
weights[[1]] <- weights.base1
weights[[2]] <- weights.base2
weights[[3]] <- weights.base3
weights[[4]] <- weights.treat

# First, compute actual transition matrix to compare to transition matrix from the estimation process.
# Note: in our paper, we point out that the decomposition uses estimated
# components to recreate the 'empirical' matrix. Therefore, the reported 'gaps' do not 
# exactly replicate the empirical-independent gap.

t.mat.79 <- matrix(nrow = 4, ncol = 4)
for(i in c(0.25)){   
  t.mat.79[1,1] <- (dim(dat.base1[dat.base1$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i), ])[1])/dim(dat.base1)[1]
  t.mat.79[2,1] <- (dim(dat.base2[dat.base2$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i), ])[1])/dim(dat.base2)[1]
  t.mat.79[3,1] <- (dim(dat.base3[dat.base3$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i), ])[1])/dim(dat.base3)[1]
  t.mat.79[4,1] <- (dim(dat.treat[dat.treat$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i), ])[1])/dim(dat.treat)[1]
}
for(i in c(0.5, 0.75, 1)){   
    t.mat.79[1,i*4] <- (dim(dat.base1[dat.base1$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i) & 
                                     dat.base1$cincome > quantile(nlsy.79.ind.merged1$cincome, probs = (i - 0.25)), ])[1])/dim(dat.base1)[1]
    t.mat.79[2,i*4] <- (dim(dat.base2[dat.base2$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i) & 
                                      dat.base2$cincome > quantile(nlsy.79.ind.merged1$cincome, probs = (i - 0.25)), ])[1])/dim(dat.base2)[1]
    t.mat.79[3,i*4] <- (dim(dat.base3[dat.base3$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i) & 
                                      dat.base3$cincome > quantile(nlsy.79.ind.merged1$cincome, probs = (i - 0.25)), ])[1])/dim(dat.base3)[1]
    t.mat.79[4,i*4] <- (dim(dat.treat[dat.treat$cincome <= quantile(nlsy.79.ind.merged1$cincome, probs = i) & 
                                      dat.treat$cincome > quantile(nlsy.79.ind.merged1$cincome, probs = (i - 0.25)), ])[1])/dim(dat.treat)[1]
}
    


set.seed(12345) # set seed to ensure constancy of results 

# NOTE: output is a list with 8 objects
# 1. index decomposition results (NOTE: in output 1, M1 and M2 are not those reported in our paper, 
#                                       M1 in this output is related to the trace of the matrix and M2 in this output
#                                       is the unsymmetrized eigenvalue index.
#                                       M1 from the paper (Bartholomew index) is M3 in this output and M2 from the paper
#                                       (the symmetrized eigen value) is M4 in this ouput.)
# 2. top quartile decomposition results
# 3. bottom quartile decomposition results

# NOTE: the signs on outputs 1-3 will populate with the opposite sign as in the paper.
#       In the paper the 'gap' is the v(empirical matrix) - v(independent matrix), however the procedures has 
#       it written in the opposite direction. This was a change in a later version of the paper done for reasons 
#       of interpretation. So to match the paper one should report the negative of the results in output 1-3.

#       HOWEVER: we correct this in the final tables (below after bootstrap). So if one only looks at the final 
#                tables with SEs included the signs are corrected. 

# 4. estimated empirical matrix
# 5. estimated counterfactual matrix (from the aggregate decomposition)
# 6. estimated empirical and counterfactual indices (also from aggregate decompositon)
# 7. composition effect (aggregate decomp) for the full matrix
# 8. structure effect (aggregate decomp) for the full matrix

out.table <- full.decomp(formula = formula,
                         covariate = covariate,  
                         dat = dat, weights)
set.seed(as.numeric(format(Sys.time(), "%OS3"))*1000)
out.table


##############################################################
#
# Begin bootstrap replications to generate standard errors (SEs)
#
###############################################################


# First create empty lists to hold bootstrap replications of each estimated object

out.list1 <- list(NA) # bootstraps for indexes decompos
#                       NOTE: 3rd row is the Bartholomew index, 4th row is 1 minus symmetrized 2nd eigen value (both reported in paper)
                      # 1st row is related to trace (not reported in paper) and 2nd row is 1 minus 2nd standard eigen value (also not reported in paper)
                      # these other indices were reported in an earlier version of the paper

out.list2 <- list(NA) # bootstraps for top quartile decomps
out.list3 <- list(NA) # bootstraps for bottom quartile decomps

out.list4 <- list(NA) # bootstraps for empirical matrix
out.list5 <- list(NA) # bootstraps for counterfactual matrix
out.list6 <- list(NA) # bootstraps for indices (empirical and counter) (this is a 4x2 matrix with columns empirical and counter
                      # and rows corresponding to 4 index measures - order same as out.list1)

out.list7 <- list(NA) # bootstraps for full matrix composition effect
out.list8 <- list(NA) # bootstraps for full matrix structure effect

bskip <- 1   # Set to positive integer to use a m-out-of-n bootstrap (useful for large data sets)

for (b in 1:2) {
      
      bsamp1 <- sample(1:dim(dat.base1)[1], 
                       round(dim(dat.base1)[1]/bskip), 
                       replace=TRUE)
      bsamp2 <- sample(1:dim(dat.base2)[1], 
                       round(dim(dat.base2)[1]/bskip), 
                       replace=TRUE)
      bsamp3 <- sample(1:dim(dat.base3)[1], 
                       round(dim(dat.base3)[1]/bskip), 
                       replace=TRUE)
      bsamp4 <- sample(1:dim(dat.treat)[1], 
                       round(dim(dat.treat)[1]/bskip), 
                       replace=TRUE )
      
      
      dat <- list(NA)
      dat[[1]] <- dat.base1[bsamp1,]
      dat[[2]] <- dat.base2[bsamp2,]
      dat[[3]] <- dat.base3[bsamp3,]
      dat[[4]] <- dat.treat[bsamp4,]
      
      ww <- list(NA)
      ww[[1]] <- rep(1,length(bsamp1))
      ww[[2]] <- rep(1,length(bsamp2))
      ww[[3]] <- rep(1,length(bsamp3))
      ww[[4]] <- rep(1,length(bsamp4))
      
      
      set.seed(12345) # to ensure constancy of results based on MC integration
      out.temp <- full.decomp(formula = formula,
                              covariate = covariate,  
                              dat = dat, weights = ww)
      out.list1[[b]] <- out.temp[[1]]
      out.list2[[b]] <- out.temp[[2]]
      out.list3[[b]] <- out.temp[[3]]
      out.list4[[b]] <- out.temp[[4]]
      out.list5[[b]] <- out.temp[[5]]
      out.list6[[b]] <- out.temp[[6]]
      out.list7[[b]] <- out.temp[[7]]
      out.list8[[b]] <- out.temp[[8]]
      
      
      set.seed(as.numeric(format(Sys.time(), "%OS3"))*1000)                             
      
      save(out.list1,file="outlist1.Rdata")
      save(out.list2,file="outlist2.Rdata")
      save(out.list3,file="outlist3.Rdata")
      print(c("BOOTSTRAP REPLICATION: ", b))
}

# Create table structure for SEs

sd.table1 <- out.table[[1]]
sd.table2 <- out.table[[2]]
sd.table3 <- out.table[[3]]
sd.table4 <- out.table[[4]]
sd.table5 <- out.table[[5]]
sd.table6 <- out.table[[6]]
sd.table7 <- out.table[[7]]
sd.table8 <- out.table[[8]]

# Use 'sapply' function to create SEs for results as std.dev. over bootstrap results

for (i in 1:dim(sd.table1)[1]) {
      for (j in 1:dim(sd.table1)[2]) {
            sd.table1[i,j]  <- sd(sapply(out.list1, function(x) {x[i,j]})/sqrt(bskip))
            sd.table2[i,j]  <- sd(sapply(out.list2, function(x) {x[i,j]})/sqrt(bskip))
            sd.table3[i,j]  <- sd(sapply(out.list3, function(x) {x[i,j]})/sqrt(bskip))
      }
}

for (i in 1:dim(sd.table4)[1]) {
      for (j in 1:dim(sd.table4)[2]) {
            sd.table4[i,j]  <- sd(sapply(out.list4, function(x) {x[i,j]})/sqrt(bskip))
            sd.table5[i,j]  <- sd(sapply(out.list5, function(x) {x[i,j]})/sqrt(bskip))
            sd.table7[i,j]  <- sd(sapply(out.list7, function(x) {x[i,j]})/sqrt(bskip))
            sd.table8[i,j]  <- sd(sapply(out.list8, function(x) {x[i,j]})/sqrt(bskip))
      }
}

for (i in 1:dim(sd.table6)[1]) {
      for (j in 1:dim(sd.table6)[2]) {
            sd.table6[i,j]  <- sd(sapply(out.list6, function(x) {x[i,j]})/sqrt(bskip))
      }
}

# Create larger table to hold pt. estimates and SEs

tab1 <- cbind(out.table[[1]][,1:4], out.table[[1]][,1:4]) 
tab2 <- cbind(out.table[[2]][,1:4], out.table[[2]][,1:4]) 
tab3 <- cbind(out.table[[3]][,1:4], out.table[[3]][,1:4]) 
tab4 <- cbind(out.table[[4]][,1:4], out.table[[4]][,1:4]) 
tab5<- cbind(out.table[[5]][,1:4], out.table[[5]][,1:4])
tab7<- cbind(out.table[[7]][,1:4], out.table[[7]][,1:4])
tab8<- cbind(out.table[[8]][,1:4], out.table[[8]][,1:4])

# Merge pt. estimates and (SEs) into one table

for (i in 1:4) {
      tab1[,1 + 2*(i-1)] <- -round(100*out.table[[1]][,i], digits = 3)
      tab1[,2 + 2*(i-1)] <- paste("(",round(100*sd.table1[,i], digits = 3),")", sep = "") 
      
      tab2[,1 + 2*(i-1)] <- -round(100*out.table[[2]][,i], digits = 3)
      tab2[,2 + 2*(i-1)] <- paste("(",round(100*sd.table2[,i], digits = 3),")", sep = "") 
      
      tab3[,1 + 2*(i-1)] <- -round(100*out.table[[3]][,i], digits = 3)
      tab3[,2 + 2*(i-1)] <- paste("(",round(100*sd.table3[,i], digits = 3),")", sep = "") 
      
      tab4[,1 + 2*(i-1)] <- round(100*out.table[[4]][,i], digits = 3)
      tab4[,2 + 2*(i-1)] <- paste("(",round(100*sd.table4[,i], digits = 3),")", sep = "")
      
      tab5[,1 + 2*(i-1)] <- round(100*out.table[[5]][,i], digits = 3)
      tab5[,2 + 2*(i-1)] <- paste("(",round(100*sd.table5[,i], digits = 3),")", sep = "")
      
      tab7[,1 + 2*(i-1)] <- round(100*out.table[[7]][,i], digits = 3)
      tab7[,2 + 2*(i-1)] <- paste("(",round(100*sd.table7[,i], digits = 3),")", sep = "")
      
      tab8[,1 + 2*(i-1)] <- round(100*out.table[[8]][,i], digits = 3)
      tab8[,2 + 2*(i-1)] <- paste("(",round(100*sd.table8[,i], digits = 3),")", sep = "")
}
colnames(tab1) <- c("M1","M1.se","M2","M2.se","M3","M3.se", "M4", "M4.se" ) # Note again the paper reports what is listed here as M3 and M4
                                                                            # The reported M3 is the Bartholomew index (called M1 in paper)
                                                                            # The reported M4 is the symmetrized eigen value index (called M2 in paper)
                                                                            # This discrepency is due to indices used in an earlier version of the paper
colnames(tab2) <- c("1st Q","1st Q.se","2nd Q","2nd Q.se","3rd Q","3rd Q.se", "4th Q", "4th Q.se" )
colnames(tab3) <- c("1st Q","1st Q.se","2nd Q","2nd Q.se","3rd Q","3rd Q.se", "4th Q", "4th Q.se" )
colnames(tab4) <- c("1st Q","1st Q.se","2nd Q","2nd Q.se","3rd Q","3rd Q.se", "4th Q", "4th Q.se" )
colnames(tab5) <- c("1st Q","1st Q.se","2nd Q","2nd Q.se","3rd Q","3rd Q.se", "4th Q", "4th Q.se" )
colnames(tab7) <- c("1st Q","1st Q.se","2nd Q","2nd Q.se","3rd Q","3rd Q.se", "4th Q", "4th Q.se" )
colnames(tab8) <- c("1st Q","1st Q.se","2nd Q","2nd Q.se","3rd Q","3rd Q.se", "4th Q", "4th Q.se" )

tab6 <- cbind(out.table[[6]], out.table[[6]]) 
 
for (i in 1:2) {
      tab6[,1 + 2*(i-1)] <- round(100*out.table[[6]][,i], digits = 3)
      tab6[,2 + 2*(i-1)] <- paste("(",round(100*sd.table6[,i], digits = 3),")", sep = "") 
 
}
colnames(tab6) <- c("Empirical","Emp.se","Counterfactural","Counter.se" )
 

library(xtable)

xtable( tab1 )
xtable( tab2 )
xtable( tab3 )
xtable( tab4 )
xtable( tab5 )
xtable( tab6 )
xtable( tab7 )
xtable( tab8 )