################################################################################
# Chou, Cheng and Ruoyao Shi, "What Time Use Surveys Can (And Cannot) Tell Us
# About Labor Supply", Journal of Applied Econometrics, forthcoming.
# Produce Table 2, Figure 1 and 2.
# Written by Cheng Chou (https://chengchou.github.io), Jan 29, 2021.
################################################################################

dat.kids <- readRDS("dtus.rds")
source("estimators.R")
# LOAD PACKAGES #
library(ggplot2)
library(ggpubr) # multiple plots on one single graph
library(dplyr)
library(tidyr)
library(Matrix) # Do big matrix computation
library(purrr)
library(AER)
library(xtable) # create latex table
options(xtable.include.rownames = FALSE) # Do not append row number in xtable
options(xtable.booktabs = TRUE) # Use booktab
options(xtable.caption.placement = 'top') # Caption should be on the top
options(xtable.table.placement = NULL) # Do not specify placement

################################################################################
# ELASTICITY OF THE NUMBER OF CHILDREN ON WEEKLY HOURS OF WORK #
################################################################################
# Given dependent variable name and sex, using recalled hours or weekly hours
# to estimate the parameters and elasticities.
# name of regressors
xnames <- c('NCHILD', 'EDTRYCompleted secondary', 'EDTRYAbove 2ndry edu')
prob.atus <- c(0.25, rep(0.1, 5), 0.25)
dtus.lm <- function(y, sex) {
    reg1 <- as.formula(
      paste0(y, '~ NCHILD + EDTRY + AGE + I(AGE ^ 2) + SECTOR + URBAN + YEAR')
      )
    lm1 <- lm(reg1, weights = PROPWT, data = dat.kids %>% filter(SEX == sex))
    # return the same list as dtus.im function and R square
    beta.lm <- coef(lm1)[xnames]
    vcov.lm <- vcov(lm1)[xnames, xnames]
    rsq <- summary(lm1)$r.squared
    # Elasticities
    meanHours <- dat.kids %>% filter(SEX == sex) %>% pull(y) %>% mean()
    elas.lm <- beta.lm / meanHours
    vcov.elas.lm <- vcov.lm / meanHours ^ 2
    # number of regression coefficients---it will be used by imputed estimator.
    p <<- length(coef(lm1))
    list(beta = beta.lm, vcov = vcov.lm, elas = elas.lm, vcov.elas = vcov.elas.lm, rsq = rsq)
}

# Parameters from OLS
lm.kids <- list() # list OLS
for (y in c('WORKHRS', 'WKHRTUS')) {
  for (sex in c('Man', 'Woman')) {
    lm.kids[[y]][[sex]] <- dtus.lm(y, sex)
  }
}

# Use imputed estimator to estimate the model.
dtus.impute <- function(dat.kids.sex) {
  # Use imputed estimator to estimate the same model.
  # Impute hours in all days of a week
  Hours.hat <- matrix(nr = nrow(dat.kids.sex), nc = 7)
  Quz <- matrix(0, nr = p, nc = p)
  rsq.day <- c()
  resd <- list()
  for (day in 1L : 7L) {
    # day now needs to be integer.
    dat1 <- dat.kids.sex %>% filter(RandomDay1 == day)
    lm.day <- lm(Hours1 ~ NCHILD + EDTRY + AGE + I(AGE ^ 2) + SECTOR + URBAN + YEAR, 
      weights = PROPWT,
      data = dat1, x = TRUE)
    # resd[[day]] <- tibble(V = lm.day$residuals, DAY = dat1$RandomDay1)
    resd[[day]] <- dat1 %>% select(SAMPLE, HLDID, PERSID) %>% 
      mutate(V = lm.day$residuals, DAY = dat1$RandomDay1)
    rsq.day[day] <- summary(lm.day)$r.squared
    # Find unused levels
    id <- which(!(dat.kids.sex$YEAR %in% levels(droplevels(dat1$YEAR))))
    dat.kids.sex2 <- dat.kids.sex
    dat.kids.sex2$YEAR[id] <- NA
    Hours.hat[ , day] <- predict(lm.day, newdata = dat.kids.sex2)
    # Calculate variance using Theorem 3 (valid daily IV assumption). It is
    # simple here because we assume X are exogenous.
    U <- Diagonal(x = (lm.day$residuals) ^ 2)
    Quz <- Quz + (t(U %*% lm.day$x) %*% lm.day$x) / nrow(dat1) * (1 / prob.atus[day])
  }
  resd <- bind_rows(resd)
  dat.kids2 <- dat.kids.sex %>% mutate(WKHRTUS.hat = rowSums(Hours.hat))
  #   im.sex <- lm(WKHRTUS.hat ~ NCHILD + EDTRY + AGE + I(AGE ^ 2) + SECTOR + URBAN + YEAR, weights = PROPWT, data = dat.kids2, x = TRUE)
  im.sex <- ivreg(WKHRTUS.hat ~ NCHILD + EDTRY + AGE + I(AGE ^ 2) + SECTOR + URBAN + YEAR, weights = PROPWT, data = dat.kids2, x = TRUE)
  resd <- resd %>% left_join(
    dat.kids2 %>% select(SAMPLE, HLDID, PERSID) %>% mutate(U = im.sex$residuals),
    by = c('SAMPLE', 'HLDID', 'PERSID')
    )
  Omega.im <- var.im(im.sex, resd)
  vcov.im <- Omega.im[xnames, xnames]
  beta.im <- im.sex$coefficients[xnames]

  # Calculate elasticities
  meanHours <- dat.kids.sex %>% group_by(RandomDay1) %>% 
    summarize(meanHoursDay = mean(Hours1)) %>% pull(meanHoursDay) %>% sum()
  elas.im <- beta.im / meanHours
  vcov.elas.im <- vcov.im / meanHours ^ 2
  list(beta = beta.im, elas = elas.im, 
    vcov = vcov.im, vcov.elas = vcov.elas.im, rsq = mean(rsq.day))
}

# Calculate the std err
for (sex in c('Man', 'Woman')) {
  lm.kids[['Imputed']][[sex]] <- dtus.impute(dat.kids %>% filter(SEX == sex))
}

# Do Hausman test: Recall - Impute, DTUS - Impute
h <- list()
for (y in c('WORKHRS', 'WKHRTUS')) {
  for (sex in c('Man', 'Woman')) {
    b.ef <- lm.kids[[y]][[sex]]$beta
    b.if <- lm.kids[['Imputed']][[sex]]$beta
    v.ef <- lm.kids[[y]][[sex]]$vcov
    v.if <- lm.kids[['Imputed']][[sex]]$vcov
    h[[paste(y, sex)]] <- hausman(b.ef, b.if, v.ef, v.if) %>% unlist()
  }
}
# Hausman for beta comparison
h <- bind_cols(h) %>% mutate("Imputed Man" = NA, "Imputed Woman" = NA)

# Sample size per group
n.woman <- nrow(dat.kids %>% filter(SEX == "Woman"))
n.man <- nrow(dat.kids %>% filter(SEX == "Man"))
n.sex <- list(Woman = n.woman, Man = n.man)

# R squared
rsq.lm <- lm.kids %>% map_depth(2, ~ .$rsq)

# Create a latex table of regression coefficients
df_template <- 
  tibble(
    variables = c('$n$ of kids aged $<$ 18', NA, 'Educ: completed 2ndry', NA, 'Educ: above 2ndry', NA, 'P value of Hausman test', '$n$ of Obs.', '$R$ squared'),
    "Man WORKHRS" = character(length(variables)),
    "Man WKHRTUS" = character(length(variables)),
    "Man Imputed" = character(length(variables)),
    "Empty" = NA,
    "Woman WORKHRS" = character(length(variables)),
    "Woman WKHRTUS" = character(length(variables)),
    "Woman Imputed" = character(length(variables))
  )
para_df <- df_template
# Regression coefficients table
joinpara <- function(l1) {
  cbind(
    l1$beta,
    l1$vcov %>% diag() %>% map_dbl(~round(sqrt(.), 3)) %>% paste0('(', ., ')')
    ) %>% t() %>% as.vector()
}
for (y in c('WORKHRS', 'WKHRTUS', 'Imputed')) {
  for (sex in c('Man', 'Woman')) {
    para_df[[paste(sex, y)]] <- 
      c(joinpara(lm.kids[[y]][[sex]]),
        h %>% select(paste(y, sex)) %>% slice(2) %>% unlist(),
      paste0('{', n.sex[[sex]], '}'), rsq.lm[[y]][[sex]])
  }
}
colnames(para_df) <- c("", "Man: Recalled Hours", "Man: Time Use Hours", 
  "Man: Imputed Estimator", "", "Woman: Recalled Hours", 
  "Woman: Time Use Hours", "Woman: Imputed Estimator")

# Save latex table for parameters estimation.
print(
  xtable(
    para_df, 
    caption = "Weekly Labor Supply Estimation (Regression Coefficients): DTUS"
    ), 
  # A latex table will be saved.
  file = "DTUSKidsParameters.tex",
  sanitize.text.function = function(x) {x}
)

# Create a latex table of elasticities.
elst_df <- df_template
joinelst <- function(l1) {
  cbind(
    l1$elas * 100,
    l1$vcov.elas %>% diag() %>% map_dbl(~round(100 * sqrt(.), 2)) %>% paste0('(', ., ')')
    ) %>% t() %>% as.vector()
}
for (y in c('WORKHRS', 'WKHRTUS', 'Imputed')) {
  for (sex in c('Man', 'Woman')) {
    elst_df[[paste(sex, y)]] <- c(joinelst(lm.kids[[y]][[sex]]),
      h %>% select(paste(y, sex)) %>% slice(2) %>% unlist(),
      paste0('{', n.sex[[sex]], '}'), rsq.lm[[y]][[sex]])
  }
}
colnames(elst_df) <- c("", "Man: Recalled Hours", "Man: Time Use Hours", 
  "Man: Imputed Estimator", "", "Woman: Recalled Hours", 
  "Woman: Time Use Hours", "Woman: Imputed Estimator")
# Save latex table for parameters estimation.
print(
  xtable(
    elst_df, 
    caption = "Weekly Labor Supply Elasticity Estimation: DTUS"
    ), 
  # A latex table will be saved. This is Table 2 of in the paper.
  file = "DTUSKidsElasticities.tex",
  sanitize.text.function = function(x) {x}
)

################################################################################
# DISTRIBUTIONAL PROPERTIES OF MEASUREMENT ERROR #
################################################################################
# Below, "true hour worked" = WKHRTUS (weekly hours of work from time use study)
# Scatterplot of measurement error and "true hours worked"
dat.wide <- readRDS("dtusPlotMeasurementError.rds")
measurement1 <- 
  ggscatter(dat.wide, color = 'wheat3', x = 'WKHRTUS', y = 'MeasureError', add
    = 'reg.line', add.params = list(color = 'black')) +
  xlab('Weekly Hours of Work in the Dutch Time Use Survey') +
  ylab('Measurement Error')

# Kernel density of measurement error
merrDensity <- ggdensity(dat.wide, x = 'MeasureError') +
  xlab('Measurement Error')

# Correlation of between the measurement error and the 'true' weekly hours of
# work AND the density of the measurement error.
twomerr <- ggarrange(measurement1, merrDensity, labels = c("A", "B"), ncol = 2, nrow = 1)
ggsave(twomerr, file = 'CorrelationDensityMeasurementError.png', width = 12, height = 6)

################################################################################
# DRAW RANDOM DAILY HOURS #
################################################################################
dat.randomDay <- readRDS("dtusPlotRandomDailyHours.rds")
# Pick a random day for each individual
# Plot the density graphs together.
# 7 * any day
twoDensties <- 
  ggplot(data = dat.randomDay) +
  geom_density(aes(x = WKHRTUS, weight = PROPWT, linetype = 'DTUS Weekly')) +
  geom_density(aes(x = RandomDayHours1, linetype = 'DTUS Daily * 7: Draw 1')) +
  geom_density(aes(x = RandomDayHours2, linetype = 'DTUS Daily * 7: Draw 2')) +
  scale_linetype_manual(name = 'Data', 
                        values = c('DTUS Weekly' = 'solid', 
                                   'DTUS Daily * 7: Draw 1' = 'dashed',
                                   'DTUS Daily * 7: Draw 2' = 'dotted')) +
  xlab('Weekly Hour of Work') +
  scale_x_continuous(breaks = seq(0, 130, by = 20)) +
  theme_minimal() +
  theme(legend.position = "bottom")
ggsave(twoDensties, file = 'DensitiesWeeklyRandomDaily7.png', height = 9, width = 11)
