# This code merges several data sets into one final data set based on the GEOCODE NLSY data. 
# The geocode data must be obtained by the researcher from the NLSY. 
# The code merges: 
# 1. Public NLSY data cleaned for use
# 2. State of residence from the NLSY geocode data (1979-2012)
# 2. State level cost of living index (used to adust wages) - obtained from Richard Fording's website - https://rcfording.wordpress.com/datasets/ 
# 3. County level wages (1974-1981) from the U.S. Bureau of Economics Analysis (BEA)
# 4. State unemployment levels (1976-1981) from the U.S. Bureau of Labor Statistics (BLS)
# 5. Information on local 2 and 4 year colleges (presence in county and tuition) - taken from Kling (2001, Journal of Business & Economics Statistics, Vol 19, Issue 3) which is based on 1977 HEGIS data

# Variables from data sets 3-5 were not used in the final version of the JoAE paper but were used in earlier versions
# in which we conducted sensitivity analysis with instruments for education. These sensitivity analyses were not included in the final version.


#################################################
#                                               #
# Load library and public nlsy data (cleaned)   #
#                                               # 
#################################################

library(spatstat)

nlsy.79.initial <- read.csv("nlsy.79.matrix.final.csv")


#######################
#                     #
#   COLA adjustment   #
#                     #
#######################

# Create parent.cincome and cincome variables which are incomes ultimately corrected for state level cost-of-living
nlsy.79.initial$parent.cincome <- 0
nlsy.79.initial$cincome <- 0

# Load in state of residence data and a csv file with cost of living adjustments (COLA) for 1979 and 1990 for each state.
  
  # State of residence data: nlsy.79.states_orig.csv is derived from geocode restricted NLSY data, researchers must obtain 
  # access to this data through the BLS. The CSV file used here includes the state of residence variable for each available 
  # year from 1979 to 2012 (annually from 1979 - 1994 and then bi-annually from 1994 - 2012). Variables are denoted here
  # as "stateZZ" where ZZ denotes the year (e.g., state79 for state of residence in 1979, state12 for state of residence
  # in 2012). 

  # COLA data: we use the adj.cola (adjusted colas to be mean = 1)

nlsy.79.states <- read.csv("nlsy.79.states_orig.csv")
nlsy.cola <- read.csv("nlsy.cola.csv")

#Since the NLSY does not provide state of residence in 1997 (only reported bi-yearly at this point in the survey), 
#need to generate variable for 1997 state of residence (i.e., state97). 

    #The first option is the 1996 value, 2nd option is the 1998 value, 3rd option is the 1994 value, and 4th
    #option is the 1993 value.
    #If all four options are unavailable, the variable is coded as -999999 (drop observation later in code) 
    #This process mitigates lost observations if state is not available in 1996.

nlsy.79.states$state97 <- 0
for (i in 1:length(nlsy.79.states$state97)) {
  if(nlsy.79.states$state96[i] > 0) {
    nlsy.79.states$state97[i]<- nlsy.79.states$state96[i]
  } else if (nlsy.79.states$state98[i]>0) {
    nlsy.79.states$state97[i] <- nlsy.79.states$state98[i]
  } else if (nlsy.79.states$state94[i]>0) {
    nlsy.79.states$state97[i] <- nlsy.79.states$state94[i]
  } else if (nlsy.79.states$state93[i]>0) {
    nlsy.79.states$state97[i] <- nlsy.79.states$state93[i]
  } else {
    nlsy.79.states$state97[i] <- -999999
  }
}


#Drop unused state variables before merging with public NLSY data
nlsy.79.states$state80 <- NULL
nlsy.79.states$state81 <- NULL
nlsy.79.states$state82 <- NULL
nlsy.79.states$state83 <- NULL
nlsy.79.states$state84 <- NULL
nlsy.79.states$state85 <- NULL
nlsy.79.states$state86 <- NULL
nlsy.79.states$state87 <- NULL
nlsy.79.states$state88 <- NULL
nlsy.79.states$state89 <- NULL
nlsy.79.states$state90 <- NULL
nlsy.79.states$state91 <- NULL
nlsy.79.states$state92 <- NULL
nlsy.79.states$state93 <- NULL
nlsy.79.states$state94 <- NULL
nlsy.79.states$state96 <- NULL
nlsy.79.states$state98 <- NULL
nlsy.79.states$state00 <- NULL
nlsy.79.states$state02 <- NULL
nlsy.79.states$state04 <- NULL
nlsy.79.states$state06 <- NULL
nlsy.79.states$state08 <- NULL
nlsy.79.states$state10 <- NULL
nlsy.79.states$state12 <- NULL

# Merge the cleaned NLSY data and the states from the geocode data into nlsy.79.ind.merged by the 'id' variable. 
# Then, removed observations without a 1979 or 1997 state -- sample size reduced from 1345 to 1340 

nlsy.79.ind.merged <- merge(nlsy.79.initial, nlsy.79.states, by = "id")
nlsy.79.ind.merged <- subset(nlsy.79.ind.merged, nlsy.79.ind.merged$state79 > 0)
nlsy.79.ind.merged <- subset(nlsy.79.ind.merged, nlsy.79.ind.merged$state97 > 0)

# Drop observations in American Samoa (lose 1 observation)

nlsy.79.ind.merged <- subset(nlsy.79.ind.merged, nlsy.79.ind.merged$state97 < 60)

# Now, loop through the states and adjust incomes using the correct state-specific COLA. Parental income is adjusted 
# using the 1979 COLA while respondent income is adjusted using the 1997 COLA. 

  # Note: Since COLA for DC (11th state in our list of states) is only available for 2012, we took the highest 
  # reported state value (for the corresponding year). 

tempstate <- sort(unique(c(nlsy.79.ind.merged$state79,nlsy.79.ind.merged$state97)), decreasing = FALSE)
                  
for (i in tempstate){
      if(i == 11){
        temp79 <- max(nlsy.cola$adj.cola[nlsy.cola$year == 1979])
        temp97 <- max(nlsy.cola$adj.cola[nlsy.cola$year == 1997])
       } else {
          temp79 <- nlsy.cola$adj.cola[nlsy.cola$fips == i & nlsy.cola$year == 1979]
          temp97 <- nlsy.cola$adj.cola[nlsy.cola$fips == i & nlsy.cola$year == 1997]
       }
  nlsy.79.ind.merged$parent.cincome[nlsy.79.ind.merged$state79 == i] <- nlsy.79.ind.merged$parent.income[nlsy.79.ind.merged$state79 == i]*temp79
  nlsy.79.ind.merged$cincome[nlsy.79.ind.merged$state97 == i] <- nlsy.79.ind.merged$income[nlsy.79.ind.merged$state97 == i]*temp97
}
 
rm(temp79)
rm(temp97)
rm(tempstate)


#####################################################################################################
#                                                                                                   #
#  NOTE: The rest of this code merges the IVs for education that were ultimately                    #
#        not used in the JoAE final version. However, the (slightly) reduced data set               #
#        resulting from the addition of the IVs was used (for potential robustness checks).         #
#        If one wished, they could omit the rest of this merge code - the sample size difference    #
#        is minimal and results do not change in any substantive way.                               #
#                                                                                                   #
#        Also, note that all the IVs require the Geocode data.                                      #
#                                                                                                   #
#####################################################################################################

#######################
#   Merge IV data     #
#######################

# Load datasets 

nlsy.79.fips <- read.csv("nlsy.79.82.state.fips.csv")
  # The "nlsy.79.82.state.fips.csv" file contains respondents fips and state codes from 1979-1982 
  # along with state and fips code at age 14. Variables are denoted as "fipsZZ" or "stateZZ" where ZZ denotes year. 

nlsy.79.year17 <- read.csv("nlsy.79.birthyear.17.csv")
  # The "nlsy.79.birthyear.17.csv" files contains respondent birth year ("birth_year") and year respondent turned 17 ("year17)

#Merge fips/state codes and year 17 data with other data

nlsy.79.ind.merged1 <- merge(nlsy.79.ind.merged,nlsy.79.fips, by = "id")
nlsy.79.ind.merged1 <- merge(nlsy.79.ind.merged1,nlsy.79.year17, by = "id")

  #######################
  # County wages at 17  #
  #######################

#Load 1974 - 1981 county-level wages (from U.S. BEA). Variables are denoted as "cwage.ZZ" where ZZ denotes year (e.g., cwage.79 is county level wages in 1979)

    # NOTE: Some counties were missing (unavailable) from the BEA dataset. Therefore, within the CSV file, we
    # inserted blank rows (i.e., missing values) for fips codes without available county-level wage data. 
    # Specifically, county-level wages were unavailable for the following fips codes: 12025, 18080, 34077, 48484, 51165, 51191, 51600, 55136, 69150, 69153
    # Ultimately, respondents within these fips codes are dropped 

county.wages <- read.csv("county.wages.74.81.add.csv")

#Correct for respondents with missing 1981 fips values -- assign 1980 value
nlsy.79.ind.merged1$fips81.rev <- 0
for(i in 1:length(nlsy.79.ind.merged1$fips81)){  
  if(nlsy.79.ind.merged1$fips81[i] > 0){
    nlsy.79.ind.merged1$fips81.rev[i] <- nlsy.79.ind.merged1$fips81[i]
  } else {
    nlsy.79.ind.merged1$fips81.rev[i] <- nlsy.79.ind.merged1$fips80[i]
  }
}
nlsy.79.ind.merged1$fips81 <- NULL


#Correct for respondents with missing missing 1980 fips values and age 14 values -- assign 1979 value
nlsy.79.ind.merged1$fips80.rev <- 0
nlsy.79.ind.merged1$fips14.rev <- 0

for(i in 1:length(nlsy.79.ind.merged1$fips80)){  
  if(nlsy.79.ind.merged1$fips80[i] > 0){
    nlsy.79.ind.merged1$fips80.rev[i] <- nlsy.79.ind.merged1$fips80[i]
} else {
  nlsy.79.ind.merged1$fips80.rev[i] <- nlsy.79.ind.merged1$fips79[i]
}
}
nlsy.79.ind.merged1$fips80 <- NULL

for(i in 1:length(nlsy.79.ind.merged1$fips14)){  
  if(nlsy.79.ind.merged1$fips14[i] > 0){
    nlsy.79.ind.merged1$fips14.rev[i] <- nlsy.79.ind.merged1$fips14[i]
  } else {
    nlsy.79.ind.merged1$fips14.rev[i] <- nlsy.79.ind.merged1$fips79[i]
  }
}
nlsy.79.ind.merged1$fips14 <- NULL

#Correct for a change in fips coding (former fips code 12025 is now 12086)
nlsy.79.ind.merged1$fips81[nlsy.79.ind.merged1$fips81 == 12025] <- 12086
nlsy.79.ind.merged1$fips80.rev[nlsy.79.ind.merged1$fips80.rev == 12025] <- 12086
nlsy.79.ind.merged1$fips79[nlsy.79.ind.merged1$fips79 == 12025] <- 12086
nlsy.79.ind.merged1$fips14.rev[nlsy.79.ind.merged1$fips14.rev == 12025] <- 12086

# Recode age 14 location to location in 79 if respondent was in an 'unspecific' place or unlisted (see Attachment 102 - State FIPS codes)
nlsy.79.ind.merged1$fips14.rev[nlsy.79.ind.merged1$fips14.rev == 40400] <- nlsy.79.ind.merged1$fips79[nlsy.79.ind.merged1$fips14.rev == 40400]
nlsy.79.ind.merged1$fips14.rev[nlsy.79.ind.merged1$fips14.rev == 36500] <- nlsy.79.ind.merged1$fips79[nlsy.79.ind.merged1$fips14.rev == 36500]
nlsy.79.ind.merged1$fips14.rev[nlsy.79.ind.merged1$fips14.rev == 2818] <- nlsy.79.ind.merged1$fips79[nlsy.79.ind.merged1$fips14.rev == 2818]
nlsy.79.ind.merged1$fips14.rev[nlsy.79.ind.merged1$fips14.rev == 34061] <- nlsy.79.ind.merged1$fips79[nlsy.79.ind.merged1$fips14.rev == 34061]

# Create county-level wage variable and correction for inflation to 1983$
nlsy.79.ind.merged1$cwage17 <- 0

for(i in 1:length(nlsy.79.ind.merged1$cwage17)){
  if(nlsy.79.ind.merged1$year17[i] == 1981){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.81[county.wages$fips == nlsy.79.ind.merged1$fips81.rev[i]]*1.10
  } else if (nlsy.79.ind.merged1$year17[i] == 1980){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.80[county.wages$fips == nlsy.79.ind.merged1$fips80.rev[i]]*1.21
  } else if (nlsy.79.ind.merged1$year17[i] == 1979){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.79[county.wages$fips == nlsy.79.ind.merged1$fips79[i]]*1.37
  } else if (nlsy.79.ind.merged1$year17[i] == 1978){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.78[county.wages$fips == nlsy.79.ind.merged1$fips14.rev[i]]*1.53
  } else if (nlsy.79.ind.merged1$year17[i] == 1977){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.77[county.wages$fips == nlsy.79.ind.merged1$fips14.rev[i]]*1.64
  } else if (nlsy.79.ind.merged1$year17[i] == 1976){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.76[county.wages$fips == nlsy.79.ind.merged1$fips14.rev[i]]*1.75
  } else if (nlsy.79.ind.merged1$year17[i] == 1975){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.75[county.wages$fips == nlsy.79.ind.merged1$fips14.rev[i]]*1.83
  } else if (nlsy.79.ind.merged1$year17[i] == 1974){
    nlsy.79.ind.merged1$cwage17[i] <- county.wages$cwage.74[county.wages$fips == nlsy.79.ind.merged1$fips14.rev[i]]*2.02
  }
}

nlsy.79.ind.merged1$lcwage17 <- 0
nlsy.79.ind.merged1$lcwage17 <- log(nlsy.79.ind.merged1$cwage17)
  

  #######################
  # State unemp at 17   #
  #######################

#Load state-level unemployment data (1976-1981) from BLS. Variables denoted as "unempZZ" where ZZ denotes year (e.g., unemp79 
# is state-level unemployment in 1979). 

state.unemp <- read.csv("state.unemp.76.81.csv")

#Correct missing 1981 state fips values -- assign 1981 value
nlsy.79.ind.merged1$state81.rev <- 0

for(i in 1:length(nlsy.79.ind.merged1$state81)){  
  if(nlsy.79.ind.merged1$state81[i] > 0){
    nlsy.79.ind.merged1$state81.rev[i] <- nlsy.79.ind.merged1$state81[i]
  } else {
    nlsy.79.ind.merged1$state81.rev[i] <- nlsy.79.ind.merged1$state80[i]
  }
}
nlsy.79.ind.merged1$state81 <- NULL


#Correct missing 1980 state fips values and age 14 values -- assign 1979 value
nlsy.79.ind.merged1$state80.rev <- 0
nlsy.79.ind.merged1$state14.rev <- 0 

for(i in 1:length(nlsy.79.ind.merged1$state80)){  
  if(nlsy.79.ind.merged1$state80[i] > 0){
    nlsy.79.ind.merged1$state80.rev[i] <- nlsy.79.ind.merged1$state80[i]
  } else {
    nlsy.79.ind.merged1$state80.rev[i] <- nlsy.79.ind.merged1$state79.y[i]
  }
}
nlsy.79.ind.merged1$state80 <- NULL

for(i in 1:length(nlsy.79.ind.merged1$state14)){  
  if(nlsy.79.ind.merged1$state14[i] > 0){
    nlsy.79.ind.merged1$state14.rev[i] <- nlsy.79.ind.merged1$state14[i]
  } else {
    nlsy.79.ind.merged1$state14.rev[i] <- nlsy.79.ind.merged1$state79.y[i]
  }
  if(nlsy.79.ind.merged1$state14[i]==69){
    nlsy.79.ind.merged1$state14.rev[i] <- nlsy.79.ind.merged1$state79.y[i]
  }
}
nlsy.79.ind.merged1$state14 <- NULL

# Create state-level unemployment variable$

nlsy.79.ind.merged1$sunemp17 <- 0

for(i in 1:length(nlsy.79.ind.merged1$sunemp17)){
  if(nlsy.79.ind.merged1$year17[i] == 1981){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp81[state.unemp$sfips == nlsy.79.ind.merged1$state81.rev[i]]
  } else if (nlsy.79.ind.merged1$year17[i] == 1980){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp80[state.unemp$sfips == nlsy.79.ind.merged1$state80.rev[i]]
  } else if (nlsy.79.ind.merged1$year17[i] == 1979){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp79[state.unemp$sfips == nlsy.79.ind.merged1$state79.y[i]]
  } else if (nlsy.79.ind.merged1$year17[i] == 1978){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp78[state.unemp$sfips == nlsy.79.ind.merged1$state14.rev[i]]
  } else if (nlsy.79.ind.merged1$year17[i] == 1977){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp77[state.unemp$sfips == nlsy.79.ind.merged1$state14.rev[i]]
  } else if (nlsy.79.ind.merged1$year17[i] == 1976){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp76[state.unemp$sfips == nlsy.79.ind.merged1$state14.rev[i]]
  } else if (nlsy.79.ind.merged1$year17[i] == 1975){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp76[state.unemp$sfips == nlsy.79.ind.merged1$state14.rev[i]]
  } else if (nlsy.79.ind.merged1$year17[i] == 1974){
    nlsy.79.ind.merged1$sunemp17[i] <- state.unemp$unemp76[state.unemp$sfips == nlsy.79.ind.merged1$state14.rev[i]]
  }
}


  #############################
  #  College variables at 14  #
  #############################

# Load college variables at 14 from Kling (2001) based on HEGIS data. The uploaded data set includes the following variables: 
  # "sfips" = state code
  # "cfips" = county code
  # "fips" = fips code
  # "pub2" = presence of 2 year public college in county of residence at 14
  # "pub4" = presence of 4 year public college in county of residence at 14
  # "min.tuit.pub" = minimum tuition at public college (2 or 4 year) in county of residence at 14 (0 if no college in county)
  # "min.tuit.pub4" = minimum tuition at public 4 year college in county of residence at 14 (0 if no college in county)

college.kling <- read.csv("college.data.kling.clean.csv")

  # Calculate minimum tuition for a public 4 year college and then any public college (2 or 4 year) by state

tempstate <- sort(unique(c(college.kling$sfips)), decreasing = FALSE)

state.tuition = data.frame(matrix(0,nrow =length(tempstate), ncol = 3))
names(state.tuition) <- c("sfips","min.tuit4s", "min.tuits")
state.tuition$sfips <- tempstate

for(i in 1:length(state.tuition$sfips)){
  temp.sfips = state.tuition$sfips[i]
  state.tuition$min.tuit4s[i] = min(college.kling$min.tuit.pub4[college.kling$sfips == temp.sfips & college.kling$min.tuit.pub4 >0])
  state.tuition$min.tuits[i] = min(college.kling$min.tuit.pub[college.kling$sfips == temp.sfips & college.kling$min.tuit.pub >0])
}
rm(temp.sfips)
rm(tempstate)

  # Merge presence of college and minimum state tuition data
nlsy.79.ind.merged1 = merge(nlsy.79.ind.merged1,state.tuition,by.x = "state14.rev", by.y = "sfips", all.x = TRUE)
temp2 <- subset(college.kling, select = fips:min.tuit.pub4)
nlsy.79.ind.merged1 = merge(nlsy.79.ind.merged1,temp2, by.x = "fips14.rev", by.y = "fips", all.x = TRUE)
rm(temp2)

  # Remove county-level tuition variables
nlsy.79.ind.merged1$min.tuit.pub <- NULL
nlsy.79.ind.merged1$min.tuit.pub4 <- NULL


# Remove extra variables not used in analysis
nlsy.79.ind.merged1$fips14.rev <- NULL 
nlsy.79.ind.merged1$fips80.rev <- NULL 
nlsy.79.ind.merged1$fips81.rev <- NULL
nlsy.79.ind.merged1$fips79 <- NULL 
nlsy.79.ind.merged1$fips81 <- NULL
nlsy.79.ind.merged1$fips82 <- NULL 
nlsy.79.ind.merged1$state14.rev <- NULL
nlsy.79.ind.merged1$state79.x <- NULL 
nlsy.79.ind.merged1$state79.y <- NULL 
nlsy.79.ind.merged1$state80.rev <- NULL 
nlsy.79.ind.merged1$state81.rev <- NULL
nlsy.79.ind.merged1$state82 <- NULL
nlsy.79.ind.merged1$state97 <- NULL
nlsy.79.ind.merged1$birth_year <- NULL 
nlsy.79.ind.merged1$year17 <- NULL 

rm(college.kling)
rm(county.wages)
rm(nlsy.79.fips)
rm(nlsy.79.ind.merged)
rm(nlsy.79.initial)
rm(nlsy.79.states)
rm(nlsy.79.year17)
rm(nlsy.cola)
rm(state.tuition)
rm(state.unemp)

#Remove observations with missing IV data (sample size reduced from 1339 to 1321)
nlsy.79.ind.merged1 <- na.omit(nlsy.79.ind.merged1)

# Create log income variables
nlsy.79.ind.merged1$lcincome <- 0
nlsy.79.ind.merged1$parent.lcincome <- 0
nlsy.79.ind.merged1$lcincome <- log(nlsy.79.ind.merged1$cincome)
nlsy.79.ind.merged1$parent.lcincome <- log(nlsy.79.ind.merged1$parent.cincome)

# Save final data 

write.csv(nlsy.79.ind.merged1, file = "nlsy.79.ind.merged1.csv")








