********************************************************************************
*                             ReadAHSData.do                                   *
********************************************************************************

* 1. Unzip Flat Files

cap unzipfile "RawData/AHS 1999 National PUF v1.1 Flat CSV.zip" 
cap unzipfile "RawData/AHS 2001 National PUF v1.1 Flat CSV.zip" 
cap unzipfile "RawData/AHS 2003 National PUF v1.1 Flat CSV.zip"
cap unzipfile "RawData/AHS 2005 National PUF v1.1 Flat CSV.zip"
cap unzipfile "RawData/AHS 2007 National PUF v1.1a Flat CSV.zip"
cap unzipfile "RawData/AHS 2009 National PUF v1.1 Flat CSV.zip"
cap unzipfile "RawData/AHS 2011 National PUF v2.0 Flat CSV.zip"
cap unzipfile "RawData/AHS 2002 Metropolitan PUF v1.1 Flat CSV.zip"
cap unzipfile "RawData/AHS 2004 Metropolitan PUF v1.1 Flat CSV.zip"
cap unzipfile "RawData/AHS 2007 Metropolitan PUF v1.1 Flat CSV.zip"
cap unzipfile "RawData/AHS 2009 Metropolitan PUF v1.1 Flat CSV.zip"
cap unzipfile "RawData/AHS 2011 Metropolitan PUF v2.0 Flat CSV.zip"


local filelist "tAHS1999N  tAHS2001N  tAHS2003N tAHS2005N  tAHS2007n  tAHS2009N  ahs2011n  tAHS2002M  tAHS2004m tAHS2007m  tAHS2009m  ahs2011m "

foreach file in `filelist'  {

clear 
di c(current_time)
import delimited using "`file'.csv"

* School variables disappear 
cap gen schoth = .
cap gen schpri = .
cap gen schpub = .	

gen intyr = substr("`file'",-5,4)
destring intyr , replace

gen sampleflag = lower(substr("`file'",-1,1))
replace sampleflag = "NationalSample" if sampleflag == "n"
replace sampleflag = "MetropolitanSample" if sampleflag == "m"

gen region_gen = .
* the following is for the metropolitan sample in 2011, where state is not reported
cap destring state, replace ignore(')
cap replace region_gen = 1 if state == 23 | state == 33 | state == 50 | state == 25 ///
   | state == 44 | state == 9 | state == 36 | state == 34 | state == 42
cap replace region_gen = 2 if state == 39 | state == 18 | state == 17 | state == 26 ///
   | state == 55 | state ==27 | state == 19 | state == 29 | state == 38 ///
   | state == 46 | state == 31 | state == 20
cap replace region_gen = 3 if state == 10 | state == 24 | state == 11 | state == 51 ///
   | state == 54 | state ==37 | state == 45 | state == 13 | state == 12 ///
   | state == 21 | state == 47 | state == 1 | state == 28 | state == 5 ///
   | state == 22 | state == 40 | state == 48
cap replace region_gen = 4 if state == 30 | state == 16 | state == 56 | state == 8 ///
   | state == 35 | state == 4 | state == 49 | state == 32 | state == 53 ///
   | state == 41 | state == 6 | state == 2 | state == 15

cap gen region = region_gen   


* 2011 has no rmov variable
cap gen rmov = . 
cap gen degree = .

cap rename metro3 METRO3    // 2011 uses METRO3
cap rename metro METRO3     // Metropolitan samples 
* metro and metro3 may not be comparable between metro sample and national

* generate an empty cmsa variable for years in which it is not recorded
* mostly applies to metropolitan sample data in which SMSA is reported but not CMSA
cap gen cmsa = .
cap gen smsa = .

destring smsa, replace ignore(')

keep control ownhere ran rac rmov* value schoth schpri schpub *movm* *move* tenure ///
    pmovyr *rec* weight zinc per zadult METRO3 region built smsa cmsa pline* ///
	degree frstho intyr sampleflag 



********************************************************************************	
* ADJUST FOR INCONSISTENCY BETWEEN FLAT AND RELATIONAL FILES IN TERMS OF PERSON LINE NUMBERS
* IMPORTANT STEP HERE!
* This step is essential for replicating Ngai and Tenreyro exactly.
* The general results do go through otherwise.
foreach var in move movm rmov {
  cap gen `var' = .
  foreach n of numlist 1/5 {
    cap replace `var' = `var'`n' if pline`n' == 1
  }
}
********************************************************************************

drop move1-move16 movm1-movm16 

replace zinc = log(zinc) 
replace built = log(built) 

* destring data for more compact storage and numerical operations
foreach var of varlist _all {
  cap destring `var', replace ignore(')
}

save `file'extract, replace
}


clear all

foreach file of local filelist {
 append using `file'extract
}

save AHSForAnalysis, replace 	

********************************************************************************
* Create Crosswalk from SMSA to CMSA to use in cases where CMSA is not recorded
use tAHS1999Nextract.dta
keep cmsa smsa region
/*
use mode of cmsa by smsa
   other approaches are possible, such as min
   ... the min approach avoids assigning some smsa values to 99
*/
   
egen cmsa_impute = mode(cmsa), by(smsa)
egen region_impute = mode(region) , by(smsa)
drop cmsa
collapse (mean) cmsa_impute , by(smsa)
label var cmsa_impute "Imputed CMSA from SMSA using 1999 data"
save MSACrosswalk , replace

use AHSForAnalysis
merge m:1 smsa using MSACrosswalk
gen cmsa_flag = (cmsa_impute != . & cmsa == .)
replace cmsa = cmsa_impute if (cmsa_flag == 1 | smsa == 1280 | smsa == 1520 | smsa == 6440 | smsa == 7280 )
replace cmsa = 10 if smsa == 1280  // Buffalo, NY 
replace cmsa = 99 if smsa == 1520  // Charlotte, NC ... no corresponding CMSA code in the public use files
replace cmsa = 79 if smsa == 6440  // Portland, OR
replace cmsa = 49 if smsa == 7280  // Riverside, CA

save AHSForAnalysis, replace
