/********************************************************************************
	 
	 Revisiting Gender Identity and Relative Income within Households –
	A cautionary tale on the potential pitfalls of density estimators.

		Daniel Kuehnle, Michael Oberfichtner, Kerstin Ostermann
						  Last update: March 2021
								
						Dofile for the US analyses
						- preparation file for SSB

	Run this file on the Synthetic Data Server after adjusting PATH NAME
	+++++CAUTION: THIS FILE WILL NOT RUN ON THE GOLD STANDARD FILES+++++
						
*******************************************************************************/

clear all
version 14.2

//install DCdenstiy
adopath + "PATH NAME"

cd "PATH NAME"
cap log close
log using KOO_SSBprepration, replace t

foreach subv of numlist 1 / 4 {
foreach data of numlist 1 / 4 {
//load data set
cd "PATH NAME"
use ssb_v6_0_2_synthetic`subv'_`data'.dta, clear

cd "PATH NAME"

**BKP: "we use the observation from the first year that the couple is in the panel
	* which ranges from 1990 to 2004" (p. 575)
drop if inlist(panel, 1984,2008)
tab panel

//restrict to individuals aged between 18 and 65
gen age= panel - year(birthdate)
keep if age <=65 & age>=18

//identify couples
sort personid spouse_personid
drop if personid==. | spouse_personid==.

*individuals
gen wife_id = personid if male==0
replace wife_id = spouse_personid if male==1
gen husb_id = personid if male==1
replace husb_id = spouse_personid if male==0

*one household
gen swife_id = string(wife_id)
gen shusb_id = string(husb_id)
gen hhid = shusb_id + swife_id
egen tag = tag(hhid)

//Income
*compute income for each individual in first year of each panel
foreach i in  "1990" "1991" "1992" "1993" "1996" "2001" "2004" {
gen inc_`i' = total_der_fica_`i' + total_der_nonfica_`i'  if panel==`i'
}

keep hhid swife_id shusb_id tag inc_*  panel mh_date1 age male

*create income measures that are not year specific
gen inc =.
foreach i in  "1990" "1991" "1992" "1993" "1996" "2001" "2004" {
replace inc = inc_`i' if panel==`i'
}

*compute income measures for wife and husband
gen rwife_inc =.
gen rhusb_inc =.
replace rwife_inc  = inc if male==0
replace rhusb_inc = inc if male==1
bysort hhid: egen m_wife_inc = mean(rwife_inc)
bysort hhid: egen m_husb_inc = mean(rhusb_inc)

*relative income
gen femshare = m_wife_inc / (m_wife_inc + m_husb_inc)

//one observation per couple
keep if tag==1
drop inc_*

//* Bertrand: "The sample includes all married couples where both the husband and wife earn
*positive income (...)" (Appendix, p.25) */
drop if inlist(femshare,1,0,.)

gen subversion = `subv'
gen data = `data'
save KOO_SIPP`subv'`data'.dta, replace
}
}

//join data sets
foreach data of numlist 1/ 4 {
use KOO_SIPP`data'1.dta, clear
foreach subv of numlist 2/4{
append using KOO_SIPP`data'`subv'.dta
}
save KOO_SIPP`data'.dta, replace
}
foreach data of numlist 1/3 {
append  using KOO_SIPP`data'.dta
}

//Identify different datasets
cap drop identify
gen identify =.
foreach num of numlist 1/4 {
replace identify = `num' if  subversion==`num'
replace identify = `num' + 4 if subversion==`num' & data==2
replace identify = `num' + 8 if subversion==`num' & data==3
replace identify = `num' + 12 if subversion==`num' & data==4
}

lab var identify "Identifier of synthetic dataset versions"
drop  subversion data

save KOO_SIPP.dta, replace
log close
