* This do-file merges relevant datasets within each wave, stores the relevant varibles and subsequently combines all waves
clear all

*PRFITB: gross household income
* Specify which variables to keep (indivual and hh) 
global VARSi 	jbhrs jbot hid futrk futrl bwtag* doiy4 doby pno hid hgr2r plbornc hsownd hscost race yr2uk4 nmar nchild lprnt lnprnt fiyr* xpfood ncars carown /*edtype school scend sctype scnow*/ isced fenow feend lchmor lchmorn lchnmor pay* bankk* debt* nvest* save* svack* sex mlstat jbstat age  nchild
global VARShh   doim doiy4 nkids nch02 nch34 nch511 nch1215 nch1618 birhh mabwly mabwnly bwtagm1 bwtagm2 bwtagm3


local wave = 0
foreach PRE in a b c d e f g h i j k l m n o p q r {
local wave = `wave' + 1
display `wave'
use ${bhps_data}\\`PRE'indresp, replace

destring, replace

**********************************
* Merge with household variables *
* Sort by household id (hid)
sort `PRE'hid
tempfile Individual
save `Individual'
use ${bhps_data}\\`PRE'hhresp,replace
sort `PRE'hid 
merge `PRE'hid using `Individual'

* Remove prefix and add wave variable 
renpfix `PRE'
gen wave = `wave'
* Rename the pid (now id in 2006)
if `wave'==16 {
	ren id pid
}

***************************************
*To save RAM: only keep some variables.
* The rather clumsy way of doing this is due to the fact that some variables are only in the sample for some years
loc VarsToKeep
foreach v in pid wave ${VARSi} ${VARShh} {
	loc j    	
	cap unab j: `v'
		loc VarsToKeep  `VarsToKeep' `j' 
}
keep `VarsToKeep'
***************************************

* Append with previous data: Investigate what happens when different types (int vs. str4). The force-option produces missings..
if `wave'>1 {
	append using `AllWaves', force
}

tempfile AllWaves
save `AllWaves'
* End of year loop
} 

sort pid wave
drop if missing(pid)

label var birhh "baby born since last wave"


*************************************************************************************
* Use bracketed wealth information:  1: impute to lowest value 2: impute to midpoint *
* Recode many variables to missing if negative (inapplicable)
foreach var of varlist nkids save* bankk* nvest* debt* svack*{
	replace `var' = . if `var'<0 
}
* Find the "highest" value in the brackets. Since they are ordered 1000, 5000, 10000, 500, I rename the last bracket to zero rather than 4 and loop from 0-3
cap ren bankkb4 bankkb0
cap ren savekb4 savekb0
cap ren nvestc4 nvestc0 
cap ren debtc4 debtc0
cap ren svackb4 svackb0
* For some variables, there is five groups in 2005
cap ren nvestc5 nvestc4 
cap ren debtc5 debtc4
cap ren svackb5 svackb4
cap drop Bracket
g Bracket = .
cap drop BracketNv
g BracketNv = .
cap drop BracketSa
g BracketSa = .
cap drop BracketDe
g BracketDe = .
cap drop BracketSv
g BracketSv = .
forvalues br = 0/3 {
	replace Bracket 	= `br' if bankkb`br'==1 & !missing(bankkb`br')
	replace BracketSa 	= `br' if savekb`br'==1 & !missing(savekb`br')
}
forvalues br = 0/4 {
	replace BracketNv 	= `br' if nvestc`br'==1 & !missing(nvestc`br')
	replace BracketDe 	= `br' if debtc`br'==1 & !missing(debtc`br')
	replace BracketSv 	= `br' if svackb`br'==1 & !missing(svackb`br')
}
* If "no" to bracket 0-> then let bracket be -1 and use that below
replace Bracket 	= -1 if bankkb0==2 & !missing(bankkb0)
replace BracketNv 	= -1 if nvestc0==2 & !missing(nvestc0)
replace BracketSa 	= -1 if savekb0==2 & !missing(savekb0)
replace BracketDe 	= -1 if debtc0==2 & !missing(debtc0)
replace BracketSv 	= -1 if svackb0==2 & !missing(svackb0)

* Construct the imputed measures
* First: use the bottom (midpoint in lower bracket (-1)
cap drop IMPbankk
g IMPbankk = bankk
replace IMPbankk = (Bracket==-1)*250 + (Bracket==0)*500 + (Bracket==1)*1000 + (Bracket==2)*5000 + (Bracket==3)*10000 if missing(IMPbankk)
replace IMPbankk = . if missing(bankk) & missing(Bracket)
cap drop IMPnvestk
g IMPnvestk = nvestk /*nvestk: ALL investments, both sole and joint*/
replace IMPnvestk = (BracketNv==-1)*500 + (BracketNv==0)*1000 + (BracketNv==1)*5000 + (BracketNv==2)*15000 + (BracketNv==3)*50000 + (BracketNv==4)*100000 if missing(IMPnvestk)
replace IMPnvestk = . if missing(nvestk) & missing(BracketNv)
cap drop IMPsavek
g IMPsavek = savek
replace IMPsavek = (BracketSa==-1)*250 + (BracketSa==0)*500 + (BracketSa==1)*1000 + (BracketSa==2)*5000 + (BracketSa==3)*10000 if missing(IMPsavek)
replace IMPsavek = . if missing(savek) & missing(BracketSa)
cap drop IMPdebty
g IMPdebty = debty
replace IMPdebty = (BracketDe==-1)*50 + (BracketDe==0)*100 + (BracketDe==1)*500 + (BracketDe==2)*1500 + (BracketDe==3)*5000 + (BracketDe==4)*10000 if missing(IMPdebty)
replace IMPdebty = . if missing(debty) & missing(BracketDe)
cap drop IMPsvack
g IMPsvack = svack
replace IMPsvack = (BracketSv==-1)*250 + (BracketSv==0)*500 + (BracketSv==1)*1000 + (BracketSv==2)*5000 + (BracketSv==3)*10000 + (BracketSv==4)*20000 if missing(IMPsvack)
replace IMPsvack = . if missing(svack) & missing(BracketSv)

* Second: use the midpoint between brackets (and put in an arbitrary point in the end..
cap drop IMP2bankk
g IMP2bankk = bankk
replace IMP2bankk = (Bracket==-1)*250 + (Bracket==0)*(500+1000)/2 + (Bracket==1)*(1000+5000)/2 + (Bracket==2)*(5000+10000)/2 + (Bracket==3)*(10000+2500) if missing(IMP2bankk)
replace IMP2bankk = . if missing(bankk) & missing(Bracket)
cap drop IMP2nvestk
g IMP2nvestk = nvestk 
replace IMP2nvestk = (BracketNv==-1)*500 + (BracketNv==0)*(1000+5000)/2 + (BracketNv==1)*(5000+15000)/2 + (BracketNv==2)*(15000+50000)/2 + (BracketNv==3)*(50000+100000)/2 + (BracketNv==4)*(100000+25000) if missing(IMP2nvestk)
replace IMP2nvestk = . if missing(nvestk) & missing(BracketNv)
cap drop IMP2savek
g IMP2savek = savek
replace IMP2savek = (BracketSa==-1)*250 + (BracketSa==0)*(500+1000)/2 + (BracketSa==1)*(1000+5000)/2 + (BracketSa==2)*(5000+10000)/2 + (BracketSa==3)*(10000+2500) if missing(IMP2savek)
replace IMP2savek = . if missing(savek) & missing(BracketSa)
cap drop IMP2debty
g IMP2debty = debty
replace IMP2debty = (BracketDe==-1)*50 + (BracketDe==0)*(100+500)/2 + (BracketDe==1)*(500+1500)/2 + (BracketDe==2)*(1500+5000)/2 + (BracketDe==3)*(5000+10000)/2 + (BracketDe==4)*(10000+2500) if missing(IMP2debty)
replace IMP2debty = . if missing(debty) & missing(BracketDe)
cap drop IMP2svack
g IMP2svack = svack
replace IMP2svack = (BracketSv==-1)*250 + (BracketSv==0)*(500+1000)/2 + (BracketSv==1)*(1000+5000)/2 + (BracketSv==2)*(5000+10000)/2 + (BracketSv==3)*(10000+20000)/2 + (BracketSv==4)*(20000+5000) if missing(IMP2svack)
replace IMP2svack = . if missing(svack) & missing(BracketSv)

* Delete variables used to impute wealth-information
drop  nvest* debt* savek* bankk* Bracket* svack*

*************************************************************************************

*save BHPSpanel, replace
sort pid wave
cap drop _merge
tempfile Panel
save `Panel'

*************************************************************************************
* Merge children birthyear onto this data
*clear all
*BHPS\UKDA-5151-stata8\stata8\\
local wave_nr = 1
foreach wave in b k l{
use ${bhps_data}\\`wave'childnt, replace
keep `wave'hid `wave'lchby4 `wave'lncno `wave'lchlv `wave'lchsx pid

keep if `wave'lchlv~=4  /*child not still born*/ 
sort pid

by pid: egen `wave'BioKids=sum(!missing(`wave'lchby4))
by pid: egen `wave'NoGirls=sum((`wave'lchsx==2))


renpfix `wave'
gen `wave'BioKids=BioKids
gen `wave'NoGirls=NoGirls
 

if `wave_nr'>1 {
	append using `AllWaves', force
}

tempfile AllWaves
save `AllWaves'
local wave_nr = `wave_nr'+ 1
}




*sort pid hid lncno
*duplicates report pid  lncno, force
*duplicates drop pid lncno, force

**************************************
* Reshape to get one observation per indivual - and a variable for each child
keep pid lncno lchby4 bBioKids kBioKids lBioKids bNoGirls kNoGirl lNoGirl

ren lchby4 BirthYear
replace BirthYear = . if BirthYear<0
reshape wide BirthYear bBioKids kBioKids lBioKids bNoGirls kNoGirl lNoGirl, i(pid) j(lncno)
aorder
drop BirthYear8-BirthYear15 bBioKids2-bBioKids15 kBioKids2-kBioKids15 lBioKids2-lBioKids15 bNoGirls2-bNoGirls15 kNoGirls2-kNoGirls15 lNoGirls2-lNoGirls15
ren bBioKids1 bBioKids
ren kBioKids1 kBioKids
ren lBioKids1 lBioKids

ren bNoGirls bNoGirls
ren kNoGirls kNoGirls
ren lNoGirls lNoGirls



/*
replace bBioKids=0 if missing(bBioKids)
replace kBioKids=0 if missing(kBioKids)
replace lBioKids=0 if missing(lBioKids)
*/

label var bBioKids "No Bio children in wave b: 1992"
label var kBioKids "No Bio children in wave k: 2001"
label var lBioKids "No Bio children in wave 1: 2002"





sort pid
**************************************
* Merge this data onto the panel data
merge pid using `Panel'
sort pid wave
drop _merge
tempfile Panel
save `Panel'

*Merge with age in wage q - is also in m
use ${bhps_data}\\qchild, replace
keep pid qscage
ren qscage qAge
label var qAge "Age of child in wave q: 2007"
g temp =-qAge
sort pid temp
drop temp
by pid: gen kid =_n
reshape wide qAge , i(pid) j(kid)
sort pid
merge pid using `Panel'
drop _merge
sort pid wave
tempfile Panel
save `Panel'

*Merge with age in wage q - is also in m
use ${bhps_data}\\mchild, replace
keep pid mscage
ren mscage mAge
label var mAge "Age of child in wave m: 2003"
g temp =-mAge
sort pid temp
drop temp
by pid: gen kid =_n
reshape wide mAge , i(pid) j(kid)
sort pid
merge pid using `Panel'

sort hid
drop _merge
tempfile Panel
save `Panel'
/* constructing a datasæt with all new entrance due to new babies */

local wave = 0
foreach PRE in b c d e f g h i j k l m n o p q r {
local wave = `wave' + 1

use ${bhps_data}\\`PRE'indall, replace
destring, replace

**********************************
* Merge with household variables *
* Sort by household id (hid)
keep if `PRE'newhy==1
gen BYBaby=`PRE'neyrjn
if `wave'<6 & !missing(`PRE'neyrjn) &`PRE'neyrjn>90 {
replace BYBaby=`PRE'neyrjn+1900
}
renpfix `PRE'
gen wave = `wave'+1
if `wave'==15 {
	ren id pid
}


keep wave hid pid BYBaby 
if `wave'>1 {
	append using `AllWBaby', force
}


tempfile AllWBaby
save `AllWBaby'

* End of year loop
}
 sort hid BYBaby  
duplicates report hid  , force
gen nn=1
bys hid: egen noBaby=sum(nn)
bys hid: gen nnBaby=sum(nn)

drop pid
reshape wide BYBaby noBaby, i(hid) j(nnBaby)  /* dealing with multiple births within same wave */
ren noBaby1 noBaby
drop noBaby2 noBaby3
duplicates report hid, force 
 
 sort hid
 
merge hid using `Panel'

sort pid wave
xtset pid wave
/* number of biological children */
/* Number of newborn biological babies (since last wave) */
gen BioBaby=0 if wave>9
replace BioBaby=mabwnly if birhh==1 & mabwly==1 & wave>9 
label var BioBaby "No of biological babyies born since last wave"

/* birth year of newborn */ 
gen BYBabyII=doiy4 if doim>bwtagm1 & BioBaby~=0
replace BYBabyII=doiy4-1 if doim<=bwtagm1 & BioBaby~=0
label var BYBabyII "Alt. measure of YOB for new born"
/* fill in values of new borns if it is first wave */
bys pid: egen Firstwave=min(wave)
label var Firstwave "First wave in survey"
replace noBaby=BioBaby if missing(noBaby) & wave==Firstwave 
replace BYBaby1=BYBabyII if missing(BYBaby1) & wave==Firstwave 

replace noBaby=0 if missing(noBaby) 
label var noBaby "Number of new born babies since last wave"
label var BYBaby1 "Year of birth for new born baby since last wave"
label var BYBaby2 "Year of birth for new born baby since last wave"
label var BYBaby3 "Year of birth for new born baby since last wave"


bys pid: gen SumBaby=sum(noBaby) if wave>2

replace SumBaby=0 if wave==1 |wave==2
gen fBioBaby=f.BioBaby
gen f2BioBaby=f2.BioBaby

gen NChildren=bBioKids-f.noBaby if wave==1

replace NChildren=bBioKids if wave==2 & !missing(bBioKids)
replace NChildren=0 if wave==2 & missing(bBioKids)
replace NChildren=bBioKids+SumBaby if wave>2 & Firstwave<3 & !missing(bBioKids) 
replace NChildren=0+SumBaby if wave>2 & Firstwave<3 & missing(bBioKids)

replace NChildren=kBioKids-fBioBaby-f2BioBaby if wave==9 &  !missing(kBioKids)
replace NChildren=. if wave==9 &  !missing(kBioKids) & NChildren==-1

replace NChildren=kBioKids-fBioBaby if wave==10 &  !missing(kBioKids)
replace NChildren=kBioKids if wave==11 &  !missing(kBioKids)

replace NChildren=0 if wave==9 & (Firstwave==9) & missing(kBioKids)& !missing(f2.wave)
replace NChildren=0 if wave==10 & (Firstwave==9 | Firstwave==10) & missing(kBioKids) & !missing(f.wave)
replace NChildren=0 if wave==11 & (Firstwave==9 | Firstwave==10) & missing(kBioKids)

replace NChildren=kBioKids+SumBaby if wave>11 &  !missing(kBioKids)
replace NChildren=0+SumBaby if wave>11 & (Firstwave==9 | Firstwave==10) & missing(kBioKids)


replace NChildren=lBioKids-fBioBaby if wave==11 & !missing(lBioKids)
replace NChildren=lBioKids if wave==12 & !missing(lBioKids)
replace NChildren=0 if wave==11 & (Firstwave==11)  & missing(lBioKids) & missing(kBioKids) & !missing(f.wave)
replace NChildren=0 if wave==12 & (Firstwave==11 | Firstwave==12)  & missing(lBioKids) & missing(kBioKids)
replace NChildren=lBioKids+SumBaby if wave>12 & !missing(lBioKids)
replace NChildren=0+SumBaby if wave>11 & (Firstwave==11 | Firstwave==12) & missing(lBioKids) & missing(kBioKids)

label var NChildren "No Children ever born"

gen NoGirls=bNoGirls if wave==2
replace NoGirls=kNoGirls if wave==9
replace NoGirls=lNoGirls if wave==11


gen NatChildren=0 if lprnt==2
replace NatChildren=lnprnt if lprnt==1

label var NatChildren "No Children ever born (survey question)"

 
gen NnChildren=NChildren if  !missing(NChildren)
replace NnChildren=NatChildren if !missing(NatChildren) & missing(NChildren)
replace NnChildren=nchild if missing(NnChildren) & age<30 
replace NnChildren=. if NnChildren<0



drop fBioBaby f2BioBaby SumBaby _merge nn
save ${output_data}\\BHPSpanel, replace


