set more off

cap log close
log using "Output/model",replace

use "Output/database",clear

codebook anoniem if first==2002
codebook anoniem if first==2003

replace sector="" if sector==" "
sort anoniem schooljaar
*only keep students that start high school in eersteA
gen keep=1 if eersteA==1 & enterhs==1

by anoniem: egen temp=max(keep)
drop if temp~=1
drop temp

codebook anoniem if first==2002
codebook anoniem if first==2003

*drop observations in hoger beroepsonderwijs
drop if onderwijsvorm=="HBO"

/*
*check if students have different birth_years
sort anoniem schooljaar
by anoniem: egen mode=mode(birth)
format mode %td
br mode birth
gen test=mode-birth
tab test
drop mode test
*/

gen age=schooljaar-birth_year

gen agestart=first+1-birth_year
gen agedegree=degreeyear-birth_year

tab agestart agedegree

*extra background var
gen repeated=0 if !missing(first) & !missing(birth_year) //note that this is repeated before SE
replace repeated = 1 if (first-birth_year)>11 & !missing(birth_year) & !missing(first)
label var  repeated "Repeated a grade before starting SE"


*outcome variables (note: first defines cohort based on last year in bao)
gen degreeontime=0 if first<=2006
replace degreeontime=1 if degree==1 & (agedegree-agestart<=5) & first<=2006

gen degree_delay1=0 if first<=2005
replace degree_delay1=1 if degree==1 & (agedegree-agestart<=6) & first<=2005

gen degree_delay2=0 if first<=2004
replace degree_delay2=1 if degree==1 & (agedegree-agestart<=7) & first<=2004

gen degree_delay3=0 if first<=2003
replace degree_delay3=1 if degree==1 & (agedegree-agestart<=8) & first<=2003

gen degree_delay4=0 if first<=2002
replace degree_delay4=1 if degree==1 & (agedegree-agestart<=9) & first<=2002

gen degree_delay5=0 if first<=2001
replace degree_delay5=1 if degree==1 & (agedegree-agestart<=10) & first<=2001

sort anoniem schooljaar
gen downgrade=0 if (ASO==1 | TSO==1 | KSO==1 | BSO==1)
by anoniem: replace downgrade=1 if ASO[_n-1]==1 & (TSO==1 | KSO==1 | BSO==1)
by anoniem: replace downgrade=1 if (TSO[_n-1]==1 | KSO[_n-1]==1) & BSO==1

by anoniem: egen temp=max(downgrade)
replace downgrade=temp
drop temp

*create indicator if students has more than 1 obs/schoolyear
by anoniem schooljaar: egen count=count(schooljaar)
by anoniem schooljaar: gen n=_n
tab count if n==1 & first+1==schooljaar //0.02% (or 20 students) have multiple obs per schoolyear

*create indiciator if students drops out and returns
sort anoniem schooljaar
by anoniem: gen Lschooljaar=schooljaar[_n-1]
tab schooljaar Lschooljaar

by anoniem: gen unbalanced=1 if schooljaar>schooljaar[_n-1]+1
by anoniem: egen temp=total(unbalanced)
replace unbalanced=temp
tab temp if n==1 & first+1==schooljaar //0.72% has 1 year gap, 0.01% a 2 year gap 
drop temp

*create indicator if student is still in database in 2011
gen stillindata2011=1 if schooljaar==2011
by anoniem: egen temp=max(stillindata2011)
replace stillindata2011=temp
replace stillindata2011=0 if missing(stillindata2011)
drop temp

*create indicator if student is still in database in 2012
gen stillindata2012=1 if schooljaar==2012
by anoniem: egen temp=max(stillindata2012)
replace stillindata2012=temp
replace stillindata2012=0 if missing(stillindata2012)
drop temp


*define dropout var
	*use 3 years of delay max
	clonevar nodropout_strict=degree_delay3 	
	label var nodropout_strict "Same as degree_delay3"
	clonevar nodropout_wounknown=degree_delay3 	
	label var nodropout_wounknown "Same as degree_delay3 but dropping observations that are still in db after 3 years of delay"
	clonevar nodropout_soft=degree_delay3 		
	label var nodropout_soft "Same as degree_delay3 but setting outcome for obs still in data after 3 years of delay=1"

	*if still in data in 2011 for 2002 cohort (this is the year they can get a degree with 3y of delay)
	replace nodropout_wounknown=. 	if nodropout_wounknown==0 	& stillindata2011==1 & first==2002
	replace nodropout_soft=1 		if nodropout_soft==0 		& stillindata2011==1 & first==2002
	
	*if still in data in 2012 for 2003 cohort (this is the year they can get a degree with 3y of delay)
	replace nodropout_wounknown=. 	if nodropout_wounknown==0 	& stillindata2012==1 & first==2003
	replace nodropout_soft=1 		if nodropout_soft==0 		& stillindata2012==1 & first==2003
	
	sum nodropout* if n==1 & first+1==schooljaar

	
//extra school vars
gen scho_ASO=(scho_someASO==1 & scho_someTSO==0 & scho_someKSO==0 & scho_someBSO==0)
gen scho_noASO=(scho_someASO==0 & (scho_someTSO==1 | scho_someKSO==1 | scho_someBSO==1))
gen scho_none=(scho_someASO==0 & scho_someTSO==0 & scho_someKSO==0 & scho_someBSO==0)
gen scho_mixed=(scho_someASO==1 & (scho_someTSO==1 | scho_someKSO==1 | scho_someBSO==1))

drop dup

save "Output/database_model_allyears",replace

//add track info
tempfile temp
preserve
keep if natuurlijk==2
sort anoniem schooljaar
duplicates drop anoniem,force
keep anoniem ASO TSO KSO BSO
rename ASO ASO2
rename TSO TSO2
rename KSO KSO2
rename BSO BSO2
save `temp',replace
restore

merge m:1 anoniem using `temp'
drop _merge



//add track info for grade 3
tempfile temp
preserve
keep if natuurlijk==3
sort anoniem schooljaar
duplicates drop anoniem,force
keep anoniem ASO TSO KSO BSO
rename ASO ASO3
rename TSO TSO3
rename KSO KSO3
rename BSO BSO3
save `temp',replace
restore

merge m:1 anoniem using `temp'
drop _merge


tempfile temp
preserve
drop if ASO==0 & TSO==0 & KSO==0 & BSO==0
drop if missing(ASO) & missing(TSO) & missing(KSO) & missing(BSO)
gsort +anoniem -schooljaar
duplicates drop anoniem,force
keep anoniem ASO TSO KSO BSO
rename ASO ASOL
rename TSO TSOL
rename KSO KSOL
rename BSO BSOL
save `temp',replace
restore

merge m:1 anoniem using `temp'
drop _merge

////////////
keep if enterhs==1
////////////

duplicates tag anoniem,gen(dup)
tab dup

bysort anoniem: egen scho_ASO_same=max(scho_ASO)
replace scho_ASO_same=. if dup==0

gen error=1 if scho_ASO_same~=scho_ASO & dup==1
bysort anoniem: egen temp=total(error)
replace error=temp
drop temp
tab error
drop if error==1 

codebook anoniem if first==2002
codebook anoniem if first==2003

drop dup

gen random=runiform()
sort random 

duplicates drop anoniem, force //randomly drop others

drop random
rename gender male

label var ASO2 "ASO in grade 2"
label var ASO3 "ASO in grade 3"
label var ASOL "ASO in final high school year"

save "Output/database_model",replace

log close

