set more off

cap log close
log using "Output/students", replace

///////////////////////////////////////////////////////////// CREATE STUDENTDB ///////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

/// PRIMARY EDUCATION
insheet using "Source/Education/1_llnbas.csv", delimiter(";") names clear
replace ko_schooljaar=subinstr(ko_schooljaar," ","",.)
gen schooljaar=substr(ko_school,1,4)
destring schooljaar,replace

*only keep last year in bao -> this will define starting year of cohort in data ("first")
bysort anoniem: egen last=max(schooljaar)

tab schooljaar

keep if last==schooljaar

tab schooljaar

*keep only cohorts before 2007 so can drop everything in primary education from that
drop if schooljaar>2007

drop ko_school
replace nis_code_lln=subinstr(nis_code_lln," ","",.)
destring nis_code_lln,replace
rename nis_code_lln niscode
rename nummer_school nummer_instelling
gen bao=1

duplicates tag nummer_instelling hs stamnummer schooljaar, gen(dup)
list if dup==1

drop financierbaar

save "Output/students_bao",replace

/// SECONDARY EDUCATION
insheet using "Source/Education/2_llnsec.csv", delimiter(";") clear names
replace ko_school=subinstr(ko_school," ","",.)
gen schooljaar=substr(ko_school,1,4)
drop ko_school
destring schooljaar,replace
replace nis_code_lln=subinstr(nis_code_lln," ","",.)
replace nis_code_lln=subinstr(nis_code_lln,"?","",.)
destring nis_code_lln,replace
rename nis_code_lln niscode
rename nummer_school nummer_instelling
gen so=1

duplicates tag nummer_instelling hs stamnummer schooljaar, gen(dup) //note: anoniempersoonsid can have doubles because enrolled in different schools in same year
list if dup==1
drop dup

drop financierbaar

tab schooljaar //obs 2001-2012
tab hs //obs in secondary(3XX)

save "Output/students_so",replace

append using "Output/students_bao"

save "Output/students",replace

///EXTRA DATA THAT INCLUDES LOCATION INFO + BIRTH DATE
insheet using "Source/Education/1_gegevens_LM.csv", delimiter(";") names clear

duplicates drop
duplicates tag stamnummer geboortedatum instellingsnr schooljaar hs, gen(dup)
tab dup
list if dup~=0
drop dup

save "Output/students_import",replace

rename schooljaarcode schooljaar
rename instellingsnr nummer_instelling

keep if schooljaar==2007

label define mv 0 "V" 1 "M",replace
encode geslacht,gen(gender) label(mv)
drop geslacht
*correct mistakes
set more off
replace geboortedatum="06.09.1987" if geboortedatum=="06.09.1687"
replace geboortedatum="01.06.1994" if geboortedatum=="01.06.0994"
replace geboortedatum="09.03.1982" if geboortedatum=="09.03.0982"
replace geboortedatum="17.04.1985" if geboortedatum=="17.04.0985"
gen birth=date(geboortedatum, "DMY")
format birth %td
gen birth_day=day(birth)
gen birth_month=month(birth)
gen birth_year=year(birth)
gen birth_monthyear=monthly(substr(geboortedatum,4,.), "MY")
format birth_monthyear %tm

replace nisc_=subinstr(nisc_," ","",.)
replace nisc_="." if schooljaar<2007 //mistake in data, is postcode not niscode
destring nisc_, replace
rename nisc_ niscode

replace postcode=subinstr(postcode," ","",.)
destring postcode, replace

rename stat_sect sector

replace sector="" if missing(niscode) //uninformative without niscode

tab hs //obs in kindgarten (1XX), elementary(2XX) and secondary (3XX)

drop if hs<300 & !missing(hs)

drop schooljaar

*add "anoniempersoonsid" as id variable
tempfile temp
preserve
use "Output/students",clear
keep anoniem nummer_instelling stamnummer hs 
drop if missing(anoniem)
duplicates drop
save `temp'
restore

merge 1:1 nummer_instelling stamnummer hs using `temp'
drop nummer_instelling stamnummer hs
keep if _merge==3

drop _merge

duplicates drop

duplicates tag  anoniem,gen(dup)
tab dup
br if dup==1

drop if anoniem=="2e17e733708283964b2f806ebd39ac6f" & sector==" "

duplicates drop anoniem,force  //random drop 1 of the 2 others

save "Output/students_stsect",replace



/// MERGE
*merge with extra info
use  "Output/students",clear

rename niscode niscode_standard
label variable niscode_standard "Niscode according to standard database, not the one with st sect"

*merge birthdate and location info
merge m:1  anoniem  using "Output/students_stsect", update
tab schooljaar _merge
tab schooljaar _merge if so==1

drop niscode_standard

drop _merge
save "Output/students", replace

////EXTENSIONS
/// SES
insheet using "Source/Education/1_aanleveren_GOK_BaO+SO.csv", delimiter(";") names clear
*6482622obs

rename nummer_school nummer_instelling
rename hoofdstructuur hs

*only keep 2007
keep if schooljaar==2007
drop schooljaar

*only keep secondary
drop if hs<300 & !missing(hs)

keep nummer_instelling hs stamnummer schooltoelage_ik spreektaal_*  moeder_opleidingsniveau

*transform to numeric variables
label define janee 0 "N" 1 "J",replace
foreach var of varlist schooltoelage_ik spreektaal_* {
di "`var'"
tab `var'
replace `var'=subinstr(`var'," ","",.)
encode `var',gen(temp) label(janee)
drop `var'
rename temp `var'
}

destring moeder_opleidingsniveau,replace
replace moeder_opleidingsniveau=. if moeder_opleidingsniveau==0
gen opl_moe1 = 0 if !missing(moeder_opleidingsniveau)
replace opl_moe1 = 1 if moeder_opleidingsniveau==1 | moeder_opleidingsniveau==2 | moeder_opleidingsniveau==3
gen opl_moe2 = 0 if !missing(moeder_opleidingsniveau)
replace opl_moe2 = 1 if moeder_opleidingsniveau==4
gen opl_moe3 = 0 if !missing(moeder_opleidingsniveau)
replace opl_moe3 = 1 if moeder_opleidingsniveau==5

*Some extra info in Dutch from data supplier that's useful:
*"In 2006 en 2008 werden de gelijkekansenindicatoren nog bijgestuurd en een verschillend gewicht werd toegekend aan de 
*verschillende indicatoren. Voornamelijk de indicator thuistaal werd bijgevijld tot “de taal die de leerling in het gezin spreekt, 
*dit is de taal die de leerling spreekt met vader, moeder, broers of zussen, is niet het Nederlands. Die taal is niet het Nederlands indien 
*de leerling in het gezin met niemand of in een gezin met drie gezinsleden (de leerling niet meegerekend) met maximum één gezinslid het Nederlands 
*spreekt Verschillende broers en zussen worden steeds als één gezinslid beschouwd”

replace spreektaal_mdr_ned=. if spreektaal_mdr_geen==1
replace spreektaal_vdr_ned=. if spreektaal_vdr_geen==1
replace spreektaal_brz_ned=. if spreektaal_brz_geen==1

gen ned_thuis=0 if !missing(spreektaal_mdr_ned) | !missing(spreektaal_vdr_ned) | !missing(spreektaal_brz_ned)

replace ned_thuis=1 if spreektaal_mdr_ned==1
replace ned_thuis=1 if spreektaal_vdr_ned==1
replace ned_thuis=1 if spreektaal_brz_ned==1

	*extra requirement if 3 members of the family
	replace ned_thuis=0 if spreektaal_mdr_ned==0 & spreektaal_vdr_ned==0
	replace ned_thuis=0 if spreektaal_mdr_ned==0 & spreektaal_brz_ned==0
	replace ned_thuis=0 if spreektaal_vdr_ned==0 & spreektaal_brz_ned==0


gen no_ned_thuis = 1 -ned_thuis

gen mother_no_ned=1-spreektaal_mdr_ned

rename schooltoelage_ik toelage_SO

destring moeder_opleidingsniveau,replace

*add id
tempfile temp
preserve
use "Output/students",clear
keep anoniem nummer_instelling stamnummer hs 
drop if missing(anoniem)
duplicates drop
save `temp'
restore

merge 1:1 nummer_instelling stamnummer hs using `temp'
drop nummer_instelling stamnummer hs
keep if _merge==3

drop _merge

duplicates drop

duplicates tag  anoniem,gen(dup)
tab dup
br if dup==1

drop if missing(spreektaal_mdr_ned) & dup==1
drop  dup

save "Output/students_gok",replace


/// STUDY RESULTS
insheet using "Source/Education/5_studieressec.csv", delimiter(";") names clear


rename schooljaar_code schooljaar
rename administratieve_gr nummer_admgr

replace laatst_beh_studieb=subinstr(laatst_beh_studieb," ","",.)

gen degree=0 if !missing(laatst_beh_studieb)
replace degree=1 if laatst_beh_studieb=="03" //Degree secondary education
replace degree=1 if laatst_beh_studieb=="02" //Studiegetuigschrift (this is for BSO (=vocational track) students who complete 12th grade)

gen degreeyear=schooljaar if degree==1
bysort anoniem: egen temp=min(degreeyear)
replace degreeyear=temp
drop temp

bysort anoniem: egen temp=max(degree)
replace degree=temp
drop temp

keep anoniem_persoonsid degree degreeyear
duplicates drop

label var degree "Students obtains degree SO or cert BSO in 2001-2012"


save "Output/students_certificate",replace


/// merge students with extensions
use "Output/students", clear
merge m:1 anoniem using "Output/students_gok", update
tab schooljaar if _merge==1 |_merge==2
tab schooljaar if _merge==3 |_merge==4 | _merge==5

*keep cohorts 2001-2007 (and earlier)
bysort anoniem: egen first=min(schooljaar)
tab first

**********
*SELECT COHORT
keep if first==2002 | first==2003
**********

save temp,replace
drop _merge
merge m:1 anoniem_persoonsid using "Output/students_certificate", update
tab _merge if so==1 //0.14% missing
drop _merge

*define first year in so and then drop cohort that we cannot use and drop bao observations and define first year high school
gen baoyear=schooljaar if bao==1
bysort anoniem: egen temp=max(baoyear)
tab temp first

drop if missing(temp) //cannot identify first year of high school for these cohorts

gen enterhs=1 if schooljaar==temp+1
label var enterhs "Schoolyear after last year on bao"
drop temp


*add identifier of primary school
sort anoniem schooljaar
by anoniem: egen temp=total(bao)
gen error=1 if temp~=1 //create identifier if there are multiple observations of bao
tab error
drop temp

by anoniem: gen temp=bao[1]
replace error=1 if temp~=1 //update identifier if first observation is not bao
tab error
drop temp

by anoniem: gen bao_nummer_instelling=nummer_instelling[1] if error~=1
by anoniem: gen bao_intern_volgnr_vpl=intern_volgnr_vpl[1] if error~=1

label var bao_nummer_instelling "nummer_instelling of bao student went to"
label var bao_intern_volgnr_vpl "intern_volgnr_vpl of bao student went to"

drop if bao==1
drop bao
drop baoyear
drop so
drop last
drop error

tab first schooljaar

*drop observations after having obtained a degree
drop if !missing(degreeyear) & !missing(schooljaar) & degreeyear<schooljaar

tab first schooljaar

save "Output/students",replace


log close
