set more off
cap log close
log using "Output/schooldb", replace

///////////////////////////////////////////////////////////// CREATE SCHOOLDB ///////////////////////////////////////////////////////////////////////////////////////
// - We use source files from department of education to construct a dataset ("schools.dta") of schools in Flanders 2001-2012 //////////////////////////////////////
// - We also add an additional identifier based on the location of a school to distinguish between schools ////////////////////////////////////////////////////////
// 		(necassary to distinguish between schools since some schools have multiple official numbers) /////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

/// BEWARE !!! ///
// schools identifier is nummer_instelling BUT, there can be several numbers for the same school AND one number sometimes 
// corresponds to multiple adresses so the school is in practice split up. We have adress-data so we will use this to make our own identifier.

*Note that for missing values we use value in future years if they are the same over 2 periods of previous year if they are the same over 2 periods

/// VPL: DATABASE THAT CONTAIN THE DIFFERENT VPLS FOR EACH NUMMER_INST
insheet using "Source/education/6z_vpl_in_tabel_2_verrijkt+_utf8.csv", delimiter(";") clear names
gen schooljaar=0
rename intern_volgnr_vpl intn_volg_nr
save "Output/vpl",replace

insheet using "Source/education/6z_vpl_in_tabel_2_verrijkt_utf8.csv", delimiter(";") clear names
rename schj_hudg_cd schooljaar
label variable schooljaar "Year in which school year started"
drop schj_hudg_ko
rename inst_cd nummer_instelling
rename strt_nm straatnaam
rename huis huisnr
rename post_cd postcode
rename gemt_cd nis_code_fusie
rename gemt_nm naam_fusieg
rename naam_fusieg naam_fusiegemeente

tab intn
replace intn=subinstr(intn," ","",.)
destring intn, replace dpcomma

append using "Output/vpl"

*clean
keep schooljaar nummer_instelling intn_volg_nr straatnaam huisnr postcode naam_fusiegemeente
replace huisnr=subinstr(huisnr," ","",.)
replace huisnr="." if huisnr=="Z/N" | huisnr=="z/n"

rename nummer_instelling hoofdnr
rename intn_volg_nr vplnr
save "Output/vpl",replace


/// SCHOOLS: DATABASES THAT CONTAIN MORE INFO ON AFFILIATION OF SCHOOLS
*scholen_1
insheet using "Source/education/6z_scholen_in_tabel_2_verrijkt+_utf8.csv", delimiter(;) clear

drop ko_schooljaar

*use vpl dataset for following variables
drop straatnaam huisnr postcode nis_code_fusie naam_fusiegemeente
save "Output/scholen_1",replace

*scholen_2
insheet using "Source/education/6z_scholen_in_tabel_2_verrijkt_utf8.csv", delimiter(;) clear
rename schj_hudg_cd schooljaar
label variable schooljaar "Year in which school year started"
drop schj_hudg_ko
rename inst_cd nummer_instelling
rename strt_nm straatnaam
rename huis huisnr
rename post_cd postcode
rename gemt_cd nis_code_fusie
rename gemt_nm naam_fusieg
rename mach_inrc_sort_cd im_net_code
rename  mach_inrc_sort_ko ko_im_net_code
rename im_net_code soort_im 
rename ko_im_net_code ko_soort_im
rename ondw_net_cd im_net_code
rename ondw_net_ko ko_im_net_code
rename naam_fusieg naam_fusiegemeente

save "Output/scholen_2",replace

//merge 1+2
use "Output/scholen_1",clear
merge 1:1 schooljaar nummer_instelling using "Output/scholen_2", update

*we use scholen_1 if there is a conflict
tab schooljaar if _merge==1
tab schooljaar if _merge==3 | _merge==5
drop _merge
save "Output/merge_1",replace

*scholen_3 (koepels)
insheet using "Source/education/6z_scholen_in_tabel_2_verrijkt+_koepels_utf8.csv", delimiter(;) clear
*42656obs
sum
rename koepel_cd kopl_cd
rename koepel_ko kopl_ko

save "Output/scholen_3",replace

use  "Output/merge_1",clear
merge 1:1 schooljaar nummer_instelling using "Output/scholen_3", update
*list if kopl_cd_merge3~=kopl_cd & !missing(kopl_cd) & !missing(kopl_cd_merge3) //no real problems here
drop _merge
save "Output/merge_2",replace

*scholen_4 (gemeenschap)
insheet using "Source/education/6z_scholen_in_tabel_2_verrijkt+_scholengem_utf8.csv", delimiter(;) clear

sum

rename scholengem_cd schl_gems_cd
rename scholengem_ko schl_gems_ko
 
save "Output/scholen_4",replace

use "Output/merge_2", clear
merge 1:1 schooljaar nummer_instelling using "Output/scholen_4", update
*br schl_gems_cd schl_gems_cd_merge4 if schl_gems_cd~=schl_gems_cd_merge4 & !missing(schl_gems_cd_merge4) & !missing(schl_gems_cd) 
*br schooljaar schl_gems_cd schl_gems_cd_merge4 if schl_gems_cd==124065
*outlier in 2010 and only for _merge4 so again trust the large db instead of this extension
drop _merge
drop straatnaam- setr_stts_cd
save "Output/merge_3",replace

drop soort_im ko_soort_im nummer_im im_net_code ko_im_net_code

replace kopl_cd=117986 if kopl_cd==117663 //steiner has other number in 2001
replace kopl_cd=117978 if kopl_cd==117671 //FOPEM
bysort kopl_cd: tab kopl_ko,missing

replace kopl_ko="" if kopl_cd==.
replace kopl_ko="VSKO" if kopl_cd==61581
replace kopl_ko="OVSG" if kopl_cd==61705
replace kopl_ko="GO" if kopl_cd==111311
replace kopl_ko="VOOP" if kopl_cd==117689
replace kopl_ko="FOPEM" if kopl_cd==117978
replace kopl_ko="IPCO" if kopl_cd==117697
replace kopl_ko="STEINER" if kopl_cd==117986

drop kopl_cd

encode kopl_ko,gen(koepel)
drop kopl_ko
tab koepel

drop verwijzing_sch_sg
rename schl_gems_cd scholengemeenschap_nr
rename schl_gems_ko scholengemeenschap_naam

rename nummer_instelling hoofdnr

*do not allow same school to have different koepels/scholengem
tempvar temp
foreach var of varlist koepel scholengemeenschap_nr scholengemeenschap_naam  {
bysort hoofdnr: egen `temp'=mode(`var'), maxmode
replace `var'=`temp'
drop `temp'
}

sort hoofdnr schooljaar
drop schooljaar
duplicates drop 

save "Output/nummer_inst",replace
