clear all
set matsize 10000

* locals
local x $x
local y $y
local s $subsample

* control variables
global controlvar opl_moe1 opl_moe2 no_ned_thuis toelage_SO male repeated ///


// Open data
use "Output/database_model_final",clear
rename $x x
label var x "Elite HS"
rename $y y
drop if missing(y)
drop if missing(x)

// Random drop
gen random=runiform()
drop if random>$sample
drop random

// Open log
cap log close
log using main_`x'_`y'_`s'_control1, replace

// Make instrument
merge m:1 niscode sector using "Output/minimumdist_adnr"
drop if _merge==2
drop _merge
merge m:1 niscode sector using "Output/minimumdist_adnr_count"
drop if _merge==2
drop _merge

gen mindist_treat=d_scho_ASO
gen mindist_notreat=.
foreach var of varlist d_scho_noASO d_scho_mixed {
replace mindist_notreat=`var' if `var'<mindist_notreat
}

gen z=mindist_notreat-mindist_treat
label var z "Dist nonElite - dist Elite"
global propvars z

// Number of schools within 5 km distance
gen totsch5 =  tot_5km_scho_notreat + tot_5km_scho_treat

// Drop observations without control variables (based on control variables from main analysis to keep the same number of observations)
foreach var of varlist $controls2{
drop if missing(`var')
}

// Selection of the sample
if $subsample==2 {
drop if totsch5==0
drop if totsch5==1 
drop if totsch5==2
drop if totsch5==3
}
if $subsample==3 {
drop if d4_scho_any< d_scho_ASO
}
if $subsample==4 {
keep if ASO2==1
}
if $subsample==5 {
keep if scho_someASO==1
}
if $subsample==6 {
drop if dropout_uncertain==1
}

* Domain for MTE graphs (at least 10 students in both elite and non-elite schools)
if $subsample==1 {
probit x z $controlvar
predict propscore
gen prop_score=round(propscore,.01)
tab prop_score x
global dom = 0.77
}

tempfile temp_1
save `temp_1', replace

********************
*** OLS and 2SLS ***
********************

if $subsample==1 & "$y"=="nodropout_strict" {
* Table 1: Enrollment in elite and non-elite schools
sum male repeated opl_moe3 opl_moe2 opl_moe1 no_ned_thuis toelage_SO mediaaninkomen_nis mediaaninkomen totaalbelgrel_nis totaalbelgrel hogeronderwijsrel_nis hogeronderwijsrel sechogerrel_nis sechogerrel bev_nis bev mindist_treat mindist_notreat if x==1 
sum male repeated opl_moe3 opl_moe2 opl_moe1 no_ned_thuis toelage_SO mediaaninkomen_nis mediaaninkomen totaalbelgrel_nis totaalbelgrel hogeronderwijsrel_nis hogeronderwijsrel sechogerrel_nis sechogerrel bev_nis bev mindist_treat mindist_notreat if x==0 

* Extra and footnotes
sum x ASO2 y degreeontime downgrade mindist_treat mindist_notreat dropout_uncertain
tabstat mindist_treat mindist_notreat,stats(p1 p5 p10 p25 p50 p75 p90 p95 p99)

* Table 2: Track choice and study outcomes by initial school choice
sum ASO2 y degreeontime downgrade if x==1 
sum ASO2 y degreeontime downgrade if x==0 

* Table 3: First stage: choosing an elite school
	* F-stat of exclusion restrictions are computed in 2SLS regressions of Table 13
reg x z $controlvar, cluster(location)

* Table 4: Testing the instrument: effect of student characteristics on relative distance 
reg z $controlvar,cluster(location)
test $controls1 repeated

* Table 13: Obtaining a high school degree: OLS and 2SLS
reg y x $controlvar, cluster(location)
ivregress 2sls y (x=z) $controlvar, cluster(location)
estat firststage
}

	
********************************
*** SEMI PARAMETRIC APPROACH ***
********************************

* common support
probit x z $controlvar
predict p
local options graphregion(color(white))  freq width(0.01) xtitle(P(z)) xlabel(0 0.2 0.4 0.6 0.8 1) ylabel(#5 ,format(%9.0g) ) ytitle("")
twoway hist p if x==0, legend(order(1 "No treatment" 2 "Treatment")) `options'  fcolor(gs10) lcolor(white)  || hist p if x==1 , fcolor(none) lcolor(black) `options' 
graph save "Output/support_`x'_`y'_`s'_control1",replace

* Weights for ATT
sum x
scalar scal_meanp=r(mean)
scalar scal_weight_att_0=1
scalar scal_weight_att_total=scal_weight_att_0
tempvar temp
forvalues i=1/100 {
local j=`i'/100
gen `temp'=p>`j' if !missing(p)
sum `temp'
scalar scal_weight_att_`i'=r(mean) 

scal scal_weight_att_total=scal_weight_att_total+scal_weight_att_`i'
di "weight at `i'%=" scal_weight_att_`i'
drop `temp'
}


gen temp=_n-1 in 1/101
replace temp=temp/100
gen weight_graph=.
forvalues i=1/100 {
replace weight_graph=scal_weight_att_`i'/scal_meanp in `i'
}

tw line weight_graph  temp, fcolor(none) lcolor(black) xtitle(U_D) ytitle("") graphregion(color(white))
graph save "Output/ATT_weights_`x'_`y'_`s'_control1",replace



* asymptotically optimal constant bandwith bandwith for Y
mtefe y $controlvar (x=z $controlvar), gridpoints(100) first second semi degree(2)  kernel(epanechnikov) vce(cluster location) savepropensity(propsc) bootreps(2)

matrix B=e(b)
gen y_tilde = -(opl_moe1*B[1,1] + opl_moe2*B[1,2] + no_ned_thuis*B[1,3] + toelage_SO*B[1,4] + male*B[1,5] + repeated*B[1,6]) /// 
-(opl_moe1*B[1,7] + opl_moe2*B[1,8] + no_ned_thuis*B[1,9] + toelage_SO*B[1,10] + male*B[1,11] + repeated*B[1,12])*propsc

lpoly y_tilde propsc, degree(2)
di r(bwidth)
global bw = r(bwidth)

* Estimation of treatment effects
mtefe y $controlvar (x=z $controlvar), gridpoints(100) first second semi degree(2)  kernel(epanechnikov) bootreps($bootloop) vce(cluster location) ybwidth($bw)
mtefeplot, trimsupport(0 $dom)
graph save "Output/MTE_`x'_`y'_`s'_control1",replace




***************************
*** PARAMETRIC APPROACH ***
***************************

* Estimation
switch_probit y $controlvar, select(x z $controlvar) cluster(location)

est store baseline
predict att,tt
predict ate,te
predict atnt,tu
foreach var of varlist att ate atnt {
sum `var'
scalar `var'_`i'=r(mean)
}
drop att ate atnt

*predictions for treatment effects and to calculate other effects
predict att,tt
predict ate,te
predict atnt,tu
predict psel, psel //probability of treatment
predict xb1,xb1 //linear prediction of outcome equation if treated
predict xb0,xb0 // "" nontreated
predict zb,zb // "" of participation equation


//distributional
est restore baseline
scalar scal_rho1=e(rho1)
di scal_rho1
scalar scal_rho0=e(rho0)
di scal_rho0

nlcom tanh(_b[/athrho1]),post
mat mat_temp=e(V)
scalar scal_rho1_se=sqrt(mat_temp[1,1])
di scal_rho1_se
est restore baseline
nlcom tanh(_b[/athrho0]),post
mat mat_temp=e(V)
scalar scal_rho0_se=sqrt(mat_temp[1,1])
di scal_rho0_se
est restore baseline


**marginal effects on TE

*categorical variable degree mother
foreach effect in  tt te tu {
clonevar opl_moe1_clone=opl_moe1
clonevar opl_moe2_clone=opl_moe2
di "`effect'"
replace opl_moe1=0
replace opl_moe2=0
predict temp0,`effect'

replace opl_moe1=1
predict temp1,`effect'
gen me_`effect'_opl_moe1=temp1-temp0
drop temp1

replace opl_moe1=0
replace opl_moe2=1
predict temp1,`effect'
gen me_`effect'_opl_moe2=temp1-temp0
drop temp1 temp0

replace opl_moe2=opl_moe2_clone
replace opl_moe1=opl_moe1_clone
drop opl_moe1_clone opl_moe2_clone


* dummy variables: effect of a change from 0 to 1
foreach var of varlist no_ned_thuis toelage_SO male repeated{
clonevar `var'_clone=`var'
replace `var'=0
predict temp0,`effect'
replace `var'=1
predict temp1,`effect'
gen me_`effect'_`var'=temp1-temp0
drop temp1 temp0
replace `var'=`var'_clone
drop `var'_clone
}

}
*get out probs
gen prob0=normal(xb0) //prob to succeed, cond on no treatment
gen prob1=normal(xb1)
gen treat=normal(zb)

*gain and loose
scalar scal_rho01=2*scal_rho0*scal_rho1
gen gain=prob1-binormal(xb0,xb1,scal_rho01)
gen loose=(1-prob1)-binormal(-xb0,-xb1,scal_rho01)

set more off
*gain and loose if (un)treated (simulation assisted)
matrix mat_M=(0,0,0)
scal scal_rho01=2*scal_rho0*scal_rho1 //this does not change predicted probs but it does change gain from treatment!
matrix mat_V=(1,scal_rho0,scal_rho1 \ scal_rho0,1,scal_rho01 \ scal_rho1,scal_rho01,1)

	gen psel_sim=0
	gen treat_sim=0
	gen prob0_sim=0
	gen prob1_sim=0
	gen gain_tt_sim=0
	gen loose_tt_sim=0 
	gen gain_ut_sim=0
	gen loose_ut_sim=0

qui forvalues s=1/$maxsim {
drawnorm UD U0 U1, corr(mat_V) means(mat_M)
replace treat_sim=(zb+UD>0)
replace prob0_sim=(xb0+U0>0)
replace prob1_sim=(xb1+U1>0)

replace gain_tt_sim=(gain_tt_sim+prob1_sim*(1-prob0_sim)*treat_sim)
replace loose_tt_sim=(loose_tt_sim+(1-prob1_sim)*prob0_sim*treat_sim)
replace gain_ut_sim=(gain_ut_sim+prob1_sim*(1-prob0_sim)*(1-treat_sim))
replace loose_ut_sim=(loose_ut_sim+(1-prob1_sim)*prob0_sim*(1-treat_sim)) 
 
replace psel_sim=psel_sim+treat_sim
drop UD U0 U1
}

replace psel_sim=psel_sim/$maxsim
replace gain_tt_sim=(gain_tt_sim/$maxsim)/psel_sim
replace loose_tt_sim=(loose_tt_sim/$maxsim)/psel_sim
replace gain_ut_sim=(gain_ut_sim/$maxsim)/(1-psel_sim)
replace loose_ut_sim=(loose_ut_sim/$maxsim)/(1-psel_sim)

replace gain_tt_sim=. if x==0 //such that expectation over X is okay
replace loose_tt_sim=. if x==0
replace gain_ut_sim=. if x==1 
replace loose_ut_sim=. if x==1

di "TREATMENT PARAMETERS"
tabstat att ate atnt gain loose gain_tt_sim loose_tt_sim gain_ut_sim loose_ut_sim,stats(mean) col(stats)

di "MARGINAL EFFECTS"
tabstat me_*,stats(mean) col(stats)


*collect all interesting statistics in scalars
foreach var of varlist att ate atnt gain loose gain_tt_sim loose_tt_sim gain_ut_sim loose_ut_sim me_*  {	
	qui sum `var'
	scalar scal_`var'=r(mean)
}
	
//bootstrap
use `temp_1',clear

tempfile temp_boot
save "Output/startpred_clean_twostage",replace 


if $parallel_allow==1 {
parallel setclusters 5,force
global loopspercluster=$bootloop/5 
parallel do "twostage_draws_control1.do", nodata
}
else {
global PLL_CLUSTERS=1
global pll_instance=1
global loopspercluster=$bootloop
do "twostage_draws_control1.do"
}

	
*summarize the draws
use "Output/twostage_draw_1",clear
forvalues i=2/$bootloop {
merge 1:1 var using  "Output/twostage_draw_`i'",keep(3) nogen
}

egen sd=rowsd(draw*)
levelsof var,local(vars)

gen est=.
foreach var in `vars' {
replace est=scal_`var' if var=="`var'"
}
order est,before(sd)
gen star=""
replace star="*" if abs(est)-1.96*sd>0

keep var est sd star

list 

log close
