options obs=max nonumber nodate label;

libname frawdata '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/raw data';

%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/individual clean.sas';

%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/interview clean.sas';
data interview;
	set interview;
	if cv_hdc<0 then cv_hdc = .;
	if ~(1<=cv_hgc<=20) then cv_hgc = .;
	rename interviewdate = date interviewcm = cm;
run;

%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/yearly prisec clean.sas';
data ed_acadyr;
	set ed_acadyr;
	if hdc_startacyr<0 then hdc_startacyr = .;
	rename academic_yr = yr;
run;

%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/monthly clean.sas';
data monthly;
	set monthly;
	if mon<9 then yr = yr - 1;
	
	if num_arrest = 99 then num_arrest = 0;
	if num_arrest<0 then num_arrest = 0;
	if incarceration<0 then incarceration = 0;

	HsAttend = hs_status=2;	
	ColAttend = col_status>=2;
	Attend = (hs_status>=2) or (col_status>=2);	
	
run;
proc means data = monthly noprint;
	by id yr;
	where col_status>=0;
	output out = hist_edatt (rename=(_freq_=Nmon) drop=_type_) 
		mean(Attend ColAttend HsAttend) = Attend ColAttend HsAttend;
run;
proc means data = monthly noprint;
	by id yr;
	output out = hist_arrest (drop=_type_ _freq_) sum(num_arrest)=num_arrest;
run;
	
data inc;
	set interview (keep=id date where=(date ne .));
	by id date;
	if last.id;
	wkd = weekday(date);
	lastwk = date - (wkd-1) - 7;
	format lastwk mmddyy8.;
	lastcm = (year(lastwk)-1980)*12 + month(lastwk);
	keep id lastcm;
run;
data inc (keep = id cm mon yr date age where=(mon=9));
	merge inc individual (keep=id dob_feasible dob_cm);
	by id;
	do cm = dob_cm+1 to lastcm;
		yr = ceil(cm/12)+1979;
		mon = cm - (yr-1980)*12;
		date = mdy(mon,1,yr);
		age = (date-dob_feasible)/365.25;
		output;
	end;
run;

%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/behavior clean.sas';
%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/hdc.sas';
%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/hgc.sas';
%include '/folders/myshortcuts/GenderGap/supplemental files/nlsy97/sas files/education history.sas';

data grade_in8;
	set interview (keep = id date gpain8 where=(gpain8>0));
	by id date;
	rename gpain8 = col1;
run;
data grade_in8;
	set grade_in8;
	if col1 = 8 then GPAin8 = 4;	
		else if col1 = 7 then GPAin8 = 3.5;
		else if col1 = 6 then GPAin8 = 3;
		else if col1 = 5 then GPAin8 = 2.5;
		else if col1 = 4 then GPAin8 = 2;
		else if col1 = 3 then GPAin8 = 1.5;
		else if col1 = 2 then GPAin8 = 1;
		else if col1 = 1 then GPAin8 = .5;			
	drop col1;
run;
proc sort data = grade_in8; by id descending date; run;

data grd8;
	set ed_acadyr;
	date = mdy(9,1,yr);
	keep id yr date grade_attended;
run;
proc sort data = grd8; by id descending date; run;
data grd8;
	merge grd8 (in=a) grade_in8 (in = b keep = id date);
	by id descending date;
	retain infodate;
	if first.id then infodate = .;
	if b then infodate = date;
	if a;
	format infodate mmddyy8.;
run;
proc sort data = grd8; by id date; run;
data grd8;
	set grd8 (where=(grade_attended=8 and infodate ne .));
	by id date;
	if last.id;
run;

proc sort data = grade_in8; by id date; run;
data grade_in8 (where=(GPAin8 ne .));
	merge grd8 (in=a) grade_in8 (rename=(date=infodate));
	by id infodate;
	if a;
	keep id yr GPAin8;
run;

data yth_relHH;
	merge inc (in=a keep = id yr age) 
	individual (in=b keep = id rel_to_hh2 rel_to_hh6 rel_to_hh12);
	by id;
	if b;
	if floor(age)=2 and 1<=rel_to_hh2<=10 then rel_to_HH = rel_to_hh2;
	if floor(age)=6 and 1<=rel_to_hh6<=10 then rel_to_HH = rel_to_hh6;
    if floor(age)=12 and 1<=rel_to_hh12<=10 then rel_to_HH = rel_to_hh12;
   	
   	if rel_to_HH ne .;
   	keep id yr rel_to_HH;
run;

data hist_HH;
	set interview (where=(date ne .));
	yr = year(date);
	if month(date)<9 then yr = yr-1;
run;
data hist_HH;
	set hist_HH;
	by id yr date;
	if first.yr;
run;
data hist_HH;
	set hist_HH;

	if rel_to_HH<=0 then rel_to_HH = .;
	
	if num_HH_under18<0 then num_HH_under18 = .;
	
	if -999999<family_income<0 then family_income = 0;
	if family_income<0 then family_income = .;
	
	if mom_parent_style<=0 then mom_parent_style = .;

	if piat<0 then piat = .;
	
	HW_min_day = max((min(HW_wkday_hr,6)*60+HW_wkday_min),0);
	HW_hrs_wk = (HW_wkday_times*HW_min_day + HW_wkend_hr*60 + HW_wkend_min)/60;
	if HW_does = 0 then HW_hrs_wk = 0;

	RD_min_day = max((min(RD_wkday_hr,6)*60+RD_wkday_min),0);
	READ_hrs_wk = (RD_wkday_times*RD_min_day + RD_wkend_hr*60 + RD_wkend_min)/60;
	if RD_does = 0 then READ_hrs_wk = 0;
	
	keep id yr rel_to_HH family_income num_HH_under18 piat
		mom_parent_style HW_hrs_wk READ_hrs_wk;

run;	
data hist_HH;
	merge hist_HH yth_relHH (rename=(rel_to_HH = rel_to_HH_alt));
	by id yr;
	if rel_to_HH = . and (1<=rel_to_HH_alt<=10) then rel_to_HH = rel_to_HH_alt;
	drop rel_to_HH_alt;
run;


data pop;
	set individual;
	
	if ~(0<hgc_bio_mom<=20) then MomDegree = 0;
		else if hgc_bio_mom<=11 then MomDegree = 1;
		else if hgc_bio_mom=12 then MomDegree = 2;
		else if hgc_bio_mom<=15 then MomDegree = 3;
		else MomDegree = 4;
		
	if ~(0<hgc_bio_dad<=20) then DadDegree = 0;
		else if hgc_bio_dad<=11 then DadDegree = 1;
		else if hgc_bio_dad=12 then DadDegree = 2;
		else if hgc_bio_dad<=15 then DadDegree = 3;
		else DadDegree = 4;
	
	keep id race race_eth samptype gender dob_feasible wt_r1 MomDegree DadDegree;
	
run;

proc freq data = pop; tables momdegree daddegree; run;

data hist;
	merge hist_ed (in=a) 
		  hist_arrest 
		  behavior
		  individual (keep=id asvab_year ASVAB_AR ASVAB_MK ASVAB_WK ASVAB_PC rename=(asvab_year=yr))
		  grade_in8 
		  hist_HH;
	by id yr;
	if a;

	retention = (grade_progression=1);

	if days_suspended<0 then days_suspended = 0;

	if num_arrest<0 then num_arrest = 0;

	family_income = family_income/1000;
	if family_income<=0 then family_income = .;
	
	if grade_attended=6 then age6th = age;
	
	if rel_to_HH ne . then brokenHH = rel_to_HH~=1;
	
	if num_HH_under18>5 then num_HH_under18 = 5;
	
	if READ_hrs_wk>20 then READ_hrs_wk = 20;

	if HW_hrs_wk>20 then HW_hrs_wk = 20;
	
	drop grade_progression rel_to_HH;
	
run;

proc means data = hist noprint;
	by id;
	where (10<=age<17);
	output out = meas_fix1 (drop = _freq_ _type_)
	mean(family_income num_HH_under18)=family_income num_HH_under18;
run;

proc means data = hist noprint;
	by id;
	where (6<=grade_attended<=8);
	output out = meas_fix2 (drop = _freq_ _type_)
	max(brokenHH) = brokenHH min(age6th) = age6th;
run;


data meas_bygrade;
	set hist;
	where (6<=grade_attended<=8);
	drop hgc hgcprom EdPromotion family_income num_HH_under18 brokenHH age6th;
run;
proc sort data = meas_bygrade; by id grade_attended age; run;
proc transpose data = meas_bygrade out = meas_bygrade (where = (col1 ne .));
	by id grade_attended age;
run;
proc sort data = meas_bygrade; by id _name_ grade_attended age; run;
data meas_bygrade;
	set meas_bygrade;
	by id _name_ grade_attended age;
	if last.grade_attended=0 and _name_="mom_parent_style" then delete;
run;
proc means data = meas_bygrade noprint;
	by id _name_ grade_attended ;
	output out = meas_bygrade (drop = _type_ _freq_) max(col1)= meas;
run;

data meas_sum;
	set meas_bygrade;
	by id _name_ grade_attended;
	if last._name_=0 and _name_="mom_parent_style" then delete;
	if substr(_name_,1,5)="ASVAB" then delete;
run;
proc means data = meas_sum noprint;
	by id _name_;
	output out = meas_sum (drop = _type_ _freq_) max(meas)= meas;
run;
proc transpose data = meas_sum out=meas_sum (drop=_name_);
	by id;
	id _name_;
run;


proc sort data = meas_bygrade; by id grade_attended; run;
proc transpose data = meas_bygrade out=meas_bygrade2 (drop=_name_);
	by id grade_attended;
	id _name_;
run;
data meas_bygrade2;
	set meas_bygrade2;
			
	if days_suspended ne . then days_suspendedANY = days_suspended>0;
	if days_suspendedANY=1 then days_suspendedLOG = log(days_suspended);
			
	if READ_hrs_wk ne . then READ_hrs_wkANY = READ_hrs_wk>0;	
	if READ_hrs_wkANY=1 then READ_hrs_wkLOG = log(READ_hrs_wk);
		
	if HW_hrs_wk ne . then HW_hrs_wkANY = HW_hrs_wk>0;	
	if HW_hrs_wkANY=1 then HW_hrs_wkLOG = log(HW_hrs_wk);
		
	if num_arrest ne . then num_arrestANY = num_arrest>0;
	if num_arrest = 0 then num_arrest = .;
		
	drop days_suspended READ_hrs_wk HW_hrs_wk;
	
run;

proc transpose data = meas_bygrade2 out=meas_bygrade2 (where = (col1 ne .));
	by id grade_attended;
run;
data meas_bygrade2;
	set meas_bygrade2;
	lab = compress(_name_ || "_" || grade_attended);
	drop _name_ grade_attended;
run;
proc sort data = meas_bygrade2; by lab; run;

proc means data = meas_bygrade2 noprint;
	by lab;
	output out = meas_INFO var(col1) = varvar;
run;
data meas_INFO;
	set meas_INFO;
	where _freq_>=100 and varvar ne 0;
run;

data meas_bygrade3;
	merge meas_INFO (in=a keep=lab) meas_bygrade2;
	by lab;
	if a;
run;
proc sort data = meas_bygrade3; by id; run;
proc transpose data = meas_bygrade3 out = meas_bygrade3 (drop = _name_);
	by id;
	id lab;
	var col1;
run;


data panl;
	set hist (where = (hgcprom~=.) drop=hgc);
	by id age;
	
	Arrest = (num_arrest>0);
	rename hgcprom=hgc;	
	keep id age hgcprom EdPromotion Arrest yr;

run;
proc freq data = panl; tables hgc*Edpromotion / missing; run;


data agehgc9;
	set panl;
	by id age;
	if first.id;
	rename age = agehgc9;
	keep id age;
run;
data agelast;
	set panl;
	by id age;
	if last.id;
	rename age = agelast;
	keep id age;
run;


data measFinal;
	merge meas_bygrade3 meas_fix1 meas_fix2 pop (keep =id MomDegree DadDegree) 
	agehgc9 (in=a keep=id);
	by id;
	InEdSamp = (a=1);
	if family_income=. then family_income_group = 1;
		else if family_income<15 then family_income_group = 2;
		else family_income_group = 3;
	if family_income_group=3 then family_incomeLOG = log(family_income);
	drop family_income;
run;
proc freq; tables family_income_group; run;

data popFinal;
	merge pop (keep = id wt_r1 race gender race_eth);
	by id;
	if race<0 then race = .;
run;


proc export data=popfinal
outfile="/folders/myshortcuts/GenderGap/supplemental files/nlsy97/pop.csv" replace; 
run;
proc export data=measfinal
outfile="/folders/myshortcuts/GenderGap/supplemental files/nlsy97/meas.csv" replace; 
run;
proc export data=panl
outfile="/folders/myshortcuts/GenderGap/supplemental files/nlsy97/panel.csv" replace; 
run;


*** Output for REDUCED FORM **;

data ed24;
	set hist (keep=id hgc hgcprom age where=(floor(age)=24));
	keep id hgc hgcprom;
	rename hgc=hgc24 hgcprom=hgc24USED;
run;

data RFoutput;
	merge pop ed24 agehgc9 agelast meas_sum meas_fix1 meas_fix2 
	individual (keep = id ASVAB_AR_AGE_NORMED ASVAB_MK_AGE_NORMED ASVAB_PC_AGE_NORMED ASVAB_WK_AGE_NORMED);
	by id;
run;

proc means data = rfoutput n nmiss; run;

proc export data=RFoutput
outfile="/folders/myshortcuts/GenderGap/supplemental files/nlsy97/RFdata.dta" replace; 
run;

*** END Reduced Form **;

data SumArrest;
	set hist (keep=id num_arrest age);
	by id age;
	if first.id then NumArrestYrs = (num_arrest>0);
		else NumArrestYrs + (num_arrest>0);
	age = floor(age);
	keep id NumArrestYrs Age;
run;
data SumArrest;
	set SumArrest (where=(15<=age<=24));
	EverArrest = NumArrestYrs>0;
run;
proc transpose data = SumArrest (keep = id age EverArrest) out=EverArrest (drop=_name_) prefix=Arrest;
	by id;
	id age;
	var EverArrest;
run;

data RFoutput2;
	merge RFoutput (keep=id wt_R1 hgc24 gender race race_eth) EverArrest;
	by id;
	if race<0 then race = .;
run;
proc export data=RFoutput2
outfile="/folders/myshortcuts/GenderGap/supplemental files/nlsy97/SumData.csv" replace; 
run;






