
* This do-file prepares datasets for benchmarking industries

global path "."									/*path on your computer*/
cd "$path"
use "$path\_datasets\TFP_china_tech", clear			/*path to the cleaned panel dataset 1998-2007*/

*-------------------------------------------------------------------------------
* define high-tech industries based on cic classification
*-------------------------------------------------------------------------------
rename cic_2_digit cic_2d
rename cic_adj cic_4d
gen cic_3d = substr(cic_4d,1,3)
destring cic_3d, replace force
destring cic_4d, replace force

* 2 digit group includes: 25,26,27,36,37,40,41
keep if (cic_2d==25|cic_2d==27|cic_2d==37|cic_2d==40|cic_2d==36|cic_2d==41|cic_2d==26)

* 3 or 4 digit group includes
local ind_25 "cic_3d==253"
local ind_26 "cic_4d==2665"
local ind_27 "cic_4d==2710|cic_4d==2720|cic_4d==2730|cic_4d==2740|cic_4d==2750|cic_4d==2760|cic_4d==2770"

local ind_36 "cic_4d==3681|cic_4d==3682|cic_4d==3683|cic_4d==3684|cic_4d==3685|cic_4d==3686|cic_4d==3689"
local ind_37 "cic_4d==3761|cic_4d==3762|cic_4d==3769"

local ind_401 "cic_4d==4011|cic_4d==4012|cic_4d==4019|cic_4d==4013|cic_4d==4014"
local ind_403 "cic_4d==4031|cic_4d==4032|cic_4d==4039"
local ind_402 "cic_3d==402"
local ind_407 "cic_4d==4071|cic_4d==4072"
local ind_405 "cic_4d==4051|cic_4d==4052|cic_4d==4053|cic_4d==4059"
local ind_406 "cic_4d==4061|cic_4d==4062"
local ind_409 "cic_3d==409"
local ind_404 "cic_4d==4041|cic_4d==4042|cic_4d==4043"
local ind_40 "`ind_401'|`ind_403'|`ind_402'|`ind_407'|`ind_405'|`ind_406'|`ind_409'|`ind_404'"

local ind_411 "cic_4d==4111|cic_4d==4112|cic_4d==4113|cic_4d==4114|cic_4d==4115|cic_4d==4119"
local ind_412 "cic_4d==4121|cic_4d==4122|cic_4d==4123|cic_4d==4124|cic_4d==4125|cic_4d==4126|cic_4d==4127|cic_4d==4128|cic_4d==4129|cic_4d==4141|cic_4d==4190"
local ind_415 "cic_4d==4154|cic_4d==4155"
local ind_41 "`ind_411'|`ind_412'|`ind_415'"

local tech "`ind_25'|`ind_26'|`ind_27'|`ind_36'|`ind_37'|`ind_40'|`ind_41'"
di "(`tech')"						
keep if `tech'

* Rename all main variables for better notations
tab Ownership, gen(Ownership)
rename lnInputShare sm
rename Ownership4 v4
rename Ownership5 v5

gen type = .
replace type = 1 if Ownership<=3
replace type = 2 if Ownership==4
replace type = 3 if Ownership==5

*drop k l m
ren lnrY yg
ren lnrK k
ren lnL l
ren lnrM m
		
* Generating and Rename Variables
bysort NewID (year): gen d_OECD = (Ownership[_n]==5 & Ownership[_n-1]<=3 & _n>1)
bysort NewID (year): gen d_HKMT = (Ownership[_n]==4 & Ownership[_n-1]<=3 & _n>1)
bysort NewID (year): gen ex_OECD = (d_OECD[_n+1]==1 & d_OECD[_n]==0)
bysort NewID (year): gen ex_HKMT = (d_HKMT[_n+1]==1 & d_HKMT[_n]==0)

gen d = (d_HKMT==1 | d_OECD==1)
gen v = (v4==1 | v5==1)
bysort NewID (year): gen ex = (d[_n+1]==1 & d[_n]==0)

* Generating Treatment Indicator for Panel
egen treat = sum(d), by(NewID)
codebook NewID if treat>=1
codebook NewID if treat>=2

egen treatHKMT = sum(d_HKMT), by(NewID)
codebook NewID if treatHKMT>=1
codebook NewID if treatHKMT>=2 & d_HKMT==1

egen treatOECD = sum(d_OECD), by(NewID)
codebook NewID if treatOECD>=1
codebook NewID if treatOECD>=2 & d_OECD==1

********************************************************************************
* CLEANING CODE 1

drop if (K <= 0 | K ==.) /*dropped 5491 observations*/
drop if (rK <= 0 | rK ==.) /*dropped 3759 observations*/
drop if (rVA <= -0.01 | rVA ==.) /*dropped 29887 observations*/
drop if (L <= 8 | L ==.) /*dropped 6464 observations*/
drop if (M <= 0 | M ==.) /*dropped 523 observations*/

drop if InputShare>=1 /*dropped 7270 observations*/
drop if InputShare<=0 /*dropped 0 observations*/

drop if OECDShare < 0 /*dropped 4 observations*/
drop if HKMTShare < 0 /*dropped 5 observations*/
drop if OECDShare > 1 /*dropped 2 observations*/

********************************************************************************
* CLEANING CODE 2

* Drop outliers and SOE firms
	* if firms switch twice
	drop if treat>=2 			/*dropped 1177 obs*/
	
	* if outliers 1% and 99% of log material share
	su sm, d
	drop if sm<=-4 | sm>=-0.05	/*dropped 20041 obs*/
	
	* if SOE firms
	drop if Ownership==1		/*dropped 92045 obs*/
	
	* if log employment is too large 99%
	drop if l>=10				/*dropped 45 obs*/
	
save "$path\_datasets\TFP_china_tech_cic.dta", replace
* 
cap drop laborshare materialshare capshare share_ratio
gen laborshare = tot_wage/Y
gen materialshare = InputShare
gen share_relative = laborshare/materialshare


log using share_check, replace
tabstat laborshare materialshare share_relative if v==1, by(year)
tabstat laborshare materialshare share_relative if v==0, by(year)
log c
