/*************************************************************

Comparison of HF database from Chang and Levinson (2020) and the
Survey of Professional Forecasters

Andrew C. Chang, March 18 2022
Board of Governors of the Federal Reserve System, a.christopher.chang@gmail.com
ORCID: 0000-0002-9769-789X

1) Merge the SPF Forecast Data with Dates of Release
2) Prepare SPF data for merging with HF data
3) Merge SPF with HF data
4) Keep SPF forecasts and the closest HF forecast that appears prior to the SPF forecast or on same day as SPF forecast
5) Run MZ/Nordhaus tests on median SPF GDP Forecasts
6) Compute RMSEs between SPF and HF GDP forecats
7) Graph RMSE Results for GDP (Figure 3, top panel)
8) Compute RMSEs between SPF and HF Inflation forecats, run MZ/Nordhaus tests on SPF
9) Graph Results for Inflation (Figure 3, bottom panel)

*************************************************************/


clear all
set more off
capture log close

global Data "$Work\Data"
global DataModified "$Work\Data - Modified\SPF"
global Charts "$Work/Figures"

log using ${Work}/results/logs/final/SPFandHFcompare.log, replace

version 13.0


*************************
* 1) Merge the SPF Forecast Data with Dates of Release
*************************

* Create stata versionf of data to merge
import delimited "$Data/SPF/Mean_RGDP_Growth.csv", clear stringcols(3 4 5 6 7)  //importing the forecasts as strings to match later
sort year quarter
rename drgdp* drgdp*spfmean
save "$DataModified/Mean_RGDP_Growth.dta", replace
clear

import delimited using "$Data/SPF/Median_RGDP_Growth.csv", clear stringcols(3 4 5 6 7)  //importing the forecasts as strings to match later
sort quarter year
rename drgdp* drgdp*spfmedian
save "$DataModified/Median_RGDP_Growth.dta", replace
clear

import delimited "$Data/SPF/Mean_COREPCE_Level.csv"
sort year quarter
rename corepce* corepce*spfmean
save "$DataModified/Mean_COREPCE_Level.dta", replace
clear

import delimited using "$Data/SPF/Median_COREPCE_Level.csv"
sort quarter year
rename corepce* corepce*spfmedian
save "$DataModified/Median_COREPCE_Level.dta", replace

* Merge all SPF datasets together

merge 1:1 year quarter using "$DataModified/Mean_RGDP_Growth.dta"
drop _merge
merge 1:1 year quarter using "$DataModified/Median_RGDP_Growth.dta"
drop _merge
merge 1:1 year quarter using "$DataModified/Mean_COREPCE_Level.dta"
drop _merge

save "$DataModified/SPF_all.dta", replace
clear

* Merge survey dates
import delimited "$Data/SPF/SPFDates.csv"
replace year = year[_n-1] if year == .  //Fills in missing years, missing in original text
replace quarter = substr(quarter,2,1)  //removes the leading Q of quarter, for merging
destring quarter, replace

merge 1:1 year quarter using "$DataModified/SPF_all.dta"
keep if _merge == 3
drop _merge

*************************
* 2) Prepare SPF data for merging with HF data
*************************

* Convert dates to stata format for merging with HF database
generate temp = date(truedeadlinedate,"MDY",2020)
drop truedeadlinedate newsreleasedate
rename temp truedeadlinedate

* Save separate datasets for inflation and GDP
preserve
keep year quarter corepce* truedeadlinedate
save "$DataModified/SPF_COREPCE.dta", replace
restore

preserve
keep year quarter drgdp* truedeadlinedate
save "$DataModified/SPF_RGDP.dta", replace
restore

* Reshape to long (matching format of HF database), one intermediate dataset for each of mean core PCE, median core PCE, mean GDP, median GDP
use "$DataModified/SPF_COREPCE.dta", clear
drop corepce*spfmedian
drop corepceaspfmean corepcebspfmean corepcecspfmean  //dropping annual forecasts
rename corepce*spfmean corepce*  //reshape can't use characters after the numeric stub
reshape long corepce, i(truedeadlinedate) j(horizon_calendar)
replace horizon_calendar = horizon_calendar - 2  //SPF naming variable convention is FORECAST1 is previous quarter, FORECAST2 is current quarter, etc.  See pg. 21 of SPF documentation.  HF data has horizon == 0 as current quarter
rename corepce corepcespf_mean
rename truedeadlinedate date   //For merging
generate variable = "Core PCE Inflation"  //Also for merging
save "$DataModified/SPF_COREPCE_MEAN.dta", replace

use "$DataModified/SPF_COREPCE.dta", clear
drop corepce*spfmean
drop corepceaspfmedian corepcebspfmedian corepcecspfmedian
rename corepce*spfmedian corepce*  
reshape long corepce, i(truedeadlinedate) j(horizon_calendar)
replace horizon_calendar = horizon_calendar - 2  
rename corepce corepcespf_median
rename truedeadlinedate date
generate variable = "Core PCE Inflation"
save "$DataModified/SPF_COREPCE_MEDIAN.dta", replace

use "$DataModified/SPF_RGDP.dta", clear
drop drgdp*spfmedian
rename drgdp*spfmean drgdp*  
reshape long drgdp, i(truedeadlinedate) j(horizon_calendar)
replace horizon_calendar = horizon_calendar - 2  
rename drgdp drgdpspf_mean
rename truedeadlinedate date
generate variable = "GDP"
save "$DataModified/SPF_RGDP_MEAN.dta", replace

use "$DataModified/SPF_RGDP.dta", clear
drop drgdp*spfmean
rename drgdp*spfmedian drgdp*  
reshape long drgdp, i(truedeadlinedate) j(horizon_calendar)
replace horizon_calendar = horizon_calendar - 2  
rename drgdp drgdpspf_median
rename truedeadlinedate date
generate variable = "GDP"
save "$DataModified/SPF_RGDP_MEDIAN.dta", replace

* Append mean/medians into two datasets
append using "$DataModified/SPF_COREPCE_MEDIAN.dta"
save "$DataModified/SPF_MEDIAN.dta", replace
clear

use "$DataModified/SPF_COREPCE_MEAN.dta"
append using "$DataModified/SPF_RGDP_MEAN.dta"
save "$DataModified/SPF_MEAN.dta", replace
clear

*************************
* 3) Merge SPF with HF data
*************************

import delimited "$Data/public/hf_final_allobservations_public.csv", clear


* Convert dates to stata format for merging with SPF
generate temp = date(date,"DMY",2020)
drop date
rename temp date

* Change the timing of the horizon variable to be calendar quarter based, instead of FOMC cycle based
generate temp = quarterly(dateqtr, "YQ")
drop dateqtr
rename temp dateqtr
generate temp = quarterly(projqtr, "YQ")
drop projqtr
rename temp projqtr
generate horizon_calendar = projqtr - dateqtr  //horizon based on calendar quarters

* Merge in SPF forecasts
merge 1:1 year date horizon_calendar variable using "$DataModified/SPF_MEDIAN.dta"
drop _merge
merge 1:1 year date horizon_calendar variable using "$DataModified/SPF_MEAN.dta"
drop _merge
drop quarter
replace source = "SPF" if missing(source)

sort variable horizon_calendar date
format date %td

*************************
* 4) Keep SPF forecasts and the closest HF forecast that appears prior to the SPF forecast or on same day as SPF forecast
*************************
drop if horizon_calendar == 3 //removes forecasts after the FOMC but before calendar quarter changes, since just comparing calendar quarters 0-2
drop if date > 18939  // November 8 2011, date of last SPF 
replace dateqtr = qofd(date) if dateqtr == .
drop if source == "SPF" & dateqtr == 166 & horizon_calendar >= 1 //No HF obs for this SPF (2001q3) within calendar quarter for these horizons, keeps one unnecessary for for PCE inflation
drop if source == "SPF" & dateqtr == 166 & variable == "Core PCE Inflation"

* This next statement keeps one HF obs for each SPF obs.
* Keeps the closest HF obs that appears just prior to a SPF obs within that calendar quarter
* Because of the merge before, "Source" for an observation reflects the HF source first, followed by the SPF if no HF obs for that day

keep if (variable[_n] == variable[_n-1] & horizon_calendar[_n] == horizon_calendar[_n-1] & ///   *By variable and horizon_calendar 
((source[_n] ~= "SPF" & source[_n+1] == "SPF") | ///   *the observation before a non-matched SPF forecast
(source[_n] == "SPF" & source[_n-1] ~= "SPF"))) | ///     *the non-matched SPF observations with some HF forecast that quarter
((~missing(rgdp_fore[_n]) & ~missing(drgdpspf_median[_n])) | (~missing(pcepilfe_fore[_n]) & ~missing(corepcespf_median[_n])))   //SPF observations matched to HF observations

drop if horizon_calendar == -1 & variable == "GDP"  //Edge case for one observation that the last statement doesn't remove

order date horizon_calendar rgdp_fore drgdpspf*

format dateqtr %tq

save "$DataModified/HF_SPF_MERGED.dta", replace

*************************
* 5) Run MZ/Nordhaus tests on median SPF GDP Forecasts
* This sample is the same as the SPF/HF compare (i.e, missing 2001Q3)
*************************

preserve

keep if variable == "GDP"
destring drgdpspf_mean, replace
destring drgdpspf_median, replace

* Assign actual corresponding BEA value to SPF rows
replace rgdp_3rd = rgdp_3rd[_n-1] if missing(rgdp_3rd) & source == "SPF" 

* SPF forecast errors
generate rgdpspf_err_mean = drgdpspf_mean - rgdp_3rd
generate rgdpspf_err_median = drgdpspf_median - rgdp_3rd

keep if source == "SPF" | ((~missing(rgdp_fore[_n]) & ~missing(drgdpspf_median[_n])) | (~missing(pcepilfe_fore[_n]) & ~missing(corepcespf_median[_n])))

generate panelvar = 0
replace panelvar = horizon_calendar if variable == "Core PCE Inflation"
replace panelvar = horizon_calendar + 4 if variable == "GDP"

xtset panelvar dateqtr

generate rgdpspf_rev = d.drgdpspf_median

bysort horizon_calendar: regress rgdpspf_err_median rgdpspf_rev

restore

*************************
* 6) Compute RMSEs between SPF and HF GDP forecats
*************************

preserve 

keep if variable == "GDP"
destring drgdpspf_mean, replace
destring drgdpspf_median, replace

* Assign actual corresponding BEA value to SPF rows
replace rgdp_3rd = rgdp_3rd[_n-1] if missing(rgdp_3rd) & source == "SPF" 

* SPF forecast errors
generate rgdpspf_err_mean = drgdpspf_mean - rgdp_3rd
generate rgdpspf_err_median = drgdpspf_median - rgdp_3rd

* RMSEs
bysort horizon_calendar: egen mse_spfmean = mean(rgdpspf_err_mean^2)
generate rmse_spfmean = sqrt(mse_spfmean)
bysort horizon_calendar: egen mse_spfmedian = mean(rgdpspf_err_median^2)
generate rmse_spfmedian = sqrt(mse_spfmedian)
bysort horizon_calendar: egen mse_hf = mean(rgdp_err^2)
generate rmse_hf = sqrt(mse_hf)

sort horizon_calendar

collapse rmse_spfmean rmse_spfmedian rmse_hf, by(horizon_calendar)  //keep one observation of rmse and count for each horizon (i.e. drop duplicates)
sort horizon_calendar

*************************
* 7) Graph RMSE Results for GDP
*************************

//For charting quarters to target quarter
replace horizon_calendar = -horizon_calendar

scatter rmse_spfmean rmse_spfmedian rmse_hf horizon_calendar, msize(small small small) mcol(blue blue green) msymbol(square circle triangle) ///
	xti("Quarters to Target", size(large)) yti("Real GDP," "RMSE," "p.p., a.r.", orientation(horizontal) justification(left) size(large)) ///
	ylabel(0 0.5 1 1.5 2 2.5,angle(0) nogextend nogrid labsize(large))  ///
	xlabel(-2 -1 0,labsize(large)) ///
	plotregion(fcolor(white)) graphregion(fcolor(white)) ///
	legend(off) ///
	text(2.5 -2 "SPF (Circles and Squares)", place(e) color(blue) nobox size(large)) ///
	text(2.1 -2 "High-Frequency (Triangles)", place(e) color(green) nobox size(large))

graph export "$Charts/SPFandHF_RGDPCompare.pdf", replace fontface(Times)
restore

*************************
* 8) Compute RMSEs between SPF and HF Inflation forecats, run MZ/Nordhaus tests on SPF
*************************

preserve
keep if variable == "Core PCE Inflation"
drop if year < 2007 //no core pce inflation forecasts in the SPF before 2007 
drop if horizon_calendar == -1 //backcasts aren't meaningful in the SPF
destring corepcespf_mean, replace
destring corepcespf_median, replace

* Assign actual corresponding BEA value to SPF rows
replace pcepilfe_3rd = pcepilfe_3rd[_n-1] if missing(pcepilfe_3rd) & source == "SPF" 

* SPF forecast errors
generate corepcespf_err_mean = corepcespf_mean - pcepilfe_3rd
generate corepcespf_err_median = corepcespf_median - pcepilfe_3rd

* RMSEs
bysort horizon_calendar: egen mse_spfmean = mean(corepcespf_err_mean^2)
generate rmse_spfmean = sqrt(mse_spfmean)
bysort horizon_calendar: egen mse_spfmedian = mean(corepcespf_err_median^2)
generate rmse_spfmedian = sqrt(mse_spfmedian)
bysort horizon_calendar: egen mse_hf = mean(pcepilfe_err^2)
generate rmse_hf = sqrt(mse_hf)

sort horizon_calendar

collapse rmse_spfmean rmse_spfmedian rmse_hf, by(horizon_calendar)  //keep one observation of rmse and count for each horizon (i.e. drop duplicates)
sort horizon_calendar

*************************
* 9) Graph Results for Inflation
*************************

//For charting quarters to target quarter
replace horizon_calendar = -horizon_calendar

scatter rmse_spfmean rmse_spfmedian rmse_hf horizon_calendar, msize(small small small) mcol(blue blue green) msymbol(square circle triangle) ///
	xti("Quarters to Target", size(large)) yti("Core PCE" "Inflation," "RMSE," "p.p., a.r.", orientation(horizontal) justification(left) size(large)) ///
	ylabel(0 0.5 1,angle(0) nogextend nogrid labsize(large))  ///
	xlabel(-2 -1 0,labsize(large)) ///
	plotregion(fcolor(white)) graphregion(fcolor(white)) ///
	legend(off) ///
	text(0.5 -2 "SPF (Circles and Squares)", place(e) color(blue) nobox size(large)) ///
	text(0.7 -2 "High-Frequency (Triangles)", place(e) color(green) nobox size(large))

graph export "$Charts/SPFandHF_CorePCECompare.pdf", replace fontface(Times)
restore


log close

