/* 
date:    20190114
author:  Simon Kwok
purpose: to clean and prepare data for bubble inference

For details, please refer to: Jarrow, Robert, and Simon Kwok (2021) "Inferring Financial Bubbles from Option Data", Journal of Applied Econometrics, forthcoming.  

The following datasets are needed. All datasets are stored in the folder "option".

Input dataset 1: callbase.sas7bdat 
 source:           OptionMetrics in WRDS
 fields:
  date             date of transaction (e.g., 04JAN1996)
  exdate           expiry date (e.g., 04JAN1996)
  cp_flag          call/put indicator (C for call; P for put)
  strike_price     strike price (in days)
  best_bid         best bid price
  best_offer       best ask price
  volume           traded volume
  open_interest    open interest
  impl_volatility  implied volatility
  delta            option delta

Input dataset 2: snp500.sas7bdat
 source:           WRDS
 fields:
  date             date
  snp500           S&P 500 index

Input dataset 3: div_shiller.sas7bdat
 source:           http://www.econ.yale.edu/~shiller/data.htm
 fields:
  year             year
  month            month
  dividend         S&P dividend yield 

Input dataset 4: trate_1mo3mo.sas7bdat
 source:           WRDS
 fields:
  date             date
  trate1mo         1-month Treasury bill rate
  trate3mo         3-month Treasury bill rate
*/

%let var_call=date exdate cp_flag strike_price best_bid best_offer volume open_interest impl_volatility delta;

%let stdate=01Jan2000;
%let endate=31Dec2000;

%let startdate="&stdate"d; 
%let enddate="&endate"d; 

data calldata;
set option.callbase(keep=&var_call);
where date between &startdate and &enddate;
callprice = (best_bid + best_offer)/2;
strike = strike_price/1000;
tau = exdate - date;
run;

proc sort data = calldata;
by date exdate strike;
run;

/* cleaning call data */
data cleancalldata0;
set calldata;
where day(exdate)<27 and weekday(exdate) between 6 and 7 and (best_bid >= 0.05 or best_offer >= 0.05)  /* all expiry dates of standard options fall on Fri starting from 11 Dec 2015 */
	and (best_offer-best_bid)>=0 and tau>=9
	and volume > 0 
	and impl_volatility ne . and impl_volatility > 0;
run;

/* retain tau group with enough observations */
proc sql;
create table cleancalldata as
select *, count(date) as n from cleancalldata0
group by date, cp_flag, tau
having n > 3;
quit;

data indexraw;
set option.snp500;
run;

data divraw;
set option.div_shiller;
run;

/* compute moving average of s&p */
proc expand data=indexraw out=indexraw method=none;
convert snp = snp21ma / transform = (movave 21);
run;

data indexdata;
set indexraw;
where date between &startdate and &enddate;
yr = year(date);
mo = month(date);
run;

proc sql;
create table indexdata2 as 
select date, yr, mo, snp, snp21ma, mean(snp) as snpmonth
from indexdata
group by yr, mo;
quit;

data trate;
set option.trate_1mo3mo;

t1 = lag(trate1mo);
t2 = lag2(trate1mo);
t31 = lag(trate3mo);
t32 = lag2(trate3mo);
*retain trate;
if trate1mo ne . then tr = trate1mo;
else if t1 ne . then tr = t1;
else if t2 ne . then tr = t2; 
else if trate3mo ne . then tr = trate3mo;
else if t31 ne . then tr = t31; 
else if t32 ne . then tr = t32; 
tr=tr/100;
run;


proc sql;
create table index_tr as
select coalesce(indexdata2.date,trate.date) as date, mo, yr, snp, snp21ma, snpmonth, tr
from indexdata2 inner join trate
on indexdata2.date = trate.date;
quit;


proc sql;
create table index_tr_div as
select index_tr.date as date, index_tr.snp as snp, snp21ma, snpmonth, tr, dividend
from index_tr left join divraw
on index_tr.yr = divraw.year and index_tr.mo = divraw.month;
quit;

data index_tr_div;
set index_tr_div;
dy_ma = dividend/snp21ma;
dy_fix = dividend/snpmonth;
run;


proc sql;
create table indexopt as
select coalesce(cleancalldata.date,index_tr_div.date) as date, snp, dy_ma, dy_fix, tr, cp_flag, strike, callprice, exdate, tau, volume, open_interest, best_bid, best_offer, impl_volatility as iv, delta
from cleancalldata full join index_tr_div
on cleancalldata.date = index_tr_div.date
where tau ne . ;
quit;

data indexopt;
set indexopt;
format date DATE9.;
informat date YYMMDD8.;
money = log(strike*exp(-(tr-dy_ma)*tau/252)/snp);
money2 = strike/snp;
run;

proc sort data = indexopt;
by date cp_flag exdate strike;
run;

/* rank by tau --> taurank0 */
proc rank data=indexopt out=indexopt2 ties=dense;
var tau;
ranks taurank0;
by date cp_flag;
run;

%let datatype=allopt;

proc sql;
create table indexopt3 as
select date, cp_flag, exdate, tau, strike, snp, dy_ma, tr, money, callprice, volume, iv, delta
from indexopt2
where tau ne .
order by date, cp_flag, tau, strike;
quit;

proc sql;
create table optcount as
select date, count(date)
from indexopt3
group by date;
quit;

proc sql;
create table optcpcount as
select date, cp_flag, count(date)
from indexopt3
group by date, cp_flag;
quit;

proc sql;
create table optcpstats as
select date, cp_flag, tau, count(date) as n, min(money) as minm, max(money) as maxm
from indexopt3
group by date, cp_flag, tau;
quit;

proc sql;
create table optcpotm as
select cp_flag, date, tau, min(callprice) as minprice
from indexopt3
group by cp_flag, date, tau;
quit;

proc sql;
create table cpotm as
select date, cp_flag, min(tau) as mintau, minprice
from optcpotm
group by date, cp_flag;
quit;

proc sql;
create table cmon as
select date, count(date) as n, mean(money) as avgm, min(money) as minm, max(money) as maxm
from indexopt3
where cp_flag='C' and tau<=90
group by date;
quit;

proc sql;
create table pmon as
select date, count(date) as n, mean(money) as avgm, min(money) as minm, max(money) as maxm
from indexopt3
where cp_flag='P' and tau<=90
group by date;
quit;

proc export data=indexopt3 replace
outfile="&datatype._&stdate.to&endate..csv"
dbms=csv;
run;

proc export data=optcount replace
outfile="&datatype._&stdate.to&endate._count.csv"
dbms=csv;
run;

proc export data=optcpstats replace
outfile="&datatype._&stdate.to&endate._stats.csv"
dbms=csv;
run;
