function createPWT90MatlabDataNoPetro
%{
This function reads in an excel file with data from Penn World Tables 9.0
The file is organized as 47 columns

countrycode	country	currency_unit	year	rgdpe	rgdpo	pop	emp	avh
hc	ccon	cda	cgdpe	cgdpo	ck	ctfp	cwtfp	rgdpna	rconna	rdana
rkna	rtfpna	rwtfpna	labsh	delta	xr	pl_con	pl_da	pl_gdpo	i_cig
i_xm	i_xr	i_outlier	cor_exp	statcap	csh_c	csh_i	csh_g	csh_x
csh_m	csh_r	pl_c	pl_i	pl_g	pl_x	pl_m	pl_k


A new Matlab data set is then created for easier processing.

Dick Startz
September 2016
modified May 2017
%}
tic;
[numericData,txt] = xlsread('pwt90.xlsx','Data');
%{
    numerical data shows up in numericData
    countryCode and country show up in
    txt, columns 1 and 2
    note that txt includes an extra
    header row
    
    Note that USA is included and needs to be separated out
    So we are going to renumber the cross IDs to keep them consecutive
    they will not match EViews file
%}

headers = txt(1,:);
nCols = length(headers);
for iCol= 1:nCols
    if strcmp(headers{iCol},'rgdpna')
        gdpnaCol = iCol;
    end
    if strcmp(headers{iCol},'pop')
        popCol = iCol;
    end
    if strcmp(headers{iCol},'year')
        yearCol = iCol;
    end
    if strcmp(headers{iCol},'countrycode')
        countrycodeCol = iCol;
    end
    if strcmp(headers{iCol},'country')
        countryCol = iCol;
    end
end

% now take out top row of txt so it matches with num
txt = txt(2:end,:);
% now take out USA data
firstUSARow = 0;
for iRow = 1:size(txt,1)
    if strcmp(txt{iRow,countrycodeCol},'USA')
        if firstUSARow == 0
            firstUSARow = iRow;
        end
        lastUSARow = iRow;
    end
end

% remember that txt has columns in front of numeric data
gdpnaCol = gdpnaCol -(size(txt,2) - size(numericData,2));
popCol = popCol -(size(txt,2) - size(numericData,2));
yearCol = yearCol -(size(txt,2) - size(numericData,2));

USAgdpna = numericData(firstUSARow:lastUSARow,gdpnaCol);
USApop =  numericData(firstUSARow:lastUSARow,popCol);
USAyear =  numericData(firstUSARow:lastUSARow,yearCol);

USAperCapitaNA = USAgdpna./USApop;
yStar = USAperCapitaNA;
numericData = numericData([1:firstUSARow-1,lastUSARow+1:end],:);
txt = txt([1:firstUSARow-1,lastUSARow+1:end],:);

% now take out Qatar, Macao, Brunei, UAE, and Kuwait.
% also countries that begin in 2005,
countryCodesToEliminate = {'ARE', 'BRN', 'KWT', 'MAC', 'QAT',...
    'CUW','SXM'};
for iCountry = 1:length(countryCodesToEliminate)
    firstRowToEliminate = 0;
    for iRow = 1:size(txt,1)
        if strcmp(txt{iRow,countrycodeCol},...
                countryCodesToEliminate{iCountry})
            if firstRowToEliminate == 0
                firstRowToEliminate = iRow;
            end
            lastRowToEliminate = iRow;
        end
    end
    disp(['Eliminating ',countryCodesToEliminate{iCountry},...
        ' with last population = ',...
        num2str(numericData(lastRowToEliminate,popCol))]);
    numericData = numericData([1:firstRowToEliminate-1,...
        lastRowToEliminate+1:end],:);
    txt = txt([1:firstRowToEliminate-1,lastRowToEliminate+1:end],:);
end


originalNobs = size(numericData,1);

countryCodesCell = unique(txt(:,countrycodeCol),'stable');
countryCodesString = char(countryCodesCell);
countryNames = unique(txt(:,countryCol),'stable');
nCountries = length(countryCodesCell);

% create consecutive crossId, 1 for each country
% and create indexes for observations

firstObForCountry = nan(nCountries,1);
lastObForCountry = nan(nCountries,1);
firstYearForCountry = nan(nCountries,1);
lastYearForCountry = nan(nCountries,1);

crossId = nan(originalNobs,1);
crossId(1) = 1;
firstObForCountry(1) = 1;
firstYearForCountry(1) = numericData(1,yearCol);

countryCounter = 1;
for iRow = 2:originalNobs
    if strcmp(txt(iRow,countrycodeCol),txt(iRow-1,countrycodeCol))
        crossId(iRow) = countryCounter;
    else
        countryCounter = countryCounter + 1;
        crossId(iRow) = countryCounter;
    end
end

% find the crossId for Korea, Spain, and China
for iCountry = 1:nCountries
    if strcmp(countryCodesCell{iCountry},'KOR')
        koreaCountryNumber = iCountry;
    elseif strcmp(countryCodesCell{iCountry},'ESP')
        spainCountryNumber = iCountry;
    elseif strcmp(countryCodesCell{iCountry},'CHN')
        chinaCountryNumber = iCountry;
    elseif strcmp(countryCodesCell{iCountry},'IND')
        indiaCountryNumber = iCountry;
    elseif strcmp(countryCodesCell{iCountry},'CAN')
        canadaCountryNumber = iCountry;
    end
end

rgdpna = numericData(:,gdpnaCol);
pop = numericData(:,popCol);
yearOb = numericData(:,yearCol);

% note that we have 65 observations per country, although many are missing.
nobsPerCountry = size(USAgdpna,1);
if nobsPerCountry ~= 65
    error('Wrong number of observations per country');
end

y = rgdpna./pop;

gap = nan(originalNobs,1);
for iCountry = 1:nCountries
    gap(1+nobsPerCountry*(iCountry-1):nobsPerCountry*iCountry)...
        = log(y(1+nobsPerCountry*(iCountry-1):...
        nobsPerCountry*iCountry)./yStar);
end

% now save first and last data for each country

nobs = 0;
for iCountry = 1:nCountries
    % first find observations for the country
    % then limit them to the ones which have a valid gap variable
    validObs = (iCountry == crossId) & ~isnan(rgdpna)...
        & ~isnan(yearOb) & ~isnan(pop);
    nobs = nobs + sum(validObs);
    
    % note that we are going to assume no internal missing data
    firstObForCountry(iCountry) = find(validObs,1,'first');
    lastObForCountry(iCountry) = find(validObs,1,'last');
    
    firstYearForCountry(iCountry) = yearOb(firstObForCountry(iCountry));
    lastYearForCountry(iCountry) = yearOb(lastObForCountry(iCountry));
end
%{
now let's create some data that wil be useful for getting later
summary statistics

The suffixes ROW and World refer to rest-of-world, everything other than
US and everything including US respectively.
%}
popLast = pop(lastObForCountry);
popROWLast = sum(popLast);
popUSALast = USApop(end);
popWorldLast = popROWLast + popUSALast;
popWeightUSALast = popUSALast/popWorldLast; % weighting US vs everyone else
popWeightROWLast = pop(lastObForCountry)/popROWLast;
popWeightWorldLast = pop(lastObForCountry)/popWorldLast;

yFirst = nan(nCountries,1);
yLast = nan(nCountries,1);

for iCountry = 1:nCountries
    yFirst(iCountry) = y(firstObForCountry(iCountry));
    yLast(iCountry) = y(lastObForCountry(iCountry));
end

yStarFirst = yStar(1);
yStarLast = yStar(end);
firstYear = USAyear(1);
lastYear = USAyear(end);

gdpLast = yLast.*popLast;


worldGDPLast = sum(gdpLast) + yStarLast*popUSALast;
clear numericData txt validObs iCountry;
save 'PWT90Data';
elapsedTime = toc;

disp([mfilename,' completed after ',num2str(elapsedTime),' seconds ',...
    datestr(now,'on mmmm-dd-yyyy at HH:MM:SS')]);

end

