rm(list = ls())
library(tidyverse)
library(here)
library(readxl)
library(lubridate)
library(magrittr)
source(here::here("codes","essay1_paths.R"))

# Read Correspondence Table (obtained at meeting with Marcelo) ------

read_correspondence_table <- function(s){
  read_excel(paste0(essay1_data_raw,"/fipe/cadastro_marcas.xlsx"), sheet = paste0(s))
}

varieties_to_ipc_correspondence <- map(2001:2004,read_correspondence_table) %>% 
  bind_rows(.) %>% 
  mutate(active_marca = ifelse(vfatv == 1, 1, 0)) %>% 
  select(year = vfano,
         codigo = vfprd,
         VFMARCA = vfmarca,
         VFNOME,
         active_marca) %>% 
  distinct()

# Create continuing variety indicator
varieties_to_ipc_correspondence <- varieties_to_ipc_correspondence %>% 
  group_by(codigo,VFMARCA,year) %>% 
  mutate(n = n(),
         cont_marca = ifelse(n == 1 & active_marca ==1, 1, 0)) %>% 
  select(-n) %>% 
  ungroup()


# Read varieties dataset -----
ipcfipe_varieties <- read_csv2(paste0(essay1_data_raw,"/fipe/PrecoMedio1998-2007.csv"), guess_max = 1) %>%   
  select(VFMARCA,VFESTAB,VFLOCAL,NOMEMARCA,NOMELOCAL,47:82) %>% 
  gather(key = date_original,
         value = avg_price,
         -c(VFLOCAL,VFMARCA,VFESTAB,NOMEMARCA,NOMELOCAL)) %>% 
  mutate(year=str_sub(date_original,-4,-1),
         month=str_sub(date_original,-7,-6),
         date=ymd(paste(year,"-",month,"-01"))) %>% 
  mutate_at(vars(year,month),~as.numeric(.)) %>%  
  select(-date_original) %>% 
  mutate(avg_price = ifelse(avg_price == 0,NA,avg_price))

# Clean up cases with two quotes for the same month (usually at least one is NA)
ipcfipe_varieties <- ipcfipe_varieties %>% 
  group_by(VFMARCA,VFESTAB,VFLOCAL,date) %>% 
  mutate(count_na = sum(is.na(avg_price)))

ipcfipe_varieties <- ipcfipe_varieties %>% 
  mutate(n = n())

check_duplicates_na <- ipcfipe_varieties %>% 
  ungroup() %>% 
  filter(n!=1, count_na>=1)

ipcfipe_varieties <- ipcfipe_varieties %>% 
  ungroup() %>% 
  filter(n==1 | (n!=1 & !is.na(avg_price) & count_na ==1))

# None with more than 36 observations
ipcfipe_varieties %>% 
  group_by(VFMARCA,VFESTAB,VFLOCAL) %>% 
  summarise(n = n()) %>% 
  filter(n>36)


# Input price - assume prices did not change if we have only one missing between two observations

ipcfipe_varieties <- ipcfipe_varieties %>% 
  group_by(VFMARCA,VFESTAB,VFLOCAL) %>% 
  mutate(avg_price_inputed = ifelse((is.na(avg_price) & !is.na(lead(avg_price,order_by = date)) & !is.na(lag(avg_price, order_by = date))),
                                    lag(avg_price, order_by = date),
                                    NA),
         avg_price_inputed = ifelse(is.na(avg_price_inputed),avg_price,avg_price_inputed)) %>% 
  ungroup()


# Keep only cases that are observed through the whole period
ipcfipe_varieties <- ipcfipe_varieties %>% 
  filter(!is.na(avg_price_inputed)) %>% 
  group_by(VFMARCA,VFESTAB,VFLOCAL) %>% 
  mutate(nobs_variety_inputed = n()) %>% 
  ungroup() %>% 
  filter(nobs_variety_inputed == 36) 

# Other cleaning up

# Exclude the following brands
# Imposto Predial - Isentos	346004
#	Aluguel - Nec	366022
#	Aluguel - Vago	366023
#	Aluguel - Recusa	366024
# Aluguel - Outros 366025
# Lotacao - 1092000

ipcfipe_varieties <- ipcfipe_varieties %>%
  filter(!(VFMARCA %in% c(1092000,346004,366022:366025)))



# Join IPC codes to varieites
varieties <- ipcfipe_varieties %>% 
  semi_join(varieties_to_ipc_correspondence, by = c("year","VFMARCA")) %>% 
  left_join(varieties_to_ipc_correspondence, by= c("year","VFMARCA")) %>% 
  arrange(codigo,VFMARCA,VFESTAB,VFLOCAL,date) %>% 
  select(codigo,VFMARCA,VFESTAB,VFLOCAL,year,month,date,avg_price = avg_price_inputed) %>% 
  distinct()

# Create base prices

base_date <- as_date("2002-04-01")

base_prices <- varieties %>% 
  ungroup() %>% 
  filter(date == base_date) %>% 
  mutate(base_price = avg_price) %>%
  select(-date, -year,-month, -avg_price) 

# Create number of varieties
base_prices <- base_prices %>% 
  group_by(codigo) %>% 
  mutate(number_variety = n())

# Join base prices and compute relative prices
varieties <- varieties %>% 
  left_join(base_prices, by=c("codigo","VFMARCA","VFESTAB","VFLOCAL")) %>% 
  mutate(rel_price = avg_price/base_price)

# Exclude cases in which I have only one variety

varieties <- varieties %>% 
  filter(number_variety != 1)

# Save
saveRDS(varieties, paste0(essay1_data_processed,"/fipe/varieties_prices_complete.rds"))
