rm(list = ls())
library(tidyverse)
library(readxl)
library(here)
library(microdadosBrasil)
source(here::here("codes","essay1_paths.R"))

# Read POF to SNIPC ------

tab_pof_to_snipc <- read_excel(paste0(essay1_data_raw,"/ibge/Tabela_de_correspondencia_POF_SNIPC_2008_2009.xls"), skip = 2) %>% 
    rename(descricao = 2,
           ITEM1 = 4,
           NOME1 = NOME) %>% 
    mutate(GRUPO = as.numeric(GRUPO),
        cod_tab = GRUPO*1000+ITEM1) %>% 
    filter(!between(row_number(),4226,4228))


pof_to_snipc_expenditures <- function(file_type,ano,tab){
    x <- read_POF(ft = file_type,i = ano, root_path = paste0(essay1_data_raw,"/ibge/POF",ano))
    
    #harmonize names with POF 2002
    x <- x %>% 
        mutate(val_def_anual = as.numeric(VALOR_ANUAL_EXPANDIDO2),
               fator = as.numeric(FATOR_EXPANSAO2),
               renda = as.numeric(RENDA_TOTAL),
               item = as.numeric(COD_ITEM),
               quadro = as.numeric(NUM_QUADRO),
               fator_set = as.numeric(FATOR_EXPANSAO1),
               obtencao  = as.numeric(COD_OBTENCAO),
               uf = as.numeric(COD_UF),
               seq = as.numeric(NUM_SEQ),
               dv = as.numeric(NUM_DV),
               domcl = as.numeric(COD_DOMC),
               uc = as.numeric(NUM_UC),
               estrato = as.numeric(NUM_EXT_RENDA))
    if(file_type == "caderneta_despesa"){
        x <- x %>% mutate(grupo = as.numeric(PROD_NUM_QUADRO_GRUPO_PRO))
    } 
    
    x <- x %>% 
        mutate(vad = val_def_anual,
               renda_mon_anual = renda*12,
               uckey = (uf*1000000000)+(seq*1000000)+(dv*100000)+(domcl*1000)+(uc*100)+estrato)
               
    if(file_type == "caderneta_despesa"){
        x <- x %>% 
            mutate(cod = grupo*1000+floor(item/100),
                   codi = grupo*100000+item) %>% 
            left_join(tab_pof_to_snipc, by = c("cod"="cod_tab"))
    } else {
        x <- x %>% 
            mutate(cod = quadro*1000+floor(item/100),
                   codi = quadro*100000+item) %>% 
            left_join(tab_pof_to_snipc, by = c("cod"="cod_tab")) 
    }
    
    x <- x %>% 
        select(uf,uckey,estrato,fator,fator_set,cod,codi,
               obtencao,renda_mon_anual,subitem,vad,ipca_desc = descricao,
               pof_desc = NOME1)
}

file_types <- c("despesa_90dias",
                "despesa_12meses",
                "outras_despesas",
                "servico_domestico",
                "aluguel_estimado",
                "despesa_veiculo",
                "despesa_individual",
                "caderneta_despesa")

file_types <- setNames(file_types,file_types)

snipc_data_exp <- map(file_types, pof_to_snipc_expenditures,ano = 2008, tab = tab_pof_to_snipc)

# Parse INSS -----

ano <- 2008

snipc_data_exp$inss <- read_POF(ft = "servico_domestico",i = ano, root_path = paste0(essay1_data_raw,"/ibge/POF",ano)) %>% 
    mutate(val_def_anual_inss = as.numeric(VALOR_INSS_ANUAL_EXPANDIDO2),
           fator = as.numeric(FATOR_EXPANSAO2),
           renda = as.numeric(RENDA_TOTAL),
           item = as.numeric(COD_ITEM),
           quadro = as.numeric(NUM_QUADRO),
           fator_set = as.numeric(FATOR_EXPANSAO1),
           obtencao  = as.numeric(COD_OBTENCAO),
           uf = as.numeric(COD_UF),
           seq = as.numeric(NUM_SEQ),
           dv = as.numeric(NUM_DV),
           domcl = as.numeric(COD_DOMC),
           uc = as.numeric(NUM_UC),
           estrato = as.numeric(NUM_EXT_RENDA)) %>% 
    mutate(vad = val_def_anual_inss,
           renda_mon_anual = renda*12,
           uckey = (uf*1000000000)+(seq*1000000)+(dv*100000)+(domcl*1000)+(uc*100)+estrato,
           cod = quadro*1000+floor(item/100),
           codi = quadro*100000+item) %>% 
    left_join(tab_pof_to_snipc, by = c("cod"="cod_tab")) %>% 
    select(uf,uckey,estrato,fator,fator_set,cod,codi,
           obtencao,renda_mon_anual,subitem,vad,ipca_desc = descricao,
           pof_desc = NOME1)

# Parse Earnings -----

earnings <- read_POF(ft = "rendimentos",i = ano, root_path = paste0(essay1_data_raw,"/ibge/POF",ano)) %>% 
    mutate(fator = as.numeric(FATOR_EXPANSAO2),
           renda = as.numeric(RENDA_TOTAL),
           item = as.numeric(COD_ITEM),
           quadro = as.numeric(NUM_QUADRO),
           fator_set = as.numeric(FATOR_EXPANSAO1),
           uf = as.numeric(COD_UF),
           seq = as.numeric(NUM_SEQ),
           dv = as.numeric(NUM_DV),
           domcl = as.numeric(COD_DOMC),
           uc = as.numeric(NUM_UC),
           estrato = as.numeric(NUM_EXT_RENDA),
           ded_def_anual_IRPF = as.numeric(VAL_DEDUCAO_IR_CORRIGIDO),
           ded_def_anual_previ = as.numeric(VAL_DEDUCAO_PREV_CORRIGIDO),
           ded_def_anual_outras = as.numeric(VAL_DEDUCAO_OUTRA_CORRIGIDO),
           pos_ocup = as.numeric(COD_OCUP_FINAL))

earnings <- earnings %>% 
    mutate(renda_mon_anual = renda*12,
           uckey = (uf*1000000000)+(seq*1000000)+(dv*100000)+(domcl*1000)+(uc*100)+estrato,
           codi = NA)      

irpf <- earnings %>% 
    mutate(vad = ded_def_anual_IRPF*fator,
           cod = ifelse(vad>0,quadro*1000+6*100+pos_ocup,0))

previ <- earnings %>% 
    mutate(vad = ded_def_anual_previ*fator,
           cod = ifelse(vad>0,quadro*1000+5*100+pos_ocup,0))

outras <- earnings %>% 
    mutate(vad = ded_def_anual_outras*fator,
           cod = ifelse(vad>0,quadro*1000+7*100+pos_ocup,0))

# Parse other earnings -----
other_earnings <- read_POF(ft = "outros_rendimentos",i = ano, root_path = paste0(essay1_data_raw,"/ibge/POF",ano)) %>% 
    mutate(fator = as.numeric(FATOR_EXPANSAO2),
           renda = as.numeric(RENDA_TOTAL),
           item = as.numeric(COD_ITEM),
           quadro = as.numeric(NUM_QUADRO),
           fator_set = as.numeric(FATOR_EXPANSAO1),
           uf = as.numeric(COD_UF),
           seq = as.numeric(NUM_SEQ),
           dv = as.numeric(NUM_DV),
           domcl = as.numeric(COD_DOMC),
           uc = as.numeric(NUM_UC),
           estrato = as.numeric(NUM_EXT_RENDA),
           ded_def_anual = as.numeric(VAL_DEDUCAO_CORRIGIDO)) %>% 
    mutate(renda_mon_anual = renda*12,
           uckey = (uf*1000000000)+(seq*1000000)+(dv*100000)+(domcl*1000)+(uc*100)+estrato,
           codi = NA,
           vad = ded_def_anual*fator,
           cod = ifelse(vad>0,quadro*1000+floor(item/100),0))

snipc_data_earn <- bind_rows(irpf,previ,outras,other_earnings) %>% 
    left_join(tab_pof_to_snipc, by = c("cod"="cod_tab")) %>% 
    select(uf,uckey,estrato,fator,fator_set,cod,codi,
           renda_mon_anual,subitem,vad,ipca_desc = descricao,
           pof_desc = NOME1) %>% 
    filter(cod>0)

pof_ipca <- bind_rows(snipc_data_exp) %>% 
    bind_rows(snipc_data_earn)

pof_ipca <- pof_ipca %>% 
    mutate(subitem = ifelse(pof_desc=="AGREGADO",0,subitem))

pof_ipca <- pof_ipca %>%
    filter(!is.na(subitem))

# Exclude Agreagados

pof_ipca <- pof_ipca %>% 
    filter(!str_detect(pof_desc,"AGREGADO")) 

# Create ipca categories

pof_ipca <- pof_ipca %>% 
    mutate(
        ipca = as.numeric(subitem),
        ipca_group = floor(ipca / 10 ^ 6),
        ipca_subgroup = floor(ipca / 10 ^ 5),
        ipca_item = floor(ipca / 10 ^ 3),
        renda_mensal_uc = renda_mon_anual/12
    ) %>% 
    select(-subitem,renda_mon_anual)

# Keep only relevant ipca codes
cpi <- readRDS(paste0(essay1_data_processed,"/ibge/cpi_br.rds"))
cpi_codes <- cpi %>% 
    filter(level == 7) %>% 
    distinct(ipca) %>% 
    mutate(ipca = as.numeric(ipca))

pof_ipca <- pof_ipca %>% 
    semi_join(cpi_codes,by=c("ipca"))

# Save
saveRDS(pof_ipca,paste0(essay1_data_processed,"/ibge/pof2008_with_ipca_codes.rds"))
