rm(list = ls())

library(zoo)
library(tidyverse)
library(openxlsx)
library(stargazer)

source('Code/Import_functions/import_country_facts.R')
source('Code/Import_functions/import_wb_cpi.R')

source('Code/Tidy_functions/change_nesting_order.R')

##################################### IMPORT DATA #################################################

country_facts <- import_country_facts("Raw_data/Country_facts/country_facts.xlsx")

# Extract the latest vintage
cpi_all <- import_wb_cpi(path = "Raw_data/World_Bank_data/Inflation-data.xlsx", sheet = "hcpi_m", country_facts = country_facts)

# For some countries that experienced hyperinflation, the CPI levels in early yearmons are very close to 0, but not exactly 0. This is fine.
# For some countries, like Israel, the CPI levels are rounded to a few decimal places, so the CPI levels in some months are exactly zero
# This is a problem, both because the inflation rate will appear to be zero whenever the CPI level remains at exactly zero,
# and because when I compute log CPI levels they will be undefined
set_zeroes_to_NA <- function(cpi_for_country) {
  cpi_for_country[(cpi_for_country == 0)] <- NA
  return(cpi_for_country)
}
cpi_no_zeroes <- map(as.list(cpi_all), set_zeroes_to_NA)

# Remove any countries that have no data
cpi_filtered <- Filter(function(x) length(x)>0, map(cpi_no_zeroes, function(x) na.trim(x, sides = "both")))

# Trim NAs at the start or end of the series
trim_zoo <- function(z) {
  # Trim NAs at the end
  z_no_trail <- na.trim(z, sides = "right")
  
  # Trim from the start of the series until the last NA. This removes any sequence of alternating NA and non-NA values at the start, like Iceland
  if(any(is.na(z_no_trail))) {return(window(z_no_trail, start = (last(index(z_no_trail)[is.na(z_no_trail)]) + 1/12), end = NULL))} else {return(z_no_trail)}
}
cpi <- map(cpi_filtered, trim_zoo)

################### TABULATE DESCRIPTIVE STATISTICS ON PUBLICATION LAG ########################

# Compute the latest outcome for each country
compute_last_yearmon <- function(cpi_for_country) {last(index(na.trim(cpi_for_country, sides = "both")))}
last_yearmon <- do.call(c, map(cpi, compute_last_yearmon))

# Compute the publication lag for each country
# If the publication lag is over 5 months, I assume that the true publication lag was shorter, but the country stopped updating its CPI
# or the World Bank stopped updating it.
compute_raw_publication_lag <- function(cpi_for_country) {12*(as.yearmon("Jan 2023") - compute_last_yearmon(cpi_for_country))}
raw_publication_lag <- do.call(c, map(cpi, compute_raw_publication_lag))

# Summarise the latest outcomes and publication lags
df <- data.frame(Country = names(cpi), `Last Observation` = last_yearmon, `Publication Lag` = raw_publication_lag)
count_countries <- function(x) {nrow(filter(df, raw_publication_lag == x))}
count_df <- data.frame(`Publication Lag` = unique(df$Publication.Lag), `Number of Countries` = unlist(map(unique(df$Publication.Lag), count_countries)))
summary_df <- count_df[order(count_df$Publication.Lag), ]
summary_df$Last_yearmon <- do.call(c, map(summary_df$Publication.Lag, function(x) (as.yearmon("Jan 2023") - x/12)))
summary_df[ , c(1, 3, 2)]

# Make a latex table for the paper
num_5_or_more <- sum(summary_df[(summary_df$Publication.Lag >= 5), "Number.of.Countries"], na.rm = T)
nice_df <- rbind(summary_df[summary_df$Publication.Lag <= 4, ], c(5, num_5_or_more, NA))
colnames(nice_df) <- c("Publication Lag", "Number of Countries", "Latest Observation")
stargazer(nice_df[ , c(3, 2, 1)], summary = F, rownames = F)

####################### CONSTRUCT VINTAGES OF MONTHLY CPI #################################

# The World Bank dataset was compiled in January 2023
# For countries whose publication lag is 1 to 4 months, we assume the WB obtained the most up-to-date data available at the time,
# so the latest vintage we can construct is January 2023
# For counties whose publication lag is 5 or more months, which is implausibly high, we assume the WB did not obtain the latest
# data available at the time or the country has stopped compiling a CPI.
# For these countries, we assume the publication lag is 4 months.
compute_publication_lag <- function(cpi_for_country) {min(4, 12*(as.yearmon("Jan 2023") - compute_last_yearmon(cpi_for_country)))}

# Construct CPI vintages
construct_vintages_for_country <- function(cpi_for_country) {
  publication_lag <- compute_publication_lag(cpi_for_country)
  
  # The earliest vintage we construct is the first one that would have two monthly observations, since one observation is not enough for extrapolation
  earliest_vintage <- first(index(cpi_for_country)) + publication_lag/12 + 1/12
  
  # The latest vintage we construct is usually January 2023, which is the month the World Bank compiled its dataset
  # However, for countries whose latest observation is quite far in the past, we assume the World Bank lacks the latest
  # observation or the country stopped compiling its CPI, so the latest vintage we can't construct vintages all the way to January 2023
  latest_vintage <- last(index(cpi_for_country)) + publication_lag/12
  vintage_yearmons <- seq(from = earliest_vintage, to = latest_vintage, by = 1/12)
  
  # Construct vintages by taking subsets of the full series
  construct_vintage <- function(vintage_yearmon) {window(cpi_for_country, start = NULL, end = (vintage_yearmon - publication_lag/12))}
  vintages <- map(vintage_yearmons, construct_vintage)
  names(vintages) <- vintage_yearmons
  return(vintages)
}
map(cpi, construct_vintages_for_country) %>% convert_to_vintages_country() %>% saveRDS("Mid_data/FX/CPI/Monthly_CPI/cpi_m.rds")
