install.packages('tidyverse')
install.packages('ivreg')
install.packages('sandwich')
install.packages('clubSandwich')
install.packages('svglite')
install.packages('mapchina')
install.packages('ggthemes')

library(tidyverse)
library(readxl)
library(ivreg)
library(sandwich)
library(clubSandwich)
library(ggplot2)
library(svglite)
library(mapchina)
library(ggthemes)

# file paths
input_folder <- "data/"
graph_output <- "graphs/"
file_output <- "output/"

# list of provinces
provinces <- c('beijing', 'tianjin',	'hebei',	'shanxi',	'innermongolia',	'liaoning',	'jilin',	'heilongjiang',	'shanghai',	'jiangsu',	'zhejiang',	'anhui',	'fujian',	'jiangxi',	'shandong',	'henan',	'hubei',	'hunan',	'guangdong',	'guangxi',	'hainan',	'chongqing',	'sichuan',	'guizhou',	'yunnan',	'tibet',	'shaanxi',	'gansu',	'qinghai',	'ningxia',	'xinjiang')

# province regions
north <- c('beijing', 'tianjin', 'hebei', 'shanxi', 'innermongolia')
northeast <- c('liaoning', 'jilin', 'heilongjiang')
shangzhejiang <- c('shanghai',	'jiangsu',	'zhejiang',	'anhui',	'fujian',	'jiangxi',	'shandong')
central <- c('henan',	'hubei',	'hunan',	'guangdong',	'guangxi',	'hainan')
southwest <- c('chongqing',	'sichuan',	'guizhou',	'yunnan',	'tibet')
northwest <- c('shaanxi',	'gansu',	'qinghai',	'ningxia',	'xinjiang')
coastal <- c('tianjin', 'hebei', 'liaoning', 'shanghai', 'jiangsu', 'zhejiang', 'fujian', 'shandong', 'henan', 'guangdong', 'guangxi', 'hainan')

# load files
df_1982_census_age_national <- read_excel(paste0(input_folder, '1982_census_age_national.xls'), sheet='processed')
df_1990_census_age_national <- read_excel(paste0(input_folder, '1990_census_age_national.xls'), sheet='processed')
df_2000_census_age_national <- read_excel(paste0(input_folder, '2000_census_age_national.xls'), sheet='processed') %>%
  mutate(Population = Population*10)
df_2010_census_age_national <- read_excel(paste0(input_folder, '2010_census_age_national.xls'), sheet='processed') %>%
  mutate(Age = as.numeric(Age), Population = as.numeric(Population)*10)
df_2020_census_age_national <- read_excel(paste0(input_folder, '2020_census_age_national.xls'), sheet='processed') %>%
  mutate(Age = as.numeric(Age), Population = as.numeric(Population)*10)

df_1982_census_age_by_province <- read_excel(paste0(input_folder, '1982_census_age_by_province.xls'), sheet='processed')
df_1990_census_age_by_province <- read_excel(paste0(input_folder, '1990_census_age_by_province.xls'), sheet='processed')
df_2000_census_age_by_province <- read_excel(paste0(input_folder, '2000_census_age_by_province.xls'), sheet='processed')
df_2010_census_age_by_province <- read_excel(paste0(input_folder, '2010_census_age_by_province.xls'), sheet='processed')
df_2020_census_age_by_province <- read_excel(paste0(input_folder, '2020_census_age_by_province.xls'), sheet='processed')

df_1982_census_pop_by_province <- read_excel(paste0(input_folder, '1982_census_age_national.xls'), sheet='by_province')

df_pop_py_province <- rbind(rbind(df_1990_census_age_by_province %>% pivot_longer(!Age, names_to="province", values_to="pop") %>%
  select(-Age) %>%
  group_by(province) %>%
  summarise(pop=sum(pop)) %>%
  mutate(Year = 1990),
  df_2000_census_age_by_province %>% pivot_longer(!Age, names_to="province", values_to="pop") %>%
    select(-Age) %>%
    group_by(province) %>%
    summarise(pop=sum(pop)) %>%
    mutate(Year = 2000)),
  df_2010_census_age_by_province %>% pivot_longer(!Age, names_to="province", values_to="pop") %>%
    select(-Age) %>%
    group_by(province) %>%
    summarise(pop=sum(pop)) %>%
    mutate(Year = 2010))

df_gdp_per_cap_industry <- read_excel(paste0(input_folder, 'gdp_per_cap_and_industry.xlsx'))

df_1990_census_industry <- read_excel(paste0(input_folder, '1990_census_industries.xlsx'), sheet='processed') %>%
  pivot_longer(!industry, names_to="province", values_to="proportion") %>%
  mutate(Year=1990) %>%
  pivot_wider(names_from='industry', values_from='proportion')
df_2000_census_industry <- read_excel(paste0(input_folder, '2000_census_industries.xlsx'), sheet='processed') %>%
  pivot_longer(!industry, names_to="province", values_to="proportion") %>%
  mutate(Year=2000) %>%
  mutate(proportion = proportion) %>%
  pivot_wider(names_from='industry', values_from='proportion')
df_2010_census_industry <- read_excel(paste0(input_folder, '2010_census_industries.xlsx'), sheet='processed') %>%
  pivot_longer(!industry, names_to="province", values_to="proportion") %>%
  mutate(Year=2010) %>%
  mutate(proportion = proportion) %>%
  pivot_wider(names_from='industry', values_from='proportion')

df_national_gdp_pop <- read_excel(paste0(input_folder, 'national_gdp_pop.xlsx'))

# generate per capita GDP
df_gdp_pc <- df_gdp_per_cap_industry %>%
  select(Year, ends_with('gdp_pc')) %>%
  filter(year(Year) %in% c(1982, 1990, 2000, 2010, 2020)) %>%
  mutate(Year = year(Year)) %>%
  pivot_longer(!Year, names_to="province", values_to="gdp_pc") %>%
  mutate(province = str_extract(province, "[^_]+")) %>%
  filter(province != 'national') %>%
  pivot_wider(names_from="Year", values_from="gdp_pc") %>%
  mutate(temp_2010 = log(`2020`)-log(`2010`), temp_2000 = log(`2010`)-log(`2000`), temp_1990 = log(`2000`)-log(`1990`)) %>%
  select(province, temp_2010, temp_2000, temp_1990) %>%
  rename('2010' = 'temp_2010', '2000' = 'temp_2000', '1990' = 'temp_1990') %>%
  pivot_longer(!province, names_to="Year", values_to="delta_gdp_pc")
df_gdp_pc$Year <- as.numeric(df_gdp_pc$Year)

# generate economic structure variables
df_gdp_comp <- df_gdp_per_cap_industry %>%
  select(Year, ends_with(c('_gdp', '_gdp_primary', '_gdp_secondary'))) %>%
  mutate(Year = year(Year)) %>%
  filter(Year %in% c(1990, 2000, 2010)) %>%
  pivot_longer(!Year, names_to="variable") %>%
  separate(variable, c("province", "variable"), sep = "_", remove = T, extra="merge") %>%
  pivot_wider(names_from = variable, values_from = value) %>%
  mutate(prop_primary = gdp_primary/gdp, prop_secondary = gdp_secondary/gdp) %>%
  select(-c(gdp, gdp_primary, gdp_secondary)) %>%
  filter(Year == 2010 | Year == 2000 | Year == 1990)

df_census_industries <- rbind(rbind(df_1990_census_industry, df_2000_census_industry), df_2010_census_industry)

# generate proportion over 60
df_1982_60plus <- df_1982_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count") %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  mutate(count = ifelse(is.na(count), 0, count)) %>%
  group_by(province) %>%
  summarize(prop_60_or_older = sum(ifelse(Age >= 60, count, 0)) / sum(ifelse(Age >= 20, count, 0))) %>%
  mutate(Year = 1982)
df_1990_60plus <- df_1990_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count") %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  summarize(prop_60_or_older = sum(ifelse(Age >= 60, count, 0)) / sum(ifelse(Age >= 20, count, 0))) %>%
  mutate(Year = 1990)
df_2000_60plus <- df_2000_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count") %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  summarize(prop_60_or_older = sum(ifelse(Age >= 60, count, 0)) / sum(ifelse(Age >= 20, count, 0))) %>%
  mutate(Year = 2000)
df_2010_60plus <- df_2010_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count") %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  summarize(prop_60_or_older = sum(ifelse(Age >= 60, count, 0)) / sum(ifelse(Age >= 20, count, 0))) %>%
  mutate(Year = 2010)
df_2020_60plus <- df_2020_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count") %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  summarize(prop_60_or_older = sum(ifelse(Age >= 60, count, 0)) / sum(ifelse(Age >= 20, count, 0))) %>%
  mutate(Year = 2020)

df_over60_change_2010 <- rbind(rbind(df_2000_60plus, df_2010_60plus), df_2020_60plus) %>%
  pivot_wider(names_from="Year", values_from="prop_60_or_older") %>%
  mutate(delta_60plus = log(`2020`)-log(`2010`),
         delta_60plus_lvls = `2020`-`2010`) %>%
  select(province, delta_60plus, delta_60plus_lvls) %>%
  mutate(Year = 2010)
df_over60_change_2000 <- rbind(rbind(df_2000_60plus, df_2010_60plus), df_1990_60plus) %>%
  pivot_wider(names_from="Year", values_from="prop_60_or_older") %>%
  mutate(delta_60plus = log(`2010`)-log(`2000`),
         delta_60plus_lvls = `2010`-`2000`) %>%
  select(province, delta_60plus, delta_60plus_lvls) %>%
  mutate(Year = 2000)
df_over60_change_1990 <- rbind(rbind(df_1990_60plus, df_2000_60plus), df_1982_60plus) %>%
  pivot_wider(names_from="Year", values_from="prop_60_or_older") %>%
  mutate(delta_60plus = log(`2000`)-log(`1990`),
         delta_60plus_lvls = `2000`-`1990`) %>%
  select(province, delta_60plus, delta_60plus_lvls) %>%
  mutate(Year = 1990)
df_over60_change <- rbind(df_over60_change_2010, df_over60_change_2000, df_over60_change_1990)

# generate predicted proportion over 60 from 2010 to 2020
survival_odds_2000_2010 <- merge(df_2010_census_age_national %>% mutate(Year = 2010, actual_age = Age, Age = Age - 10), df_2000_census_age_national %>% mutate(Year = 2000), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y, age_bucket = cut(Age, breaks = seq(0, 105, by = 5), right = F)) %>%
  mutate(age_bucket = ifelse(Age == 0, 0, age_bucket)) %>%
  group_by(age_bucket) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x)) %>%
  rename('Age' = 'age_bucket')
survival_odds_2000_2010$Age <- df_2000_census_age_by_province$Age[-c(21,22)]
  
survival_odds_2000_2020 <- merge(df_2020_census_age_national %>% mutate(Year = 2020, actual_age = Age, Age = Age - 20), df_2000_census_age_national %>% mutate(Year = 2000), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y, age_bucket = cut(Age, breaks = seq(0, 105, by = 5), right = F)) %>%
  mutate(age_bucket = ifelse(Age == 0, 0, age_bucket)) %>%
  group_by(age_bucket) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x)) %>%
  rename('Age' = 'age_bucket')
survival_odds_2000_2020$Age <- df_2000_census_age_by_province$Age[-c(19, 20, 21, 22)]

provincial_2000 <- df_2000_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count")
predicted_provincial_2010 <- left_join(provincial_2000, survival_odds_2000_2010, by="Age") %>%
  mutate(predicted_2010 = weighted_survival_rate*count) %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2010)) %>%
  summarize(prop_60_or_older_2010 = sum(ifelse(Age >= 50, predicted_2010, 0))/sum(ifelse(Age >= 10, predicted_2010, 0)))

predicted_provincial_2020 <- left_join(provincial_2000, survival_odds_2000_2020, by="Age") %>%
  mutate(predicted_2020 = weighted_survival_rate*count) %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2020)) %>%
  summarize(prop_60_or_older_2020 = sum(ifelse(Age >= 40, predicted_2020, 0))/sum(ifelse(Age >= 0, predicted_2020, 0)))

pred_over60_change_2010 <- merge(predicted_provincial_2010, predicted_provincial_2020, by="province") %>%
  mutate(delta_pred_over60 = log(prop_60_or_older_2020)-log(prop_60_or_older_2010),
         delta_pred_over60_lvls = prop_60_or_older_2020-prop_60_or_older_2010) %>%
  select(province, delta_pred_over60, delta_pred_over60_lvls) %>%
  mutate(Year = 2010)

# generate predicted proportion over 60 from 2000 to 2010
survival_odds_1990_2000 <- merge(df_2000_census_age_national %>% mutate(Year = 2000, actual_age = Age, Age = Age - 10), df_1990_census_age_national %>% mutate(Year = 1990), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y, age_bucket = cut(Age, breaks = seq(0, 105, by = 5), right = F)) %>%
  mutate(age_bucket = ifelse(Age == 0, 0, age_bucket)) %>%
  group_by(age_bucket) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x)) %>%
  rename('Age' = 'age_bucket')
survival_odds_1990_2000$Age <- df_1990_census_age_by_province$Age[-c(21,22)]

survival_odds_1990_2010 <- merge(df_2010_census_age_national %>% mutate(Year = 2010, actual_age = Age, Age = Age - 20), df_1990_census_age_national %>% mutate(Year = 2000), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y, age_bucket = cut(Age, breaks = seq(0, 105, by = 5), right = F)) %>%
  mutate(age_bucket = ifelse(Age == 0, 0, age_bucket)) %>%
  group_by(age_bucket) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x)) %>%
  rename('Age' = 'age_bucket')
survival_odds_1990_2010$Age <- df_1990_census_age_by_province$Age[-c(19, 20, 21, 22)]

provincial_1990 <- df_1990_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count")
predicted_provincial_2000 <- left_join(provincial_1990, survival_odds_1990_2000, by="Age") %>%
  mutate(predicted_2000 = weighted_survival_rate*count) %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2000)) %>%
  summarize(prop_60_or_older_2000 = sum(ifelse(Age >= 50, predicted_2000, 0))/sum(ifelse(Age >= 10, predicted_2000, 0)))

predicted_provincial_2010 <- left_join(provincial_1990, survival_odds_1990_2010, by="Age") %>%
  mutate(predicted_2010 = weighted_survival_rate*count) %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2010)) %>%
  summarize(prop_60_or_older_2010 = sum(ifelse(Age >= 40, predicted_2010, 0))/sum(ifelse(Age >= 0, predicted_2010, 0)))

pred_over60_change_2000 <- merge(predicted_provincial_2000, predicted_provincial_2010, by="province") %>%
  mutate(delta_pred_over60 = log(prop_60_or_older_2010)-log(prop_60_or_older_2000),
         delta_pred_over60_lvls = prop_60_or_older_2010 - prop_60_or_older_2000) %>%
  select(province, delta_pred_over60, delta_pred_over60_lvls) %>%
  mutate(Year = 2000)

# generate predicted proportion over 60 from 1990 to 2000, first generating an implied national age structure for 1982
df_1982_implied_census_age_by_province <- df_1982_census_age_by_province %>%
  pivot_longer(!Age, names_to="province", values_to="sample_pop") %>%
  group_by(province) %>%
  mutate(sample_pop = ifelse(is.na(sample_pop), 0, sample_pop)) %>%
  mutate(sample_pop = sample_pop / sum(sample_pop)) %>%
  left_join(df_1982_census_pop_by_province, by="province") %>%
  mutate(population = population*sample_pop) %>%
  select(-sample_pop) 
df_1982_implied_census_age_national <- df_1982_implied_census_age_by_province %>%
  group_by(Age) %>%
  summarise(Population=sum(population))
  
survival_odds_1982_1990 <- merge(df_1990_census_age_national %>% mutate(Year = 1990, actual_age = Age, Age = Age - 8), df_1982_implied_census_age_national %>% mutate(Year = 1982), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y) %>%
  group_by(Age) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x))

survival_odds_1982_2000 <- merge(df_2000_census_age_national %>% mutate(Year = 2000, actual_age = Age, Age = Age - 18), df_1982_implied_census_age_national %>% mutate(Year = 1982), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y) %>%
  group_by(Age) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x))

provincial_1982 <- df_1982_implied_census_age_by_province %>%
  rename('count' = 'population')
predicted_provincial_1990 <- left_join(provincial_1982, survival_odds_1982_1990, by="Age") %>%
  mutate(predicted_1990 = weighted_survival_rate*count) %>%
  group_by(province) %>%
  filter(!is.na(predicted_1990)) %>%
  summarize(prop_60_or_older_1990 = sum(ifelse(Age >= 52, predicted_1990, 0))/sum(ifelse(Age >= 12, predicted_1990, 0)))

predicted_provincial_2000 <- left_join(provincial_1982, survival_odds_1982_2000, by="Age") %>%
  mutate(predicted_2000 = weighted_survival_rate*count) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2000)) %>%
  summarize(prop_60_or_older_2000 = sum(ifelse(Age >= 42, predicted_2000, 0))/sum(ifelse(Age >= 2, predicted_2000, 0)))

pred_over60_change_1990 <- merge(predicted_provincial_1990, predicted_provincial_2000, by="province") %>%
  mutate(delta_pred_over60 = log(prop_60_or_older_2000)-log(prop_60_or_older_1990),
         delta_pred_over60_lvls = prop_60_or_older_2000 - prop_60_or_older_1990) %>%
  select(province, delta_pred_over60, delta_pred_over60_lvls) %>%
  mutate(Year = 1990)

# combine variables
df_list <- list(df_gdp_pc, df_census_industries, df_over60_change, pred_over60_change_2010, pred_over60_change_2000, pred_over60_change_1990)
df_main <- df_list %>% reduce(left_join, by=c('province', 'Year')) %>%
  mutate(delta_pred_over60 = ifelse(!is.na(delta_pred_over60.x), delta_pred_over60.x, ifelse(!is.na(delta_pred_over60.y), delta_pred_over60.y, delta_pred_over60)),
         delta_pred_over60_lvls = ifelse(!is.na(delta_pred_over60_lvls.x), delta_pred_over60_lvls.x, ifelse(!is.na(delta_pred_over60_lvls.y), delta_pred_over60_lvls.y, delta_pred_over60_lvls))) %>%
  select(-c(delta_pred_over60.x, delta_pred_over60.y, delta_pred_over60_lvls.x, delta_pred_over60_lvls.y)) %>%
  filter(!(province %in% c('chongqing', 'sichuan') & Year==2000) & !(province %in% c('chongqing', 'sichuan', 'hainan') & Year==1990)) %>%
  mutate(Agriculture = log(Agriculture), Mining = log(Mining), Manufacturing = log(Manufacturing), Construction = log(Construction), Services = log(Services), Other = log(Other)) %>%
  left_join(df_pop_py_province, by=c('province', 'Year'))

df_main <- df_main %>%
  mutate(coastal = ifelse(province %in% c(coastal), 1, 0))

correlscatter1 <- ggplot(df_main %>% select(province, Year, delta_gdp_pc) %>% pivot_wider(names_from=Year, values_from=delta_gdp_pc), aes(x=100*`1990`, y=100*`2000`, label='province')) + geom_point(shape=16, size=5) + geom_smooth(method='lm', se=F) + labs(x = "Δlog(GDP per capita) from 1990 to 2000 – %", y="Δlog(GDP per capita) from 2000 to 2010 – %") + theme(axis.text.y.right = element_blank(),axis.title.x = element_text(), axis.title.y.left = element_text(), plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) + scale_y_continuous(breaks = seq(90, 170, by=20))
correlscatter2 <- ggplot(df_main %>% select(province, Year, delta_gdp_pc) %>% pivot_wider(names_from=Year, values_from=delta_gdp_pc), aes(x=100*`2000`, y=100*`2010`, label='province')) + geom_point(shape=16, size=5) + geom_smooth(method='lm', se=F) + labs(x="Δlog(GDP per capita) from 2000 to 2010 – %", y="Δlog(GDP per capita) from 2010 to 2020 – %") + theme(axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) + scale_y_continuous(limits=c(60, 135), breaks = seq(60, 135, by=15))

ggsave(file=paste0(graph_output, "Fig A4 - correlscatter1.svg"), plot=correlscatter1, bg="white")
ggsave(file=paste0(graph_output, "Fig A5 - correlscatter2.svg"), plot=correlscatter2, bg="white")

# run graphs
scatter1 <- ggplot(df_main, aes(x=100*delta_pred_over60, y=100*delta_gdp_pc, colour=as.factor(Year))) + geom_point(shape=16, size=5) + scale_size(range = c(0, 25)) + geom_smooth(aes(group=Year, colour = as.factor(Year)), method=lm, se=F) + scale_color_manual(values = c("1990" = "#A569BD", "2000" = "#F5B041", "2010" = "#0072B2"),
                                                                                                                                                                                                                                                                   labels = c("1990" = "1990s", "2000" = "2000s", "2010" = "2010s")) + scale_y_continuous(limits = c(40, 200), breaks=seq(40, 200, b=40)) + labs(x="Δlog(Predicted old-age ratio) – %", y="Δlog(GDP per capita) – %") + theme(legend.position = "bottom", axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) 
scatter2 <- ggplot(df_main, aes(x=100*delta_60plus, y=100*delta_gdp_pc, colour=as.factor(Year))) + geom_point(shape=16, size=5) + scale_size(range = c(0, 25)) + geom_smooth(aes(group=Year, colour = as.factor(Year)), method=lm, se=F) + scale_color_manual(values = c("1990" = "#A569BD", "2000" = "#F5B041", "2010" = "#0072B2"),
                                                                                                                                                                                                                                                              labels = c("1990" = "1990s", "2000" = "2000s", "2010" = "2010s")) + scale_y_continuous(limits = c(40, 200), breaks=seq(40, 200, b=40)) + labs(x="Δlog(Old-age ratio) – %", y="Δlog(GDP per capita) – %") + theme(legend.position = "bottom", axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) 
scatter3 <- ggplot(df_main, aes(x=100*delta_60plus, y=100*delta_pred_over60, colour=as.factor(Year))) + geom_point(shape=16, size=5) + scale_size(range = c(0, 25)) + geom_smooth(aes(group=Year, colour = as.factor(Year)), method=lm, se=F) + scale_color_manual(values = c("1990" = "#A569BD", "2000" = "#F5B041", "2010" = "#0072B2"),
                                                                                                                                                                                                                                                                   labels = c("1990" = "1990s", "2000" = "2000s", "2010" = "2010s")) + scale_y_continuous(limits = c(-20, 60), breaks=seq(-20, 60, b=20)) + labs(x="Δlog(Predicted old-age ratio) – %", y="Δlog(Old-age ratio) – %") + theme(legend.position = "bottom", axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) 

ggsave(file=paste0(graph_output, "Fig 7 - reg.svg"), plot=scatter1, bg="white")
ggsave(file=paste0(graph_output, "Fig 6 - ols.svg"), plot=scatter2, bg="white")
ggsave(file=paste0(graph_output, "Fig 5 - first_stage.svg"), plot=scatter3, bg="white")

sf::sf_use_s2(FALSE)

provinces <- data.frame(
  province = c("anhui", "fujian", "gansu", "guangdong", "guizhou", "hainan", "hebei", "heilongjiang", "henan", "hubei", "hunan", "jiangsu", "jiangxi", "jilin", "liaoning", "qinghai", "shaanxi", "shandong", "shanxi", "sichuan", "yunnan", "zhejiang", "guangxi", "innermongolia", "ningxia", "tibet", "xinjiang", "beijing", "chongqing", "shanghai", "tianjin"),
  Name_Province = c("安徽省", "福建省", "甘肃省", "广东省", "贵州省", "海南省", "河北省", "黑龙江省", "河南省", "湖北省", "湖南省", "江苏省", "江西省", "吉林省", "辽宁省", "青海省", "陕西省", "山东省", "山西省", "四川省", "云南省", "浙江省", "广西壮族自治区", "内蒙古自治区", "宁夏回族自治区", "西藏自治区", "新疆维吾尔自治区", "北京市", "重庆市", "上海市", "天津市")
)

china_provinces <- china %>%
  filter(Code_Province %in% as.character(31:36), !Name_Province == "台湾省")

china_provinces <- china %>%
  group_by(Name_Province) %>%
  summarise(geometry = sf::st_union(geometry))

china_provinces_1990 <- china_provinces %>%
  left_join(provinces, by="Name_Province") %>%
  left_join(df_over60_change_1990, by="province") %>%
  mutate(delta_60plus=ifelse(province=="chongqing", 0, delta_60plus)) %>%
  filter(!is.na(delta_60plus)) %>%
  mutate(delta_60plus=100*as.numeric(delta_60plus))
sichuan_replace <- china_provinces_1990$delta_60plus[which(china_provinces_1990$province=="sichuan")]
china_provinces_1990 <- china_provinces_1990 %>% mutate(delta_60plus=ifelse(province=="chongqing", sichuan_replace, delta_60plus))
china_provinces_2000 <- china_provinces %>%
  left_join(provinces, by="Name_Province") %>%
  left_join(df_over60_change_2000, by="province") %>%
  filter(!is.na(delta_60plus)) %>%
  mutate(delta_60plus=100*delta_60plus)
china_provinces_2010 <- china_provinces %>%
  left_join(provinces, by="Name_Province") %>%
  left_join(df_over60_change_2010, by="province") %>%
  filter(!is.na(delta_60plus)) %>%
  mutate(delta_60plus=100*delta_60plus)

map_1990 <- ggplot(data = china_provinces_1990) +
  geom_sf(aes(fill = delta_60plus)) +
  scale_fill_distiller(type = "seq", palette="YlGnBu", direction=1, limits = c(-20, 60), breaks=c(0,20,40), name="Δlog(Old-age ratio)", guide = guide_colorbar(ticks = TRUE, ticks.colour = "black", frame.colour = "black")) +
  theme(legend.position = "none") +
  theme_map() +
  theme(legend.text = element_text(size = 14), legend.title=element_text(size=14))
map_2000 <- ggplot(data = china_provinces_2000) +
  geom_sf(aes(fill = delta_60plus)) +
  scale_fill_distiller(type = "seq", palette="YlGnBu", direction=1, limits = c(-20, 60), breaks=c(0,20,40), name="Δlog(Old-age ratio)", guide = guide_colorbar(ticks = TRUE, ticks.colour = "black", frame.colour = "black")) +
  theme(legend.position = "none") +
  theme_map() +
  theme(legend.text = element_text(size = 14), legend.title=element_text(size=14))
map_2010 <- ggplot(data = china_provinces_2010) +
  geom_sf(aes(fill = delta_60plus)) +
  scale_fill_distiller(type = "seq", palette="YlGnBu", direction=1, limits = c(-20, 60), breaks=c(0,20,40), name="Δlog(Old-age ratio)", guide = guide_colorbar(ticks = TRUE, ticks.colour = "black", frame.colour = "black")) +
  theme(legend.position = "none") +
  theme_map() +
  theme(legend.text = element_text(size = 14), legend.title=element_text(size=14))

ggsave(file=paste0(graph_output, "Fig A1 - map_1990.svg"), plot=map_1990, bg="white")
ggsave(file=paste0(graph_output, "Fig A2 - map_2000.svg"), plot=map_2000, bg="white")
ggsave(file=paste0(graph_output, "Fig A3 - map_2010.svg"), plot=map_2010, bg="white")

# Rotemberg weights cleaning
clean_1982 <- df_1982_census_age_by_province %>%
  mutate(age_bucket = cut(Age, breaks = seq(-1, 105, by = 5))) %>% select(-c(sichuan)) %>%
  select(-Age) %>%
  rename('Age' = age_bucket) %>%
  select(Age, everything()) %>%
  group_by(Age) %>%
  summarise(across(everything(), ~ sum(., na.rm=T))) %>%
  mutate(Age = as.character(Age))
clean_1990 <- df_1990_census_age_by_province %>% select(-c(sichuan, hainan))
filtered_df <- df_2000_census_age_by_province[df_2000_census_age_by_province$Age %in% c("0", "1-4"), ] %>% 
  select(-Age) %>% summarise(across(everything(), ~ sum(., na.rm=T))) %>%
  mutate(Age = "0-4") %>%
  select(Age, everything())
clean_2000 <- rbind(filtered_df,df_2000_census_age_by_province[!df_2000_census_age_by_province$Age %in% c("0", "1-4"), ]) %>% 
  select(-c(sichuan, chongqing, hainan))
clean_1982$Age <- clean_2000$Age[1:20]
filtered_df <- df_2010_census_age_by_province[df_2010_census_age_by_province$Age %in% c("0", "1-4"), ] %>% 
  select(-Age) %>% summarise(across(everything(), ~ sum(., na.rm=T))) %>%
  mutate(Age = "0-4") %>%
  select(Age, everything())
clean_2010 <- rbind(filtered_df,df_2010_census_age_by_province[!df_2010_census_age_by_province$Age %in% c("0", "1-4"), ]) %>% 
  select(-c(sichuan, chongqing, hainan))

master_df <- rbind(clean_1982, clean_1990, clean_2000, clean_2010)

clean_1982_props <- clean_1982  %>%
  mutate_at(vars(-Age), ~ . / sum(.)) %>%
  mutate(Year=1982)
clean_1990_props <- clean_1990  %>%
  mutate_at(vars(-Age), ~ . / sum(.)) %>%
  mutate(Year=1990)
clean_2000_props <- clean_2000  %>%
  mutate_at(vars(-Age), ~ . / sum(.)) %>%
  mutate(Year=2000)
clean_2010_props <- clean_2010  %>%
  mutate_at(vars(-Age), ~ . / sum(.)) %>%
  mutate(Year=2010)

clean_master_df <- rbind(clean_1982_props, clean_1990_props, clean_2000_props, clean_2010_props)

# survival odds column
age_bins <- c(-1, 4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79)
age_labels <- c('0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79')

odds_1990 <- survival_odds_1982_2000[1:80,2] - survival_odds_1982_1990[1:80,2]
odds_1990 <- cbind(odds_1990, survival_odds_1982_2000[1:80,1])
odds_1990 <- odds_1990 %>% merge(df_1982_implied_census_age_national, by="Age") %>%
  group_by(age_group = cut(Age, breaks=age_bins, labels=age_labels)) %>%
  mutate(survival_pop = weighted_survival_rate*Population) %>%
  group_by(age_group) %>%
  summarise(Population = sum(Population),
            survival_pop = sum(survival_pop),
            weighted_survival_rate = survival_pop/Population) %>%
  select(-c(Population, survival_pop)) %>%
  rename('Age' = 'age_group') %>%
  mutate(Year=1990)

odds_2000 <- survival_odds_1990_2010[1:16,2] - survival_odds_1990_2000[1:16,2]
odds_2000 <- cbind(odds_2000, survival_odds_1990_2010[1:16,1]) %>%
  mutate(Year=2000)

odds_2010 <- survival_odds_2000_2020[1:17,2] - survival_odds_2000_2010[1:17,2]
odds_2010 <- cbind(odds_2010, survival_odds_2000_2020[1:17,1]) %>%
  mutate(Year=2010)
weight_scalar <- df_2000_census_age_national %>% filter(Age<=4) %>%
  summarise(zero_weight = first(Population)/sum(Population))
odds_2010[2,1] <- odds_2010[1,1]*weight_scalar + odds_2010[2,1]*(1-weight_scalar)
odds_2010 <- odds_2010[2:nrow(odds_2010),]
odds_2010[1,2] <- "0-4"

master_odds <- rbind(odds_1990, odds_2000, odds_2010)
rotemberg <- merge(clean_master_df, master_odds, by=c("Age", "Year")) %>%
  pivot_longer(-c(Age, weighted_survival_rate, Year), names_to="province", values_to="pop_share") %>%
  pivot_wider(names_from="Age", names_glue="{.value}_{Age}", values_from=c(weighted_survival_rate, pop_share)) %>%
  merge(df_main, by=c("province", "Year"))

# output data for Stata
write.csv(rotemberg, paste0(file_output, "ageing_rotemberg.csv"))

# 30 year lag robustness
# generate predicted proportion over 60 from 2000 to 2010
survival_odds_1990_2020 <- merge(df_2020_census_age_national %>% mutate(Year = 2020, actual_age = Age, Age = Age - 30), df_1990_census_age_national %>% mutate(Year = 1990), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y, age_bucket = cut(Age, breaks = seq(0, 105, by = 5), right = F)) %>%
  mutate(age_bucket = ifelse(Age == 0, 0, age_bucket)) %>%
  group_by(age_bucket) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x)) %>%
  rename('Age' = 'age_bucket')
survival_odds_1990_2020$Age <- df_1990_census_age_by_province$Age[-c(17, 18, 19, 20, 21,22)]

survival_odds_1990_2010 <- merge(df_2010_census_age_national %>% mutate(Year = 2010, actual_age = Age, Age = Age - 20), df_1990_census_age_national %>% mutate(Year = 2000), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y, age_bucket = cut(Age, breaks = seq(0, 105, by = 5), right = F)) %>%
  mutate(age_bucket = ifelse(Age == 0, 0, age_bucket)) %>%
  group_by(age_bucket) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x)) %>%
  rename('Age' = 'age_bucket')
survival_odds_1990_2010$Age <- df_1990_census_age_by_province$Age[-c(19, 20, 21, 22)]

provincial_1990 <- df_1990_census_age_by_province %>%
  pivot_longer(cols = -Age, names_to = "province", values_to = "count")
predicted_provincial_2020 <- left_join(provincial_1990, survival_odds_1990_2020, by="Age") %>%
  mutate(predicted_2020 = weighted_survival_rate*count) %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2020)) %>%
  summarize(prop_60_or_older_2020 = sum(ifelse(Age >= 30, predicted_2020, 0))/sum(ifelse(Age >= 0, predicted_2020, 0)))

predicted_provincial_2010 <- left_join(provincial_1990, survival_odds_1990_2010, by="Age") %>%
  mutate(predicted_2010 = weighted_survival_rate*count) %>%
  mutate(Age = as.numeric(str_extract(Age, "\\d+"))) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2010)) %>%
  summarize(prop_60_or_older_2010 = sum(ifelse(Age >= 40, predicted_2010, 0))/sum(ifelse(Age >= 0, predicted_2010, 0)))

pred_over60_change_2010 <- merge(predicted_provincial_2020, predicted_provincial_2010, by="province") %>%
  mutate(delta_pred_over60 = log(prop_60_or_older_2020)-log(prop_60_or_older_2010)) %>%
  select(province, delta_pred_over60) %>%
  mutate(Year = 2010)

# generate predicted proportion over 60 from 1990 to 2000, first generating an implied national age structure for 1982
df_1982_implied_census_age_by_province <- df_1982_census_age_by_province %>%
  pivot_longer(!Age, names_to="province", values_to="sample_pop") %>%
  group_by(province) %>%
  mutate(sample_pop = ifelse(is.na(sample_pop), 0, sample_pop)) %>%
  mutate(sample_pop = sample_pop / sum(sample_pop)) %>%
  left_join(df_1982_census_pop_by_province, by="province") %>%
  mutate(population = population*sample_pop) %>%
  select(-sample_pop) 
df_1982_implied_census_age_national <- df_1982_implied_census_age_by_province %>%
  group_by(Age) %>%
  summarise(Population=sum(population))

survival_odds_1982_2010 <- merge(df_2010_census_age_national %>% mutate(Year = 2010, actual_age = Age, Age = Age - 28), df_1982_implied_census_age_national %>% mutate(Year = 1982), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y) %>%
  group_by(Age) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x))

survival_odds_1982_2000 <- merge(df_2000_census_age_national %>% mutate(Year = 2000, actual_age = Age, Age = Age - 18), df_1982_implied_census_age_national %>% mutate(Year = 1982), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y) %>%
  group_by(Age) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x))

provincial_1982 <- df_1982_implied_census_age_by_province %>%
  rename('count' = 'population')
predicted_provincial_2010 <- left_join(provincial_1982, survival_odds_1982_2010, by="Age") %>%
  mutate(predicted_2010 = weighted_survival_rate*count) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2010)) %>%
  summarize(prop_60_or_older_2010 = sum(ifelse(Age >= 32, predicted_2010, 0))/sum(ifelse(Age >= 0, predicted_2010, 0)))

predicted_provincial_2000 <- left_join(provincial_1982, survival_odds_1982_2000, by="Age") %>%
  mutate(predicted_2000 = weighted_survival_rate*count) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2000)) %>%
  summarize(prop_60_or_older_2000 = sum(ifelse(Age >= 42, predicted_2000, 0))/sum(ifelse(Age >= 2, predicted_2000, 0)))

pred_over60_change_2000 <- merge(predicted_provincial_2010, predicted_provincial_2000, by="province") %>%
  mutate(delta_pred_over60 = log(prop_60_or_older_2010)-log(prop_60_or_older_2000)) %>%
  select(province, delta_pred_over60) %>%
  mutate(Year = 2000)

# combine variables
df_list_30lag <- list(df_gdp_pc, df_census_industries, df_over60_change, pred_over60_change_2010, pred_over60_change_2000)
df_main_30lag <- df_list_30lag %>% reduce(left_join, by=c('province', 'Year')) %>%
  mutate(delta_pred_over60 = ifelse(!is.na(delta_pred_over60.x), delta_pred_over60.x, delta_pred_over60.y)) %>%
  select(-c(delta_pred_over60.x, delta_pred_over60.y)) %>%
  filter(!(province %in% c('chongqing', 'sichuan') & Year==2000) & !(province %in% c('chongqing', 'sichuan', 'hainan') & Year==1990)) %>%
  mutate(Agriculture = log(Agriculture), Mining = log(Mining), Manufacturing = log(Manufacturing), Construction = log(Construction), Services = log(Services), Other = log(Other)) %>%
  left_join(df_pop_py_province, by=c('province', 'Year')) %>%
  filter(Year %in% c(2000, 2010))

df_main_30lag <- df_main_30lag %>%
  mutate(coastal = ifelse(province %in% c(coastal), 1, 0))

write.csv(df_main_30lag, paste0(file_output, 'lag_30.csv'))

# 40 year lag robustness
# generate predicted proportion over 60 from 1990 to 2000, first generating an implied national age structure for 1982
df_1982_implied_census_age_by_province <- df_1982_census_age_by_province %>%
  pivot_longer(!Age, names_to="province", values_to="sample_pop") %>%
  group_by(province) %>%
  mutate(sample_pop = ifelse(is.na(sample_pop), 0, sample_pop)) %>%
  mutate(sample_pop = sample_pop / sum(sample_pop)) %>%
  left_join(df_1982_census_pop_by_province, by="province") %>%
  mutate(population = population*sample_pop) %>%
  select(-sample_pop) 
df_1982_implied_census_age_national <- df_1982_implied_census_age_by_province %>%
  group_by(Age) %>%
  summarise(Population=sum(population))

survival_odds_1982_2020 <- merge(df_2020_census_age_national %>% mutate(Year = 2020, actual_age = Age, Age = Age - 38), df_1982_implied_census_age_national %>% mutate(Year = 1982), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y) %>%
  group_by(Age) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x))

survival_odds_1982_2010 <- merge(df_2010_census_age_national %>% mutate(Year = 2010, actual_age = Age, Age = Age - 28), df_1982_implied_census_age_national %>% mutate(Year = 1982), by="Age") %>%
  mutate(survival_ratio = Population.x/Population.y) %>%
  group_by(Age) %>%
  summarize(weighted_survival_rate = weighted.mean(survival_ratio, Population.x))

provincial_1982 <- df_1982_implied_census_age_by_province %>%
  rename('count' = 'population')
predicted_provincial_2020 <- left_join(provincial_1982, survival_odds_1982_2020, by="Age") %>%
  mutate(predicted_2020 = weighted_survival_rate*count) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2020)) %>%
  summarize(prop_60_or_older_2020 = sum(ifelse(Age >= 32, predicted_2020, 0))/sum(ifelse(Age >= 0, predicted_2020, 0)))

predicted_provincial_2010 <- left_join(provincial_1982, survival_odds_1982_2010, by="Age") %>%
  mutate(predicted_2010 = weighted_survival_rate*count) %>%
  group_by(province) %>%
  filter(!is.na(predicted_2010)) %>%
  summarize(prop_60_or_older_2010 = sum(ifelse(Age >= 22, predicted_2010, 0))/sum(ifelse(Age >= 0, predicted_2010, 0)))

pred_over60_change_2010 <- merge(predicted_provincial_2010, predicted_provincial_2020, by="province") %>%
  mutate(delta_pred_over60 = log(prop_60_or_older_2020)-log(prop_60_or_older_2010)) %>%
  select(province, delta_pred_over60) %>%
  mutate(Year = 2010)

# combine variables
df_list_40lag <- list(df_gdp_pc, df_census_industries, df_over60_change, pred_over60_change_2010)
df_main_40lag <- df_list_40lag %>% reduce(left_join, by=c('province', 'Year')) %>%
  filter(!(province %in% c('chongqing', 'sichuan', 'hainan'))) %>%
  mutate(Agriculture = log(Agriculture), Mining = log(Mining), Manufacturing = log(Manufacturing), Construction = log(Construction), Services = log(Services), Other = log(Other)) %>%
  left_join(df_pop_py_province, by=c('province', 'Year')) %>%
  filter(Year %in% c(2010))

df_main_40lag <- df_main_40lag %>%
  mutate(coastal = ifelse(province %in% c(coastal), 1, 0))

write.csv(df_main_40lag, paste0(file_output, 'lag_40.csv'))

# migration and life expectancy exercise
df_migr_life <- readxl::read_excel(paste0(input_folder, 'migration_lifex.xlsx'), sheet='Sheet1') %>%
  mutate(migrdiff_1990=log(pop_2000-regpop_2000)-log(pop_1990-regpop_1990),
         migrdiff_2000=log(pop_2010-regpop_2010)-log(pop_2000-regpop_2000),
         migrdiff_2010=log(pop_2020-regpop_2020)-log(pop_2010-regpop_2010),
         lifexdiff_1990 = lifex_2000-lifex_1990,
         lifexdiff_2000 = lifex_2010-lifex_2000,
         lifexdiff_2010 = lifex_2020-lifex_2010,
         gdpdiff_1990 = log(gdppc_2000)-log(gdppc_1990),
         gdpdiff_2000 = log(gdppc_2010)-log(gdppc_2000),
         gdpdiff_2010 = log(gdppc_2020)-log(gdppc_2010)) %>%
  filter(!is.na(city)) %>%
  select(city, migrdiff_2000, migrdiff_2010, migrdiff_1990, lifexdiff_2000, lifexdiff_2010, lifexdiff_1990, gdpdiff_2000, gdpdiff_2010, gdpdiff_1990) %>%
  pivot_longer(!city, names_to=c("variable", "year"), names_sep="_") %>%
  pivot_wider(names_from=variable, values_from=value)

migr_correl <- ggplot(df_migr_life, aes(x=100*gdpdiff, y=100*migrdiff, colour=as.factor(year))) + geom_point(shape=16, size=5) + scale_size(range = c(0, 25)) + geom_smooth(aes(group=year, colour = as.factor(year)), method=lm, se=F) + scale_color_manual(values = c("1990" = "#A569BD", "2000" = "#F5B041", "2010" = "#0072B2"),
                                                                                                                                                                                                                                                             labels = c("1990" = "1990s", "2000" = "2000s", "2010" = "2010s")) + labs(x="Δlog(GDP) – %", y="Δlog(Unregistered population) – %") + theme(legend.position = "bottom", axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) 
life_correl <- ggplot(df_migr_life, aes(x=100*gdpdiff, y=lifexdiff, colour=as.factor(year))) + geom_point(shape=16, size=5) + scale_size(range = c(0, 25)) + geom_smooth(aes(group=year, colour = as.factor(year)), method=lm, se=F) + scale_color_manual(values = c("1990" = "#A569BD", "2000" = "#F5B041", "2010" = "#0072B2"),
                                                                                                                                                                                                                                                          labels = c("1990" = "1990s", "2000" = "2000s", "2010" = "2010s")) + labs(x="Δlog(GDP) – %", y="Δ Life expectancy – years") + theme(legend.position = "bottom", axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) 

ggsave(file=paste0(graph_output, "Fig 3 - migr_correl.svg"), plot=migr_correl, bg="white")
ggsave(file=paste0(graph_output, "Fig 4 - life_correl.svg"), plot=life_correl, bg="white")

# cross-provinces sectoral exercise
df_health_construct_cn <- readxl::read_excel(paste0(input_folder, 'health_construction_cn.xlsx'), sheet='Sheet1') %>%
  mutate(servicesdiff_2000 = ((services_2010-reservices_2010)/gdp_2010)-((services_2000-reservices_2000)/gdp_2000),
         servicesdiff_2010 = ((services_2020-reservices_2020)/gdp_2020)-((services_2010-reservices_2010)/gdp_2010),
         constructdiff_1990 = ((gdpconstruct_2000/gdp_2000)-(gdpconstruct_1990/gdp_1990)),
         constructdiff_2000 = ((gdpconstruct_2010/gdp_2010)-(gdpconstruct_2000/gdp_2000)),
         constructdiff_2010 = ((gdpconstruct_2020/gdp_2020)-(gdpconstruct_2010/gdp_2010))) %>%
  filter(!is.na(province)) %>%
  select(province, servicesdiff_2000, servicesdiff_2010, constructdiff_1990, constructdiff_2000, constructdiff_2010) %>%
  pivot_longer(!province, names_to=c("variable", "Year"), names_sep="_") %>%
  pivot_wider(names_from=variable, values_from=value) %>%
  mutate(Year=as.numeric(Year), province=str_to_lower(province)) %>%
  left_join(df_main, by=c("province", "Year"))

write.csv(df_health_construct_cn, paste0(file_output, 'health_construct.csv'))

service_correl <- ggplot(df_health_construct_cn, aes(x=100*delta_pred_over60, y=100*servicesdiff, colour=as.factor(Year))) + geom_point(shape=16, size=5) + scale_size(range = c(0, 25)) + geom_smooth(aes(group=Year, colour = as.factor(Year)), method=lm, se=F) + scale_color_manual(values = c("2000" = "#F5B041", "2010" = "#0072B2"),
                                                                                                                                                                                                                                                                                        labels = c("2000" = "2000s", "2010" = "2010s")) + labs(x="Δlog(Predicted old-age ratio) – %", y="ΔServices/GDP – %") + scale_y_continuous(limits = c(-20, 20), breaks=seq(-20, 20, b=10)) + theme(legend.position = "bottom", axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) 
construct_correl <- ggplot(df_health_construct_cn, aes(x=100*delta_pred_over60, y=100*constructdiff, colour=as.factor(Year))) + geom_point(shape=16, size=5) + scale_size(range = c(0, 25)) + geom_smooth(aes(group=Year, colour = as.factor(Year)), method=lm, se=F) + scale_color_manual(values = c("1990" = "#A569BD", "2000" = "#F5B041", "2010" = "#0072B2"),
                                                                                                                                                                                                                                                                                           labels = c("1990" = "1990s", "2000" = "2000s", "2010" = "2010s")) + labs(x="Δlog(Predicted old-age ratio) – %", y="ΔConstruction/GDP – %") + theme(legend.position = "bottom", axis.text.y.right = element_blank(),plot.margin = margin(t = 5, r = 30, b = 5, l = 5)) 
ggsave(file=paste0(graph_output, "Fig 11 - service_correl.svg"), plot=service_correl, bg="white")
ggsave(file=paste0(graph_output, "Fig 12 - construct_correl.svg"), plot=construct_correl, bg="white")