library(tidyverse)
library(arrow)
library(brpop)
library(tidymodels)
library(timetk)
source("functions.R")
Bundled data
This notebook prepares a data-set with dengue cases and covariates of interest per municipality.
Packages
Data
Dengue
Load dengue cases data and aggregate per week.
<- read_parquet(data_dir("dengue_data/parquet_aggregated/dengue_md.parquet")) %>%
dengue group_by(mun) %>%
summarise_by_time(.date_var = date, .by = "week", freq = sum(freq, na.rm = TRUE)) %>%
ungroup() %>%
rename(cases = freq)
Population
Load municipality population data for the years present at the dengue cases data.
<- mun_pop_totals() %>%
pop filter(year %in% seq(year(min(dengue$date)), year(max(dengue$date)))) %>%
mutate(mun = as.character(mun))
<- pop %>%
pop_2021 filter(year == 2021)
<- pop_2021 %>%
pop_2022 mutate(year = 2022)
<- bind_rows(pop, pop_2022)
pop
rm(pop_2021, pop_2022)
Weather variables
Load weather variables and group per week.
Precipitation
<- open_dataset(sources = data_dir("weather_data/parquet/era5/total_precipitation_sum.parquet")) %>%
prec filter(name == "total_precipitation_sum_sum") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
summarise_by_time(.date_var = date, .by = "week", value = sum(value, na.rm = TRUE)) %>%
rename(prec = value)
As precipitation is a volume, the sum
function is used.
Average maximum temperature
<- open_dataset(sources = data_dir("weather_data/parquet/era5/2m_temperature_max.parquet")) %>%
tmax filter(name == "2m_temperature_max_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
rename(tmax = value)
Average minimum temperature
<- open_dataset(sources = data_dir("weather_data/parquet/era5/2m_temperature_min.parquet")) %>%
tmin filter(name == "2m_temperature_min_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
rename(tmin = value)
Join data
Join dengue cases, population and weather variables.
<- dengue %>%
tdengue mutate(dengue_year = year(date)) %>%
inner_join(pop, by = c("dengue_year" = "year", "mun")) %>%
select(-dengue_year) %>%
inner_join(prec, by = "date") %>%
inner_join(tmax, by = "date") %>%
inner_join(tmin, by = "date")
rm(dengue, prec, tmax, tmin)
The population estimate is constant over each year.
Cleaning and basic features
Due the sparsity of dengue cases, only municipalities with more than 50,000 inhabitants are filtered.
# List municipalities with moren than 50k inhab
<- pop %>%
mun_vec filter(year == max(year)) %>%
filter(pop >= 50000) %>%
pull(mun)
rm(pop)
<- tdengue %>%
tdengue # Remove municipalilities with zero population
filter(pop != 0) %>%
# Keep only municipalities in the list
filter(mun %in% mun_vec) %>%
# Compute dengue rate per population
# mutate(cases = cases/pop*100000) %>%
# Remove population
select(-pop) %>%
# Round values
mutate(across(c(cases, prec, tmax, tmin), ~ round(.x, digits = 2))) %>%
# Pad weeks
group_by(mun) %>%
pad_by_time(date, .by = "week", .pad_value = 0, .start_date = min(tdengue$date), .end_date = max(tdengue$date)) %>%
ungroup()
Municipalities remaining at the dataset: 679
The computation of dengue incidence (cases per population) is commented to keep the raw cases count.
Standardize measures
Center around mean with a unit standard deviation.
\[ x' = \frac{x - \mu}{\sigma} \]
<- tdengue %>%
tdengue mutate(cases_raw = cases) %>%
group_by(mun) %>%
arrange(date) %>%
mutate(cases_cum_raw = cumsum(cases_raw)) %>%
mutate(cases_cum = cases_cum_raw) %>%
mutate(across(c(cases, cases_cum, prec, tmax, tmin), ~ standardize_vec(.x, silent = TRUE))) %>%
ungroup()
Lag and rolling lag variables
Creates lagged variables from standardized dengue cases and weather variables, from 1 to 24 weeks (6 months).
<- tdengue %>%
tdengue group_by(mun) %>%
tk_augment_lags(.value = c(cases, prec, tmax, tmin), .lags = 1:24) %>%
# tk_augment_slidify(
# .value = contains("_lag"),
# .period = c(2, 4, 6),
# .f = ~ mean(.x, na.rm = TRUE),
# .partial = TRUE,
# .align = "center"
# ) %>%
ungroup()
Rolling window calculation procedure is commented out.
Overview
glimpse(tdengue)
Rows: 425,733
Columns: 105
$ mun <chr> "110002", "110002", "110002", "110002", "110002", "11000…
$ date <date> 2010-12-26, 2011-01-02, 2011-01-09, 2011-01-16, 2011-01…
$ cases <dbl> -0.29323416, -0.05782082, -0.05782082, -0.17552749, -0.0…
$ prec <dbl> -1.84351393, 1.58948545, 1.38739445, 1.56445970, 1.24738…
$ tmax <dbl> -24.73411305, 0.06515107, 0.06350563, 0.10381893, 0.1120…
$ tmin <dbl> -24.67450779, 0.18593524, 0.19523234, 0.20368425, 0.1986…
$ cases_raw <dbl> 0, 2, 2, 1, 2, 5, 0, 1, 2, 1, 0, 2, 1, 2, 2, 0, 0, 0, 1,…
$ cases_cum_raw <dbl> 0, 2, 4, 5, 7, 12, 12, 13, 15, 16, 16, 18, 19, 21, 23, 2…
$ cases_cum <dbl> -0.8506694, -0.8445922, -0.8385149, -0.8354763, -0.82939…
$ cases_lag1 <dbl> NA, -0.29323416, -0.05782082, -0.05782082, -0.17552749, …
$ prec_lag1 <dbl> NA, -1.84351393, 1.58948545, 1.38739445, 1.56445970, 1.2…
$ tmax_lag1 <dbl> NA, -24.73411305, 0.06515107, 0.06350563, 0.10381893, 0.…
$ tmin_lag1 <dbl> NA, -24.67450779, 0.18593524, 0.19523234, 0.20368425, 0.…
$ cases_lag2 <dbl> NA, NA, -0.29323416, -0.05782082, -0.05782082, -0.175527…
$ prec_lag2 <dbl> NA, NA, -1.84351393, 1.58948545, 1.38739445, 1.56445970,…
$ tmax_lag2 <dbl> NA, NA, -24.73411305, 0.06515107, 0.06350563, 0.10381893…
$ tmin_lag2 <dbl> NA, NA, -24.67450779, 0.18593524, 0.19523234, 0.20368425…
$ cases_lag3 <dbl> NA, NA, NA, -0.29323416, -0.05782082, -0.05782082, -0.17…
$ prec_lag3 <dbl> NA, NA, NA, -1.84351393, 1.58948545, 1.38739445, 1.56445…
$ tmax_lag3 <dbl> NA, NA, NA, -24.73411305, 0.06515107, 0.06350563, 0.1038…
$ tmin_lag3 <dbl> NA, NA, NA, -24.67450779, 0.18593524, 0.19523234, 0.2036…
$ cases_lag4 <dbl> NA, NA, NA, NA, -0.29323416, -0.05782082, -0.05782082, -…
$ prec_lag4 <dbl> NA, NA, NA, NA, -1.84351393, 1.58948545, 1.38739445, 1.5…
$ tmax_lag4 <dbl> NA, NA, NA, NA, -24.73411305, 0.06515107, 0.06350563, 0.…
$ tmin_lag4 <dbl> NA, NA, NA, NA, -24.67450779, 0.18593524, 0.19523234, 0.…
$ cases_lag5 <dbl> NA, NA, NA, NA, NA, -0.29323416, -0.05782082, -0.0578208…
$ prec_lag5 <dbl> NA, NA, NA, NA, NA, -1.84351393, 1.58948545, 1.38739445,…
$ tmax_lag5 <dbl> NA, NA, NA, NA, NA, -24.73411305, 0.06515107, 0.06350563…
$ tmin_lag5 <dbl> NA, NA, NA, NA, NA, -24.67450779, 0.18593524, 0.19523234…
$ cases_lag6 <dbl> NA, NA, NA, NA, NA, NA, -0.29323416, -0.05782082, -0.057…
$ prec_lag6 <dbl> NA, NA, NA, NA, NA, NA, -1.84351393, 1.58948545, 1.38739…
$ tmax_lag6 <dbl> NA, NA, NA, NA, NA, NA, -24.73411305, 0.06515107, 0.0635…
$ tmin_lag6 <dbl> NA, NA, NA, NA, NA, NA, -24.67450779, 0.18593524, 0.1952…
$ cases_lag7 <dbl> NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.05782082, -0…
$ prec_lag7 <dbl> NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.5894854, 1.387…
$ tmax_lag7 <dbl> NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.06515107, 0.…
$ tmin_lag7 <dbl> NA, NA, NA, NA, NA, NA, NA, -24.67450779, 0.18593524, 0.…
$ cases_lag8 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.05782082…
$ prec_lag8 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.5894854, 1…
$ tmax_lag8 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.06515107…
$ tmin_lag8 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -24.67450779, 0.18593524…
$ cases_lag9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.0578…
$ prec_lag9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.589485…
$ tmax_lag9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.0651…
$ tmin_lag9 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.67450779, 0.1859…
$ cases_lag10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.…
$ prec_lag10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.58…
$ tmax_lag10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.…
$ tmin_lag10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.6745078, 0.1…
$ cases_lag11 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416,…
$ prec_lag11 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.843514, 1…
$ tmax_lag11 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305…
$ tmin_lag11 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.6745078,…
$ cases_lag12 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323…
$ prec_lag12 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.84351…
$ tmax_lag12 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.7341…
$ tmin_lag12 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.6745…
$ cases_lag13 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.2…
$ prec_lag13 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.8…
$ tmax_lag13 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.…
$ tmin_lag13 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.…
$ cases_lag14 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag14 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag14 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag14 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag15 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag15 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag15 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag15 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag16 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag16 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag16 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag16 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag17 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag17 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag17 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag17 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag18 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag18 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag18 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag18 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag19 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag19 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag19 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag19 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag20 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag20 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag20 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag20 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag21 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag21 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag21 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag21 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag22 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag22 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag22 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag22 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag23 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag23 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag23 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag23 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag24 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag24 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag24 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag24 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
Save result
write_parquet(x = tdengue, sink = data_dir("bundled_data/tdengue.parquet"))
Session info
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] timetk_2.9.0 yardstick_1.3.0 workflowsets_1.0.1 workflows_1.1.3
[5] tune_1.1.2 rsample_1.2.0 recipes_1.0.10 parsnip_1.2.0
[9] modeldata_1.3.0 infer_1.0.6 dials_1.2.0 scales_1.3.0
[13] broom_1.0.5 tidymodels_1.1.1 brpop_0.3.0 arrow_14.0.0.2
[17] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
[21] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
[25] ggplot2_3.4.4 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] tidyselect_1.2.0 timeDate_4032.109 fastmap_1.1.1
[4] digest_0.6.34 rpart_4.1.23 timechange_0.3.0
[7] lifecycle_1.0.4 survival_3.5-7 magrittr_2.0.3
[10] compiler_4.3.2 rlang_1.1.3 tools_4.3.2
[13] utf8_1.2.4 yaml_2.3.8 data.table_1.15.0
[16] knitr_1.45 htmlwidgets_1.6.4 bit_4.0.5
[19] DiceDesign_1.10 withr_3.0.0 nnet_7.3-19
[22] grid_4.3.2 fansi_1.0.6 xts_0.13.2
[25] colorspace_2.1-0 future_1.33.1 iterators_1.0.14
[28] globals_0.16.2 MASS_7.3-60 anytime_0.3.9
[31] cli_3.6.2 rmarkdown_2.25 generics_0.1.3
[34] rstudioapi_0.15.0 future.apply_1.11.1 tzdb_0.4.0
[37] splines_4.3.2 assertthat_0.2.1 parallel_4.3.2
[40] vctrs_0.6.5 hardhat_1.3.1 Matrix_1.6-3
[43] jsonlite_1.8.8 hms_1.1.3 bit64_4.0.5
[46] listenv_0.9.1 foreach_1.5.2 gower_1.0.1
[49] glue_1.7.0 parallelly_1.37.0 codetools_0.2-19
[52] stringi_1.8.3 gtable_0.3.4 GPfit_1.0-8
[55] munsell_0.5.0 pillar_1.9.0 furrr_0.3.1
[58] htmltools_0.5.7 ipred_0.9-14 lava_1.7.3
[61] R6_2.5.1 lhs_1.1.6 evaluate_0.23
[64] lattice_0.22-5 backports_1.4.1 class_7.3-22
[67] Rcpp_1.0.12 prodlim_2023.08.28 padr_0.6.2
[70] xfun_0.42 zoo_1.8-12 pkgconfig_2.0.3