Bundled data

Author

Raphael Saldanha

Last modification

February 19, 2024 | 10:30:32 +01:00

This notebook prepares a data-set with dengue cases and covariates of interest per municipality.

Packages

library(tidyverse)
library(arrow)
library(brpop)
library(tidymodels)
library(timetk)
source("functions.R")

Data

Dengue

Load dengue cases data and aggregate per week.

dengue <- read_parquet(data_dir("dengue_data/parquet_aggregated/dengue_md.parquet")) %>%
  group_by(mun) %>%
  summarise_by_time(.date_var = date, .by = "week", freq = sum(freq, na.rm = TRUE)) %>%
  ungroup() %>%
  rename(cases = freq)

Population

Load municipality population data for the years present at the dengue cases data.

pop <- mun_pop_totals() %>%
  filter(year %in% seq(year(min(dengue$date)), year(max(dengue$date)))) %>%
  mutate(mun = as.character(mun))

pop_2021 <- pop %>%
  filter(year == 2021)

pop_2022 <- pop_2021 %>%
  mutate(year = 2022)

pop <- bind_rows(pop, pop_2022)

rm(pop_2021, pop_2022)

Weather variables

Load weather variables and group per week.

Precipitation

prec <- open_dataset(sources = data_dir("weather_data/parquet/era5/total_precipitation_sum.parquet")) %>%
  filter(name == "total_precipitation_sum_sum") %>%
  select(date, value) %>%
  collect() %>%
  filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
  summarise_by_time(.date_var = date, .by = "week", value = sum(value, na.rm = TRUE)) %>%
  rename(prec = value)
Note

As precipitation is a volume, the sum function is used.

Average maximum temperature

tmax <- open_dataset(sources = data_dir("weather_data/parquet/era5/2m_temperature_max.parquet")) %>%
  filter(name == "2m_temperature_max_mean") %>%
  select(date, value) %>%
  collect() %>%
  filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
  summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
  rename(tmax = value)

Average minimum temperature

tmin <- open_dataset(sources = data_dir("weather_data/parquet/era5/2m_temperature_min.parquet")) %>%
  filter(name == "2m_temperature_min_mean") %>%
  select(date, value) %>%
  collect() %>%
  filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
  summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
  rename(tmin = value)

Join data

Join dengue cases, population and weather variables.

tdengue <- dengue %>%
  mutate(dengue_year = year(date)) %>%
  inner_join(pop, by = c("dengue_year" = "year", "mun")) %>%
  select(-dengue_year) %>%
  inner_join(prec, by = "date") %>%
  inner_join(tmax, by = "date") %>%
  inner_join(tmin, by = "date")

rm(dengue, prec, tmax, tmin)
Note

The population estimate is constant over each year.

Cleaning and basic features

Due the sparsity of dengue cases, only municipalities with more than 50,000 inhabitants are filtered.

# List municipalities with moren than 50k inhab
mun_vec <- pop %>%
  filter(year == max(year)) %>%
  filter(pop >= 50000) %>%
  pull(mun)

rm(pop)
tdengue <- tdengue %>%
  # Remove municipalilities with zero population
  filter(pop != 0) %>%
  # Keep only municipalities in the list
   filter(mun %in% mun_vec) %>%
  # Compute dengue rate per population
  # mutate(cases = cases/pop*100000) %>%
  # Remove population
  select(-pop) %>%
  # Round values
  mutate(across(c(cases, prec, tmax, tmin), ~ round(.x, digits = 2))) %>%
  # Pad weeks 
  group_by(mun) %>%
  pad_by_time(date, .by = "week", .pad_value = 0, .start_date = min(tdengue$date), .end_date = max(tdengue$date)) %>%
  ungroup()

Municipalities remaining at the dataset: 679

Warning

The computation of dengue incidence (cases per population) is commented to keep the raw cases count.

Standardize measures

Center around mean with a unit standard deviation.

\[ x' = \frac{x - \mu}{\sigma} \]

tdengue <- tdengue %>%
  mutate(cases_raw = cases) %>%
  group_by(mun) %>%
  arrange(date) %>%
  mutate(cases_cum_raw = cumsum(cases_raw)) %>%
  mutate(cases_cum = cases_cum_raw) %>%
  mutate(across(c(cases, cases_cum, prec, tmax, tmin), ~ standardize_vec(.x, silent = TRUE))) %>%
  ungroup()

Lag and rolling lag variables

Creates lagged variables from standardized dengue cases and weather variables, from 1 to 24 weeks (6 months).

tdengue <- tdengue %>%
  group_by(mun) %>%
  tk_augment_lags(.value = c(cases, prec, tmax, tmin), .lags = 1:24) %>%
  # tk_augment_slidify(
  #   .value = contains("_lag"), 
  #   .period = c(2, 4, 6), 
  #   .f = ~ mean(.x, na.rm = TRUE), 
  #   .partial = TRUE,
  #   .align   = "center"
  # ) %>%
  ungroup()
Warning

Rolling window calculation procedure is commented out.

Overview

glimpse(tdengue)
Rows: 425,733
Columns: 105
$ mun           <chr> "110002", "110002", "110002", "110002", "110002", "11000…
$ date          <date> 2010-12-26, 2011-01-02, 2011-01-09, 2011-01-16, 2011-01…
$ cases         <dbl> -0.29323416, -0.05782082, -0.05782082, -0.17552749, -0.0…
$ prec          <dbl> -1.84351393, 1.58948545, 1.38739445, 1.56445970, 1.24738…
$ tmax          <dbl> -24.73411305, 0.06515107, 0.06350563, 0.10381893, 0.1120…
$ tmin          <dbl> -24.67450779, 0.18593524, 0.19523234, 0.20368425, 0.1986…
$ cases_raw     <dbl> 0, 2, 2, 1, 2, 5, 0, 1, 2, 1, 0, 2, 1, 2, 2, 0, 0, 0, 1,…
$ cases_cum_raw <dbl> 0, 2, 4, 5, 7, 12, 12, 13, 15, 16, 16, 18, 19, 21, 23, 2…
$ cases_cum     <dbl> -0.8506694, -0.8445922, -0.8385149, -0.8354763, -0.82939…
$ cases_lag1    <dbl> NA, -0.29323416, -0.05782082, -0.05782082, -0.17552749, …
$ prec_lag1     <dbl> NA, -1.84351393, 1.58948545, 1.38739445, 1.56445970, 1.2…
$ tmax_lag1     <dbl> NA, -24.73411305, 0.06515107, 0.06350563, 0.10381893, 0.…
$ tmin_lag1     <dbl> NA, -24.67450779, 0.18593524, 0.19523234, 0.20368425, 0.…
$ cases_lag2    <dbl> NA, NA, -0.29323416, -0.05782082, -0.05782082, -0.175527…
$ prec_lag2     <dbl> NA, NA, -1.84351393, 1.58948545, 1.38739445, 1.56445970,…
$ tmax_lag2     <dbl> NA, NA, -24.73411305, 0.06515107, 0.06350563, 0.10381893…
$ tmin_lag2     <dbl> NA, NA, -24.67450779, 0.18593524, 0.19523234, 0.20368425…
$ cases_lag3    <dbl> NA, NA, NA, -0.29323416, -0.05782082, -0.05782082, -0.17…
$ prec_lag3     <dbl> NA, NA, NA, -1.84351393, 1.58948545, 1.38739445, 1.56445…
$ tmax_lag3     <dbl> NA, NA, NA, -24.73411305, 0.06515107, 0.06350563, 0.1038…
$ tmin_lag3     <dbl> NA, NA, NA, -24.67450779, 0.18593524, 0.19523234, 0.2036…
$ cases_lag4    <dbl> NA, NA, NA, NA, -0.29323416, -0.05782082, -0.05782082, -…
$ prec_lag4     <dbl> NA, NA, NA, NA, -1.84351393, 1.58948545, 1.38739445, 1.5…
$ tmax_lag4     <dbl> NA, NA, NA, NA, -24.73411305, 0.06515107, 0.06350563, 0.…
$ tmin_lag4     <dbl> NA, NA, NA, NA, -24.67450779, 0.18593524, 0.19523234, 0.…
$ cases_lag5    <dbl> NA, NA, NA, NA, NA, -0.29323416, -0.05782082, -0.0578208…
$ prec_lag5     <dbl> NA, NA, NA, NA, NA, -1.84351393, 1.58948545, 1.38739445,…
$ tmax_lag5     <dbl> NA, NA, NA, NA, NA, -24.73411305, 0.06515107, 0.06350563…
$ tmin_lag5     <dbl> NA, NA, NA, NA, NA, -24.67450779, 0.18593524, 0.19523234…
$ cases_lag6    <dbl> NA, NA, NA, NA, NA, NA, -0.29323416, -0.05782082, -0.057…
$ prec_lag6     <dbl> NA, NA, NA, NA, NA, NA, -1.84351393, 1.58948545, 1.38739…
$ tmax_lag6     <dbl> NA, NA, NA, NA, NA, NA, -24.73411305, 0.06515107, 0.0635…
$ tmin_lag6     <dbl> NA, NA, NA, NA, NA, NA, -24.67450779, 0.18593524, 0.1952…
$ cases_lag7    <dbl> NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.05782082, -0…
$ prec_lag7     <dbl> NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.5894854, 1.387…
$ tmax_lag7     <dbl> NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.06515107, 0.…
$ tmin_lag7     <dbl> NA, NA, NA, NA, NA, NA, NA, -24.67450779, 0.18593524, 0.…
$ cases_lag8    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.05782082…
$ prec_lag8     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.5894854, 1…
$ tmax_lag8     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.06515107…
$ tmin_lag8     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, -24.67450779, 0.18593524…
$ cases_lag9    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.0578…
$ prec_lag9     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.589485…
$ tmax_lag9     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.0651…
$ tmin_lag9     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.67450779, 0.1859…
$ cases_lag10   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416, -0.…
$ prec_lag10    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.8435139, 1.58…
$ tmax_lag10    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305, 0.…
$ tmin_lag10    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.6745078, 0.1…
$ cases_lag11   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323416,…
$ prec_lag11    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.843514, 1…
$ tmax_lag11    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.73411305…
$ tmin_lag11    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.6745078,…
$ cases_lag12   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.29323…
$ prec_lag12    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.84351…
$ tmax_lag12    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.7341…
$ tmin_lag12    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.6745…
$ cases_lag13   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.2…
$ prec_lag13    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -1.8…
$ tmax_lag13    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.…
$ tmin_lag13    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -24.…
$ cases_lag14   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag14    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag14    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag14    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag15   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag15    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag15    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag15    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag16   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag16    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag16    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag16    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag17   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag17    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag17    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag17    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag18   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag18    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag18    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag18    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag19   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag19    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag19    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag19    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag20   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag20    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag20    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag20    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag21   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag21    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag21    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag21    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag22   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag22    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag22    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag22    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag23   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag23    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag23    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag23    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ cases_lag24   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ prec_lag24    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmax_lag24    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ tmin_lag24    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

Save result

write_parquet(x = tdengue, sink = data_dir("bundled_data/tdengue.parquet"))

Session info

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] timetk_2.9.0       yardstick_1.3.0    workflowsets_1.0.1 workflows_1.1.3   
 [5] tune_1.1.2         rsample_1.2.0      recipes_1.0.10     parsnip_1.2.0     
 [9] modeldata_1.3.0    infer_1.0.6        dials_1.2.0        scales_1.3.0      
[13] broom_1.0.5        tidymodels_1.1.1   brpop_0.3.0        arrow_14.0.0.2    
[17] lubridate_1.9.3    forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4       
[21] purrr_1.0.2        readr_2.1.5        tidyr_1.3.1        tibble_3.2.1      
[25] ggplot2_3.4.4      tidyverse_2.0.0   

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0    timeDate_4032.109   fastmap_1.1.1      
 [4] digest_0.6.34       rpart_4.1.23        timechange_0.3.0   
 [7] lifecycle_1.0.4     survival_3.5-7      magrittr_2.0.3     
[10] compiler_4.3.2      rlang_1.1.3         tools_4.3.2        
[13] utf8_1.2.4          yaml_2.3.8          data.table_1.15.0  
[16] knitr_1.45          htmlwidgets_1.6.4   bit_4.0.5          
[19] DiceDesign_1.10     withr_3.0.0         nnet_7.3-19        
[22] grid_4.3.2          fansi_1.0.6         xts_0.13.2         
[25] colorspace_2.1-0    future_1.33.1       iterators_1.0.14   
[28] globals_0.16.2      MASS_7.3-60         anytime_0.3.9      
[31] cli_3.6.2           rmarkdown_2.25      generics_0.1.3     
[34] rstudioapi_0.15.0   future.apply_1.11.1 tzdb_0.4.0         
[37] splines_4.3.2       assertthat_0.2.1    parallel_4.3.2     
[40] vctrs_0.6.5         hardhat_1.3.1       Matrix_1.6-3       
[43] jsonlite_1.8.8      hms_1.1.3           bit64_4.0.5        
[46] listenv_0.9.1       foreach_1.5.2       gower_1.0.1        
[49] glue_1.7.0          parallelly_1.37.0   codetools_0.2-19   
[52] stringi_1.8.3       gtable_0.3.4        GPfit_1.0-8        
[55] munsell_0.5.0       pillar_1.9.0        furrr_0.3.1        
[58] htmltools_0.5.7     ipred_0.9-14        lava_1.7.3         
[61] R6_2.5.1            lhs_1.1.6           evaluate_0.23      
[64] lattice_0.22-5      backports_1.4.1     class_7.3-22       
[67] Rcpp_1.0.12         prodlim_2023.08.28  padr_0.6.2         
[70] xfun_0.42           zoo_1.8-12          pkgconfig_2.0.3    
Back to top