library(tidyverse)
library(arrow)
library(lubridate)
source("../functions.R")Export aggregated database
Context
This notebook aims to create a database with aggregated counts of positive dengue cases for spatial and time units.
The aggregation will be on municipalities and time will be date and epidemiological week.
Packages
Execution node
node_name()[1] "rfsaldanha"
Load data
Lazy evaluation.
dengue <- arrow::read_parquet(file = data_dir("dengue_data/parquet_improved/dengue_improved.parquet"))
tally(dengue)# A tibble: 1 × 1
n
<int>
1 16956001
Filter valid dates
valid_interval <- interval(ymd("2011-01-01"), ymd("2022-12-31"))
dengue <- dengue %>%
mutate(DT_SIN_PRI = ymd(DT_SIN_PRI)) %>%
filter(DT_SIN_PRI %within% valid_interval)
tally(dengue)# A tibble: 1 × 1
n
<int>
1 16956001
Filter positive cases of dengue
dengue_classifications <- c("Febre hemorrágica do dengue", "Síndrome do choque do dengue", "Dengue com sinais de alarme",
"Dengue clássico", "Dengue com complicações", "Dengue",
"Dengue grave")dengue <- dengue %>%
filter(CLASSI_FIN %in% dengue_classifications)
tally(dengue)# A tibble: 1 × 1
n
<int>
1 9369027
Aggregation
Municipality and date (“md”)
Including the step to fill the time series with zero values when certain dates do not have cases.
dengue_md <- dengue %>%
group_by(mun = ID_MN_RESI, date = DT_SIN_PRI) %>%
summarise(freq = n()) %>%
arrange(date) %>%
complete(
date = seq.Date(min(date), max(date), by = "day"),
fill = list(freq = 0)
) %>%
ungroup() %>%
collect()`summarise()` has grouped output by 'mun'. You can override using the `.groups`
argument.
glimpse(dengue_md)Rows: 19,456,365
Columns: 3
$ mun <chr> "110000", "110000", "110000", "110000", "110000", "110000", "1100…
$ date <date> 2015-12-29, 2015-12-30, 2015-12-31, 2016-01-01, 2016-01-02, 2016…
$ freq <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
dengue_md %>%
write_parquet(sink = data_dir("dengue_data/parquet_aggregated/dengue_md.parquet"))Municipality and epidemiological week (“mw”)
dengue_mw <- dengue_md %>%
mutate(epi_week = paste0(epiyear(date), "-", str_pad(epiweek(date), 2, pad = "0"))) %>%
group_by(mun, epi_week) %>%
summarise(freq = n()) %>%
arrange(epi_week) %>%
ungroup() %>%
collect()`summarise()` has grouped output by 'mun'. You can override using the `.groups`
argument.
glimpse(dengue_mw)Rows: 2,784,191
Columns: 3
$ mun <chr> "110005", "110006", "110014", "110034", "110140", "120017", "…
$ epi_week <chr> "2010-52", "2010-52", "2010-52", "2010-52", "2010-52", "2010-…
$ freq <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
dengue_mw %>%
write_parquet(sink = data_dir("dengue_data/parquet_aggregated/dengue_mw.parquet"))Session info
sessionInfo()R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] arrow_14.0.0.2 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[5] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
[9] tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] bit_4.0.5 gtable_0.3.4 jsonlite_1.8.8 compiler_4.3.2
[5] tidyselect_1.2.0 assertthat_0.2.1 scales_1.3.0 yaml_2.3.8
[9] fastmap_1.1.1 R6_2.5.1 generics_0.1.3 knitr_1.45
[13] htmlwidgets_1.6.4 munsell_0.5.0 pillar_1.9.0 tzdb_0.4.0
[17] rlang_1.1.3 utf8_1.2.4 stringi_1.8.3 xfun_0.42
[21] bit64_4.0.5 timechange_0.3.0 cli_3.6.2 withr_3.0.0
[25] magrittr_2.0.3 digest_0.6.34 grid_4.3.2 hms_1.1.3
[29] lifecycle_1.0.4 vctrs_0.6.5 evaluate_0.23 glue_1.7.0
[33] fansi_1.0.6 colorspace_2.1-0 rmarkdown_2.25 tools_4.3.2
[37] pkgconfig_2.0.3 htmltools_0.5.7