library(tidyverse)
library(arrow)
library(lubridate)
source("../functions.R")
Export aggregated database
Context
This notebook aims to create a database with aggregated counts of positive dengue cases for spatial and time units.
The aggregation will be on municipalities and time will be date and epidemiological week.
Packages
Execution node
node_name()
[1] "rfsaldanha"
Load data
Lazy evaluation.
<- arrow::read_parquet(file = data_dir("dengue_data/parquet_improved/dengue_improved.parquet"))
dengue
tally(dengue)
# A tibble: 1 × 1
n
<int>
1 16956001
Filter valid dates
<- interval(ymd("2011-01-01"), ymd("2022-12-31"))
valid_interval
<- dengue %>%
dengue mutate(DT_SIN_PRI = ymd(DT_SIN_PRI)) %>%
filter(DT_SIN_PRI %within% valid_interval)
tally(dengue)
# A tibble: 1 × 1
n
<int>
1 16956001
Filter positive cases of dengue
<- c("Febre hemorrágica do dengue", "Síndrome do choque do dengue", "Dengue com sinais de alarme",
dengue_classifications "Dengue clássico", "Dengue com complicações", "Dengue",
"Dengue grave")
<- dengue %>%
dengue filter(CLASSI_FIN %in% dengue_classifications)
tally(dengue)
# A tibble: 1 × 1
n
<int>
1 9369027
Aggregation
Municipality and date (“md”)
Including the step to fill the time series with zero values when certain dates do not have cases.
<- dengue %>%
dengue_md group_by(mun = ID_MN_RESI, date = DT_SIN_PRI) %>%
summarise(freq = n()) %>%
arrange(date) %>%
complete(
date = seq.Date(min(date), max(date), by = "day"),
fill = list(freq = 0)
%>%
) ungroup() %>%
collect()
`summarise()` has grouped output by 'mun'. You can override using the `.groups`
argument.
glimpse(dengue_md)
Rows: 19,456,365
Columns: 3
$ mun <chr> "110000", "110000", "110000", "110000", "110000", "110000", "1100…
$ date <date> 2015-12-29, 2015-12-30, 2015-12-31, 2016-01-01, 2016-01-02, 2016…
$ freq <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
%>%
dengue_md write_parquet(sink = data_dir("dengue_data/parquet_aggregated/dengue_md.parquet"))
Municipality and epidemiological week (“mw”)
<- dengue_md %>%
dengue_mw mutate(epi_week = paste0(epiyear(date), "-", str_pad(epiweek(date), 2, pad = "0"))) %>%
group_by(mun, epi_week) %>%
summarise(freq = n()) %>%
arrange(epi_week) %>%
ungroup() %>%
collect()
`summarise()` has grouped output by 'mun'. You can override using the `.groups`
argument.
glimpse(dengue_mw)
Rows: 2,784,191
Columns: 3
$ mun <chr> "110005", "110006", "110014", "110034", "110140", "120017", "…
$ epi_week <chr> "2010-52", "2010-52", "2010-52", "2010-52", "2010-52", "2010-…
$ freq <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
%>%
dengue_mw write_parquet(sink = data_dir("dengue_data/parquet_aggregated/dengue_mw.parquet"))
Session info
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] arrow_14.0.0.2 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[5] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
[9] tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] bit_4.0.5 gtable_0.3.4 jsonlite_1.8.8 compiler_4.3.2
[5] tidyselect_1.2.0 assertthat_0.2.1 scales_1.3.0 yaml_2.3.8
[9] fastmap_1.1.1 R6_2.5.1 generics_0.1.3 knitr_1.45
[13] htmlwidgets_1.6.4 munsell_0.5.0 pillar_1.9.0 tzdb_0.4.0
[17] rlang_1.1.3 utf8_1.2.4 stringi_1.8.3 xfun_0.42
[21] bit64_4.0.5 timechange_0.3.0 cli_3.6.2 withr_3.0.0
[25] magrittr_2.0.3 digest_0.6.34 grid_4.3.2 hms_1.1.3
[29] lifecycle_1.0.4 vctrs_0.6.5 evaluate_0.23 glue_1.7.0
[33] fansi_1.0.6 colorspace_2.1-0 rmarkdown_2.25 tools_4.3.2
[37] pkgconfig_2.0.3 htmltools_0.5.7