Export aggregated database

Author

Raphael Saldanha

Last modification

February 19, 2024 | 09:43:37 +01:00

Context

This notebook aims to create a database with aggregated counts of positive dengue cases for spatial and time units.

The aggregation will be on municipalities and time will be date and epidemiological week.

Packages

library(tidyverse)
library(arrow)
library(lubridate)
source("../functions.R")

Execution node

node_name()
[1] "rfsaldanha"

Load data

Lazy evaluation.

dengue <- arrow::read_parquet(file = data_dir("dengue_data/parquet_improved/dengue_improved.parquet"))

tally(dengue)
# A tibble: 1 × 1
         n
     <int>
1 16956001

Filter valid dates

valid_interval <- interval(ymd("2011-01-01"), ymd("2022-12-31"))

dengue <- dengue %>%
  mutate(DT_SIN_PRI = ymd(DT_SIN_PRI)) %>%
  filter(DT_SIN_PRI %within% valid_interval)

tally(dengue)
# A tibble: 1 × 1
         n
     <int>
1 16956001

Filter positive cases of dengue

dengue_classifications <- c("Febre hemorrágica do dengue", "Síndrome do choque do dengue", "Dengue com sinais de alarme",
  "Dengue clássico", "Dengue com complicações", "Dengue",
  "Dengue grave")
dengue <- dengue %>%
  filter(CLASSI_FIN %in% dengue_classifications)

tally(dengue)
# A tibble: 1 × 1
        n
    <int>
1 9369027

Aggregation

Municipality and date (“md”)

Including the step to fill the time series with zero values when certain dates do not have cases.

dengue_md <- dengue %>%
  group_by(mun = ID_MN_RESI, date = DT_SIN_PRI) %>%
  summarise(freq = n()) %>%
  arrange(date) %>%
  complete(
    date = seq.Date(min(date), max(date), by = "day"),
    fill = list(freq = 0)
  ) %>%
  ungroup() %>%
  collect()
`summarise()` has grouped output by 'mun'. You can override using the `.groups`
argument.
glimpse(dengue_md)
Rows: 19,456,365
Columns: 3
$ mun  <chr> "110000", "110000", "110000", "110000", "110000", "110000", "1100…
$ date <date> 2015-12-29, 2015-12-30, 2015-12-31, 2016-01-01, 2016-01-02, 2016…
$ freq <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
dengue_md %>%
  write_parquet(sink = data_dir("dengue_data/parquet_aggregated/dengue_md.parquet"))

Municipality and epidemiological week (“mw”)

dengue_mw <- dengue_md %>%
  mutate(epi_week = paste0(epiyear(date), "-", str_pad(epiweek(date), 2, pad = "0"))) %>%
  group_by(mun, epi_week) %>%
  summarise(freq = n()) %>%
  arrange(epi_week) %>%
  ungroup() %>%
  collect()
`summarise()` has grouped output by 'mun'. You can override using the `.groups`
argument.
glimpse(dengue_mw)
Rows: 2,784,191
Columns: 3
$ mun      <chr> "110005", "110006", "110014", "110034", "110140", "120017", "…
$ epi_week <chr> "2010-52", "2010-52", "2010-52", "2010-52", "2010-52", "2010-…
$ freq     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
dengue_mw %>%
  write_parquet(sink = data_dir("dengue_data/parquet_aggregated/dengue_mw.parquet"))

Session info

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] arrow_14.0.0.2  lubridate_1.9.3 forcats_1.0.0   stringr_1.5.1  
 [5] dplyr_1.1.4     purrr_1.0.2     readr_2.1.5     tidyr_1.3.1    
 [9] tibble_3.2.1    ggplot2_3.4.4   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] bit_4.0.5         gtable_0.3.4      jsonlite_1.8.8    compiler_4.3.2   
 [5] tidyselect_1.2.0  assertthat_0.2.1  scales_1.3.0      yaml_2.3.8       
 [9] fastmap_1.1.1     R6_2.5.1          generics_0.1.3    knitr_1.45       
[13] htmlwidgets_1.6.4 munsell_0.5.0     pillar_1.9.0      tzdb_0.4.0       
[17] rlang_1.1.3       utf8_1.2.4        stringi_1.8.3     xfun_0.42        
[21] bit64_4.0.5       timechange_0.3.0  cli_3.6.2         withr_3.0.0      
[25] magrittr_2.0.3    digest_0.6.34     grid_4.3.2        hms_1.1.3        
[29] lifecycle_1.0.4   vctrs_0.6.5       evaluate_0.23     glue_1.7.0       
[33] fansi_1.0.6       colorspace_2.1-0  rmarkdown_2.25    tools_4.3.2      
[37] pkgconfig_2.0.3   htmltools_0.5.7  
Back to top