library(tidyverse)
library(arrow)
library(knitr)
library(lubridate)
source("../functions.R")
Imputation
This notebook aims to impute some missing data and enrich the dataset.
Packages
Execution node
node_name()
[1] "rfsaldanha"
Load data
<- c("ID_AGRAVO", "DT_NOTIFIC", "ID_UNIDADE",
important_vars "DT_SIN_PRI", "CS_SEXO", "CS_GESTANT",
"CS_RACA", "CS_ESCOL_N", "ID_MN_RESI",
"COUFINF", "COMUNINF", "ID_OCUPA_N",
"DT_SORO", "RESUL_SORO", "SOROTIPO",
"CLASSI_FIN", "CRITERIO", "EVOLUCAO",
"DT_OBITO", "HOSPITALIZ", "DT_INTERNA")
<- c(
dengue_files_list data_dir("dengue_data/parquets/dengue_2011.parquet"),
data_dir("dengue_data/parquets/dengue_2012.parquet"),
data_dir("dengue_data/parquets/dengue_2013.parquet"),
data_dir("dengue_data/parquets/dengue_2014.parquet"),
data_dir("dengue_data/parquets/dengue_2015.parquet"),
data_dir("dengue_data/parquets/dengue_2016.parquet"),
data_dir("dengue_data/parquets/dengue_2017.parquet"),
data_dir("dengue_data/parquets/dengue_2018.parquet"),
data_dir("dengue_data/parquets/dengue_2019.parquet"),
data_dir("dengue_data/parquets/dengue_2020.parquet"),
data_dir("dengue_data/parquets/dengue_2021.parquet"),
data_dir("dengue_data/parquets/dengue_2022.parquet")
)
<- open_dataset(sources = dengue_files_list) %>%
dengue select(all_of(important_vars)) %>%
collect()
Residence municipality: ID_MN_RESI
Task: If ID_MN_RESI is invalid or missing, imputate with valid COMUNINF information.
Imputation
<- dengue %>%
dengue mutate(
ID_MN_RESI_check = if_else(nchar(ID_MN_RESI) == 6,
false = FALSE,
true = TRUE),
COMUNINF_check = if_else(nchar(COMUNINF) == 6,
false = FALSE,
true = TRUE)
%>%
) mutate(ID_MN_RESI = case_when(
== FALSE & COMUNINF_check == TRUE ~ COMUNINF,
ID_MN_RESI_check is.na(ID_MN_RESI) ~ COMUNINF,
TRUE ~ ID_MN_RESI
%>%
)) select(-ID_MN_RESI_check, -COMUNINF_check)
Check improvement
%>%
dengue mutate(
ID_MN_RESI_check = if_else(nchar(ID_MN_RESI) == 6,
false = FALSE,
true = TRUE)
%>%
) group_by(ID_MN_RESI_check) %>%
summarise(freq = n()) %>%
ungroup() %>%
kable(
format.args = list(big.mark = ".", decimal.mark = ",")
)
ID_MN_RESI_check | freq |
---|---|
FALSE | 4 |
TRUE | 16.953.774 |
NA | 2.223 |
No improvement for invalid municipalities, improvement of 96 records with missing data.
Date of the first symptoms onset: DT_SIN_PRI
Task: If DT_SIN_PRI is invalid or missing, imputate with valid DT_NOTIFIC information.
Imputation
<- interval(ymd("2011-01-01"), ymd("2022-12-31"))
valid_interval
<- dengue %>%
dengue mutate(
DT_SIN_PRI_check = ymd(DT_SIN_PRI) %within% valid_interval,
DT_NOTIFIC_check = ymd(DT_NOTIFIC) %within% valid_interval,
%>%
) mutate(DT_SIN_PRI = case_when(
== FALSE & DT_NOTIFIC_check == TRUE ~ DT_NOTIFIC,
DT_SIN_PRI_check is.na(DT_SIN_PRI) ~ DT_NOTIFIC,
TRUE ~ DT_SIN_PRI
%>%
)) select(-DT_SIN_PRI_check, -DT_NOTIFIC_check)
Check improvement
<- interval(ymd("2011-01-01"), ymd("2022-12-31"))
valid_interval
%>%
dengue mutate(
DT_SIN_PRI_check = ymd(DT_SIN_PRI) %within% valid_interval
%>%
) group_by(DT_SIN_PRI_check) %>%
summarise(freq = n()) %>%
ungroup() %>%
kable(
format.args = list(big.mark = ".", decimal.mark = ",")
)
DT_SIN_PRI_check | freq |
---|---|
TRUE | 16.956.001 |
There was a improvement on 25,823 records with invalid dates and all missing records were imputate.
Export improved database
%>%
dengue write_parquet(sink = data_dir("dengue_data/parquet_improved/dengue_improved.parquet"))
Session info
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] knitr_1.45 arrow_14.0.0.2 lubridate_1.9.3 forcats_1.0.0
[5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
[9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] bit_4.0.5 gtable_0.3.4 jsonlite_1.8.8 compiler_4.3.2
[5] tidyselect_1.2.0 assertthat_0.2.1 scales_1.3.0 yaml_2.3.8
[9] fastmap_1.1.1 R6_2.5.1 generics_0.1.3 htmlwidgets_1.6.4
[13] munsell_0.5.0 pillar_1.9.0 tzdb_0.4.0 rlang_1.1.3
[17] utf8_1.2.4 stringi_1.8.3 xfun_0.42 bit64_4.0.5
[21] timechange_0.3.0 cli_3.6.2 withr_3.0.0 magrittr_2.0.3
[25] digest_0.6.34 grid_4.3.2 hms_1.1.3 lifecycle_1.0.4
[29] vctrs_0.6.5 evaluate_0.23 glue_1.7.0 fansi_1.0.6
[33] colorspace_2.1-0 rmarkdown_2.25 tools_4.3.2 pkgconfig_2.0.3
[37] htmltools_0.5.7