Imputation

Author

Raphael Saldanha

Last modification

February 19, 2024 | 10:03:25 +01:00

This notebook aims to impute some missing data and enrich the dataset.

Packages

library(tidyverse)
library(arrow)
library(knitr)
library(lubridate)
source("../functions.R")

Execution node

node_name()
[1] "rfsaldanha"

Load data

important_vars <- c("ID_AGRAVO", "DT_NOTIFIC", "ID_UNIDADE",
                    "DT_SIN_PRI", "CS_SEXO", "CS_GESTANT",
                    "CS_RACA", "CS_ESCOL_N", "ID_MN_RESI",
                    "COUFINF", "COMUNINF", "ID_OCUPA_N",
                    "DT_SORO", "RESUL_SORO", "SOROTIPO", 
                    "CLASSI_FIN", "CRITERIO", "EVOLUCAO",
                    "DT_OBITO", "HOSPITALIZ", "DT_INTERNA")

dengue_files_list <- c(
  data_dir("dengue_data/parquets/dengue_2011.parquet"),
  data_dir("dengue_data/parquets/dengue_2012.parquet"),
  data_dir("dengue_data/parquets/dengue_2013.parquet"),
  data_dir("dengue_data/parquets/dengue_2014.parquet"),
  data_dir("dengue_data/parquets/dengue_2015.parquet"),
  data_dir("dengue_data/parquets/dengue_2016.parquet"),
  data_dir("dengue_data/parquets/dengue_2017.parquet"),
  data_dir("dengue_data/parquets/dengue_2018.parquet"),
  data_dir("dengue_data/parquets/dengue_2019.parquet"),
  data_dir("dengue_data/parquets/dengue_2020.parquet"),
  data_dir("dengue_data/parquets/dengue_2021.parquet"),
  data_dir("dengue_data/parquets/dengue_2022.parquet")
)

dengue <- open_dataset(sources = dengue_files_list) %>%
  select(all_of(important_vars)) %>%
  collect()

Residence municipality: ID_MN_RESI

Task: If ID_MN_RESI is invalid or missing, imputate with valid COMUNINF information.

Imputation

dengue <- dengue %>%
  mutate(
    ID_MN_RESI_check = if_else(nchar(ID_MN_RESI) == 6, 
                               false = FALSE,
                               true = TRUE),
    COMUNINF_check = if_else(nchar(COMUNINF) == 6, 
                               false = FALSE,
                               true = TRUE)
  ) %>%
  mutate(ID_MN_RESI = case_when(
    ID_MN_RESI_check == FALSE & COMUNINF_check == TRUE ~ COMUNINF,
    is.na(ID_MN_RESI) ~ COMUNINF,
    TRUE ~ ID_MN_RESI
  )) %>%
  select(-ID_MN_RESI_check, -COMUNINF_check)

Check improvement

dengue %>%
  mutate(
    ID_MN_RESI_check = if_else(nchar(ID_MN_RESI) == 6, 
                               false = FALSE,
                               true = TRUE)
  ) %>%
  group_by(ID_MN_RESI_check) %>%
  summarise(freq = n()) %>%
  ungroup() %>%
  kable(
    format.args = list(big.mark = ".", decimal.mark = ",")
  )
ID_MN_RESI_check freq
FALSE 4
TRUE 16.953.774
NA 2.223

No improvement for invalid municipalities, improvement of 96 records with missing data.

Date of the first symptoms onset: DT_SIN_PRI

Task: If DT_SIN_PRI is invalid or missing, imputate with valid DT_NOTIFIC information.

Imputation

valid_interval <- interval(ymd("2011-01-01"), ymd("2022-12-31"))

dengue <- dengue %>%
  mutate(
    DT_SIN_PRI_check = ymd(DT_SIN_PRI) %within% valid_interval,
    DT_NOTIFIC_check = ymd(DT_NOTIFIC) %within% valid_interval,
  ) %>%
  mutate(DT_SIN_PRI = case_when(
    DT_SIN_PRI_check == FALSE & DT_NOTIFIC_check == TRUE ~ DT_NOTIFIC,
    is.na(DT_SIN_PRI) ~ DT_NOTIFIC,
    TRUE ~ DT_SIN_PRI
  )) %>%
  select(-DT_SIN_PRI_check, -DT_NOTIFIC_check)

Check improvement

valid_interval <- interval(ymd("2011-01-01"), ymd("2022-12-31"))

dengue %>%
  mutate(
    DT_SIN_PRI_check = ymd(DT_SIN_PRI) %within% valid_interval
  ) %>%
  group_by(DT_SIN_PRI_check) %>%
  summarise(freq = n()) %>%
  ungroup() %>%
  kable(
    format.args = list(big.mark = ".", decimal.mark = ",")
  )
DT_SIN_PRI_check freq
TRUE 16.956.001

There was a improvement on 25,823 records with invalid dates and all missing records were imputate.

Export improved database

dengue %>%
  write_parquet(sink = data_dir("dengue_data/parquet_improved/dengue_improved.parquet"))

Session info

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] knitr_1.45      arrow_14.0.0.2  lubridate_1.9.3 forcats_1.0.0  
 [5] stringr_1.5.1   dplyr_1.1.4     purrr_1.0.2     readr_2.1.5    
 [9] tidyr_1.3.1     tibble_3.2.1    ggplot2_3.4.4   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] bit_4.0.5         gtable_0.3.4      jsonlite_1.8.8    compiler_4.3.2   
 [5] tidyselect_1.2.0  assertthat_0.2.1  scales_1.3.0      yaml_2.3.8       
 [9] fastmap_1.1.1     R6_2.5.1          generics_0.1.3    htmlwidgets_1.6.4
[13] munsell_0.5.0     pillar_1.9.0      tzdb_0.4.0        rlang_1.1.3      
[17] utf8_1.2.4        stringi_1.8.3     xfun_0.42         bit64_4.0.5      
[21] timechange_0.3.0  cli_3.6.2         withr_3.0.0       magrittr_2.0.3   
[25] digest_0.6.34     grid_4.3.2        hms_1.1.3         lifecycle_1.0.4  
[29] vctrs_0.6.5       evaluate_0.23     glue_1.7.0        fansi_1.0.6      
[33] colorspace_2.1-0  rmarkdown_2.25    tools_4.3.2       pkgconfig_2.0.3  
[37] htmltools_0.5.7  
Back to top