Dengue case classification

by symptoms and clinical condition

Author

Raphael Saldanha

Last modification

February 6, 2024 | 13:09:16 +01:00

The objective of this notebook is to train a model to reclassify inconclusive cases of dengue, zika and chikungunya based on the symptoms, clinical conditions and other patient related variables presented by confirmed and discarded cases

Packages

library(tidyverse)
library(arrow)
library(knitr)
library(lubridate)
library(tidymodels)
library(finetune)
library(bonsai)
library(tictoc)
library(vip)
source("../functions.R")

Data

  • Cases classified as inconclusive are discarded for model training.

Dengue

  • Data prior to 2016 does not have patient symptoms and clinical conditions.
# Data sources
files_list <- c(
  data_dir("dengue_data/parquets/dengue_2016.parquet"),
  data_dir("dengue_data/parquets/dengue_2017.parquet"),
  data_dir("dengue_data/parquets/dengue_2018.parquet"),
  data_dir("dengue_data/parquets/dengue_2019.parquet"),
  data_dir("dengue_data/parquets/dengue_2020.parquet"),
  data_dir("dengue_data/parquets/dengue_2021.parquet")
)

# Independent variables
x_vars <- c("FEBRE", "MIALGIA", "CEFALEIA", 
            "EXANTEMA", "VOMITO", "NAUSEA", 
            "DOR_COSTAS", "CONJUNTVIT", 
            "ARTRITE", "ARTRALGIA", "PETEQUIA_N", 
            "LEUCOPENIA", "LACO", "DOR_RETRO", 
            "DIABETES", "HEMATOLOG", "HEPATOPAT", 
            "HEPATOPAT", "RENAL", "HIPERTENSA",
            "ACIDO_PEPT", "AUTO_IMUNE")

# Prepare data
dengue <- arrow::open_dataset(sources = files_list) %>%
  # Select variables
  select(all_of(c("CLASSI_FIN", "COMUNINF", "IDADEanos", "DT_SIN_PRI", x_vars))) %>%
  # Filter out "Inconclusivo" cases
  filter(CLASSI_FIN != "Inconclusivo") %>%
  # Collect data from parquet files
  collect() %>%
  # Prepare variables
  mutate(CLASSI_FIN = case_when(
    CLASSI_FIN != "Descartado" ~ "Dengue",
    .default = "Discarded"
  )) %>%
  mutate(CLASSI_FIN = as.factor(CLASSI_FIN)) %>%
  mutate(DT_SIN_PRI = as_date(DT_SIN_PRI)) %>%
  mutate(COMUNINF = as.factor(COMUNINF)) %>%
  mutate_at(.vars = x_vars, .funs = ~ . == "Sim") 

Chikungunya

  • Data prior to 2017 does not have patient symptoms and clinical conditions.
# Data sources
files_list <- c(
  data_dir("chik_data/parquets/chik_2017.parquet"),
  data_dir("chik_data/parquets/chik_2018.parquet"),
  data_dir("chik_data/parquets/chik_2019.parquet"),
  data_dir("chik_data/parquets/chik_2020.parquet"),
  data_dir("chik_data/parquets/chik_2021.parquet")
)

# Independent variables
x_vars <- c("FEBRE", "MIALGIA", "CEFALEIA", 
            "EXANTEMA", "VOMITO", "NAUSEA", 
            "DOR_COSTAS", "CONJUNTVIT", 
            "ARTRITE", "ARTRALGIA", "PETEQUIA_N", 
            "LEUCOPENIA", "LACO", "DOR_RETRO", 
            "DIABETES", "HEMATOLOG", "HEPATOPAT", 
            "HEPATOPAT", "RENAL", "HIPERTENSA",
            "ACIDO_PEPT", "AUTO_IMUNE")

# Prepare data
chik <- arrow::open_dataset(sources = files_list) %>%
  # Select variables
  select(all_of(c("CLASSI_FIN", "COMUNINF", "IDADEanos", "DT_SIN_PRI", x_vars))) %>%
  # Filter out "Inconclusivo" cases
  filter(CLASSI_FIN != "Inconclusivo") %>%
  # Collect data from parquet files
  collect() %>%
  # Prepare variables
  mutate(CLASSI_FIN = case_when(
    CLASSI_FIN != "Descartado" ~ "Chikungunya",
    .default = "Discarded"
  )) %>%
  mutate(CLASSI_FIN = as.factor(CLASSI_FIN)) %>%
  mutate(DT_SIN_PRI = as_date(DT_SIN_PRI)) %>%
  mutate(COMUNINF = as.factor(COMUNINF)) %>%
  mutate_at(.vars = x_vars, .funs = ~ . == "Sim") 
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `DT_SIN_PRI = as_date(DT_SIN_PRI)`.
Caused by warning:
!  2 failed to parse.

Session info

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Paris
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] vip_0.4.1          tictoc_1.2         bonsai_0.2.1       finetune_1.1.0    
 [5] yardstick_1.3.0    workflowsets_1.0.1 workflows_1.1.3    tune_1.1.2        
 [9] rsample_1.2.0      recipes_1.0.9      parsnip_1.1.1      modeldata_1.3.0   
[13] infer_1.0.6        dials_1.2.0        scales_1.3.0       broom_1.0.5       
[17] tidymodels_1.1.1   knitr_1.45         arrow_14.0.0.2     lubridate_1.9.3   
[21] forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2       
[25] readr_2.1.5        tidyr_1.3.1        tibble_3.2.1       ggplot2_3.4.4     
[29] tidyverse_2.0.0   

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0    timeDate_4032.109   fastmap_1.1.1      
 [4] digest_0.6.34       rpart_4.1.23        timechange_0.3.0   
 [7] lifecycle_1.0.4     survival_3.5-7      magrittr_2.0.3     
[10] compiler_4.3.2      rlang_1.1.3         tools_4.3.2        
[13] utf8_1.2.4          yaml_2.3.8          data.table_1.15.0  
[16] htmlwidgets_1.6.4   bit_4.0.5           DiceDesign_1.10    
[19] withr_3.0.0         nnet_7.3-19         grid_4.3.2         
[22] fansi_1.0.6         colorspace_2.1-0    future_1.33.1      
[25] iterators_1.0.14    globals_0.16.2      MASS_7.3-60        
[28] cli_3.6.2           rmarkdown_2.25      generics_0.1.3     
[31] rstudioapi_0.15.0   future.apply_1.11.1 tzdb_0.4.0         
[34] splines_4.3.2       assertthat_0.2.1    parallel_4.3.2     
[37] vctrs_0.6.5         hardhat_1.3.1       Matrix_1.6-3       
[40] jsonlite_1.8.8      hms_1.1.3           bit64_4.0.5        
[43] listenv_0.9.1       foreach_1.5.2       gower_1.0.1        
[46] glue_1.7.0          parallelly_1.36.0   codetools_0.2-19   
[49] stringi_1.8.3       gtable_0.3.4        GPfit_1.0-8        
[52] munsell_0.5.0       pillar_1.9.0        furrr_0.3.1        
[55] htmltools_0.5.7     ipred_0.9-14        lava_1.7.3         
[58] R6_2.5.1            lhs_1.1.6           evaluate_0.23      
[61] lattice_0.22-5      backports_1.4.1     class_7.3-22       
[64] Rcpp_1.0.12         prodlim_2023.08.28  xfun_0.41          
[67] pkgconfig_2.0.3    
Back to top