library(tidyverse)
library(arrow)
library(knitr)
library(lubridate)
library(tidymodels)
library(finetune)
library(bonsai)
library(tictoc)
library(vip)
source("../functions.R")
Dengue case classification
by symptoms and clinical condition
The objective of this notebook is to train a model to reclassify inconclusive cases of dengue, zika and chikungunya based on the symptoms, clinical conditions and other patient related variables presented by confirmed and discarded cases
Packages
Data
- Cases classified as inconclusive are discarded for model training.
Dengue
- Data prior to 2016 does not have patient symptoms and clinical conditions.
# Data sources
<- c(
files_list data_dir("dengue_data/parquets/dengue_2016.parquet"),
data_dir("dengue_data/parquets/dengue_2017.parquet"),
data_dir("dengue_data/parquets/dengue_2018.parquet"),
data_dir("dengue_data/parquets/dengue_2019.parquet"),
data_dir("dengue_data/parquets/dengue_2020.parquet"),
data_dir("dengue_data/parquets/dengue_2021.parquet")
)
# Independent variables
<- c("FEBRE", "MIALGIA", "CEFALEIA",
x_vars "EXANTEMA", "VOMITO", "NAUSEA",
"DOR_COSTAS", "CONJUNTVIT",
"ARTRITE", "ARTRALGIA", "PETEQUIA_N",
"LEUCOPENIA", "LACO", "DOR_RETRO",
"DIABETES", "HEMATOLOG", "HEPATOPAT",
"HEPATOPAT", "RENAL", "HIPERTENSA",
"ACIDO_PEPT", "AUTO_IMUNE")
# Prepare data
<- arrow::open_dataset(sources = files_list) %>%
dengue # Select variables
select(all_of(c("CLASSI_FIN", "COMUNINF", "IDADEanos", "DT_SIN_PRI", x_vars))) %>%
# Filter out "Inconclusivo" cases
filter(CLASSI_FIN != "Inconclusivo") %>%
# Collect data from parquet files
collect() %>%
# Prepare variables
mutate(CLASSI_FIN = case_when(
!= "Descartado" ~ "Dengue",
CLASSI_FIN .default = "Discarded"
%>%
)) mutate(CLASSI_FIN = as.factor(CLASSI_FIN)) %>%
mutate(DT_SIN_PRI = as_date(DT_SIN_PRI)) %>%
mutate(COMUNINF = as.factor(COMUNINF)) %>%
mutate_at(.vars = x_vars, .funs = ~ . == "Sim")
Chikungunya
- Data prior to 2017 does not have patient symptoms and clinical conditions.
# Data sources
<- c(
files_list data_dir("chik_data/parquets/chik_2017.parquet"),
data_dir("chik_data/parquets/chik_2018.parquet"),
data_dir("chik_data/parquets/chik_2019.parquet"),
data_dir("chik_data/parquets/chik_2020.parquet"),
data_dir("chik_data/parquets/chik_2021.parquet")
)
# Independent variables
<- c("FEBRE", "MIALGIA", "CEFALEIA",
x_vars "EXANTEMA", "VOMITO", "NAUSEA",
"DOR_COSTAS", "CONJUNTVIT",
"ARTRITE", "ARTRALGIA", "PETEQUIA_N",
"LEUCOPENIA", "LACO", "DOR_RETRO",
"DIABETES", "HEMATOLOG", "HEPATOPAT",
"HEPATOPAT", "RENAL", "HIPERTENSA",
"ACIDO_PEPT", "AUTO_IMUNE")
# Prepare data
<- arrow::open_dataset(sources = files_list) %>%
chik # Select variables
select(all_of(c("CLASSI_FIN", "COMUNINF", "IDADEanos", "DT_SIN_PRI", x_vars))) %>%
# Filter out "Inconclusivo" cases
filter(CLASSI_FIN != "Inconclusivo") %>%
# Collect data from parquet files
collect() %>%
# Prepare variables
mutate(CLASSI_FIN = case_when(
!= "Descartado" ~ "Chikungunya",
CLASSI_FIN .default = "Discarded"
%>%
)) mutate(CLASSI_FIN = as.factor(CLASSI_FIN)) %>%
mutate(DT_SIN_PRI = as_date(DT_SIN_PRI)) %>%
mutate(COMUNINF = as.factor(COMUNINF)) %>%
mutate_at(.vars = x_vars, .funs = ~ . == "Sim")
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `DT_SIN_PRI = as_date(DT_SIN_PRI)`.
Caused by warning:
! 2 failed to parse.
Session info
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] vip_0.4.1 tictoc_1.2 bonsai_0.2.1 finetune_1.1.0
[5] yardstick_1.3.0 workflowsets_1.0.1 workflows_1.1.3 tune_1.1.2
[9] rsample_1.2.0 recipes_1.0.9 parsnip_1.1.1 modeldata_1.3.0
[13] infer_1.0.6 dials_1.2.0 scales_1.3.0 broom_1.0.5
[17] tidymodels_1.1.1 knitr_1.45 arrow_14.0.0.2 lubridate_1.9.3
[21] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
[25] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.4.4
[29] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] tidyselect_1.2.0 timeDate_4032.109 fastmap_1.1.1
[4] digest_0.6.34 rpart_4.1.23 timechange_0.3.0
[7] lifecycle_1.0.4 survival_3.5-7 magrittr_2.0.3
[10] compiler_4.3.2 rlang_1.1.3 tools_4.3.2
[13] utf8_1.2.4 yaml_2.3.8 data.table_1.15.0
[16] htmlwidgets_1.6.4 bit_4.0.5 DiceDesign_1.10
[19] withr_3.0.0 nnet_7.3-19 grid_4.3.2
[22] fansi_1.0.6 colorspace_2.1-0 future_1.33.1
[25] iterators_1.0.14 globals_0.16.2 MASS_7.3-60
[28] cli_3.6.2 rmarkdown_2.25 generics_0.1.3
[31] rstudioapi_0.15.0 future.apply_1.11.1 tzdb_0.4.0
[34] splines_4.3.2 assertthat_0.2.1 parallel_4.3.2
[37] vctrs_0.6.5 hardhat_1.3.1 Matrix_1.6-3
[40] jsonlite_1.8.8 hms_1.1.3 bit64_4.0.5
[43] listenv_0.9.1 foreach_1.5.2 gower_1.0.1
[46] glue_1.7.0 parallelly_1.36.0 codetools_0.2-19
[49] stringi_1.8.3 gtable_0.3.4 GPfit_1.0-8
[52] munsell_0.5.0 pillar_1.9.0 furrr_0.3.1
[55] htmltools_0.5.7 ipred_0.9-14 lava_1.7.3
[58] R6_2.5.1 lhs_1.1.6 evaluate_0.23
[61] lattice_0.22-5 backports_1.4.1 class_7.3-22
[64] Rcpp_1.0.12 prodlim_2023.08.28 xfun_0.41
[67] pkgconfig_2.0.3