Datasets

Author

Raphael Saldanha

Last modification

February 16, 2024 | 05:44:05 -03:00

Packages

library(tidyverse)
library(arrow)
library(qs)
library(sessioninfo)
source("../../functions.R")

Data source

Datasets from SINAN Dengue and Chikungunya were previously downloaded from the Brazilian Health Ministry in DBC format, pre-processed (variables labels) and converted to the parquet format.

Dengue

Data prior to 2016 does not have patient symptoms and clinical conditions
Data from 2016 to 2023 is being considered

# Data sources
dengue_files_list <- c(
  data_dir("dengue_data/parquets/dengue_2016.parquet"),
  data_dir("dengue_data/parquets/dengue_2017.parquet"),
  data_dir("dengue_data/parquets/dengue_2018.parquet"),
  data_dir("dengue_data/parquets/dengue_2019.parquet"),
  data_dir("dengue_data/parquets/dengue_2020.parquet"),
  data_dir("dengue_data/parquets/dengue_2021.parquet"),
  data_dir("dengue_data/parquets/dengue_2022.parquet"),
  data_dir("dengue_data/parquets/dengue_2023.parquet")
)

symp_cond_vars contains a list of variables regarding symptoms and clinical conditions.

grave_cases_vars contains a list of clinical symptoms specifically for grave cases.

other_vars contains names of other variables of interest.

case_vars <- c("ID_MN_RESI", "IDADEanos", "DT_SIN_PRI",
               "CS_SEXO", "CS_GESTANT")

symp_cond_vars <- c("FEBRE", "MIALGIA", "CEFALEIA", 
                    "EXANTEMA", "VOMITO", "NAUSEA", 
                    "DOR_COSTAS", "CONJUNTVIT", 
                    "ARTRITE", "ARTRALGIA", "PETEQUIA_N", 
                    "LEUCOPENIA", "LACO", "DOR_RETRO", 
                    "DIABETES", "HEMATOLOG", "HEPATOPAT", 
                    "RENAL", "HIPERTENSA", "ACIDO_PEPT",
                    "AUTO_IMUNE")

# grave_cases_vars <- c("ALRM_HIPOT", "ALRM_PLAQ", "ALRM_VOM",
#                       "ALRM_SANG", "ALRM_HEMAT", "ALRM_ABDOM", 
#                       "ALRM_LETAR", "ALRM_HEPAT", "ALRM_LIQ",
#                       "GRAV_PULSO", "GRAV_CONV", "GRAV_ENCH",
#                       "GRAV_INSUF", "GRAV_TAQUI", "GRAV_EXTRE",
#                       "GRAV_HIPOT", "GRAV_HEMAT", "GRAV_MELEN",
#                       "GRAV_METRO", "GRAV_SANG", "GRAV_AST",
#                       "GRAV_MIOC", "GRAV_CONSC", "GRAV_ORGAO",
#                       "MANI_HEMOR", "EPISTAXE", "GENGIVO", 
#                       "METRO", "PETEQUIAS", "HEMATURA",
#                       "SANGRAM", "LACO_N", "PLASMATICO",
#                       "PLAQ_MENOR", "CON_FHD", "COMPLICA")

We will read the datasets and select the variables on interest.

# Prepare data
dengue_full <- arrow::open_dataset(sources = dengue_files_list) |>
  # Select variables
  select(all_of(c(
    "CLASSI_FIN", "CRITERIO",
    case_vars, 
    symp_cond_vars
  ))) |>
  collect()

The full dengue dataset present the following dimensions

dim(dengue_full)

[1] 12365498       28

The variable CLASSI_FIN present the case’s diagnoses and `CRITERIO` present the method used to determine the diagnose.

dengue_full <- dengue_full |>
  # Prepare variables
  mutate(CLASSI_FIN = case_when(
    str_detect(tolower(CLASSI_FIN), "dengue") ~ "Positive Dengue",
    CLASSI_FIN == "Descartado" ~ "Discarded Dengue",
    CLASSI_FIN == "Inconclusivo" ~ "Inconclusive",
    .default = CLASSI_FIN
  )) |>
  mutate(CRITERIO = case_match(
    CRITERIO,
    "Clínico epidemiológico" ~ "Clinical and epidemiological",
    "Laboratório" ~ "Laboratorial",
    "Em investigação" ~ "Being investigated",
    .default = CRITERIO
  ))

Let’s see how many suspected cases falls on each condition.

dengue_full |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Discarded Dengue	Being investigated	6338
Discarded Dengue	Clinical and epidemiological	1995427
Discarded Dengue	Laboratorial	2351751
Discarded Dengue	NA	24
Inconclusive	Being investigated	40884
Inconclusive	Clinical and epidemiological	15557
Inconclusive	Laboratorial	6718
Inconclusive	NA	1338016
Positive Dengue	Being investigated	64797
Positive Dengue	Clinical and epidemiological	3865015
Positive Dengue	Laboratorial	2612007
Positive Dengue	NA	23
NA	Being investigated	3405
NA	Clinical and epidemiological	202
NA	Laboratorial	93
NA	NA	65241

We may recode some of the cases, filter out exceptions and add a variable to retain the original information system name.

dengue_full <- dengue_full|>
  mutate(
    CRITERIO = case_when(
      CLASSI_FIN == "Inconclusive" & is.na(CRITERIO) ~ "Other reasons",
      .default = CRITERIO
    )
  ) |>
  filter(!is.na(CRITERIO)) |>
  filter(!is.na(CLASSI_FIN)) |>
  filter(!(CLASSI_FIN == "Discarded Dengue" & CRITERIO == "Being investigated")) |>
  filter(!(CLASSI_FIN == "Inconclusive" & CRITERIO == "Being investigated")) |>
  filter(!(CLASSI_FIN == "Positive Dengue" & CRITERIO == "Being investigated")) |>
  mutate(case_source = "SINAN-Dengue")

dim(dengue_full)

[1] 12184491       29

dengue_full |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Discarded Dengue	Clinical and epidemiological	1995427
Discarded Dengue	Laboratorial	2351751
Inconclusive	Clinical and epidemiological	15557
Inconclusive	Laboratorial	6718
Inconclusive	Other reasons	1338016
Positive Dengue	Clinical and epidemiological	3865015
Positive Dengue	Laboratorial	2612007

qsave(x = dengue_full, file = "dengue_full.qs")

Chikungunya

The same procedure executed on Dengue files is performed on Chikungunya reported suspected cases.

Data prior to 2017 does not have patient symptoms and clinical conditions.

# Data sources
chik_files_list <- c(
  data_dir("chik_data/parquets/chik_2017.parquet"),
  data_dir("chik_data/parquets/chik_2018.parquet"),
  data_dir("chik_data/parquets/chik_2019.parquet"),
  data_dir("chik_data/parquets/chik_2020.parquet"),
  data_dir("chik_data/parquets/chik_2021.parquet"),
  data_dir("chik_data/parquets/chik_2022.parquet"),
  data_dir("chik_data/parquets/chik_2023.parquet")
)

# Prepare data
chik_full <- arrow::open_dataset(sources = chik_files_list) |>
  # Select variables
  select(all_of(c(
    "CLASSI_FIN", "CRITERIO",
    case_vars, 
    symp_cond_vars
  ))) |>
  collect()

dim(chik_full)

[1] 1296213      28

chik_full <- chik_full |>
  # Prepare variables
  mutate(CLASSI_FIN = case_when(
    str_detect(tolower(CLASSI_FIN), "chik") ~ "Positive Chikungunya",
    CLASSI_FIN == "Descartado" ~ "Discarded Chikungunya",
    CLASSI_FIN == "Inconclusivo" ~ "Inconclusive",
    .default = CLASSI_FIN
  )) |>
  mutate(CRITERIO = case_match(
    CRITERIO,
    "Clínico epidemiológico" ~ "Clinical and epidemiological",
    "Laboratório" ~ "Laboratorial",
    "Em investigação" ~ "Being investigated",
    .default = CRITERIO
  ))

chik_full |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
!�	�	1
Dengue	Clinical and epidemiological	1
Dengue clássico	Clinical and epidemiological	20
Dengue clássico	Laboratorial	4
Dengue com complicações	Clinical and epidemiological	6
Dengue com complicações	Laboratorial	7
Discarded Chikungunya	Being investigated	484
Discarded Chikungunya	Clinical and epidemiological	189171
Discarded Chikungunya	Laboratorial	208789
H0	�	1
Inconclusive	Clinical and epidemiological	1
Inconclusive	Laboratorial	2
Inconclusive	NA	16
Positive Chikungunya	Being investigated	12494
Positive Chikungunya	Clinical and epidemiological	447464
Positive Chikungunya	Laboratorial	269732
Positive Chikungunya	NA	1
��	$	1
NA	Being investigated	6846
NA	Clinical and epidemiological	506
NA	Laboratorial	290
NA	NA	160376

chik_full <- chik_full|>
  mutate(
    CRITERIO = case_when(
      CLASSI_FIN == "Inconclusive" & is.na(CRITERIO) ~ "Other reasons",
      .default = CRITERIO
    )
  ) |>
  filter(CLASSI_FIN %in% c("Discarded Chikungunya", "Inconclusive", "Positive Chikungunya")) |>
  filter(CRITERIO %in% c("Clinical and epidemiological", "Laboratorial", "Other reasons")) |>
  mutate(case_source = "SINAN-Chikungunya")

dim(chik_full)

[1] 1115175      29

chik_full |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Discarded Chikungunya	Clinical and epidemiological	189171
Discarded Chikungunya	Laboratorial	208789
Inconclusive	Clinical and epidemiological	1
Inconclusive	Laboratorial	2
Inconclusive	Other reasons	16
Positive Chikungunya	Clinical and epidemiological	447464
Positive Chikungunya	Laboratorial	269732

qsave(x = chik_full, file = "chik_full.qs")

Join datasets

We will join both datasets, correct the date variable (DT_SIN_PRI) and remove invalid rows.

dcdata <- bind_rows(dengue_full, chik_full)

dim(dcdata)

[1] 13299666       29

And remove no longer needed objects.

rm(dengue_full, chik_full, chik_files_list, dengue_files_list)
gc()

            used   (Mb) gc trigger   (Mb)   max used   (Mb)
Ncells   1542344   82.4    2484897  132.8    2484897  132.8
Vcells 388378875 2963.1 1136575913 8671.4 1135963372 8666.8

dcdata |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Discarded Chikungunya	Clinical and epidemiological	189171
Discarded Chikungunya	Laboratorial	208789
Discarded Dengue	Clinical and epidemiological	1995427
Discarded Dengue	Laboratorial	2351751
Inconclusive	Clinical and epidemiological	15558
Inconclusive	Laboratorial	6720
Inconclusive	Other reasons	1338032
Positive Chikungunya	Clinical and epidemiological	447464
Positive Chikungunya	Laboratorial	269732
Positive Dengue	Clinical and epidemiological	3865015
Positive Dengue	Laboratorial	2612007

Data preparation

dcdata <- dcdata |>
  mutate(DT_SIN_PRI = as_date(DT_SIN_PRI, format = "%Y-%m-%d")) |>
  mutate(IDADEanos = as.numeric(IDADEanos)) |>
  mutate(across(all_of(c(symp_cond_vars)), ~ . == "Sim")) |>
  mutate(across(all_of(c(symp_cond_vars)), ~ replace_na(., 0))) |>
  mutate(uf_res = substr(ID_MN_RESI, 0, 2)) |>
  filter(year(DT_SIN_PRI) >= 2016 & year(DT_SIN_PRI) <= 2023) |>
  na.omit()

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `DT_SIN_PRI = as_date(DT_SIN_PRI, format = "%Y-%m-%d")`.
Caused by warning:
!  6 failed to parse.

Let’s see in time how the case classification and used criteria evolves.

dcdata |>
  mutate(DT_SIN_PRI = ceiling_date(DT_SIN_PRI, "month")) |>
  filter(CLASSI_FIN != "Inconclusive") |>
  filter(CRITERIO != "Other reasons") |>
  group_by(case_source, DT_SIN_PRI, CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  ggplot(aes(x = DT_SIN_PRI, y = count, fill = CRITERIO)) +
  geom_area(stat = "identity") +
  scale_y_continuous(labels = scales::unit_format(
    unit = "k", 
    scale = 1e-3,
    accuracy = 1)
  ) +
  facet_wrap(~CLASSI_FIN + case_source, scales = "free_y") +
  theme_bw() +
  theme(legend.position = "bottom", legend.direction = "horizontal")

`summarise()` has grouped output by 'case_source', 'DT_SIN_PRI', 'CLASSI_FIN'.
You can override using the `.groups` argument.

Data removal

Caution

Only for tests.

dcdata <- slice_sample(.data = dcdata, n = 500000)

qsave(x = dcdata, file = "dcdata.qs")

Reference dataset

The reference dataset will contain only suspected cases that were assessed diagnosed by laboratory exams, to train and test a classification model. Cases with inconclusive classification will be discarded.

dc_ref <- dcdata |>
  filter(CRITERIO == "Laboratorial") |>
  filter(CLASSI_FIN != "Inconclusive")

dim(dc_ref)

[1] 204270     30

dc_ref |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Discarded Chikungunya	Laboratorial	7676
Discarded Dengue	Laboratorial	87795
Positive Chikungunya	Laboratorial	10206
Positive Dengue	Laboratorial	98593

qsave(x = dc_ref, file = "dc_ref.qs")

Also, a simplified version is created, aggregating the classification not considering the specific disease, with a more balanced dataset.

dc_ref_simp <- dc_ref |>
  mutate(CLASSI_FIN = case_match(
    CLASSI_FIN,
    "Discarded Chikungunya" ~ "Negative",
    "Discarded Dengue" ~ "Negative",
    "Positive Dengue" ~ "Positive",
    "Positive Chikungunya" ~ "Positive"
  ))

dc_ref_simp |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Negative	Laboratorial	95471
Positive	Laboratorial	108799

qsave(x = dc_ref_simp, file = "dc_ref_simp.qs")

Clinical dataset

On this dataset, we will only include the cases that were classified by clinical and epidemiological criteria. Inconclusive cases will are discarded.

dc_cli <- dcdata |>
  filter(CRITERIO == "Clinical and epidemiological") |>
  filter(CLASSI_FIN != "Inconclusive")

dim(dc_cli)

[1] 244853     30

dc_cli |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Discarded Chikungunya	Clinical and epidemiological	7131
Discarded Dengue	Clinical and epidemiological	73950
Positive Chikungunya	Clinical and epidemiological	17084
Positive Dengue	Clinical and epidemiological	146688

Also, a simplified version of this dataset is created.

dc_cli_simp <- dc_cli |>
  mutate(CLASSI_FIN = case_match(
    CLASSI_FIN,
    "Discarded Chikungunya" ~ "Negative",
    "Discarded Dengue" ~ "Negative",
    "Positive Dengue" ~ "Positive",
    "Positive Chikungunya" ~ "Positive"
  ))

dim(dc_cli_simp)

[1] 244853     30

dc_cli_simp |>
  group_by(CLASSI_FIN, CRITERIO) |>
  summarise(count = n()) |>
  ungroup() |>
  gt::gt()

`summarise()` has grouped output by 'CLASSI_FIN'. You can override using the
`.groups` argument.

CLASSI_FIN	CRITERIO	count
Negative	Clinical and epidemiological	81081
Positive	Clinical and epidemiological	163772

qsave(x = dc_cli_simp, file = "dc_cli_simp.qs")

The joined data can be remove on this stage.

rm(dcdata)
gc()

           used  (Mb) gc trigger   (Mb)   max used   (Mb)
Ncells  1957221 104.6    4178680  223.2    4178680  223.2
Vcells 13088512  99.9  727408585 5549.7 1135963372 8666.8

Session info

session_info()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.2 (2023-10-31)
 os       CentOS Linux 7 (Core)
 system   x86_64, linux-gnu
 ui       X11
 language (EN)
 collate  pt_BR.UTF-8
 ctype    pt_BR.UTF-8
 tz       America/Sao_Paulo
 date     2024-02-16
 pandoc   3.1.1 @ /home/raphaelfs/miniconda3/envs/quarto/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package       * version date (UTC) lib source
 arrow         * 14.0.1  2023-11-14 [2] local
 assertthat      0.2.1   2019-03-21 [1] CRAN (R 4.2.2)
 bit             4.0.5   2022-11-15 [1] CRAN (R 4.2.2)
 bit64           4.0.5   2020-08-30 [1] CRAN (R 4.2.2)
 cli             3.6.1   2023-03-23 [1] CRAN (R 4.2.0)
 colorspace      2.1-0   2023-01-23 [1] CRAN (R 4.2.2)
 digest          0.6.31  2022-12-11 [1] CRAN (R 4.2.2)
 dplyr         * 1.1.2   2023-04-20 [1] CRAN (R 4.2.0)
 evaluate        0.21    2023-05-05 [1] CRAN (R 4.2.0)
 fansi           1.0.4   2023-01-22 [1] CRAN (R 4.2.2)
 farver          2.1.1   2022-07-06 [1] CRAN (R 4.2.2)
 fastmap         1.1.1   2023-02-24 [1] CRAN (R 4.2.2)
 forcats       * 1.0.0   2023-01-29 [1] CRAN (R 4.2.2)
 generics        0.1.3   2022-07-05 [1] CRAN (R 4.2.2)
 ggplot2       * 3.4.2   2023-04-03 [1] CRAN (R 4.2.0)
 glue            1.6.2   2022-02-24 [1] CRAN (R 4.2.2)
 gt              0.9.0   2023-03-31 [1] CRAN (R 4.2.0)
 gtable          0.3.3   2023-03-21 [1] CRAN (R 4.2.0)
 hms             1.1.3   2023-03-21 [1] CRAN (R 4.2.0)
 htmltools       0.5.5   2023-03-23 [1] CRAN (R 4.2.0)
 htmlwidgets     1.6.2   2023-03-17 [1] CRAN (R 4.2.0)
 jsonlite        1.8.5   2023-06-05 [1] CRAN (R 4.2.0)
 knitr           1.43    2023-05-25 [1] CRAN (R 4.2.0)
 labeling        0.4.2   2020-10-20 [1] CRAN (R 4.2.2)
 lifecycle       1.0.3   2022-10-07 [1] CRAN (R 4.2.2)
 lubridate     * 1.9.2   2023-02-10 [1] CRAN (R 4.2.2)
 magrittr        2.0.3   2022-03-30 [1] CRAN (R 4.2.2)
 munsell         0.5.0   2018-06-12 [1] CRAN (R 4.2.2)
 pillar          1.9.0   2023-03-22 [1] CRAN (R 4.2.0)
 pkgconfig       2.0.3   2019-09-22 [1] CRAN (R 4.2.2)
 purrr         * 1.0.1   2023-01-10 [1] CRAN (R 4.2.2)
 qs            * 0.25.5  2023-02-22 [1] CRAN (R 4.2.3)
 R6              2.5.1   2021-08-19 [1] CRAN (R 4.2.2)
 RApiSerialize   0.1.2   2022-08-25 [1] CRAN (R 4.2.3)
 Rcpp            1.0.10  2023-01-22 [1] CRAN (R 4.2.2)
 RcppParallel    5.1.7   2023-02-27 [1] CRAN (R 4.2.2)
 readr         * 2.1.4   2023-02-10 [1] CRAN (R 4.2.2)
 rlang           1.1.1   2023-04-28 [1] CRAN (R 4.2.0)
 rmarkdown       2.22    2023-06-01 [1] CRAN (R 4.2.0)
 rstudioapi      0.14    2022-08-22 [1] CRAN (R 4.2.2)
 sass            0.4.6   2023-05-03 [1] CRAN (R 4.2.0)
 scales          1.2.1   2022-08-20 [1] CRAN (R 4.2.2)
 sessioninfo   * 1.2.2   2021-12-06 [1] CRAN (R 4.2.1)
 stringfish      0.15.8  2023-05-30 [1] CRAN (R 4.2.3)
 stringi         1.7.12  2023-01-11 [1] CRAN (R 4.2.2)
 stringr       * 1.5.0   2022-12-02 [1] CRAN (R 4.2.2)
 tibble        * 3.2.1   2023-03-20 [1] CRAN (R 4.2.0)
 tidyr         * 1.3.0   2023-01-24 [1] CRAN (R 4.2.2)
 tidyselect      1.2.0   2022-10-10 [1] CRAN (R 4.2.2)
 tidyverse     * 2.0.0   2023-02-22 [2] CRAN (R 4.3.0)
 timechange      0.2.0   2023-01-11 [1] CRAN (R 4.2.1)
 tzdb            0.4.0   2023-05-12 [1] CRAN (R 4.2.0)
 utf8            1.2.3   2023-01-31 [1] CRAN (R 4.2.2)
 vctrs           0.6.3   2023-06-14 [1] CRAN (R 4.2.3)
 withr           2.5.0   2022-03-03 [1] CRAN (R 4.2.2)
 xfun            0.39    2023-04-20 [1] CRAN (R 4.2.0)
 xml2            1.3.4   2023-04-27 [1] CRAN (R 4.2.0)
 yaml            2.3.7   2023-01-23 [1] CRAN (R 4.2.2)

 [1] /home/raphaelfs/rlibs
 [2] /home/raphaelfs/miniconda3/envs/quarto/lib/R/library

──────────────────────────────────────────────────────────────────────────────