library(tidyverse)
# remotes::install_github("rfsaldanha/microdatasus")
library(microdatasus)
library(arrow)
source("../functions.R")
Raw files
This notebook presents the process of downloading SINAN Dengue raw data files from DataSUS, pre-processing steps using the {microdatasus}
package and exporting the raw files to parquet
files.
Packages
Execution node
node_name()
[1] "fatnode"
Data download
# for(a in 2011:2021){
# tmp <- fetch_datasus(
# year_start = a, year_end = a,
# information_system = "SINAN-DENGUE"
# )
#
# write_parquet(x = tmp, sink = paste0("parquets_raw/raw_dengue_", a, ".parquet"))
#
# tmp <- process_sinan_dengue(data = tmp)
#
# write_parquet(x = tmp, sink = paste0("parquets/dengue_", a, ".parquet"))
#
# rm(tmp)
# }
Warning
This code chunk is commented to avoid unnecessary download again the raw DBC files from DataSUS.
Overview
<- open_dataset(sources = data_dir("dengue_data/parquets")) dengue
%>% glimpse() dengue
FileSystemDataset with 15 Parquet files
19,180,457 rows x 75 columns
$ TP_NOT <string> "Individual", "Individual", "Individual", "Individual", …
$ ID_AGRAVO <string> "A90", "A90", "A90", "A90", "A90", "A90", "A90", "A90", …
$ DT_NOTIFIC <string> "2007-02-12", "2007-02-12", "2007-02-26", "2007-02-12", …
$ SEM_NOT <string> "200707", "200707", "200709", "200707", "200706", "20070…
$ NU_ANO <string> "2007", "2007", "2007", "2007", "2007", "2007", "2007", …
$ SG_UF_NOT <string> "Maranhão", "Maranhão", "Pernambuco", "Bahia", "Bahia", …
$ ID_MUNICIP <string> "210780", "210780", "260640", "292150", "292150", "29215…
$ ID_REGIONA <string> "1577", "1577", "1499", "1410", "1410", "1410", "1410", …
$ ID_UNIDADE <string> "2454793", "2454793", "2436027", "3340392", "3340392", "…
$ DT_SIN_PRI <string> "2007-02-09", "2007-02-10", "2007-02-21", "2007-01-28", …
$ SEM_PRI <string> "200706", "200706", "200708", "200705", "200703", "20070…
$ CS_SEXO <string> "Feminino", "Feminino", "Masculino", "Feminino", "Mascul…
$ CS_GESTANT <string> "Ignorado", "Ignorado", "Não se aplica", "Não se aplica"…
$ CS_RACA <string> "Parda", "Parda", "Branca", "Branca", "Parda", "Parda", …
$ CS_ESCOL_N <string> "5a à 8a série incompleta do EF (antigo ginásio ou 1o gr…
$ SG_UF <string> "Maranhão", "Maranhão", "Pernambuco", "Bahia", "Bahia", …
$ ID_MN_RESI <string> "210780", "210780", "260640", "292150", "292150", "29215…
$ ID_RG_RESI <string> "1577", "1577", "1499", "1410", "1410", "1410", "1410", …
$ ID_PAIS <string> "BRASIL", "BRASIL", "BRASIL", "BRASIL", "BRASIL", "BRASI…
$ NDUPLIC_N <string> "Não identificado", "Não identificado", NA, NA, NA, NA, …
$ DT_DIGITA <string> NA, NA, "2007-03-08", "2007-05-22", "2007-05-02", "2007-…
$ CS_FLXRET <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ FLXRECEBI <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ MIGRADO_W <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DT_INVEST <string> "2007-02-12", "2007-02-12", "2007-02-26", "2007-02-12", …
$ ID_OCUPA_N <string> "Nao informada", "Nao informada", NA, "Nao informada", "…
$ DT_SORO <string> NA, NA, "2007-02-26", "2007-02-01", "2007-02-09", NA, NA…
$ RESUL_SORO <string> "Não realizado", "Não realizado", "Não reagente", "Reage…
$ HISTOPA_N <string> "Não realizado", "Não realizado", "Não realizado", "Não …
$ DT_VIRAL <string> NA, NA, NA, NA, NA, NA, NA, NA, "2007-02-14", NA, NA, NA…
$ RESUL_VI_N <string> "Não realizado", "Não realizado", NA, "Não realizado", "…
$ SOROTIPO <string> NA, NA, NA, NA, NA, NA, NA, NA, "DEN 3", NA, NA, NA, NA,…
$ IMUNOH_N <string> "Não realizado", "Não realizado", "Não realizado", "Não …
$ DT_PCR <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ RESUL_PCR_ <string> "Não realizado", "Não realizado", NA, "Não realizado", "…
$ CLASSI_FIN <string> "Dengue clássico", "Dengue clássico", "Descartado", "Den…
$ CRITERIO <string> "Clínico epidemiológico", "Clínico epidemiológico", "Clí…
$ TPAUTOCTO <string> "Sim", "Sim", NA, "Não", "Sim", NA, NA, "Não", "Sim", NA…
$ COUFINF <string> "Maranhão", "Maranhão", NA, "Bahia", "Bahia", NA, NA, "B…
$ COPAISINF <string> "BRASIL", "BRASIL", NA, "BRASIL", "BRASIL", NA, NA, "BRA…
$ COMUNINF <string> "210780", "210780", NA, "292150", "292150", NA, NA, "292…
$ DOENCA_TRA <string> "Não", "Não", NA, "Não", "Não", NA, NA, "Não", "Não", NA…
$ EVOLUCAO <string> "Cura", "Cura", NA, "Cura", "Cura", NA, NA, "Cura", "Cur…
$ DT_OBITO <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DT_ENCERRA <string> "2007-05-08", "2007-05-08", "2007-04-19", "2007-05-28", …
$ MANI_HEMOR <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ EPISTAXE <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ GENGIVO <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ METRO <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ PETEQUIAS <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ HEMATURA <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ SANGRAM <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LACO_N <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ PLASMATICO <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ EVIDENCIA <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ PLAQ_MENOR <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ CON_FHD <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ COMPLICA <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ HOSPITALIZ <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DT_INTERNA <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ UF <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ MUNICIPIO <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEminutos <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEhoras <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEdias <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEmeses <string> NA, NA, NA, "0", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ IDADEanos <string> "3", "1", "3", NA, "0", "1", "3", "0", "3", "0", "1", "2…
$ munResStatus <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResTipo <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResNome <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResUf <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResLat <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResLon <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResAlt <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResArea <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
Call `print()` for full schema details
Session info
sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /home/raphaelfs/miniconda3/envs/quarto/lib/libopenblasp-r0.3.23.so
locale:
[1] LC_CTYPE=pt_BR.UTF-8 LC_NUMERIC=C
[3] LC_TIME=pt_BR.UTF-8 LC_COLLATE=pt_BR.UTF-8
[5] LC_MONETARY=pt_BR.UTF-8 LC_MESSAGES=pt_BR.UTF-8
[7] LC_PAPER=pt_BR.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] arrow_12.0.0 microdatasus_2.2.0 lubridate_1.9.2 forcats_1.0.0
[5] stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1 readr_2.1.4
[9] tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] compiler_4.2.3 pillar_1.9.0 tools_4.2.3 bit_4.0.5
[5] digest_0.6.31 timechange_0.2.0 jsonlite_1.8.5 evaluate_0.21
[9] lifecycle_1.0.3 gtable_0.3.3 pkgconfig_2.0.3 rlang_1.1.1
[13] cli_3.6.1 rstudioapi_0.14 yaml_2.3.7 xfun_0.39
[17] fastmap_1.1.1 withr_2.5.0 knitr_1.43 generics_0.1.3
[21] vctrs_0.6.3 htmlwidgets_1.6.2 hms_1.1.3 bit64_4.0.5
[25] grid_4.2.3 tidyselect_1.2.0 data.table_1.14.8 glue_1.6.2
[29] R6_2.5.1 fansi_1.0.4 rmarkdown_2.22 tzdb_0.4.0
[33] magrittr_2.0.3 scales_1.2.1 htmltools_0.5.5 assertthat_0.2.1
[37] colorspace_2.1-0 utf8_1.2.3 stringi_1.7.12 munsell_0.5.0