Raw files

Author

Raphael Saldanha

Last modification

December 1, 2023 | 09:07:18 +01:00

This notebook presents the process of downloading SINAN Dengue raw data files from DataSUS, pre-processing steps using the {microdatasus} package and exporting the raw files to parquet files.

Packages

library(tidyverse)
# remotes::install_github("rfsaldanha/microdatasus")
library(microdatasus)
library(arrow)
source("../functions.R")

Execution node

node_name()
[1] "fatnode"

Data download

# for(a in 2011:2021){
#   tmp <- fetch_datasus(
#     year_start = a, year_end = a,
#     information_system = "SINAN-DENGUE"
#   )
# 
#   write_parquet(x = tmp, sink = paste0("parquets_raw/raw_dengue_", a, ".parquet"))
#   
#   tmp <- process_sinan_dengue(data = tmp)
#   
#   write_parquet(x = tmp, sink = paste0("parquets/dengue_", a, ".parquet"))
#   
#   rm(tmp)
# }
Warning

This code chunk is commented to avoid unnecessary download again the raw DBC files from DataSUS.

Overview

dengue <- open_dataset(sources = data_dir("dengue_data/parquets"))
dengue %>% glimpse()
FileSystemDataset with 15 Parquet files
19,180,457 rows x 75 columns
$ TP_NOT       <string> "Individual", "Individual", "Individual", "Individual", …
$ ID_AGRAVO    <string> "A90", "A90", "A90", "A90", "A90", "A90", "A90", "A90", …
$ DT_NOTIFIC   <string> "2007-02-12", "2007-02-12", "2007-02-26", "2007-02-12", …
$ SEM_NOT      <string> "200707", "200707", "200709", "200707", "200706", "20070…
$ NU_ANO       <string> "2007", "2007", "2007", "2007", "2007", "2007", "2007", …
$ SG_UF_NOT    <string> "Maranhão", "Maranhão", "Pernambuco", "Bahia", "Bahia", …
$ ID_MUNICIP   <string> "210780", "210780", "260640", "292150", "292150", "29215…
$ ID_REGIONA   <string> "1577", "1577", "1499", "1410", "1410", "1410", "1410", …
$ ID_UNIDADE   <string> "2454793", "2454793", "2436027", "3340392", "3340392", "…
$ DT_SIN_PRI   <string> "2007-02-09", "2007-02-10", "2007-02-21", "2007-01-28", …
$ SEM_PRI      <string> "200706", "200706", "200708", "200705", "200703", "20070…
$ CS_SEXO      <string> "Feminino", "Feminino", "Masculino", "Feminino", "Mascul…
$ CS_GESTANT   <string> "Ignorado", "Ignorado", "Não se aplica", "Não se aplica"…
$ CS_RACA      <string> "Parda", "Parda", "Branca", "Branca", "Parda", "Parda", …
$ CS_ESCOL_N   <string> "5a à 8a série incompleta do EF (antigo ginásio ou 1o gr…
$ SG_UF        <string> "Maranhão", "Maranhão", "Pernambuco", "Bahia", "Bahia", …
$ ID_MN_RESI   <string> "210780", "210780", "260640", "292150", "292150", "29215…
$ ID_RG_RESI   <string> "1577", "1577", "1499", "1410", "1410", "1410", "1410", …
$ ID_PAIS      <string> "BRASIL", "BRASIL", "BRASIL", "BRASIL", "BRASIL", "BRASI…
$ NDUPLIC_N    <string> "Não identificado", "Não identificado", NA, NA, NA, NA, …
$ DT_DIGITA    <string> NA, NA, "2007-03-08", "2007-05-22", "2007-05-02", "2007-…
$ CS_FLXRET    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ FLXRECEBI    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ MIGRADO_W    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DT_INVEST    <string> "2007-02-12", "2007-02-12", "2007-02-26", "2007-02-12", …
$ ID_OCUPA_N   <string> "Nao informada", "Nao informada", NA, "Nao informada", "…
$ DT_SORO      <string> NA, NA, "2007-02-26", "2007-02-01", "2007-02-09", NA, NA…
$ RESUL_SORO   <string> "Não realizado", "Não realizado", "Não reagente", "Reage…
$ HISTOPA_N    <string> "Não realizado", "Não realizado", "Não realizado", "Não …
$ DT_VIRAL     <string> NA, NA, NA, NA, NA, NA, NA, NA, "2007-02-14", NA, NA, NA…
$ RESUL_VI_N   <string> "Não realizado", "Não realizado", NA, "Não realizado", "…
$ SOROTIPO     <string> NA, NA, NA, NA, NA, NA, NA, NA, "DEN 3", NA, NA, NA, NA,…
$ IMUNOH_N     <string> "Não realizado", "Não realizado", "Não realizado", "Não …
$ DT_PCR       <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ RESUL_PCR_   <string> "Não realizado", "Não realizado", NA, "Não realizado", "…
$ CLASSI_FIN   <string> "Dengue clássico", "Dengue clássico", "Descartado", "Den…
$ CRITERIO     <string> "Clínico epidemiológico", "Clínico epidemiológico", "Clí…
$ TPAUTOCTO    <string> "Sim", "Sim", NA, "Não", "Sim", NA, NA, "Não", "Sim", NA…
$ COUFINF      <string> "Maranhão", "Maranhão", NA, "Bahia", "Bahia", NA, NA, "B…
$ COPAISINF    <string> "BRASIL", "BRASIL", NA, "BRASIL", "BRASIL", NA, NA, "BRA…
$ COMUNINF     <string> "210780", "210780", NA, "292150", "292150", NA, NA, "292…
$ DOENCA_TRA   <string> "Não", "Não", NA, "Não", "Não", NA, NA, "Não", "Não", NA…
$ EVOLUCAO     <string> "Cura", "Cura", NA, "Cura", "Cura", NA, NA, "Cura", "Cur…
$ DT_OBITO     <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DT_ENCERRA   <string> "2007-05-08", "2007-05-08", "2007-04-19", "2007-05-28", …
$ MANI_HEMOR   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ EPISTAXE     <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ GENGIVO      <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ METRO        <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ PETEQUIAS    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ HEMATURA     <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ SANGRAM      <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ LACO_N       <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ PLASMATICO   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ EVIDENCIA    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ PLAQ_MENOR   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ CON_FHD      <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ COMPLICA     <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ HOSPITALIZ   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ DT_INTERNA   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ UF           <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ MUNICIPIO    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEminutos <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEhoras   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEdias    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ IDADEmeses   <string> NA, NA, NA, "0", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ IDADEanos    <string> "3", "1", "3", NA, "0", "1", "3", "0", "3", "0", "1", "2…
$ munResStatus <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResTipo   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResNome   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResUf     <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResLat    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResLon    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResAlt    <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ munResArea   <string> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
Call `print()` for full schema details

Session info

sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/raphaelfs/miniconda3/envs/quarto/lib/libopenblasp-r0.3.23.so

locale:
 [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=pt_BR.UTF-8        LC_COLLATE=pt_BR.UTF-8    
 [5] LC_MONETARY=pt_BR.UTF-8    LC_MESSAGES=pt_BR.UTF-8   
 [7] LC_PAPER=pt_BR.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=pt_BR.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] arrow_12.0.0       microdatasus_2.2.0 lubridate_1.9.2    forcats_1.0.0     
 [5] stringr_1.5.0      dplyr_1.1.2        purrr_1.0.1        readr_2.1.4       
 [9] tidyr_1.3.0        tibble_3.2.1       ggplot2_3.4.2      tidyverse_2.0.0   

loaded via a namespace (and not attached):
 [1] compiler_4.2.3    pillar_1.9.0      tools_4.2.3       bit_4.0.5        
 [5] digest_0.6.31     timechange_0.2.0  jsonlite_1.8.5    evaluate_0.21    
 [9] lifecycle_1.0.3   gtable_0.3.3      pkgconfig_2.0.3   rlang_1.1.1      
[13] cli_3.6.1         rstudioapi_0.14   yaml_2.3.7        xfun_0.39        
[17] fastmap_1.1.1     withr_2.5.0       knitr_1.43        generics_0.1.3   
[21] vctrs_0.6.3       htmlwidgets_1.6.2 hms_1.1.3         bit64_4.0.5      
[25] grid_4.2.3        tidyselect_1.2.0  data.table_1.14.8 glue_1.6.2       
[29] R6_2.5.1          fansi_1.0.4       rmarkdown_2.22    tzdb_0.4.0       
[33] magrittr_2.0.3    scales_1.2.1      htmltools_0.5.5   assertthat_0.2.1 
[37] colorspace_2.1-0  utf8_1.2.3        stringi_1.7.12    munsell_0.5.0    
Back to top