Exported data

Author

Raphael Saldanha

Last modification

December 1, 2023 | 09:07:18 +01:00

Packages

library(tidyverse)
library(lubridate)
library(arrow)
library(timetk)
library(brpop)
library(piggyback)

Dengue data

Total number of confirmed dengue cases, aggregated per municipality of residence and week of the first symptom’s onset.

dengue <- read_parquet("../dengue-data/parquet_aggregated/dengue_md.parquet") %>%
  group_by(mun) %>%
  summarise_by_time(.date_var = date, .by = "week", freq = sum(freq, na.rm = TRUE)) %>%
  ungroup() %>%
  rename(cases = freq)

Population

Estimated municipality population per year.

pop <- mun_pop_totals() %>%
  filter(year %in% seq(year(min(dengue$date)), year(max(dengue$date)))) %>%
  mutate(mun = as.character(mun))

Human Development Index

hdi <- read_parquet("../socioeconomic-data/hdi.parquet") %>%
  select(code_muni, hdi2010 = idhm2010)

Weather data

Weather indicators estimated by using zonal statistics of the territorial area of the municipality.

Precipitation

Total estimated precipitation, per municipality and week, in millimeter.

prec <- open_dataset(sources = "../weather-data/parquet/brdwgd/pr.parquet") %>%
  filter(name == "pr_sum") %>%
  select(date, value) %>%
  collect() %>%
  filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
  summarise_by_time(.date_var = date, .by = "week", value = sum(value, na.rm = TRUE)) %>%
  rename(prec = value)

Average maximun temperature

Average of maximum temperatures, per municipality and week, in celsius.

tmax <- open_dataset(sources = "../weather-data/parquet/brdwgd/tmax.parquet") %>%
  filter(name == "Tmax_mean") %>%
  select(date, value) %>%
  collect() %>%
  filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
  summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
  rename(tmax = value)

Average minimum temperature

Average of minimum temperatures, per municipality and week, in celsius.

tmin <- open_dataset(sources = "../weather-data/parquet/brdwgd/tmin.parquet") %>%
  filter(name == "Tmin_mean") %>%
  select(date, value) %>%
  collect() %>%
  filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
  summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
  rename(tmin = value)

Join data

exp_data <- dengue %>%
  mutate(dengue_year = year(date)) %>%
  inner_join(pop, by = c("dengue_year" = "year", "mun")) %>%
  select(-dengue_year) %>%
  inner_join(hdi, by = c("mun" = "code_muni")) %>%
  inner_join(prec, by = "date") %>%
  inner_join(tmax, by = "date") %>%
  inner_join(tmin, by = "date")

Lag weather variables

Lag one and two weeks.

exp_data <- exp_data %>%
  tk_augment_lags(.value = c(prec, tmax, tmin), .lags = 1:2)

Overview

glimpse(exp_data)
Rows: 2,202,058
Columns: 14
$ mun       <chr> "110001", "110001", "110001", "110001", "110001", "110001", …
$ date      <date> 2011-03-13, 2011-03-20, 2011-03-27, 2011-04-03, 2011-04-10,…
$ cases     <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ pop       <int> 24737, 24737, 24737, 24737, 24737, 24737, 24737, 24737, 2473…
$ hdi2010   <dbl> 0.641, 0.641, 0.641, 0.641, 0.641, 0.641, 0.641, 0.641, 0.64…
$ prec      <dbl> 5977667, 5832714, 6581013, 6411878, 5538108, 4712170, 412114…
$ tmax      <dbl> 29.89511, 29.59591, 29.72916, 29.27960, 29.05301, 29.32273, …
$ tmin      <dbl> 20.19973, 20.17791, 20.34251, 19.15049, 19.11662, 19.37211, …
$ prec_lag1 <dbl> NA, 5977667, 5832714, 6581013, 6411878, 5538108, 4712170, 41…
$ tmax_lag1 <dbl> NA, 29.89511, 29.59591, 29.72916, 29.27960, 29.05301, 29.322…
$ tmin_lag1 <dbl> NA, 20.19973, 20.17791, 20.34251, 19.15049, 19.11662, 19.372…
$ prec_lag2 <dbl> NA, NA, 5977667, 5832714, 6581013, 6411878, 5538108, 4712170…
$ tmax_lag2 <dbl> NA, NA, 29.89511, 29.59591, 29.72916, 29.27960, 29.05301, 29…
$ tmin_lag2 <dbl> NA, NA, 20.19973, 20.17791, 20.34251, 19.15049, 19.11662, 19…

Municipalities count: 5167

Export data

exp_data %>% write_parquet(sink = "exp_data.parquet")
exp_data %>% write_csv(file = "exp_data.csv")
zip::zip(zipfile = "exp_data.csv.zip", files = "exp_data.csv")
unlink(x = "exp_data.csv")

Data dictionary

  • mun character. Municipality code with 6 digits

  • date date. Date on format YYYY-MM-DD of the ceiling data of the week

  • cases integer. Confirmed dengue cases count

  • pop integer. Population estimation of the municipality

  • hdi2010 double. Human Development Index for 2010

  • prec double. Total precipitation, mm

  • tmax double. Average maximum temperature, Celsius

  • tmin double. Average minimum temperature, Celsius

  • *_lag1 double. One week lagged variables

  • *_lag2 double. Two weeks lagged variables

Include on current GitHub release

# Files list to upload
files_list <- c("exp_data.parquet", "exp_data.csv.zip")

# Upload files
for(i in files_list){
  pb_upload(file = i, repo = "rfsaldanha/dengue", overwrite = TRUE)
}

Files exp_data.parquet and exp_data.csv.zip available on current release: https://github.com/rfsaldanha/dengue/releases

Session info

sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=pt_BR.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] piggyback_0.1.4.9006 brpop_0.3.0          timetk_2.8.3        
 [4] arrow_12.0.1         lubridate_1.9.2      forcats_1.0.0       
 [7] stringr_1.5.0        dplyr_1.1.2          purrr_1.0.1         
[10] readr_2.1.4          tidyr_1.3.0          tibble_3.2.1        
[13] ggplot2_3.4.2        tidyverse_2.0.0     

loaded via a namespace (and not attached):
 [1] xts_0.13.1          bit64_4.0.5         httr_1.4.6         
 [4] DiceDesign_1.9      gh_1.4.0            tools_4.1.2        
 [7] utf8_1.2.3          R6_2.5.1            rpart_4.1.16       
[10] colorspace_2.1-0    yardstick_1.2.0     nnet_7.3-17        
[13] withr_2.5.0         tidyselect_1.2.0    curl_5.0.1         
[16] bit_4.0.5           compiler_4.1.2      httr2_0.2.3        
[19] cli_3.6.1           scales_1.2.1        tune_1.1.1         
[22] rappdirs_0.3.3      digest_0.6.31       rmarkdown_2.22     
[25] pkgconfig_2.0.3     htmltools_0.5.5     parallelly_1.36.0  
[28] lhs_1.1.6           fastmap_1.1.1       htmlwidgets_1.6.2  
[31] rlang_1.1.1         rstudioapi_0.14     generics_0.1.3     
[34] zoo_1.8-12          jsonlite_1.8.5      vroom_1.6.3        
[37] zip_2.3.0           magrittr_2.0.3      Matrix_1.5-4.1     
[40] Rcpp_1.0.10         munsell_0.5.0       fansi_1.0.4        
[43] GPfit_1.0-8         lifecycle_1.0.3     furrr_0.3.1        
[46] stringi_1.7.12      yaml_2.3.7          MASS_7.3-55        
[49] recipes_1.0.6       grid_4.1.2          parallel_4.1.2     
[52] listenv_0.9.0       crayon_1.5.2        lattice_0.20-45    
[55] splines_4.1.2       hms_1.1.3           knitr_1.43         
[58] pillar_1.9.0        dials_1.2.0         future.apply_1.11.0
[61] codetools_0.2-18    parsnip_1.1.0       glue_1.6.2         
[64] evaluate_0.21       rsample_1.1.1       data.table_1.14.8  
[67] vctrs_0.6.3         tzdb_0.4.0          foreach_1.5.2      
[70] gtable_0.3.3        future_1.32.0       assertthat_0.2.1   
[73] cachem_1.0.8        xfun_0.39           gower_1.0.1        
[76] mime_0.12           prodlim_2023.03.31  gitcreds_0.1.2     
[79] class_7.3-20        survival_3.2-13     timeDate_4022.108  
[82] iterators_1.0.14    memoise_2.0.1       hardhat_1.3.0      
[85] lava_1.7.2.1        workflows_1.1.3     timechange_0.2.0   
[88] globals_0.16.2      ipred_0.9-14       
Back to top