library(tidyverse)
library(lubridate)
library(arrow)
library(timetk)
library(brpop)
library(piggyback)
Exported data
Packages
Dengue data
Total number of confirmed dengue cases, aggregated per municipality of residence and week of the first symptom’s onset.
<- read_parquet("../dengue-data/parquet_aggregated/dengue_md.parquet") %>%
dengue group_by(mun) %>%
summarise_by_time(.date_var = date, .by = "week", freq = sum(freq, na.rm = TRUE)) %>%
ungroup() %>%
rename(cases = freq)
Population
Estimated municipality population per year.
<- mun_pop_totals() %>%
pop filter(year %in% seq(year(min(dengue$date)), year(max(dengue$date)))) %>%
mutate(mun = as.character(mun))
Human Development Index
<- read_parquet("../socioeconomic-data/hdi.parquet") %>%
hdi select(code_muni, hdi2010 = idhm2010)
Weather data
Weather indicators estimated by using zonal statistics of the territorial area of the municipality.
Precipitation
Total estimated precipitation, per municipality and week, in millimeter.
<- open_dataset(sources = "../weather-data/parquet/brdwgd/pr.parquet") %>%
prec filter(name == "pr_sum") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
summarise_by_time(.date_var = date, .by = "week", value = sum(value, na.rm = TRUE)) %>%
rename(prec = value)
Average maximun temperature
Average of maximum temperatures, per municipality and week, in celsius.
<- open_dataset(sources = "../weather-data/parquet/brdwgd/tmax.parquet") %>%
tmax filter(name == "Tmax_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
rename(tmax = value)
Average minimum temperature
Average of minimum temperatures, per municipality and week, in celsius.
<- open_dataset(sources = "../weather-data/parquet/brdwgd/tmin.parquet") %>%
tmin filter(name == "Tmin_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue$date) & date <= max(dengue$date)) %>%
summarise_by_time(.date_var = date, .by = "week", value = mean(value, na.rm = TRUE)) %>%
rename(tmin = value)
Join data
<- dengue %>%
exp_data mutate(dengue_year = year(date)) %>%
inner_join(pop, by = c("dengue_year" = "year", "mun")) %>%
select(-dengue_year) %>%
inner_join(hdi, by = c("mun" = "code_muni")) %>%
inner_join(prec, by = "date") %>%
inner_join(tmax, by = "date") %>%
inner_join(tmin, by = "date")
Lag weather variables
Lag one and two weeks.
<- exp_data %>%
exp_data tk_augment_lags(.value = c(prec, tmax, tmin), .lags = 1:2)
Overview
glimpse(exp_data)
Rows: 2,202,058
Columns: 14
$ mun <chr> "110001", "110001", "110001", "110001", "110001", "110001", …
$ date <date> 2011-03-13, 2011-03-20, 2011-03-27, 2011-04-03, 2011-04-10,…
$ cases <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ pop <int> 24737, 24737, 24737, 24737, 24737, 24737, 24737, 24737, 2473…
$ hdi2010 <dbl> 0.641, 0.641, 0.641, 0.641, 0.641, 0.641, 0.641, 0.641, 0.64…
$ prec <dbl> 5977667, 5832714, 6581013, 6411878, 5538108, 4712170, 412114…
$ tmax <dbl> 29.89511, 29.59591, 29.72916, 29.27960, 29.05301, 29.32273, …
$ tmin <dbl> 20.19973, 20.17791, 20.34251, 19.15049, 19.11662, 19.37211, …
$ prec_lag1 <dbl> NA, 5977667, 5832714, 6581013, 6411878, 5538108, 4712170, 41…
$ tmax_lag1 <dbl> NA, 29.89511, 29.59591, 29.72916, 29.27960, 29.05301, 29.322…
$ tmin_lag1 <dbl> NA, 20.19973, 20.17791, 20.34251, 19.15049, 19.11662, 19.372…
$ prec_lag2 <dbl> NA, NA, 5977667, 5832714, 6581013, 6411878, 5538108, 4712170…
$ tmax_lag2 <dbl> NA, NA, 29.89511, 29.59591, 29.72916, 29.27960, 29.05301, 29…
$ tmin_lag2 <dbl> NA, NA, 20.19973, 20.17791, 20.34251, 19.15049, 19.11662, 19…
Municipalities count: 5167
Export data
%>% write_parquet(sink = "exp_data.parquet") exp_data
%>% write_csv(file = "exp_data.csv")
exp_data ::zip(zipfile = "exp_data.csv.zip", files = "exp_data.csv")
zipunlink(x = "exp_data.csv")
Data dictionary
mun
character. Municipality code with 6 digitsdate
date. Date on format YYYY-MM-DD of the ceiling data of the weekcases
integer. Confirmed dengue cases countpop
integer. Population estimation of the municipalityhdi2010
double. Human Development Index for 2010prec
double. Total precipitation, mmtmax
double. Average maximum temperature, Celsiustmin
double. Average minimum temperature, Celsius*_lag1
double. One week lagged variables*_lag2
double. Two weeks lagged variables
Include on current GitHub release
# Files list to upload
<- c("exp_data.parquet", "exp_data.csv.zip")
files_list
# Upload files
for(i in files_list){
pb_upload(file = i, repo = "rfsaldanha/dengue", overwrite = TRUE)
}
Files exp_data.parquet
and exp_data.csv.zip
available on current release: https://github.com/rfsaldanha/dengue/releases
Session info
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=pt_BR.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] piggyback_0.1.4.9006 brpop_0.3.0 timetk_2.8.3
[4] arrow_12.0.1 lubridate_1.9.2 forcats_1.0.0
[7] stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1
[10] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
[13] ggplot2_3.4.2 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] xts_0.13.1 bit64_4.0.5 httr_1.4.6
[4] DiceDesign_1.9 gh_1.4.0 tools_4.1.2
[7] utf8_1.2.3 R6_2.5.1 rpart_4.1.16
[10] colorspace_2.1-0 yardstick_1.2.0 nnet_7.3-17
[13] withr_2.5.0 tidyselect_1.2.0 curl_5.0.1
[16] bit_4.0.5 compiler_4.1.2 httr2_0.2.3
[19] cli_3.6.1 scales_1.2.1 tune_1.1.1
[22] rappdirs_0.3.3 digest_0.6.31 rmarkdown_2.22
[25] pkgconfig_2.0.3 htmltools_0.5.5 parallelly_1.36.0
[28] lhs_1.1.6 fastmap_1.1.1 htmlwidgets_1.6.2
[31] rlang_1.1.1 rstudioapi_0.14 generics_0.1.3
[34] zoo_1.8-12 jsonlite_1.8.5 vroom_1.6.3
[37] zip_2.3.0 magrittr_2.0.3 Matrix_1.5-4.1
[40] Rcpp_1.0.10 munsell_0.5.0 fansi_1.0.4
[43] GPfit_1.0-8 lifecycle_1.0.3 furrr_0.3.1
[46] stringi_1.7.12 yaml_2.3.7 MASS_7.3-55
[49] recipes_1.0.6 grid_4.1.2 parallel_4.1.2
[52] listenv_0.9.0 crayon_1.5.2 lattice_0.20-45
[55] splines_4.1.2 hms_1.1.3 knitr_1.43
[58] pillar_1.9.0 dials_1.2.0 future.apply_1.11.0
[61] codetools_0.2-18 parsnip_1.1.0 glue_1.6.2
[64] evaluate_0.21 rsample_1.1.1 data.table_1.14.8
[67] vctrs_0.6.3 tzdb_0.4.0 foreach_1.5.2
[70] gtable_0.3.3 future_1.32.0 assertthat_0.2.1
[73] cachem_1.0.8 xfun_0.39 gower_1.0.1
[76] mime_0.12 prodlim_2023.03.31 gitcreds_0.1.2
[79] class_7.3-20 survival_3.2-13 timeDate_4022.108
[82] iterators_1.0.14 memoise_2.0.1 hardhat_1.3.0
[85] lava_1.7.2.1 workflows_1.1.3 timechange_0.2.0
[88] globals_0.16.2 ipred_0.9-14