library(tidyverse)
library(tidymodels)
library(arrow)
library(timetk)
library(rpart.plot)
library(vip)
Classification task
This notebook models the relationship between dengue cases and weather variables using a classification of dengue cases as outbreak level (anomaly) or base level.
Packages
Dengue data
Subset and aggregate
Rio de Janeiro, RJ, aggregated by month.
<- open_dataset("../dengue-data/parquet_aggregated/dengue_md.parquet") %>%
dengue_rj filter(mun == 330455) %>%
collect() %>%
summarise_by_time(.date_var = date, .by = "month", freq = sum(freq, na.rm = TRUE))
plot_time_series(.data = dengue_rj, .date_var = date, .value = freq, .smooth = FALSE, .title = "Dengue, absolute number of cases")
Classify
plot_anomaly_diagnostics(.data = dengue_rj, .date_var = date, .value = freq, .alpha = 0.10, .max_anomalies = 1, .legend_show = FALSE)
frequency = 12 observations per 1 year
trend = 60 observations per 5 years
<- tk_anomaly_diagnostics(.data = dengue_rj, .date_var = date, .value = freq, .alpha = 0.10, .max_anomalies = 1) %>%
dengue_rj_anom select(date, anomaly) %>%
mutate(anomaly = as.factor(anomaly))
frequency = 12 observations per 1 year
trend = 60 observations per 5 years
<- inner_join(dengue_rj, dengue_rj_anom) dengue_rj
Joining with `by = join_by(date)`
prop.table(table(dengue_rj$anomaly))
No Yes
0.8257576 0.1742424
Weather data
<- open_dataset(sources = "../weather-data/parquet/brdwgd/tmax.parquet") %>%
tmax filter(code_muni == 3304557) %>%
filter(name == "Tmax_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue_rj$date) & date <= max(dengue_rj$date)) %>%
summarise_by_time(.date_var = date, .by = "month", value = mean(value, na.rm = TRUE)) %>%
rename(tmax = value)
<- open_dataset(sources = "../weather-data/parquet/brdwgd/pr.parquet") %>%
prec filter(code_muni == 3304557) %>%
filter(name == "pr_sum") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue_rj$date) & date <= max(dengue_rj$date)) %>%
summarise_by_time(.date_var = date, .by = "month", value = sum(value, na.rm = TRUE)) %>%
rename(prec = value)
<- open_dataset(sources = "../weather-data/parquet/brdwgd/pr.parquet") %>%
prec_avg filter(code_muni == 3304557) %>%
filter(name == "pr_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue_rj$date) & date <= max(dengue_rj$date)) %>%
summarise_by_time(.date_var = date, .by = "month", value = mean(value, na.rm = TRUE)) %>%
rename(prec_avg = value)
<- open_dataset(sources = "../weather-data/parquet/brdwgd/rh.parquet") %>%
rh filter(code_muni == 3304557) %>%
filter(name == "RH_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue_rj$date) & date <= max(dengue_rj$date)) %>%
summarise_by_time(.date_var = date, .by = "month", value = mean(value, na.rm = TRUE)) %>%
rename(rh = value)
<- open_dataset(sources = "../weather-data/parquet/brdwgd/u2.parquet") %>%
wind filter(code_muni == 3304557) %>%
filter(name == "u2_mean") %>%
select(date, value) %>%
collect() %>%
filter(date >= min(dengue_rj$date) & date <= max(dengue_rj$date)) %>%
summarise_by_time(.date_var = date, .by = "month", value = mean(value, na.rm = TRUE)) %>%
rename(wind = value)
plot_time_series(.data = tmax, .date_var = date, .value = tmax, .smooth = FALSE, .title = "Max temp, average")
plot_time_series(.data = prec, .date_var = date, .value = prec, .smooth = FALSE, .title = "Precipitation, sum")
plot_time_series(.data = prec_avg, .date_var = date, .value = prec_avg, .smooth = FALSE, .title = "Precipitation, average")
plot_time_series(.data = rh, .date_var = date, .value = rh, .smooth = FALSE, .title = "Relative humidity, average")
plot_time_series(.data = wind, .date_var = date, .value = wind, .smooth = FALSE, .title = "Wind, average")
Join data
<- inner_join(x = dengue_rj, y = tmax, by = "date") %>%
res inner_join(prec, by = "date") %>%
inner_join(prec_avg, by = "date") %>%
inner_join(rh, by = "date") %>%
inner_join(wind, by = "date") %>%
select(date, anomaly, tmax, prec, rh, wind)
Decision tree
Prepare data
Remove date
Lag variables: 12 months
<- res %>%
res_prep select(-date) %>%
tk_augment_lags(.value = c(tmax, prec, rh, wind), .lags = 1:12)
Parameters
<- decision_tree() %>%
tree_spec set_engine("rpart") %>%
set_mode("classification")
Fit model
<- tree_spec %>%
fit1 fit(anomaly ~ ., data = res_prep, model = TRUE)
%>% extract_fit_engine() %>%
fit1 rpart.plot(roundint = FALSE)
augment(fit1, new_data = res_prep) %>%
conf_mat(truth = anomaly, estimate = .pred_class)
Truth
Prediction No Yes
No 86 7
Yes 6 16
augment(fit1, new_data = res_prep) %>%
accuracy(truth = anomaly, estimate = .pred_class)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.887
%>%
fit1 extract_fit_engine() %>%
vip()
Session info
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=pt_BR.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] vip_0.3.2 rpart.plot_3.1.1 rpart_4.1.16 timetk_2.8.3
[5] arrow_12.0.1 yardstick_1.2.0 workflowsets_1.0.1 workflows_1.1.3
[9] tune_1.1.1 rsample_1.1.1 recipes_1.0.6 parsnip_1.1.0
[13] modeldata_1.1.0 infer_1.0.4 dials_1.2.0 scales_1.2.1
[17] broom_1.0.5 tidymodels_1.1.0 lubridate_1.9.2 forcats_1.0.0
[21] stringr_1.5.0 dplyr_1.1.2 purrr_1.0.1 readr_2.1.4
[25] tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] xts_0.13.1 bit64_4.0.5 httr_1.4.6
[4] DiceDesign_1.9 tools_4.1.2 backports_1.4.1
[7] utf8_1.2.3 R6_2.5.1 lazyeval_0.2.2
[10] colorspace_2.1-0 nnet_7.3-17 withr_2.5.0
[13] gridExtra_2.3 tidyselect_1.2.0 bit_4.0.5
[16] compiler_4.1.2 cli_3.6.1 plotly_4.10.2
[19] labeling_0.4.2 digest_0.6.32 rmarkdown_2.23
[22] pkgconfig_2.0.3 htmltools_0.5.5 parallelly_1.36.0
[25] lhs_1.1.6 fastmap_1.1.1 htmlwidgets_1.6.2
[28] rlang_1.1.1 rstudioapi_0.14 farver_2.1.1
[31] generics_0.1.3 zoo_1.8-12 jsonlite_1.8.7
[34] crosstalk_1.2.0 magrittr_2.0.3 Matrix_1.5-4.1
[37] Rcpp_1.0.10 munsell_0.5.0 fansi_1.0.4
[40] GPfit_1.0-8 lifecycle_1.0.3 furrr_0.3.1
[43] stringi_1.7.12 yaml_2.3.7 MASS_7.3-55
[46] grid_4.1.2 parallel_4.1.2 listenv_0.9.0
[49] lattice_0.20-45 splines_4.1.2 hms_1.1.3
[52] knitr_1.43 pillar_1.9.0 future.apply_1.11.0
[55] codetools_0.2-18 glue_1.6.2 evaluate_0.21
[58] data.table_1.14.8 vctrs_0.6.3 tzdb_0.4.0
[61] foreach_1.5.2 gtable_0.3.3 future_1.33.0
[64] assertthat_0.2.1 xfun_0.39 gower_1.0.1
[67] prodlim_2023.03.31 viridisLite_0.4.2 class_7.3-20
[70] survival_3.2-13 timeDate_4022.108 iterators_1.0.14
[73] hardhat_1.3.0 lava_1.7.2.1 timechange_0.2.0
[76] globals_0.16.2 ellipsis_0.3.2 ipred_0.9-14