library(tidyverse)
library(lubridate)
library(arrow)
library(timetk)
library(dtwclust)
library(kableExtra)
library(tictoc)
source("../functions.R")
Cumulative time series
This notebook aims to cluster the Brazilian municipalities cumulative yearly dengue cases time-series yearly by its similarities.
Packages
Load data
Load the bundled data.
<- open_dataset(sources = data_dir("bundled_data/tdengue.parquet")) %>%
dengue select(mun, date, cases = cases_raw) %>%
collect()
dim(dengue)
[1] 340179 3
Prepare data
The chunk bellow executes various steps to prepare the data for clustering.
<- dengue %>%
tdengue # Remove dates
filter(date >= as.Date("2011-01-01") & date < as.Date("2020-01-01")) %>%
# Create year variable
mutate(year = year(date)) %>%
# Create week number
mutate(week = epiweek(date)) %>%
# Summarise per year and isoweek
group_by(mun, year, week) %>%
summarise(cases = sum(cases, na.rm = TRUE)) %>%
ungroup() %>%
# Accumulate mun time series by year
group_by(mun, year) %>%
arrange(week) %>%
mutate(cases = cumsum(cases)) %>%
# Scale cases
mutate(cases = scale(cases)) %>%
ungroup() %>%
# Isolate municipality and year
mutate(mun = paste0(mun, "_", year)) %>%
select(-year, week) %>%
# Arrange data
arrange(mun, week) %>%
# Prepare time series of municipalities by year
mutate(mun = paste0("m_", mun)) %>%
pivot_wider(names_from = mun, values_from = cases) %>%
select(-week) %>%
# Use zero value for years withot week 53
mutate(across(everything(), ~replace_na(.x, 0))) %>%
# Transpose as matrix
t() %>%
# Convert object
tslist()
`summarise()` has grouped output by 'mun', 'year'. You can override using the
`.groups` argument.
length(tdengue)
[1] 6111
Clustering
Sequence of k
groups to be used.
<- 3:10 k_seq
Soft-DTW method
tic()
<- tsclust(
clust series = tdengue,
type = "partitional",
k = k_seq,
distance = "sdtw",
seed = 13
)toc()
175.188 sec elapsed
Cluster Validity Indices (CVI)
names(clust) <- paste0("k_", k_seq)
<- sapply(clust, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi kbl() %>%
kable_styling()
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_4 | 0.4996424 | 0.0001998 | 5053.8781 | 0.813116 | 1.034655 | -0.0071101 | 0.0244607 |
k_3 | 0.4118566 | 0.0000000 | 859.9048 | 2.505068 | 3.417155 | -0.0070377 | 0.1095122 |
k_9 | 0.3971539 | 0.0000042 | 4229.2449 | 1.415377 | 5.666647 | -0.0069788 | 0.0123762 |
k_10 | 0.3930305 | 0.0000002 | 4035.8058 | 1.212292 | 6.342339 | -0.0077680 | 0.0109508 |
k_7 | 0.3599889 | 0.0000153 | 4452.3025 | 1.448889 | 5.467205 | -0.0075078 | 0.0151751 |
k_5 | 0.3430675 | 0.0000000 | 549.0199 | 3.757548 | 11.951407 | -0.0075078 | 0.0968742 |
k_8 | 0.3184463 | 0.0000000 | 3924.2207 | 2.307808 | 9.657683 | -0.0075078 | 0.0145123 |
k_6 | 0.3170633 | 0.0000108 | 4379.2283 | 3.268082 | 9.255333 | -0.0074597 | 0.0175612 |
Cluster with higher Silhouette statistic
<- clust[[res_cvi[[1,1]]]] sel_clust
plot(sel_clust)
ggsave(filename = "cluster_mun_year_cum.pdf")
Saving 7 x 5 in image
plot(sel_clust, type = "centroids", lty = 1)
ggsave(filename = "cluster_mun_year_cum_centr.pdf")
Saving 7 x 5 in image
Cluster sizes
table(sel_clust@cluster)
1 2 3 4
1280 989 3341 501
Session info
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
Random number generation:
RNG: L'Ecuyer-CMRG
Normal: Inversion
Sample: Rejection
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tictoc_1.2 kableExtra_1.3.4 dtwclust_5.5.12 dtw_1.23-1
[5] proxy_0.4-27 timetk_2.9.0 arrow_13.0.0.1 lubridate_1.9.3
[9] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.3 purrr_1.0.2
[13] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.4
[17] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] rlang_1.1.2 magrittr_2.0.3 clue_0.3-65
[4] furrr_0.3.1 flexclust_1.4-1 compiler_4.3.2
[7] systemfonts_1.0.5 vctrs_0.6.4 reshape2_1.4.4
[10] rvest_1.0.3 lhs_1.1.6 tune_1.1.2
[13] pkgconfig_2.0.3 fastmap_1.1.1 ellipsis_0.3.2
[16] labeling_0.4.3 utf8_1.2.4 promises_1.2.1
[19] rmarkdown_2.25 prodlim_2023.08.28 tzdb_0.4.0
[22] ragg_1.2.6 bit_4.0.5 xfun_0.41
[25] modeltools_0.2-23 jsonlite_1.8.7 recipes_1.0.8
[28] highr_0.10 later_1.3.1 parallel_4.3.2
[31] cluster_2.1.4 R6_2.5.1 stringi_1.7.12
[34] rsample_1.2.0 parallelly_1.36.0 rpart_4.1.21
[37] Rcpp_1.0.11 assertthat_0.2.1 dials_1.2.0
[40] iterators_1.0.14 knitr_1.45 future.apply_1.11.0
[43] zoo_1.8-12 httpuv_1.6.12 Matrix_1.6-1.1
[46] splines_4.3.2 nnet_7.3-19 timechange_0.2.0
[49] tidyselect_1.2.0 rstudioapi_0.15.0 yaml_2.3.7
[52] timeDate_4022.108 codetools_0.2-19 listenv_0.9.0
[55] lattice_0.22-5 plyr_1.8.9 shiny_1.7.5.1
[58] withr_2.5.2 evaluate_0.23 future_1.33.0
[61] survival_3.5-7 RcppParallel_5.1.7 xml2_1.3.5
[64] xts_0.13.1 pillar_1.9.0 foreach_1.5.2
[67] stats4_4.3.2 shinyjs_2.1.0 generics_0.1.3
[70] hms_1.1.3 munsell_0.5.0 scales_1.2.1
[73] xtable_1.8-4 globals_0.16.2 class_7.3-22
[76] glue_1.6.2 tools_4.3.2 data.table_1.14.8
[79] RSpectra_0.16-1 webshot_0.5.5 gower_1.0.1
[82] grid_4.3.2 yardstick_1.2.0 ipred_0.9-14
[85] colorspace_2.1-0 cli_3.6.1 DiceDesign_1.9
[88] textshaping_0.3.7 workflows_1.1.3 parsnip_1.1.1
[91] fansi_1.0.5 viridisLite_0.4.2 svglite_2.1.2
[94] lava_1.7.3 gtable_0.3.4 GPfit_1.0-8
[97] digest_0.6.33 ggrepel_0.9.4 farver_2.1.1
[100] htmlwidgets_1.6.2 htmltools_0.5.7 lifecycle_1.0.4
[103] httr_1.4.7 hardhat_1.3.0 mime_0.12
[106] bit64_4.0.5 MASS_7.3-60