library(tidyverse)
library(lubridate)
library(arrow)
library(timetk)
library(dtwclust)
library(kableExtra)
library(tictoc)
source("../functions.R")
Scaled cases
This notebook aims to cluster the Brazilian municipalities considering scaled (standardized) dengue cases time-series similarities.
Packages
Load data
Load the aggregated data.
<- open_dataset(sources = data_dir("bundled_data/tdengue.parquet")) %>%
tdengue select(mun, date, cases) %>%
collect()
dim(tdengue)
[1] 340179 3
Prepare data
The chunk bellow executes various steps to prepare the data for clustering.
<- tdengue %>%
tdengue # Prepare time series
mutate(mun = paste0("m_", mun)) %>%
arrange(mun, date) %>%
pivot_wider(names_from = mun, values_from = cases) %>%
select(-date) %>%
t() %>%
tslist()
length(tdengue)
[1] 679
Clustering
Sequence of k
groups to be used.
<- 2:10 k_seq
DTW (basic)
tic()
<- tsclust(
clust_dtw series = tdengue,
type = "partitional",
k = k_seq,
distance = "dtw_basic",
seed = 13
)toc()
44.727 sec elapsed
names(clust_dtw) <- paste0("k_", k_seq)
<- sapply(clust_dtw, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_2 | 0.22470565 | 0 | 471.19772 | 2.169633 | 2.169633 | 0.09802846 | 0.4465088 |
k_6 | 0.16590632 | 0 | 129.12898 | 1.492001 | 1.779535 | 0.08842332 | 0.3564468 |
k_5 | 0.12371981 | 0 | 166.97238 | 1.873249 | 2.231609 | 0.08368662 | 0.3581035 |
k_3 | 0.12269031 | 0 | 282.64604 | 2.352913 | 2.754531 | 0.08676909 | 0.4086315 |
k_4 | 0.11888356 | 0 | 193.62938 | 1.898608 | 2.232433 | 0.08471016 | 0.3694141 |
k_10 | 0.10437071 | 0 | 82.16793 | 1.872270 | 2.348246 | 0.10197290 | 0.3198181 |
k_7 | 0.07891310 | 0 | 105.14468 | 2.366061 | 2.737811 | 0.10021368 | 0.3467224 |
k_9 | 0.05778652 | 0 | 87.10300 | 1.950026 | 2.716834 | 0.10021368 | 0.3233407 |
k_8 | 0.05494472 | 0 | 100.42993 | 1.902694 | 2.559555 | 0.07642521 | 0.3449551 |
<- clust_dtw[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2
292 387
Soft-DTW
tic()
<- tsclust(
clust_sdtw series = tdengue,
type = "partitional",
k = k_seq,
distance = "sdtw",
seed = 13
)toc()
204.708 sec elapsed
names(clust_sdtw) <- paste0("k_", k_seq)
<- sapply(clust_sdtw, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_2 | 0.54420550 | 0 | 424.7128 | 1.784989 | 1.784989 | 0.0188267923 | 0.2699432 |
k_6 | 0.23058175 | 0 | 150.9456 | 1.252811 | 1.660076 | 0.0056554246 | 0.1492226 |
k_5 | 0.20100983 | 0 | 179.4794 | 2.019992 | 3.279759 | 0.0062697286 | 0.1565015 |
k_4 | 0.16735749 | 0 | 197.7269 | 2.204787 | 5.000366 | 0.0014544262 | 0.1900948 |
k_3 | 0.16612027 | 0 | 203.6722 | 4.148653 | 4.660288 | 0.0095083455 | 0.2259238 |
k_10 | 0.14175838 | 0 | 106.8324 | 1.978932 | 3.094434 | 0.0079666165 | 0.1331531 |
k_7 | 0.13487754 | 0 | 136.8319 | 1.860672 | 3.131776 | -0.0017931128 | 0.1454316 |
k_9 | 0.11776105 | 0 | 106.7446 | 2.427594 | 4.376014 | -0.0082114090 | 0.1384285 |
k_8 | 0.07201448 | 0 | 121.5954 | 2.392449 | 5.993679 | 0.0005698678 | 0.1471381 |
<- clust_sdtw[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2
581 98
SBD
tic()
<- tsclust(
clust_sbd series = tdengue,
type = "partitional",
k = k_seq,
distance = "sbd",
seed = 13
)toc()
0.792 sec elapsed
names(clust_sbd) <- paste0("k_", k_seq)
<- sapply(clust_sbd, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_2 | 0.19153388 | 0.41954595 | 92.48199 | 4.965541 | 4.965541 | 0.05215726 | 0.3986087 |
k_6 | 0.17383693 | 0.15654880 | 45.49664 | 3.416220 | 3.694440 | 0.09627307 | 0.3700479 |
k_5 | 0.15090087 | 0.16086274 | 51.05856 | 2.100628 | 2.794277 | 0.08129365 | 0.3596084 |
k_10 | 0.09507187 | 0.03507591 | 37.49864 | 2.666277 | 3.750657 | 0.05350097 | 0.3320909 |
k_8 | 0.07435107 | 0.07700081 | 29.71794 | 4.390497 | 6.258006 | 0.04658001 | 0.3447000 |
k_9 | 0.05379396 | 0.05994618 | 39.01541 | 3.867380 | 5.360924 | 0.03345584 | 0.3376868 |
k_7 | 0.03260668 | 0.10548796 | 48.66182 | 4.096509 | 5.928294 | 0.02534005 | 0.3453233 |
k_3 | 0.02929865 | 0.29847866 | 58.62624 | 5.033205 | 6.498922 | 0.02115696 | 0.3885022 |
k_4 | 0.02545150 | 0.23112470 | 50.67983 | 5.198919 | 6.670280 | 0.02116725 | 0.3814808 |
<- clust_sbd[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2
503 176
GAK
tic()
<- tsclust(
clust_gak series = tdengue,
type = "partitional",
k = k_seq,
distance = "gak",
seed = 13
)toc()
260.459 sec elapsed
names(clust_gak) <- paste0("k_", k_seq)
<- sapply(clust_gak, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_6 | 0.2830846 | 0.6304283 | 124.45464 | 2.229032 | 2.350710 | 0.009648476 | 0.2459049 |
k_9 | 0.2591976 | 0.6297937 | 102.39058 | 2.243505 | 3.344258 | 0.007771500 | 0.2020673 |
k_7 | 0.2559452 | 0.6301835 | 105.07795 | 2.610798 | 2.716722 | 0.009410867 | 0.2473205 |
k_10 | 0.2550581 | 0.6295251 | 95.02169 | 2.043610 | 2.248546 | 0.008625614 | 0.1991254 |
k_5 | 0.2253127 | 0.6305288 | 110.82437 | 1.737232 | 1.904585 | 0.008938342 | 0.3000306 |
k_3 | 0.1974647 | 0.6309581 | 98.32655 | 3.945344 | 4.077690 | 0.006170275 | 0.3651774 |
k_8 | 0.1823244 | 0.6298506 | 74.98460 | 2.864885 | 2.987567 | 0.010215175 | 0.2561144 |
k_2 | 0.1584990 | 0.6314929 | 203.86335 | 2.899706 | 2.899706 | 0.020554844 | 0.5280049 |
k_4 | 0.1423961 | 0.6306801 | 101.83124 | 3.970774 | 4.082930 | 0.005129007 | 0.3477263 |
<- clust_gak[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2 3 4 5 6
73 104 64 91 144 203
Session info
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
Random number generation:
RNG: L'Ecuyer-CMRG
Normal: Inversion
Sample: Rejection
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tictoc_1.2 kableExtra_1.3.4 dtwclust_5.5.12 dtw_1.23-1
[5] proxy_0.4-27 timetk_2.9.0 arrow_13.0.0.1 lubridate_1.9.3
[9] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.3 purrr_1.0.2
[13] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.4
[17] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] rlang_1.1.2 magrittr_2.0.3 clue_0.3-65
[4] furrr_0.3.1 flexclust_1.4-1 compiler_4.3.2
[7] systemfonts_1.0.5 vctrs_0.6.4 reshape2_1.4.4
[10] rvest_1.0.3 lhs_1.1.6 tune_1.1.2
[13] pkgconfig_2.0.3 fastmap_1.1.1 ellipsis_0.3.2
[16] labeling_0.4.3 utf8_1.2.4 promises_1.2.1
[19] rmarkdown_2.25 prodlim_2023.08.28 tzdb_0.4.0
[22] bit_4.0.5 xfun_0.41 modeltools_0.2-23
[25] jsonlite_1.8.7 recipes_1.0.8 later_1.3.1
[28] parallel_4.3.2 cluster_2.1.4 R6_2.5.1
[31] stringi_1.7.12 rsample_1.2.0 parallelly_1.36.0
[34] rpart_4.1.21 Rcpp_1.0.11 assertthat_0.2.1
[37] dials_1.2.0 iterators_1.0.14 knitr_1.45
[40] future.apply_1.11.0 zoo_1.8-12 httpuv_1.6.12
[43] Matrix_1.6-1.1 splines_4.3.2 nnet_7.3-19
[46] timechange_0.2.0 tidyselect_1.2.0 rstudioapi_0.15.0
[49] yaml_2.3.7 timeDate_4022.108 codetools_0.2-19
[52] listenv_0.9.0 lattice_0.22-5 plyr_1.8.9
[55] shiny_1.7.5.1 withr_2.5.2 evaluate_0.23
[58] future_1.33.0 survival_3.5-7 RcppParallel_5.1.7
[61] xml2_1.3.5 xts_0.13.1 pillar_1.9.0
[64] foreach_1.5.2 stats4_4.3.2 shinyjs_2.1.0
[67] generics_0.1.3 hms_1.1.3 munsell_0.5.0
[70] scales_1.2.1 xtable_1.8-4 globals_0.16.2
[73] class_7.3-22 glue_1.6.2 tools_4.3.2
[76] data.table_1.14.8 RSpectra_0.16-1 webshot_0.5.5
[79] gower_1.0.1 grid_4.3.2 yardstick_1.2.0
[82] ipred_0.9-14 colorspace_2.1-0 cli_3.6.1
[85] DiceDesign_1.9 workflows_1.1.3 parsnip_1.1.1
[88] fansi_1.0.5 viridisLite_0.4.2 gt_0.10.0
[91] svglite_2.1.2 lava_1.7.3 gtable_0.3.4
[94] GPfit_1.0-8 sass_0.4.7 digest_0.6.33
[97] ggrepel_0.9.4 farver_2.1.1 htmlwidgets_1.6.2
[100] htmltools_0.5.7 lifecycle_1.0.4 httr_1.4.7
[103] hardhat_1.3.0 mime_0.12 bit64_4.0.5
[106] MASS_7.3-60