library(tidyverse)
library(lubridate)
library(arrow)
library(timetk)
library(dtwclust)
library(kableExtra)
library(tictoc)
source("../functions.R")
Scaled cases, cumulative
This notebook aims to cluster the Brazilian municipalities considering scaled (standardized) dengue cases time-series similarities.
Packages
Load data
Load the aggregated data.
<- open_dataset(sources = data_dir("bundled_data/tdengue.parquet")) %>%
tdengue select(mun, date, cases = cases_cum) %>%
collect()
dim(tdengue)
[1] 340179 3
Prepare data
The chunk bellow executes various steps to prepare the data for clustering.
<- tdengue %>%
tdengue # Prepare time series
mutate(mun = paste0("m_", mun)) %>%
arrange(mun, date) %>%
pivot_wider(names_from = mun, values_from = cases) %>%
select(-date) %>%
t() %>%
tslist()
saveRDS(object = tdengue, file = "tdengue.rds")
length(tdengue)
[1] 679
Clustering
Sequence of k
groups to be used.
<- 2:10 k_seq
DTW (basic)
tic()
<- tsclust(
clust_dtw series = tdengue,
type = "partitional",
k = k_seq,
distance = "dtw_basic",
seed = 13
)toc()
39.132 sec elapsed
names(clust_dtw) <- paste0("k_", k_seq)
<- sapply(clust_dtw, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_2 | 0.3883485 | 0 | 219.03946 | 1.134304 | 1.134304 | 0.008077949 | 0.15430130 |
k_3 | 0.3067752 | 0 | 310.90157 | 1.635604 | 1.775981 | 0.006456940 | 0.12341630 |
k_5 | 0.2387272 | 0 | 162.71153 | 1.759704 | 2.280500 | 0.006971102 | 0.09710109 |
k_8 | 0.2052530 | 0 | 116.02794 | 1.497911 | 2.157663 | 0.010105561 | 0.08426017 |
k_9 | 0.1759150 | 0 | 110.48283 | 1.982305 | 3.102040 | 0.007701467 | 0.07432019 |
k_7 | 0.1677298 | 0 | 116.57366 | 1.604325 | 2.207429 | 0.010855206 | 0.08933756 |
k_10 | 0.1586523 | 0 | 80.28211 | 2.273564 | 3.023525 | 0.006111798 | 0.08597771 |
k_4 | 0.1539778 | 0 | 147.65699 | 2.473204 | 3.436038 | 0.005005953 | 0.12329358 |
k_6 | 0.1527731 | 0 | 126.08954 | 2.013522 | 2.576196 | 0.005165569 | 0.10876459 |
<- clust_dtw[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2
551 128
Soft-DTW
tic()
<- tsclust(
clust_sdtw series = tdengue,
type = "partitional",
k = k_seq,
distance = "sdtw",
seed = 13
)toc()
193.822 sec elapsed
names(clust_sdtw) <- paste0("k_", k_seq)
<- sapply(clust_sdtw, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_9 | 4.6835404 | 3.301961e-04 | -1198.16808 | 0.4328621 | 70.9687638 | -0.027622202 | -0.001783289 |
k_3 | 0.5874870 | 1.210092e-08 | 241.76435 | 0.4728893 | 0.6095286 | -0.016428914 | 0.017947639 |
k_2 | 0.5526841 | 1.864919e-08 | -157.74645 | 0.6603416 | 0.6603416 | -0.008343487 | 0.076892143 |
k_7 | 0.5143244 | 2.926145e-06 | 1776.33826 | 3.5865645 | -11.8749979 | -0.024224946 | 0.003868853 |
k_6 | 0.5091103 | 4.869410e-04 | 463.73188 | 2.1072188 | 0.3291628 | -0.018669198 | 0.005947036 |
k_10 | 0.4865815 | 8.516047e-01 | 1744.16238 | 2.2000345 | -13.7430978 | -0.020388668 | 0.003773382 |
k_5 | 0.4759798 | 3.530744e-08 | 219.42676 | 1.0472748 | 2.8079291 | -0.015917823 | 0.007728450 |
k_4 | 0.2085165 | 1.043811e-05 | 46.69006 | 1.5530459 | -0.6703440 | -0.016278775 | 0.027708772 |
k_8 | 0.0811271 | 1.595867e-05 | 51191.47920 | 3.6371109 | -5.0907762 | -0.024336278 | 0.001610495 |
<- clust_sdtw[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2 3 4 5 6 7 8 9
74 106 102 83 89 78 44 81 22
<- tibble(
cluster_ids mun = names(tdengue) %>% substr(3, 9),
group = as.character(sel_clust@cluster)
)
saveRDS(object = cluster_ids, file = "clust_sdtw_ids.rds")
SBD
tic()
<- tsclust(
clust_sbd series = tdengue,
type = "partitional",
k = k_seq,
distance = "sbd",
seed = 13
)toc()
0.756 sec elapsed
names(clust_sbd) <- paste0("k_", k_seq)
<- sapply(clust_sbd, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_2 | 0.4442625 | 0.5940473 | 345.3780 | 1.035863 | 1.035863 | 0.008004787 | 0.20201102 |
k_3 | 0.4022197 | 0.5792206 | 346.5557 | 1.239092 | 1.387231 | 0.001858214 | 0.12732693 |
k_4 | 0.3334983 | 0.5737804 | 245.6445 | 1.121578 | 1.815766 | 0.002032385 | 0.10409579 |
k_7 | 0.3274548 | 0.5339155 | 190.7100 | 1.212410 | 2.079583 | 0.004857007 | 0.06826826 |
k_6 | 0.3125595 | 0.5426452 | 206.1180 | 1.239948 | 2.105174 | 0.008653764 | 0.07328745 |
k_10 | 0.2993463 | 0.5225233 | 168.3385 | 1.828146 | 3.486689 | 0.002307383 | 0.05626920 |
k_9 | 0.2813463 | 0.5244313 | 182.8368 | 1.213397 | 1.947885 | 0.002827627 | 0.05685390 |
k_5 | 0.2604179 | 0.5603160 | 209.0601 | 1.290158 | 1.825457 | 0.003340171 | 0.10155246 |
k_8 | 0.1947688 | 0.5344965 | 148.8106 | 1.642704 | 3.682444 | 0.002371686 | 0.07799687 |
<- clust_sbd[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2
500 179
GAK
tic()
<- tsclust(
clust_gak series = tdengue,
type = "partitional",
k = k_seq,
distance = "gak",
seed = 13
)toc()
280.382 sec elapsed
names(clust_gak) <- paste0("k_", k_seq)
<- sapply(clust_gak, cvi, type = "internal") %>%
res_cvi t() %>%
as_tibble(rownames = "k") %>%
arrange(-Sil)
%>%
res_cvi ::gt() gt
k | Sil | SF | CH | DB | DBstar | D | COP |
---|---|---|---|---|---|---|---|
k_2 | 0.4703437 | 0.6318950 | 346.9030 | 0.7492626 | 0.7492626 | 0.002673344 | 0.22610650 |
k_3 | 0.4242503 | 0.6317923 | 350.9145 | 1.1578518 | 1.3150938 | 0.001689423 | 0.12032277 |
k_4 | 0.3404306 | 0.6317602 | 248.3140 | 1.0644928 | 1.7873944 | 0.001860028 | 0.09770063 |
k_7 | 0.3391378 | 0.6315128 | 204.6923 | 1.2022774 | 2.0564247 | 0.003583292 | 0.06365358 |
k_6 | 0.3320730 | 0.6315865 | 228.3494 | 1.1336897 | 1.8636779 | 0.004311562 | 0.06847479 |
k_9 | 0.3173288 | 0.6314596 | 178.8928 | 1.1311449 | 1.9720656 | 0.004519858 | 0.04981730 |
k_10 | 0.3094439 | 0.6314500 | 177.1258 | 1.7581411 | 3.6257840 | 0.001690421 | 0.05150554 |
k_5 | 0.2528416 | 0.6316542 | 190.1814 | 1.5612030 | 2.0839318 | 0.001970852 | 0.09056517 |
k_8 | 0.2169763 | 0.6315151 | 152.0770 | 1.7145901 | 4.1675547 | 0.001897451 | 0.07208410 |
<- clust_gak[[res_cvi[[1,1]]]]
sel_clust
plot(sel_clust)
plot(sel_clust, type = "centroids", lty = 1)
table(sel_clust@cluster)
1 2
535 144
Session info
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
Random number generation:
RNG: L'Ecuyer-CMRG
Normal: Inversion
Sample: Rejection
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] tictoc_1.2 kableExtra_1.3.4 dtwclust_5.5.12 dtw_1.23-1
[5] proxy_0.4-27 timetk_2.9.0 arrow_13.0.0.1 lubridate_1.9.3
[9] forcats_1.0.0 stringr_1.5.0 dplyr_1.1.3 purrr_1.0.2
[13] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.4
[17] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] rlang_1.1.2 magrittr_2.0.3 clue_0.3-65
[4] furrr_0.3.1 flexclust_1.4-1 compiler_4.3.2
[7] systemfonts_1.0.5 vctrs_0.6.4 reshape2_1.4.4
[10] rvest_1.0.3 lhs_1.1.6 tune_1.1.2
[13] pkgconfig_2.0.3 fastmap_1.1.1 ellipsis_0.3.2
[16] labeling_0.4.3 utf8_1.2.4 promises_1.2.1
[19] rmarkdown_2.25 prodlim_2023.08.28 tzdb_0.4.0
[22] bit_4.0.5 xfun_0.41 modeltools_0.2-23
[25] jsonlite_1.8.7 recipes_1.0.8 later_1.3.1
[28] parallel_4.3.2 cluster_2.1.4 R6_2.5.1
[31] stringi_1.7.12 rsample_1.2.0 parallelly_1.36.0
[34] rpart_4.1.21 Rcpp_1.0.11 assertthat_0.2.1
[37] dials_1.2.0 iterators_1.0.14 knitr_1.45
[40] future.apply_1.11.0 zoo_1.8-12 httpuv_1.6.12
[43] Matrix_1.6-1.1 splines_4.3.2 nnet_7.3-19
[46] timechange_0.2.0 tidyselect_1.2.0 rstudioapi_0.15.0
[49] yaml_2.3.7 timeDate_4022.108 codetools_0.2-19
[52] listenv_0.9.0 lattice_0.22-5 plyr_1.8.9
[55] shiny_1.7.5.1 withr_2.5.2 evaluate_0.23
[58] future_1.33.0 survival_3.5-7 RcppParallel_5.1.7
[61] xml2_1.3.5 xts_0.13.1 pillar_1.9.0
[64] foreach_1.5.2 stats4_4.3.2 shinyjs_2.1.0
[67] generics_0.1.3 hms_1.1.3 munsell_0.5.0
[70] scales_1.2.1 xtable_1.8-4 globals_0.16.2
[73] class_7.3-22 glue_1.6.2 tools_4.3.2
[76] data.table_1.14.8 RSpectra_0.16-1 webshot_0.5.5
[79] gower_1.0.1 grid_4.3.2 yardstick_1.2.0
[82] ipred_0.9-14 colorspace_2.1-0 cli_3.6.1
[85] DiceDesign_1.9 workflows_1.1.3 parsnip_1.1.1
[88] fansi_1.0.5 viridisLite_0.4.2 gt_0.10.0
[91] svglite_2.1.2 lava_1.7.3 gtable_0.3.4
[94] GPfit_1.0-8 sass_0.4.7 digest_0.6.33
[97] ggrepel_0.9.4 farver_2.1.1 htmlwidgets_1.6.2
[100] htmltools_0.5.7 lifecycle_1.0.4 httr_1.4.7
[103] hardhat_1.3.0 mime_0.12 bit64_4.0.5
[106] MASS_7.3-60