K-Centroids Cluster Analysis - a unified framework for partitional clustering with selectable distance / centroid
families: standard k-means, k-medians, spherical k-means ("angle"), Jaccard, and extended Jaccard.
Calls flexclust::kcca() from package flexclust.
The k parameter is set to 2 by default since flexclust::kcca() has no default value for the number of clusters.
Predictions dispatch to flexclust's S4 predict method via methods::getMethod("predict", "kccasimple")
rather than calling predict() directly, since both flexclust and kernlab define an S4 class
named "kcca" and the resulting class-cache collision can break S4 dispatch when both packages are loaded.
Dictionary
This mlr3::Learner can be instantiated via the dictionary mlr3::mlr_learners or with the associated sugar function mlr3::lrn():
Meta Information
Task type: “clust”
Predict Types: “partition”
Feature Types: “logical”, “integer”, “numeric”
Required Packages: mlr3, mlr3cluster, flexclust
Parameters
| Id | Type | Default | Levels | Range |
| k | integer | - | \([1, \infty)\) | |
| family | character | kmeans | kmeans, kmedians, angle, jaccard, ejaccard | - |
| weights | untyped | - | - | |
| group | untyped | - | - | |
| simple | logical | FALSE | TRUE, FALSE | - |
| save.data | logical | FALSE | TRUE, FALSE | - |
| iter.max | integer | 200 | \([1, \infty)\) | |
| tolerance | numeric | 1e-06 | \([0, \infty)\) | |
| verbose | integer | 0 | \([0, \infty)\) | |
| classify | character | auto | auto, weighted, hard | - |
| initcent | untyped | - | - | |
| gamma | numeric | 1 | \([0, \infty)\) | |
| ntry | integer | 5 | \([1, \infty)\) | |
| min.size | integer | 2 | \([1, \infty)\) |
References
Leisch, Friedrich (2006). “A Toolbox for K-Centroids Cluster Analysis.” Computational Statistics & Data Analysis, 51(2), 526–544. doi:10.1016/j.csda.2005.10.006 .
See also
Chapter in the mlr3book: https://mlr3book.mlr-org.com/chapters/chapter2/data_and_basic_modeling.html#sec-learners
Package mlr3extralearners for more learners.
as.data.table(mlr_learners)for a table of available Learners in the running session (depending on the loaded packages).mlr3pipelines to combine learners with pre- and postprocessing steps.
Extension packages for additional task types:
mlr3proba for probabilistic supervised regression and survival analysis.
mlr3cluster for unsupervised clustering.
mlr3tuning for tuning of hyperparameters, mlr3tuningspaces for established default tuning spaces.
Other Learner:
mlr_learners_clust.MBatchKMeans,
mlr_learners_clust.SimpleKMeans,
mlr_learners_clust.agnes,
mlr_learners_clust.ap,
mlr_learners_clust.bico,
mlr_learners_clust.birch,
mlr_learners_clust.clara,
mlr_learners_clust.cmeans,
mlr_learners_clust.cobweb,
mlr_learners_clust.dbscan,
mlr_learners_clust.dbscan_fpc,
mlr_learners_clust.diana,
mlr_learners_clust.em,
mlr_learners_clust.fanny,
mlr_learners_clust.featureless,
mlr_learners_clust.ff,
mlr_learners_clust.flexmix,
mlr_learners_clust.genie,
mlr_learners_clust.hclust,
mlr_learners_clust.hdbscan,
mlr_learners_clust.kkmeans,
mlr_learners_clust.kmeans,
mlr_learners_clust.kproto,
mlr_learners_clust.mclust,
mlr_learners_clust.meanshift,
mlr_learners_clust.movMF,
mlr_learners_clust.optics,
mlr_learners_clust.pam,
mlr_learners_clust.protoclust,
mlr_learners_clust.skmeans,
mlr_learners_clust.som,
mlr_learners_clust.specc,
mlr_learners_clust.stdbscan,
mlr_learners_clust.tclust,
mlr_learners_clust.xmeans
Super classes
mlr3::Learner -> LearnerClust -> LearnerClustKCCA
Examples
# Define the Learner and set parameter values
learner = lrn("clust.kcca")
print(learner)
#>
#> ── <LearnerClustKCCA> (clust.kcca): K-Centroids Cluster Analysis ───────────────
#> • Model: -
#> • Parameters: k=2
#> • Packages: mlr3, mlr3cluster, and flexclust
#> • Predict Types: [partition]
#> • Feature Types: logical, integer, and numeric
#> • Encapsulation: none (fallback: -)
#> • Properties: complete, exclusive, and partitional
#> • Other settings: use_weights = 'error', predict_raw = 'FALSE'
# Define a Task
task = tsk("usarrests")
# Train the learner on the task
learner$train(task)
#> Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
#> Also defined by ‘flexclust’
#> Found more than one class "kcca" in cache; using the first, from namespace 'kernlab'
#> Also defined by ‘flexclust’
# Print the model
print(learner$model)
#> kcca object of family ‘kmeans’
#>
#> call:
#> flexclust::kcca(x = as.matrix(task$data()), k = 2L, family = new("kccaFamily",
#> name = "kmeans", dist = function (x, centers)
#> {
#> if (ncol(x) != ncol(centers))
#> stop(sQuote("x"), " and ", sQuote("centers"), " must have the same number of columns")
#> z <- matrix(0, nrow = nrow(x), ncol = nrow(centers))
#> for (k in 1:nrow(centers)) {
#> z[, k] <- sqrt(colSums((t(x) - centers[k, ])^2))
#> }
#> z
#> }, cent = function (x)
#> colMeans(x), allcent = function (x, cluster, k = max(cluster,
#> na.rm = TRUE))
#> {
#> centers <- matrix(NA, nrow = k, ncol = ncol(x))
#> for (n in 1:k) {
#> if (sum(cluster == n, na.rm = TRUE) > 0) {
#> centers[n, ] <- z@cent(x[cluster == n, , drop = FALSE])
#> }
#> }
#> centers
#> }, wcent = function (x, weights)
#> colMeans(x * normWeights(weights)), weighted = TRUE, cluster = function (x,
#> centers, n = 1, distmat = NULL)
#> {
#> if (is.null(distmat))
#> distmat <- z@dist(x, centers)
#> if (n == 1) {
#> return(max.col(-distmat))
#> }
#> else {
#> r <- t(matrix(apply(distmat, 1, rank, ties.method = "random"),
#> nrow = ncol(distmat)))
#> z <- list()
#> for (k in 1:n) z[[k]] <- apply(r, 1, function(x) which(x ==
#> k))
#> }
#> return(z)
#> }, preproc = function (x)
#> x, groupFun = function (cluster, group, distmat)
#> {
#> G <- levels(group)
#> x <- matrix(0, ncol = ncol(distmat), nrow = length(G))
#> for (n in 1:length(G)) {
#> x[n, ] <- colSums(distmat[group == G[n], , drop = FALSE])
#> }
#> m <- max.col(-x)
#> names(m) <- G
#> z <- m[group]
#> names(z) <- NULL
#> if (is.list(cluster)) {
#> x[cbind(1:nrow(x), m)] <- Inf
#> m <- max.col(-x)
#> names(m) <- G
#> z1 <- m[group]
#> names(z1) <- NULL
#> z <- list(z, z1)
#> }
#> z
#> }, genDist = function ()
#> NULL))
#>
#> cluster sizes:
#>
#> 1 2
#> 21 29
#>
# Make predictions for the task
prediction = learner$predict(task)
# Score the predictions
prediction$score(task = task)
#> clust.dunn
#> 0.1033191