A quick tour of mclust

Luca Scrucca

21 Nov 2017

Introduction

mclust is a contributed R package for model-based clustering, classification, and density estimation based on finite normal mixture modelling. It provides functions for parameter estimation via the EM algorithm for normal mixture models with a variety of covariance structures, and functions for simulation from these models. Also included are functions that combine model-based hierarchical clustering, EM for mixture estimation and the Bayesian Information Criterion (BIC) in comprehensive strategies for clustering, density estimation and discriminant analysis. Additional functionalities are available for displaying and visualizing fitted models along with clustering, classification, and density estimation results.

This document gives a quick tour of mclust (version 5.4) functionalities. It was written in R Markdown, using the knitr package for production. See help(package="mclust") for further details and references provided by citation("mclust").

library(mclust)
##     __  ___________    __  _____________
##    /  |/  / ____/ /   / / / / ___/_  __/
##   / /|_/ / /   / /   / / / /\__ \ / /   
##  / /  / / /___/ /___/ /_/ /___/ // /    
## /_/  /_/\____/_____/\____//____//_/    version 5.4
## Type 'citation("mclust")' for citing this R package in publications.

Clustering

data(diabetes)
class <- diabetes$class
table(class)
## class
## Chemical   Normal    Overt 
##       36       76       33
X <- diabetes[,-1]
head(X)
##   glucose insulin sspg
## 1      80     356  124
## 2      97     289  117
## 3     105     319  143
## 4      90     356  199
## 5      90     323  240
## 6      86     381  157
clPairs(X, class)


BIC <- mclustBIC(X)
plot(BIC)

summary(BIC)
## Best BIC values:
##              VVV,3       VVV,4       EVE,6
## BIC      -4751.316 -4784.32213 -4785.24591
## BIC diff     0.000   -33.00573   -33.92951

mod1 <- Mclust(X, x = BIC)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -2303.496 145 29 -4751.316 -4770.169
## 
## Clustering table:
##  1  2  3 
## 81 36 28 
## 
## Mixing probabilities:
##         1         2         3 
## 0.5368974 0.2650129 0.1980897 
## 
## Means:
##              [,1]     [,2]       [,3]
## glucose  90.96239 104.5335  229.42136
## insulin 357.79083 494.8259 1098.25990
## sspg    163.74858 309.5583   81.60001
## 
## Variances:
## [,,1]
##          glucose    insulin       sspg
## glucose 57.18044   75.83206   14.73199
## insulin 75.83206 2101.76553  322.82294
## sspg    14.73199  322.82294 2416.99074
## [,,2]
##           glucose   insulin       sspg
## glucose  185.0290  1282.340  -509.7313
## insulin 1282.3398 14039.283 -2559.0251
## sspg    -509.7313 -2559.025 23835.7278
## [,,3]
##           glucose   insulin       sspg
## glucose  5529.250  20389.09  -2486.208
## insulin 20389.088  83132.48 -10393.004
## sspg    -2486.208 -10393.00   2217.533

plot(mod1, what = "classification")

table(class, mod1$classification)
##           
## class       1  2  3
##   Chemical  9 26  1
##   Normal   72  4  0
##   Overt     0  6 27

par(mfrow = c(2,2))
plot(mod1, what = "uncertainty", dimens = c(2,1), main = "")
plot(mod1, what = "uncertainty", dimens = c(3,1), main = "")
plot(mod1, what = "uncertainty", dimens = c(2,3), main = "")
par(mfrow = c(1,1))


ICL = mclustICL(X)
summary(ICL)
## Best ICL values:
##              VVV,3       EVE,6       EVE,7
## ICL      -4770.169 -4797.38232 -4797.50566
## ICL diff     0.000   -27.21342   -27.33677
plot(ICL)


LRT = mclustBootstrapLRT(X, modelName = "VVV")
LRT
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model        = VVV 
## Replications = 999 
##               LRTS bootstrap p-value
## 1 vs 2   361.16739             0.001
## 2 vs 3   123.49685             0.001
## 3 vs 4    16.76161             0.482

Classification

EDDA

data(iris)
class <- iris$Species
table(class)
## class
##     setosa versicolor  virginica 
##         50         50         50
X <- iris[,1:4]
head(X)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
mod2 <- MclustDA(X, class, modelType = "EDDA")
summary(mod2)
## ------------------------------------------------
## Gaussian finite mixture model for classification 
## ------------------------------------------------
## 
## EDDA model summary:
## 
##  log.likelihood   n df       BIC
##       -187.7097 150 36 -555.8024
##             
## Classes       n Model G
##   setosa     50   VEV 1
##   versicolor 50   VEV 1
##   virginica  50   VEV 1
## 
## Training classification summary:
## 
##             Predicted
## Class        setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          0        50
## 
## Training error = 0.02
plot(mod2, what = "scatterplot")

plot(mod2, what = "classification")

MclustDA

data(banknote)
class <- banknote$Status
table(class)
## class
## counterfeit     genuine 
##         100         100
X <- banknote[,-1]
head(X)
##   Length  Left Right Bottom  Top Diagonal
## 1  214.8 131.0 131.1    9.0  9.7    141.0
## 2  214.6 129.7 129.7    8.1  9.5    141.7
## 3  214.8 129.7 129.7    8.7  9.6    142.2
## 4  214.8 129.7 129.6    7.5 10.4    142.0
## 5  215.0 129.6 129.7   10.4  7.7    141.8
## 6  215.7 130.8 130.5    9.0 10.1    141.4
mod3 <- MclustDA(X, class)
summary(mod3)
## ------------------------------------------------
## Gaussian finite mixture model for classification 
## ------------------------------------------------
## 
## MclustDA model summary:
## 
##  log.likelihood   n df       BIC
##       -646.0801 200 66 -1641.849
##              
## Classes         n Model G
##   counterfeit 100   EVE 2
##   genuine     100   XXX 1
## 
## Training classification summary:
## 
##              Predicted
## Class         counterfeit genuine
##   counterfeit         100       0
##   genuine               0     100
## 
## Training error = 0
plot(mod3, what = "scatterplot")