Sparse Marices with funrar

Matthias Grenié

2016-11-22

Sparse Matrices Usefulness

When a matrix contains a lot of zero, there is no need to store all its values. But only the non-zero values, for example the following matrix:

\[\begin{equation} M = \bordermatrix{ ~ & \text{sp. A} & \text{sp. B} & {sp. C} \cr \text{site 1} & 1 & 0 & 0 \cr \text{site 2} & 0 & 1 & 1 \cr \text{site 3} & 0 & 1 & 0 \cr} \end{equation}\]

can be stored using only a matrix with non zero-values \(S\), with row index \(i\) and column index \(j\):

\[\begin{equation} S = \begin{matrix} i & j & \text{value} \\ \text{site 1} & \text{sp. A} & 1 \\ \text{site 2} & \text{sp. B} & 1 \\ \text{site 3} & \text{sp. B} & 1 \\ \text{site 2} & \text{sp. C} & 1 \end{matrix} \end{equation}\]

If your site-species matrix is big, it may contain a lot zero because it is very unlikely that all species are present at all sites when looking at thousands of species and hundreds of sites. Thus, the sparse matrix notation would save a good amount of lines (look how \(S\) compared to \(M\)).

# Generate a matrix with 1000 species and 200 sites
my_mat = matrix(sample(c(0, 0, 0, 0, 0, 0, 1), replace = TRUE, size = 200000),
                ncol = 1000, nrow = 200)
colnames(my_mat) = paste0("sp", 1:ncol(my_mat))
rownames(my_mat) = paste0("site", 1:nrow(my_mat))

my_mat[1:5, 1:5]
##       sp1 sp2 sp3 sp4 sp5
## site1   0   0   0   0   0
## site2   0   0   0   0   0
## site3   0   0   0   0   0
## site4   0   0   0   0   0
## site5   0   0   0   0   0

funrar lets you use sparse matrices directly using a function implemented in the Matrix package. When your matrix is filled with 0s it can be quicker to use sparse matrices. To know if you should use sparse matrices you can compute the filling of your matrix, i.e. the percentage of non-zero cells in your matrix:

filling = 1 - sum(my_mat == 0)/(ncol(my_mat)*nrow(my_mat))

filling
## [1] 0.143695

To convert from a normal matrix to a sparse matrix you can use as(my_mat, "sparseMatrix"):

library(Matrix)

sparse_mat = as(my_mat, "sparseMatrix")

is(my_mat, "sparseMatrix")
## [1] FALSE
is(sparse_mat, "sparseMatrix")
## [1] TRUE

it completely changes the structure as well as the memory it takes in the RAM:

# Regular Matrix
str(my_mat)
##  num [1:200, 1:1000] 0 0 0 0 0 0 0 1 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:200] "site1" "site2" "site3" "site4" ...
##   ..$ : chr [1:1000] "sp1" "sp2" "sp3" "sp4" ...
print(object.size(my_mat), units = "Kb")
## 1628.6 Kb
# Sparse Matrix from 'Matrix' package
str(sparse_mat)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:28739] 7 12 21 28 31 55 58 59 62 67 ...
##   ..@ p       : int [1:1001] 0 29 64 96 126 154 187 211 236 261 ...
##   ..@ Dim     : int [1:2] 200 1000
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:200] "site1" "site2" "site3" "site4" ...
##   .. ..$ : chr [1:1000] "sp1" "sp2" "sp3" "sp4" ...
##   ..@ x       : num [1:28739] 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ factors : list()
print(object.size(sparse_mat), units = "Kb")
## 407.8 Kb

sparse matrices reduce the amount of RAM necessary to store them. The more zeros there are in a given matrix the more RAM will be spared when turning it into a sparse matrix.

Benchmark

We can now compare the performances of the algorithms in rarity indices computation between a sparse matrix and a regular one, using the popular microbenchmark R package:

library(microbenchmark)
library(funrar)

# Get a table of traits
trait_df = data.frame(trait = runif(ncol(my_mat), 0, 1))
rownames(trait_df) = paste0("sp", 1:ncol(my_mat))

# Compute distance matrix
dist_mat = compute_dist_matrix(trait_df)

microbenchmark(regular = distinctiveness(my_mat, dist_mat),
               sparse = distinctiveness(sparse_mat, dist_mat))
## Unit: milliseconds
##     expr      min       lq     mean   median       uq      max neval cld
##  regular 323.5533 364.1125 479.2402 544.2697 554.6250 620.4377   100   b
##   sparse 261.3286 308.1827 398.4806 401.0769 493.4916 544.8086   100  a

Systematic benchmark

We generate matrices with different filling rate and compare the speed of regular matrix and sparse matrices computation.

generate_matrix = function(n_zero = 5, nrow = 200, ncol = 1000) {
  matrix(sample(c(rep(0, n_zero), 1), replace = TRUE, size = nrow*ncol),
                ncol = ncol, nrow = nrow)
}

mat_filling = function(my_mat) {
  sum(my_mat != 0)/(ncol(my_mat)*nrow(my_mat))
}

sparse_and_mat = function(n_zero) {
  my_mat = generate_matrix(n_zero)
  colnames(my_mat) = paste0("sp", 1:ncol(my_mat))
  rownames(my_mat) = paste0("site", 1:nrow(my_mat))
  
  sparse_mat = as(my_mat, "sparseMatrix")
  
  return(list(mat = my_mat, sparse = sparse_mat))
}

n_zero_vector = c(0, 1, 2, 49, 99)
names(n_zero_vector) = n_zero_vector

all_mats = lapply(n_zero_vector, sparse_and_mat)

mat_filling(all_mats$`0`$mat)
## [1] 1
mat_filling(all_mats$`99`$mat)
## [1] 0.0101

Now we can compare the speed of the algorithms:

mat_bench = microbenchmark(
  mat_0 = distinctiveness(all_mats$`0`$mat, dist_mat),
  sparse_0 = distinctiveness(all_mats$`0`$sparse, dist_mat),
  mat_1 = distinctiveness(all_mats$`1`$mat, dist_mat),
  sparse_1 = distinctiveness(all_mats$`1`$sparse, dist_mat),
  mat_2 = distinctiveness(all_mats$`2`$mat, dist_mat),
  sparse_2 = distinctiveness(all_mats$`2`$sparse, dist_mat),
  mat_49 = distinctiveness(all_mats$`49`$mat, dist_mat),
  sparse_49 = distinctiveness(all_mats$`49`$sparse, dist_mat),
  mat_99 = distinctiveness(all_mats$`99`$mat, dist_mat),
  sparse_99 = distinctiveness(all_mats$`99`$sparse, dist_mat),
  times = 5)

autoplot(mat_bench)