library(ggplot2)
library(ComplexUpset)

Prepare the datasets

movies = as.data.frame(ggplot2movies::movies)
head(movies, 3)
A data.frame: 3 × 24
titleyearlengthbudgetratingvotesr1r2r3r4r9r10mpaaActionAnimationComedyDramaDocumentaryRomanceShort
<chr><int><int><int><dbl><int><dbl><dbl><dbl><dbl><dbl><dbl><chr><int><int><int><int><int><int><int>
1$ 1971121NA6.43484.5 4.54.5 4.5 4.5 4.50011000
2$1000 a Touchdown 1939 71NA6.0 200.014.54.524.5 4.514.50010000
3$21 a Day Once a Month1941 7NA8.2 50.0 0.00.0 0.024.524.50100001
genres = colnames(movies)[18:24]
genres
  1. ‘Action’
  2. ‘Animation’
  3. ‘Comedy’
  4. ‘Drama’
  5. ‘Documentary’
  6. ‘Romance’
  7. ‘Short’

Convert the genre indicator columns to use boolean values:

movies[genres] = movies[genres] == 1
t(head(movies[genres], 3))
A matrix: 7 × 3 of type lgl
123
ActionFALSEFALSEFALSE
AnimationFALSEFALSE TRUE
Comedy TRUE TRUEFALSE
Drama TRUEFALSEFALSE
DocumentaryFALSEFALSEFALSE
RomanceFALSEFALSEFALSE
ShortFALSEFALSE TRUE

To keep the examples fast to compile we will operate on a subset of the movies with complete data:

movies[movies$mpaa == '', 'mpaa'] = NA
movies = na.omit(movies)

Utility for changing output parameters in Jupyter notebooks (IRKernel kernel), not relevant if using RStudio or scripting R from terminal:

set_size = function(w, h, factor=1.5) {
    s = 1 * factor
    options(
        repr.plot.width=w * s,
        repr.plot.height=h * s,
        repr.plot.res=100 / factor,
        jupyter.plot_mimetypes='image/png',
        jupyter.plot_scale=1
    )
}

0. Basic usage

There are two required arguments:

Additional arguments can be provided, such as name (specifies xlab() for intersection matrix) or width_ratio (specifies how much space should be occupied by the set size panel). Other such arguments are discussed at length later in this document.

set_size(8, 3)
upset(movies, genres, name='genre', width_ratio=0.1)

0.1 Selecting intersections

We will focus on the intersections with at least ten members (min_size=10) and on a few variables which are significantly different between the intersections (see 2. Running statistical tests).

When using min_size, the empty groups will be skipped by default (e.g. Short movies would have no overlap with size of 10). To keep all groups pass keep_empty_groups=TRUE:

set_size(8, 3)
(
    upset(movies, genres, name='genre', width_ratio=0.1, min_size=10, wrap=TRUE, set_sizes=FALSE)
    + ggtitle('Without empty groups (Short dropped)')
    +    # adding plots is possible thanks to patchwork
    upset(movies, genres, name='genre', width_ratio=0.1, min_size=10, keep_empty_groups=TRUE, wrap=TRUE, set_sizes=FALSE)
    + ggtitle('With empty groups')
)

When empty columns are detected a warning will be issued. The silence it, pass warn_when_dropping_groups=FALSE. Complimentary max_size can be used in tandem.

You can also select intersections by degree (min_degree and max_degree):

set_size(8, 3)
upset(
    movies, genres, width_ratio=0.1,
    min_degree=3,
)

Or request a constant number of intersections with n_intersections:

set_size(8, 3)
upset(
    movies, genres, width_ratio=0.1,
    n_intersections=15
)

0.2 Region selection modes

There are three modes defining the regions of interest on corresponding Venn diagram:

Example: given three sets \(A\), \(B\) and \(C\) with number of elements defined by the Venn diagram below

abc_data = create_upset_abc_example()

abc_venn = (
    ggplot(arrange_venn(abc_data))
    + coord_fixed()
    + theme_void()
    + scale_color_venn_mix(abc_data)
)

(
    abc_venn
    + geom_venn_region(data=abc_data, alpha=0.05)
    + geom_point(aes(x=x, y=y, color=region), size=1)
    + geom_venn_circle(abc_data)
    + geom_venn_label_set(abc_data, aes(label=region))
    + geom_venn_label_region(
        abc_data, aes(label=size),
        outwards_adjust=1.75,
        position=position_nudge(y=0.2)
    )
    + scale_fill_venn_mix(abc_data, guide=FALSE)
)

For the above sets \(A\) and \(B\) the region selection modes correspond to region of Venn diagram defined as follows:

and have the total number of elements as in the table below:

members \ mode exclusive int. inclusive int. exclusive union inclusive union
(A, B) 10 11 110 123
(A, C) == (B, C) 6 7 256 273
(A) == (B) 50 67 50 67
© 200 213 200 213
(A, B, C) 1 1 323 323
() 2 2 2 2
set_size(6, 6.5)
simple_venn = (
    abc_venn
    + geom_venn_region(data=abc_data, alpha=0.3)
    + geom_point(aes(x=x, y=y), size=0.75, alpha=0.3)
    + geom_venn_circle(abc_data)
    + geom_venn_label_set(abc_data, aes(label=region), outwards_adjust=2.55)
)
highlight = function(regions) scale_fill_venn_mix(
    abc_data, guide=FALSE, highlight=regions, inactive_color='NA'
)

(
    (
        simple_venn + highlight(c('A-B')) + labs(title='Exclusive intersection of A and B')
        | simple_venn + highlight(c('A-B', 'A-B-C')) + labs(title='Inclusive intersection of A and B')
    ) /
    (
        simple_venn + highlight(c('A-B', 'A', 'B')) + labs(title='Exclusive union of A and B')
        | simple_venn + highlight(c('A-B', 'A-B-C', 'A', 'B', 'A-C', 'B-C')) + labs(title='Inclusive union of A and B')
    )
)

When customizing the intersection_size() it is important to adjust the mode accordingly, as it defaults to exclusive_intersection and cannot be automatically deduced when user customizations are being applied:

set_size(8, 4.5)
abc_upset = function(mode) upset(
    abc_data, c('A', 'B', 'C'), mode=mode, set_sizes=FALSE,
    encode_sets=FALSE,
    queries=list(upset_query(intersect=c('A', 'B'), color='orange')),
    base_annotations=list(
        'Size'=(
            intersection_size(
                mode=mode,
                mapping=aes(fill=exclusive_intersection),
                size=0,
                text=list(check_overlap=TRUE)
            ) + scale_fill_venn_mix(
                data=abc_data,
                guide=FALSE,
                colors=c('A'='red', 'B'='blue', 'C'='green3')
            )
        )
    )
)

(
    (abc_upset('exclusive_intersection') | abc_upset('inclusive_intersection'))
    /
    (abc_upset('exclusive_union') | abc_upset('inclusive_union'))
)

0.3 Displaying all intersections

To display all possible intersections (rather than only the observed ones) use intersections='all'.

Note 1: it is usually desired to filter all the possible intersections down with max_degree and/or min_degree to avoid generating all combinations as those can easily use up all available RAM memory when dealing with multiple sets (e.g. all human genes) due to sheer number of possible combinations

Note 2: using intersections='all' is only reasonable for mode different from the default exclusive intersection.

set_size(8, 3)
upset(
    movies, genres,
    width_ratio=0.1,
    min_size=10,
    mode='inclusive_union',
    base_annotations=list('Size'=(intersection_size(counts=FALSE, mode='inclusive_union'))),
    intersections='all',
    max_degree=3
)

1. Adding components

We can add multiple annotation components (also called panels) using one of the three methods demonstrated below:

set_size(8, 8)

set.seed(0)   # keep the same jitter for identical plots

upset(
    movies,
    genres,
    annotations = list(
        # 1st method - passing list:
        'Length'=list(
            aes=aes(x=intersection, y=length),
            # provide a list if you wish to add several geoms
            geom=geom_boxplot(na.rm=TRUE)
        ),
        # 2nd method - using ggplot
        'Rating'=(
            # note that aes(x=intersection) is supplied by default and can be skipped
            ggplot(mapping=aes(y=rating))
            # checkout ggbeeswarm::geom_quasirandom for better results!
            + geom_jitter(aes(color=log10(votes)), na.rm=TRUE)
            + geom_violin(alpha=0.5, na.rm=TRUE)
        ),
        # 3rd method - using `upset_annotate` shorthand
        'Budget'=upset_annotate('budget', geom_boxplot(na.rm=TRUE))
    ),
    min_size=10,
    width_ratio=0.1
)

You can also use barplots to demonstrate differences in proportions of categorical variables:

set_size(8, 5)

upset(
    movies,
    genres,
    annotations = list(
        'MPAA Rating'=(
            ggplot(mapping=aes(fill=mpaa))
            + geom_bar(stat='count', position='fill')
            + scale_y_continuous(labels=scales::percent_format())
            + scale_fill_manual(values=c(
                'R'='#E41A1C', 'PG'='#377EB8',
                'PG-13'='#4DAF4A', 'NC-17'='#FF7F00'
            ))
            + ylab('MPAA Rating')
        )
    ),
    width_ratio=0.1
)

1.1. Changing modes in annotations

Use upset_mode to change the mode of the annotation:

set_size(8, 8)
set.seed(0)
upset(
    movies,
    genres,
    mode='inclusive_intersection',
    annotations = list(
        # if not specified, the mode will follow the mode set in `upset()` call (here: `inclusive_intersection`)
        'Length (inclusive intersection)'=(
            ggplot(mapping=aes(y=length))
            + geom_jitter(alpha=0.2, na.rm=TRUE)
        ),
        'Length (exclusive intersection)'=(
            ggplot(mapping=aes(y=length))
            + geom_jitter(alpha=0.2, na.rm=TRUE)
            + upset_mode('exclusive_intersection')
        ),
        'Length (inclusive union)'=(
            ggplot(mapping=aes(y=length))
            + geom_jitter(alpha=0.2, na.rm=TRUE)
            + upset_mode('inclusive_union')
        )
    ),
    min_size=10,
    width_ratio=0.1
)

2. Running statistical tests

upset_test(movies, genres)
[1] "year, length, budget, rating, votes, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, mpaa differ significantly between intersections"
A data.frame: 17 × 5
variablep.valuestatistictestfdr
<chr><dbl><dbl><chr><dbl>
lengthlength6.511525e-71422.88444Kruskal-Wallis rank sum test1.106959e-69
ratingrating1.209027e-46301.72764Kruskal-Wallis rank sum test1.027673e-45
budgetbudget3.899860e-44288.97476Kruskal-Wallis rank sum test2.209921e-43
r8r8 9.900004e-39261.28815Kruskal-Wallis rank sum test4.207502e-38
mpaampaa 3.732200e-35242.77939Kruskal-Wallis rank sum test1.268948e-34
r9r9 1.433256e-30218.78160Kruskal-Wallis rank sum test4.060891e-30
r1r1 2.211600e-23180.32740Kruskal-Wallis rank sum test5.371029e-23
r4r4 1.008119e-18154.62772Kruskal-Wallis rank sum test2.142254e-18
r3r3 2.568227e-17146.70217Kruskal-Wallis rank sum test4.851095e-17
r5r5 9.823827e-16137.66310Kruskal-Wallis rank sum test1.670051e-15
r7r7 9.201549e-14126.19243Kruskal-Wallis rank sum test1.422058e-13
r2r2 2.159955e-13124.00604Kruskal-Wallis rank sum test3.059936e-13
r10r10 1.283470e-11113.38113Kruskal-Wallis rank sum test1.678384e-11
votesvotes 2.209085e-10105.79588Kruskal-Wallis rank sum test2.682460e-10
r6r6 3.779129e-05 70.80971Kruskal-Wallis rank sum test4.283013e-05
yearyear 2.745818e-02 46.55972Kruskal-Wallis rank sum test2.917431e-02
titletitle 2.600003e-01 34.53375Kruskal-Wallis rank sum test2.600003e-01

Kruskal-Wallis rank sum test is not always the best choice.

You can either change the test for:

The tests are called with (formula=variable ~ intersection, data) signature, such as accepted by kruskal.test. The result is expected to be a list with following members:

It is easy to adapt tests which do not obey this signature/output convention; for example the Chi-squared test and anova can be wrapped with two-line functions as follows:

chisq_from_formula = function(formula, data) {
    chisq.test(
        ftable(formula, data)
    )
}

anova_single = function(formula, data) {
    result = summary(aov(formula, data))
    list(
        p.value=result[[1]][['Pr(>F)']][[1]],
        method='Analysis of variance Pr(>F)',
        statistic=result[[1]][['F value']][[1]]
    )
}

custom_tests = list(
    mpaa=chisq_from_formula,
    budget=anova_single
)
head(upset_test(movies, genres, tests=custom_tests))
Warning message in chisq.test(ftable(formula, data)):
“Chi-squared approximation may be incorrect”


[1] "year, length, budget, rating, votes, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, mpaa differ significantly between intersections"
A data.frame: 6 × 5
variablep.valuestatistictestfdr
<chr><dbl><dbl><chr><dbl>
lengthlength6.511525e-71422.88444Kruskal-Wallis rank sum test1.106959e-69
budgetbudget1.348209e-60 13.66395Analysis of variance Pr(>F) 1.145977e-59
ratingrating1.209027e-46301.72764Kruskal-Wallis rank sum test6.851151e-46
mpaampaa 9.799097e-42406.33814Pearson’s Chi-squared test 4.164616e-41
r8r8 9.900004e-39261.28815Kruskal-Wallis rank sum test3.366002e-38
r9r9 1.433256e-30218.78160Kruskal-Wallis rank sum test4.060891e-30

Many tests will require at least two observations in each group. You can skip intersections with less than two members with min_size=2.

bartlett_results = suppressWarnings(upset_test(movies, genres, test=bartlett.test, min_size=2))
tail(bartlett_results)
[1] "NA, year, length, budget, rating, votes, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, NA differ significantly between intersections"
A data.frame: 6 × 5
variablep.valuestatistictestfdr
<chr><dbl><dbl><chr><dbl>
yearyear 1.041955e-67386.53699Bartlett test of homogeneity of variances1.302444e-67
lengthlength3.982729e-67383.70148Bartlett test of homogeneity of variances4.595457e-67
budgetbudget7.637563e-50298.89911Bartlett test of homogeneity of variances8.183103e-50
ratingrating3.980194e-06 66.63277Bartlett test of homogeneity of variances3.980194e-06
titletitle NA NABartlett test of homogeneity of variances NA
mpaampaa NA NABartlett test of homogeneity of variances NA

2.1 Ignore specific variables

You may want to exclude variables which are:

In the movies example, the title variable is not a reasonable thing to compare. We can ignore it using:

# note: title no longer present
rownames(upset_test(movies, genres, ignore=c('title')))
[1] "year, length, budget, rating, votes, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, mpaa differ significantly between intersections"
  1. ‘length’
  2. ‘rating’
  3. ‘budget’
  4. ‘r8’
  5. ‘mpaa’
  6. ‘r9’
  7. ‘r1’
  8. ‘r4’
  9. ‘r3’
  10. ‘r5’
  11. ‘r7’
  12. ‘r2’
  13. ‘r10’
  14. ‘votes’
  15. ‘r6’
  16. ‘year’

3. Adjusting “Intersection size”

3.1 Counts

The counts over the bars can be disabled:

set_size(8, 3)

upset(
    movies,
    genres,
    base_annotations=list(
        'Intersection size'=intersection_size(counts=FALSE)
    ),
    min_size=10,
    width_ratio=0.1
)

The colors can be changed, and additional annotations added:

set_size(8, 3)

upset(
    movies,
    genres,
    base_annotations=list(
        'Intersection size'=intersection_size(
            text_colors=c(
                on_background='brown', on_bar='yellow'
            )
        )
        + annotate(
            geom='text', x=Inf, y=Inf,
            label=paste('Total:', nrow(movies)),
            vjust=1, hjust=1
        )
        + ylab('Intersection size')
    ),
    min_size=10,
    width_ratio=0.1
)

Any parameter supported by geom_text can be passed in text list:

set_size(8, 3)

upset(
    movies,
    genres,
    base_annotations=list(
        'Intersection size'=intersection_size(
            text=list(
                vjust=-0.1,
                hjust=-0.1,
                angle=45
            )
        )
    ),
    min_size=10,
    width_ratio=0.1
)

3.2 Fill the bars

set_size(8, 3)

upset(
    movies,
    genres,
    base_annotations=list(
        'Intersection size'=intersection_size(
            counts=FALSE,
            mapping=aes(fill=mpaa)
        )
    ),
    width_ratio=0.1
)