Comparisons with other packages

This vignette provides comparisons with other packages that provide similar functionality. This is a work in progress – for a more detailed / complete / coherent comparison with other packages which provide wide-to-tall data reshaping, see my paper.

We begin by defining a helper function which ensures that results are the same:

compare <- function(..., exclude="variable"){
  mc <- match.call()
  default.names <- paste(lapply(mc[-1], "[[", 1))
  L <- list(...)
  to.rep <- if(is.null(names(L)))rep(TRUE, length(L)) else names(L)==""
  names(L)[to.rep] <- default.names[to.rep]
  sorted.list <- list()
  all.name.vec <- names(L[[1]])
  name.vec <- all.name.vec[!all.name.vec %in% exclude]
  for(fun.name in names(L[-1])){
    result.dt <- data.table(L[[fun.name]])
    if(0 < nrow(result.dt)){
      if(!all(name.vec %in% names(result.dt))){
        print(result.dt)
        stop(fun.name, " result needs names: ", paste(name.vec, collapse=", "))
      }
      sorted <- result.dt[, name.vec, with=FALSE]
      setkeyv(sorted, name.vec)
      sorted.list[[fun.name]] <- sorted
    }
  }
  ref.dt <- sorted.list[[1]]
  others <- sorted.list[-1]
  for(fun.name in names(others)){
    other.dt <- others[[fun.name]]
    all.eq <- all.equal(other.dt, ref.dt)
    if(!isTRUE(all.eq)){
      cat(fun.name, "\n", all.eq, "\n\n", sep="")
    }
  }
  L
}

Note that the function above returns a list of data frames, and checks that each element in the list is equal to the first element.

Are sepals larger or smaller than petals in the iris data?

Sometimes you want to melt a “wide” data table which has several distinct pieces of information encoded in each column name. One example is the familiar iris data, which have flower part and measurement dimension encoded in each of four column names:

library(data.table)
data.table(iris)
#>      Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
#>   1:          5.1         3.5          1.4         0.2    setosa
#>   2:          4.9         3.0          1.4         0.2    setosa
#>   3:          4.7         3.2          1.3         0.2    setosa
#>   4:          4.6         3.1          1.5         0.2    setosa
#>   5:          5.0         3.6          1.4         0.2    setosa
#>  ---                                                            
#> 146:          6.7         3.0          5.2         2.3 virginica
#> 147:          6.3         2.5          5.0         1.9 virginica
#> 148:          6.5         3.0          5.2         2.0 virginica
#> 149:          6.2         3.4          5.4         2.3 virginica
#> 150:          5.9         3.0          5.1         1.8 virginica

The goal in this section will be to convert these data into a format with a column for each flower part (Sepal and Petal) so we can easily make a facetted scatterplot to visually examine whether or not sepals or larger than petals. The easiest way to perform this conversion is with packages which provide a function for melting into multiple output columns:

iris.parts <- compare(
  nc::capture_melt_multiple(
    iris,
    column=".*?",
    "[.]",
    dim=".*"),
  tidyr::pivot_longer(
    iris, 
    cols=1:4, 
    names_to=c(".value", "dim"),
    names_sep="[.]"),
  stats::reshape(
    iris,
    direction="long",
    timevar="dim",
    varying=1:4,
    sep="."),
  "data.table::melt"=melt(
    data.table(iris),
    measure.vars=patterns(
      Sepal="^Sepal",
      Petal="^Petal")
  )[data.table(
    variable=factor(1:2), dim=c("Length", "Width")
  ), on=.(variable)],
  if(requireNamespace("cdata"))cdata::rowrecs_to_blocks(
    iris,
    controlTable=data.frame(
      dim=c("Length", "Width"),
      Petal=c("Petal.Length", "Petal.Width"),
      Sepal=c("Sepal.Length", "Sepal.Width"),
      stringsAsFactors=FALSE),
    columnsToCopy="Species"))
#> Loading required namespace: cdata
iris.parts$nc
#>        Species    dim Petal Sepal
#>   1:    setosa Length   1.4   5.1
#>   2:    setosa Length   1.4   4.9
#>   3:    setosa Length   1.3   4.7
#>   4:    setosa Length   1.5   4.6
#>   5:    setosa Length   1.4   5.0
#>  ---                             
#> 296: virginica  Width   2.3   3.0
#> 297: virginica  Width   1.9   2.5
#> 298: virginica  Width   2.0   3.0
#> 299: virginica  Width   2.3   3.4
#> 300: virginica  Width   1.8   3.0

It is clear from the code above that each package is capable of the conversions. However the syntax and level of explicitness varies:

Any of the results can be visualized via:

library(ggplot2)
ggplot()+
  theme_bw()+
  theme(panel.spacing=grid::unit(0, "lines"))+
  facet_grid(dim ~ Species)+
  coord_equal()+
  geom_abline(slope=1, intercept=0, color="grey")+
  geom_point(aes(
    Petal, Sepal),
    data=iris.parts$nc)

plot of chunk unnamed-chunk-4

It is clear from the plot above that sepals are larger than petals, for every measured flower.

Comparing dimensions in iris data

What if we wanted to compare dimensions rather than parts?

iris.dims <- compare(
  nc::capture_melt_multiple(
    iris,
    part=".*?",
    "[.]",
    column=".*"),
  stats::reshape(
    structure(iris, names=sub("(.*?)[.](.*)", "\\2.\\1", names(iris))),
    direction="long",
    timevar="part",
    varying=1:4,
    sep="."))
iris.dims$nc
#>        Species  part Length Width
#>   1:    setosa Petal    1.4   0.2
#>   2:    setosa Petal    1.4   0.2
#>   3:    setosa Petal    1.3   0.2
#>   4:    setosa Petal    1.5   0.2
#>   5:    setosa Petal    1.4   0.2
#>  ---                             
#> 296: virginica Sepal    6.7   3.0
#> 297: virginica Sepal    6.3   2.5
#> 298: virginica Sepal    6.5   3.0
#> 299: virginica Sepal    6.2   3.4
#> 300: virginica Sepal    5.9   3.0

The code above shows that the syntax is mostly the same for this example. The biggest difference is for stats::reshape which assumes that each input column name is composed of (1) the output column name, (2) a delimiter, and (3) some additional information to be stored in the output column given by timevar. Therefore we need to pre-process column names using sub for it to work.

ggplot()+
  theme_bw()+
  theme(panel.spacing=grid::unit(0, "lines"))+
  facet_grid(part ~ Species)+
  coord_equal()+
  geom_abline(slope=1, intercept=0, color="grey")+
  geom_point(aes(
    Length, Width),
    data=iris.dims$nc)

plot of chunk unnamed-chunk-6

It is clear from the plot above that Length is larger than Width for every measured flower part.

Do columns need to be sorted?

Consider the following wide data set:

TC <- data.table::data.table(
  age.treatment=c(1, 5),
  sex.control=c("M", "M"),
  sex.treatment=c("F", "F"),
  age.control=c(10, 50))

It is clear from the column names how the data should be grouped when they are converted to tall format. However the columns do not appear in regular order (age is before sex for treatment, but age is after sex for control), which causes a problem for stats and data.table:

input.list <- list(
  "nc"=nc::capture_melt_multiple(
    TC,
    column=".*?",
    "[.]",
    group=".*"),
  "cdata"=if(requireNamespace("cdata"))cdata::rowrecs_to_blocks(
    TC,
    controlTable=data.frame(
      group=c("treatment", "control"),
      age=c("age.treatment", "age.control"),
      sex=c("sex.treatment", "sex.control"),
      stringsAsFactors=FALSE)),
  "data.table"=data.table::melt(TC, measure.vars=patterns(
    age="age",
    sex="sex")),
  "stats"=stats::reshape(
    TC,
    varying=1:4,
    direction="long"),
  "tidyr"=tidyr::pivot_longer(
    TC, 1:4,
    names_to=c(".value", "group"),
    names_sep="[.]"))
output.list <- list()
for(pkg in names(input.list)){
  df.or.null <- input.list[[pkg]]
  if(is.data.frame(df.or.null)){
    output.list[[pkg]] <- data.table::data.table(df.or.null)[order(age)]
  }
}
output.list
#> $nc
#>        group age sex
#> 1: treatment   1   F
#> 2: treatment   5   F
#> 3:   control  10   M
#> 4:   control  50   M
#> 
#> $cdata
#>        group age sex
#> 1: treatment   1   F
#> 2: treatment   5   F
#> 3:   control  10   M
#> 4:   control  50   M
#> 
#> $data.table
#>    variable age sex
#> 1:        1   1   M
#> 2:        1   5   M
#> 3:        2  10   F
#> 4:        2  50   F
#> 
#> $stats
#>         time age sex id
#> 1: treatment   1   M  1
#> 2: treatment   5   M  2
#> 3:   control  10   F  1
#> 4:   control  50   F  2
#> 
#> $tidyr
#>        group age sex
#> 1: treatment   1   F
#> 2: treatment   5   F
#> 3:   control  10   M
#> 4:   control  50   M
sapply(output.list, function(DT)identical(DT$sex, c("F", "F", "M", "M")))
#>         nc      cdata data.table      stats      tidyr 
#>       TRUE       TRUE      FALSE      FALSE       TRUE

In conclusion, when the input column names to melt do not appear in the same order across groups or output columns, then the correct tall data can be computed using one of nc::capture_melt_multiple, tidyr::pivot_longer, cdata::rowrecs_to_blocks.

Melting into a single output column, who data

Another data set where it is useful to do column name pattern matching followed by melting is the World Health Organization data:

data(who, package="tidyr")
names(who)
#>  [1] "country"      "iso2"         "iso3"         "year"         "new_sp_m014" 
#>  [6] "new_sp_m1524" "new_sp_m2534" "new_sp_m3544" "new_sp_m4554" "new_sp_m5564"
#> [11] "new_sp_m65"   "new_sp_f014"  "new_sp_f1524" "new_sp_f2534" "new_sp_f3544"
#> [16] "new_sp_f4554" "new_sp_f5564" "new_sp_f65"   "new_sn_m014"  "new_sn_m1524"
#> [21] "new_sn_m2534" "new_sn_m3544" "new_sn_m4554" "new_sn_m5564" "new_sn_m65"  
#> [26] "new_sn_f014"  "new_sn_f1524" "new_sn_f2534" "new_sn_f3544" "new_sn_f4554"
#> [31] "new_sn_f5564" "new_sn_f65"   "new_ep_m014"  "new_ep_m1524" "new_ep_m2534"
#> [36] "new_ep_m3544" "new_ep_m4554" "new_ep_m5564" "new_ep_m65"   "new_ep_f014" 
#> [41] "new_ep_f1524" "new_ep_f2534" "new_ep_f3544" "new_ep_f4554" "new_ep_f5564"
#> [46] "new_ep_f65"   "newrel_m014"  "newrel_m1524" "newrel_m2534" "newrel_m3544"
#> [51] "newrel_m4554" "newrel_m5564" "newrel_m65"   "newrel_f014"  "newrel_f1524"
#> [56] "newrel_f2534" "newrel_f3544" "newrel_f4554" "newrel_f5564" "newrel_f65"

Each column which starts with new has three distinct pieces of information encoded in its name: diagnosis type (e.g. sp or rel), gender (m or f), and age range (e.g. 5564 or 1524). We would like to use a regex to match these column names, then using the matching columns as measure.vars in a melt, then join the two results. The most convenient way to do that is via:

compare(
  nc::capture_melt_single(
    who,
    "new_?",
    diagnosis=".*",
    "_",
    gender=".",
    ages=".*"),
  tidyr::pivot_longer(
    who,
    new_sp_m014:newrel_f65,
    names_to=c("diagnosis", "gender", "ages"),
    names_pattern="new_?(.*)_(.)(.*)"))
#> $`nc::capture_melt_single`
#>                          country iso2 iso3 year diagnosis gender ages value
#>     1:               Afghanistan   AF  AFG 1997        sp      m  014     0
#>     2:               Afghanistan   AF  AFG 1998        sp      m  014    30
#>     3:               Afghanistan   AF  AFG 1999        sp      m  014     8
#>     4:               Afghanistan   AF  AFG 2000        sp      m  014    52
#>     5:               Afghanistan   AF  AFG 2001        sp      m  014   129
#>    ---                                                                     
#> 76042:                  Viet Nam   VN  VNM 2013       rel      f   65  3110
#> 76043: Wallis and Futuna Islands   WF  WLF 2013       rel      f   65     2
#> 76044:                     Yemen   YE  YEM 2013       rel      f   65   360
#> 76045:                    Zambia   ZM  ZMB 2013       rel      f   65   669
#> 76046:                  Zimbabwe   ZW  ZWE 2013       rel      f   65   725
#> 
#> $`tidyr::pivot_longer`
#> # A tibble: 405,440 x 8
#>    country     iso2  iso3   year diagnosis gender ages  value
#>    <chr>       <chr> <chr> <int> <chr>     <chr>  <chr> <int>
#>  1 Afghanistan AF    AFG    1980 sp        m      014      NA
#>  2 Afghanistan AF    AFG    1980 sp        m      1524     NA
#>  3 Afghanistan AF    AFG    1980 sp        m      2534     NA
#>  4 Afghanistan AF    AFG    1980 sp        m      3544     NA
#>  5 Afghanistan AF    AFG    1980 sp        m      4554     NA
#>  6 Afghanistan AF    AFG    1980 sp        m      5564     NA
#>  7 Afghanistan AF    AFG    1980 sp        m      65       NA
#>  8 Afghanistan AF    AFG    1980 sp        f      014      NA
#>  9 Afghanistan AF    AFG    1980 sp        f      1524     NA
#> 10 Afghanistan AF    AFG    1980 sp        f      2534     NA
#> # ... with 405,430 more rows

Note the result includes additional column value which contains the melted data. There is also a column for each capture group in the specified pattern. The following example shows how to rename the value column, remove missing values, and use numeric type conversion functions:

library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.6.2
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#> 
#>     between, first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
who.pattern <- "new_?(.*)_(.)((0|[0-9]{2})([0-9]{0,2}))"
who.typed <- compare(
  nc::capture_melt_single(
    who,
    "new_?",
    diagnosis=".*",
    "_",
    gender=".",
    ages=list(
      ymin.num="0|[0-9]{2}", as.numeric,
      ymax.num="[0-9]{0,2}", function(y)ifelse(y=="", Inf, as.numeric(y))),
    value.name="count",
    na.rm=TRUE),
  tidyr::pivot_longer(
    who,
    cols=grep(who.pattern, names(who)),
    names_ptypes=list(ymin.num=numeric()),
    names_to=c("diagnosis", "gender", "ages", "ymin.num", "ymax.chr"),
    names_pattern=who.pattern,
    values_drop_na=TRUE,
    values_to="count"
   ) %>% dplyr::mutate(
    ymax.num=ifelse(ymax.chr=="", Inf, as.numeric(ymax.chr))))
str(who.typed)  
#> List of 2
#>  $ nc::capture_melt_single:Classes 'data.table' and 'data.frame':    76046 obs. of  10 variables:
#>   ..$ country  : chr [1:76046] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
#>   ..$ iso2     : chr [1:76046] "AF" "AF" "AF" "AF" ...
#>   ..$ iso3     : chr [1:76046] "AFG" "AFG" "AFG" "AFG" ...
#>   ..$ year     : int [1:76046] 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 ...
#>   ..$ diagnosis: chr [1:76046] "sp" "sp" "sp" "sp" ...
#>   ..$ gender   : chr [1:76046] "m" "m" "m" "m" ...
#>   ..$ ages     : chr [1:76046] "014" "014" "014" "014" ...
#>   ..$ ymin.num : num [1:76046] 0 0 0 0 0 0 0 0 0 0 ...
#>   ..$ ymax.num : num [1:76046] 14 14 14 14 14 14 14 14 14 14 ...
#>   ..$ count    : int [1:76046] 0 30 8 52 129 90 127 139 151 193 ...
#>   ..- attr(*, ".internal.selfref")=<externalptr> 
#>  $ `%>%`                  :Classes 'tbl_df', 'tbl' and 'data.frame': 76046 obs. of  11 variables:
#>   ..$ country  : chr [1:76046] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
#>   ..$ iso2     : chr [1:76046] "AF" "AF" "AF" "AF" ...
#>   ..$ iso3     : chr [1:76046] "AFG" "AFG" "AFG" "AFG" ...
#>   ..$ year     : int [1:76046] 1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
#>   ..$ diagnosis: chr [1:76046] "sp" "sp" "sp" "sp" ...
#>   ..$ gender   : chr [1:76046] "m" "m" "m" "m" ...
#>   ..$ ages     : chr [1:76046] "014" "1524" "2534" "3544" ...
#>   ..$ ymin.num : num [1:76046] 0 15 25 35 45 55 65 0 15 25 ...
#>   ..$ ymax.chr : chr [1:76046] "14" "24" "34" "44" ...
#>   ..$ count    : int [1:76046] 0 10 6 3 5 2 0 5 38 36 ...
#>   ..$ ymax.num : num [1:76046] 14 24 34 44 54 ...

The result above shows that nc::capture_melt_single (1) makes it easier to define complex patterns (2) supports type conversion without a post-processing step, and (3) reduces repetition in user code. There are several sources of repetition in tidyr code:

Other packages for doing this include:

gather.result <- tidyr::gather(
  who,
  "variable",
  "count",
  grep(who.pattern, names(who)),
  na.rm=TRUE)
tidyr::extract(
  gather.result,
  "variable",
  c("diagnosis", "gender", "ages", "ymin.int", "ymax.int"),
  who.pattern,
  convert=TRUE
) %>% dplyr::mutate(
  ymin.num=as.numeric(ymin.int),
  ymax.num=ifelse(is.na(ymax.int), Inf, as.numeric(ymax.int)))
#> # A tibble: 76,046 x 12
#>    country iso2  iso3   year diagnosis gender  ages ymin.int ymax.int count
#>    <chr>   <chr> <chr> <int> <chr>     <chr>  <int>    <int>    <int> <int>
#>  1 Afghan~ AF    AFG    1997 sp        m         14        0       14     0
#>  2 Afghan~ AF    AFG    1998 sp        m         14        0       14    30
#>  3 Afghan~ AF    AFG    1999 sp        m         14        0       14     8
#>  4 Afghan~ AF    AFG    2000 sp        m         14        0       14    52
#>  5 Afghan~ AF    AFG    2001 sp        m         14        0       14   129
#>  6 Afghan~ AF    AFG    2002 sp        m         14        0       14    90
#>  7 Afghan~ AF    AFG    2003 sp        m         14        0       14   127
#>  8 Afghan~ AF    AFG    2004 sp        m         14        0       14   139
#>  9 Afghan~ AF    AFG    2005 sp        m         14        0       14   151
#> 10 Afghan~ AF    AFG    2006 sp        m         14        0       14   193
#> # ... with 76,036 more rows, and 2 more variables: ymin.num <dbl>,
#> #   ymax.num <dbl>

Note that tidyr::gather requires two post-processing steps, which cause the same two types of repetition as tidyr::pivot_longer:

The reshape2 package suffers from the same two issues:

reshape2.result <- reshape2:::melt.data.frame(
  who,
  measure.vars=grep(who.pattern, names(who)),
  na.rm=TRUE,
  value.name="count")

Interestingly, data.table::patterns can be used to avoid repeating the data set name, who. However it supports neither type conversion nor regex capture groups.

dt.result <- data.table::melt.data.table(
  data.table(who),
  measure.vars=patterns(who.pattern),
  na.rm=TRUE,
  value.name="count")

Neither cdata nor stats provide an na.rm option:

who.df <- data.frame(who)
is.varying <- grepl(who.pattern, names(who))
names(who.df)[is.varying] <- paste0("count.", names(who)[is.varying])
stats.result <- stats::reshape(
  who.df,
  direction="long",
  timevar="variable",
  varying=is.varying)
if(requireNamespace("cdata")){
  cdata.result <- cdata::rowrecs_to_blocks(
    who, 
    cdata::build_unpivot_control(
      "variable",
      "count",
      grep(who.pattern, names(who), value=TRUE)),
    columnsToCopy=grep(who.pattern, names(who), value=TRUE, invert=TRUE))
}

Melting a wider iris back to original

## Example 1: melting a wider iris data back to original.
library(data.table)
iris.dt <- data.table(
  i=1:nrow(iris),
  iris[,1:4],
  Species=paste(iris$Species))
print(iris.dt)
#>        i Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
#>   1:   1          5.1         3.5          1.4         0.2    setosa
#>   2:   2          4.9         3.0          1.4         0.2    setosa
#>   3:   3          4.7         3.2          1.3         0.2    setosa
#>   4:   4          4.6         3.1          1.5         0.2    setosa
#>   5:   5          5.0         3.6          1.4         0.2    setosa
#>  ---                                                                
#> 146: 146          6.7         3.0          5.2         2.3 virginica
#> 147: 147          6.3         2.5          5.0         1.9 virginica
#> 148: 148          6.5         3.0          5.2         2.0 virginica
#> 149: 149          6.2         3.4          5.4         2.3 virginica
#> 150: 150          5.9         3.0          5.1         1.8 virginica

## what if we had two observations on each row?
set.seed(1)
iris.rand <- iris.dt[sample(.N)]
iris.wide <- cbind(treatment=iris.rand[1:75], control=iris.rand[76:150])
print(iris.wide, topn=2, nrows=10)
#>     treatment.i treatment.Sepal.Length treatment.Sepal.Width
#>  1:          68                    5.8                   2.7
#>  2:         129                    6.4                   2.8
#> ---                                                         
#> 74:          91                    5.5                   2.6
#> 75:          64                    6.1                   2.9
#>     treatment.Petal.Length treatment.Petal.Width treatment.Species control.i
#>  1:                    4.1                   1.0        versicolor        60
#>  2:                    5.6                   2.1         virginica       113
#> ---                                                                         
#> 74:                    4.4                   1.2        versicolor        57
#> 75:                    4.7                   1.4        versicolor        72
#>     control.Sepal.Length control.Sepal.Width control.Petal.Length
#>  1:                  5.2                 2.7                  3.9
#>  2:                  6.8                 3.0                  5.5
#> ---                                                              
#> 74:                  6.3                 3.3                  4.7
#> 75:                  6.1                 2.8                  4.0
#>     control.Petal.Width control.Species
#>  1:                 1.4      versicolor
#>  2:                 2.1       virginica
#> ---                                    
#> 74:                 1.6      versicolor
#> 75:                 1.3      versicolor

## This is the usual data.table syntax for getting the original iris back.
iris.melted <- melt(iris.wide, value.factor=TRUE, measure.vars = patterns(
  i="i$",
  Sepal.Length="Sepal.Length$",
  Sepal.Width="Sepal.Width$",
  Petal.Length="Petal.Length$",
  Petal.Width="Petal.Width$",
  Species="Species$"))
identical(iris.melted[order(i), names(iris.dt), with=FALSE], iris.dt)
#> [1] TRUE

## nc can do the same thing -- you must define an R argument named
## column, and another named argument which identifies each group.
(nc.melted <- nc::capture_melt_multiple(
  iris.wide,
  group="[^.]+",
  "[.]",
  column=".*"))
#>          group Petal.Length Petal.Width Sepal.Length Sepal.Width    Species   i
#>   1:   control          3.9         1.4          5.2         2.7 versicolor  60
#>   2:   control          5.5         2.1          6.8         3.0  virginica 113
#>   3:   control          5.6         1.4          6.1         2.6  virginica 135
#>   4:   control          1.5         0.1          4.9         3.1     setosa  10
#>   5:   control          1.4         0.2          5.1         3.5     setosa   1
#>  ---                                                                           
#> 146: treatment          1.6         0.2          4.8         3.1     setosa  31
#> 147: treatment          1.3         0.4          5.4         3.9     setosa  17
#> 148: treatment          5.4         2.1          6.9         3.1  virginica 140
#> 149: treatment          4.4         1.2          5.5         2.6 versicolor  91
#> 150: treatment          4.7         1.4          6.1         2.9 versicolor  64
identical(nc.melted[order(i), names(iris.dt), with=FALSE], iris.dt)
#> [1] TRUE

## This is how we do it using stats::reshape.
iris.wide.df <- data.frame(iris.wide)
names(iris.wide.df) <- sub("(.*?)[.](.*)", "\\2_\\1", names(iris.wide))
iris.reshaped <- stats::reshape(
  iris.wide.df,
  direction="long",
  timevar="group",
  varying=names(iris.wide.df),
  sep="_")
identical(data.table(iris.reshaped[, names(iris.dt)])[order(i)], iris.dt)
#> [1] TRUE

## get the parts columns and groups -- is there any difference
## between groups? of course not!
parts.wide <- nc::capture_melt_multiple(
  iris.wide,
  group=".*?",
  "[.]",
  column=".*?",
  "[.]",
  dim=".*")
if(require("ggplot2")){
  ggplot()+
    theme_bw()+
    theme(panel.spacing=grid::unit(0, "lines"))+
    facet_grid(dim ~ group)+
    coord_equal()+
    geom_abline(slope=1, intercept=0, color="grey")+
    geom_point(aes(
      Petal, Sepal),
      data=parts.wide)
}

plot of chunk unnamed-chunk-17

Lots of column types

## Example 2. Lots of column types, from example(melt.data.table).
DT <- data.table(
  i_1 = c(1:5, NA),
  i_2 = c(NA,6:10),
  f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
  f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
  c_1 = sample(c(letters[1:3], NA), 6, TRUE),
  d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
  d_2 = as.Date(6:1, origin="2012-01-01"))
## add a couple of list cols
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c]
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]

## original DT syntax is quite repetitive.
melt(DT, measure=patterns(
  i="^i",
  f="^f",
  d="^d",
  l="^l"
))
#>      c_1 variable  i    f          d              l
#>  1:    a        1  1    a 2013-09-02        1,1,1,1
#>  2:    a        1  2    a 2013-09-03            2,2
#>  3: <NA>        1  3 <NA> 2013-09-04              3
#>  4:    b        1  4    a       <NA>          4,4,4
#>  5:    b        1  5    c 2013-09-05              5
#>  6: <NA>        1 NA    c 2013-09-06             NA
#>  7:    a        2 NA    z 2012-01-07        a,a,a,a
#>  8:    a        2  6    a 2012-01-06      a,a,a,a,a
#>  9: <NA>        2  7    x 2012-01-05             NA
#> 10:    b        2  8    c 2012-01-04          b,b,b
#> 11:    b        2  9    x 2012-01-03        b,b,b,b
#> 12: <NA>        2 10    x 2012-01-02 NA,NA,NA,NA,NA

## nc syntax uses a single regex rather than four.
nc::capture_melt_multiple(
  DT,
  column="^[^c]",
  "_",
  number="[12]")
#>      c_1 number          d    f  i              l
#>  1:    a      1 2013-09-02    a  1        1,1,1,1
#>  2:    a      1 2013-09-03    a  2            2,2
#>  3: <NA>      1 2013-09-04 <NA>  3              3
#>  4:    b      1       <NA>    a  4          4,4,4
#>  5:    b      1 2013-09-05    c  5              5
#>  6: <NA>      1 2013-09-06    c NA             NA
#>  7:    a      2 2012-01-07    z NA        a,a,a,a
#>  8:    a      2 2012-01-06    a  6      a,a,a,a,a
#>  9: <NA>      2 2012-01-05    x  7             NA
#> 10:    b      2 2012-01-04    c  8          b,b,b
#> 11:    b      2 2012-01-03    x  9        b,b,b,b
#> 12: <NA>      2 2012-01-02    x 10 NA,NA,NA,NA,NA

## id.vars can be specified using original DT syntax.
melt(DT, id=1:2, measure=patterns(
  f="^f",
  l="^l"
))
#>     i_1 i_2 variable    f              l
#>  1:   1  NA        1    a        1,1,1,1
#>  2:   2   6        1    a            2,2
#>  3:   3   7        1 <NA>              3
#>  4:   4   8        1    a          4,4,4
#>  5:   5   9        1    c              5
#>  6:  NA  10        1    c             NA
#>  7:   1  NA        2    z        a,a,a,a
#>  8:   2   6        2    a      a,a,a,a,a
#>  9:   3   7        2    x             NA
#> 10:   4   8        2    c          b,b,b
#> 11:   5   9        2    x        b,b,b,b
#> 12:  NA  10        2    x NA,NA,NA,NA,NA

nc::capture_melt_multiple(
  DT,
  column="^[fl]",
  "_",
  number="[12]")
#>     i_1 i_2  c_1        d_1        d_2 number    f              l
#>  1:   1  NA    a 2013-09-02 2012-01-07      1    a        1,1,1,1
#>  2:   2   6    a 2013-09-03 2012-01-06      1    a            2,2
#>  3:   3   7 <NA> 2013-09-04 2012-01-05      1 <NA>              3
#>  4:   4   8    b       <NA> 2012-01-04      1    a          4,4,4
#>  5:   5   9    b 2013-09-05 2012-01-03      1    c              5
#>  6:  NA  10 <NA> 2013-09-06 2012-01-02      1    c             NA
#>  7:   1  NA    a 2013-09-02 2012-01-07      2    z        a,a,a,a
#>  8:   2   6    a 2013-09-03 2012-01-06      2    a      a,a,a,a,a
#>  9:   3   7 <NA> 2013-09-04 2012-01-05      2    x             NA
#> 10:   4   8    b       <NA> 2012-01-04      2    c          b,b,b
#> 11:   5   9    b 2013-09-05 2012-01-03      2    x        b,b,b,b
#> 12:  NA  10 <NA> 2013-09-06 2012-01-02      2    x NA,NA,NA,NA,NA

## reshape does not support list columns.
reshape(
  DT,
  varying=grep("^[fid]", names(DT)),
  sep="_",
  direction="long",
  timevar="number")
#>      c_1     l_1            l_2 number  i    f          d id
#>  1:    a 1,1,1,1        a,a,a,a      1  1    a 2013-09-02  1
#>  2:    a     2,2      a,a,a,a,a      1  2    a 2013-09-03  2
#>  3: <NA>       3             NA      1  3 <NA> 2013-09-04  3
#>  4:    b   4,4,4          b,b,b      1  4    a       <NA>  4
#>  5:    b       5        b,b,b,b      1  5    c 2013-09-05  5
#>  6: <NA>      NA NA,NA,NA,NA,NA      1 NA    c 2013-09-06  6
#>  7:    a 1,1,1,1        a,a,a,a      2 NA    z 2012-01-07  1
#>  8:    a     2,2      a,a,a,a,a      2  6    a 2012-01-06  2
#>  9: <NA>       3             NA      2  7    x 2012-01-05  3
#> 10:    b   4,4,4          b,b,b      2  8    c 2012-01-04  4
#> 11:    b       5        b,b,b,b      2  9    x 2012-01-03  5
#> 12: <NA>      NA NA,NA,NA,NA,NA      2 10    x 2012-01-02  6

## tidyr does, but errors for combining ordered and un-ordered factors.
tidyr::pivot_longer(
  DT, grep("[cf]", names(DT), invert=TRUE),
  names_pattern="(.)_(.)",
  names_to=c(".value", "number"))
#> # A tibble: 12 x 7
#>    f_1   f_2   c_1   number     i d          l        
#>    <fct> <ord> <chr> <chr>  <int> <date>     <list>   
#>  1 a     z     a     1          1 2013-09-02 <int [4]>
#>  2 a     z     a     2         NA 2012-01-07 <chr [4]>
#>  3 a     a     a     1          2 2013-09-03 <int [2]>
#>  4 a     a     a     2          6 2012-01-06 <chr [5]>
#>  5 <NA>  x     <NA>  1          3 2013-09-04 <int [1]>
#>  6 <NA>  x     <NA>  2          7 2012-01-05 <chr [1]>
#>  7 a     c     b     1          4 NA         <int [3]>
#>  8 a     c     b     2          8 2012-01-04 <chr [3]>
#>  9 c     x     b     1          5 2013-09-05 <int [1]>
#> 10 c     x     b     2          9 2012-01-03 <chr [4]>
#> 11 c     x     <NA>  1         NA 2013-09-06 <int [1]>
#> 12 c     x     <NA>  2         10 2012-01-02 <chr [5]>

family data

## Example 3, three children, one family per row, from data.table
## vignette.
family.dt <- fread(text="
family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
1         30 1998-11-26 2000-01-29         NA             1             2            NA
2         27 1996-06-22         NA         NA             2            NA            NA
3         26 2002-07-11 2004-04-05 2007-09-02             2             2             1
4         32 2004-10-10 2009-08-27 2012-07-21             1             1             1
5         29 2000-12-05 2005-02-28         NA             2             1            NA")
(children.melt <- melt(family.dt, measure = patterns(
  dob="^dob", gender="^gender"
), na.rm=TRUE, variable.factor=FALSE))
#>     family_id age_mother variable        dob gender
#>  1:         1         30        1 1998-11-26      1
#>  2:         2         27        1 1996-06-22      2
#>  3:         3         26        1 2002-07-11      2
#>  4:         4         32        1 2004-10-10      1
#>  5:         5         29        1 2000-12-05      2
#>  6:         1         30        2 2000-01-29      2
#>  7:         3         26        2 2004-04-05      2
#>  8:         4         32        2 2009-08-27      1
#>  9:         5         29        2 2005-02-28      1
#> 10:         3         26        3 2007-09-02      1
#> 11:         4         32        3 2012-07-21      1

## nc::field can be used to define group name and pattern at the
## same time, to avoid repetitive code.
(children.nc <- nc::capture_melt_multiple(
  family.dt,
  column="[^_]+",
  "_",
  nc::field("child", "", "[1-3]"),
  na.rm=TRUE))
#>     family_id age_mother child        dob gender
#>  1:         1         30     1 1998-11-26      1
#>  2:         2         27     1 1996-06-22      2
#>  3:         3         26     1 2002-07-11      2
#>  4:         4         32     1 2004-10-10      1
#>  5:         5         29     1 2000-12-05      2
#>  6:         1         30     2 2000-01-29      2
#>  7:         3         26     2 2004-04-05      2
#>  8:         4         32     2 2009-08-27      1
#>  9:         5         29     2 2005-02-28      1
#> 10:         3         26     3 2007-09-02      1
#> 11:         4         32     3 2012-07-21      1

## reshape works too.
reshape(
  family.dt,
  varying=grep("child", names(family.dt)),
  direction="long",
  sep="_",
  timevar="child.str")
#>     family_id age_mother child.str        dob gender id
#>  1:         1         30    child1 1998-11-26      1  1
#>  2:         2         27    child1 1996-06-22      2  2
#>  3:         3         26    child1 2002-07-11      2  3
#>  4:         4         32    child1 2004-10-10      1  4
#>  5:         5         29    child1 2000-12-05      2  5
#>  6:         1         30    child2 2000-01-29      2  1
#>  7:         2         27    child2       <NA>     NA  2
#>  8:         3         26    child2 2004-04-05      2  3
#>  9:         4         32    child2 2009-08-27      1  4
#> 10:         5         29    child2 2005-02-28      1  5
#> 11:         1         30    child3       <NA>     NA  1
#> 12:         2         27    child3       <NA>     NA  2
#> 13:         3         26    child3 2007-09-02      1  3
#> 14:         4         32    child3 2012-07-21      1  4
#> 15:         5         29    child3       <NA>     NA  5

histogram of iris

## Comparison with base R. 1. mfrow means parts on rows, mfcol means
## parts on columns. 2. same number of lines of code. 3. nc/ggplot2
## code has more names and fewer numbers.
imat <- as.matrix(iris[, 1:4])
ylim <- range(table(imat))
xlim <- range(imat)
par(mfcol=c(2,2), mar=c(2,2,1,1))
for(col.i in 1:ncol(imat)){
  hist(
    imat[, col.i],
    breaks=seq(xlim[1], xlim[2], by=0.1),
    ylim=ylim,
    main=colnames(imat)[col.i])
}

plot of chunk unnamed-chunk-20

single output column simple type conversion

All four packages below can convert the input reshape column name to a numeric output capture column.

pen.peaks.wide <- data.table::data.table(
  data.set=c("foo", "bar"),
  "10.1"=c(5L, 10L),
  "0.3"=c(26L, 39L))
pen.peaks.gather <- tidyr::gather(
  pen.peaks.wide,
  "penalty",
  "peaks",
  -1,
  convert=TRUE)
str(pen.peaks.gather)
#> 'data.frame':    4 obs. of  3 variables:
#>  $ data.set: chr  "foo" "bar" "foo" "bar"
#>  $ penalty : num  10.1 10.1 0.3 0.3
#>  $ peaks   : int  5 10 26 39

pen.peaks.nc <- nc::capture_melt_single(
  pen.peaks.wide,
  penalty="^[0-9.]+", as.numeric,
  value.name="peaks")
str(pen.peaks.nc)
#> Classes 'data.table' and 'data.frame':   4 obs. of  3 variables:
#>  $ data.set: chr  "foo" "bar" "foo" "bar"
#>  $ penalty : num  10.1 10.1 0.3 0.3
#>  $ peaks   : int  5 10 26 39
#>  - attr(*, ".internal.selfref")=<externalptr>

pen.peaks.pivot <- tidyr::pivot_longer(
  pen.peaks.wide,
  -1,
  names_to="penalty",
  names_ptypes=list(penalty=numeric()),
  values_to="peaks")
str(pen.peaks.pivot)
#> Classes 'tbl_df', 'tbl' and 'data.frame':    4 obs. of  3 variables:
#>  $ data.set: chr  "foo" "foo" "bar" "bar"
#>  $ penalty : num  10.1 0.3 10.1 0.3
#>  $ peaks   : int  5 26 10 39

varying <- 2:3
pen.peaks.reshape.times <- stats::reshape(
  pen.peaks.wide,
  direction="long",
  varying=varying,
  times=as.numeric(names(pen.peaks.wide)[varying]),
  v.names="peaks",
  timevar="penalty")
str(pen.peaks.reshape.times)
#> Classes 'data.table' and 'data.frame':   4 obs. of  4 variables:
#>  $ data.set: chr  "foo" "bar" "foo" "bar"
#>  $ penalty : num  10.1 10.1 0.3 0.3
#>  $ peaks   : int  5 10 26 39
#>  $ id      : int  1 2 1 2
#>  - attr(*, ".internal.selfref")=<externalptr> 
#>  - attr(*, "reshapeLong")=List of 4
#>   ..$ varying:List of 1
#>   .. ..$ peaks: chr  "10.1" "0.3"
#>   .. ..- attr(*, "v.names")= chr "peaks"
#>   .. ..- attr(*, "times")= num  10.1 0.3
#>   ..$ v.names: chr "peaks"
#>   ..$ idvar  : chr "id"
#>   ..$ timevar: chr "penalty"

pen.peaks.renamed <- pen.peaks.wide
names(pen.peaks.renamed) <- paste0(ifelse(
  grepl("^[0-9]", names(pen.peaks.wide)),
  "peaks_", ""),
  names(pen.peaks.wide))
pen.peaks.reshape.sep <- stats::reshape(
  pen.peaks.renamed,
  direction="long",
  varying=varying,
  sep="_",
  timevar="penalty")
str(pen.peaks.reshape.sep)
#> Classes 'data.table' and 'data.frame':   4 obs. of  4 variables:
#>  $ data.set: chr  "foo" "bar" "foo" "bar"
#>  $ penalty : num  10.1 10.1 0.3 0.3
#>  $ peaks   : int  5 10 26 39
#>  $ id      : int  1 2 1 2
#>  - attr(*, ".internal.selfref")=<externalptr> 
#>  - attr(*, "reshapeLong")=List of 4
#>   ..$ varying:List of 1
#>   .. ..$ peaks: chr  "peaks_10.1" "peaks_0.3"
#>   .. ..- attr(*, "v.names")= chr "peaks"
#>   .. ..- attr(*, "times")= num  10.1 0.3
#>   ..$ v.names: chr "peaks"
#>   ..$ idvar  : chr "id"
#>   ..$ timevar: chr "penalty"