Fitting linear and non-linear equations by group

We’ll fit some linear and non-linear models for dominant height, and compare them. We’ll use the first 10 strata of the exemple dataset exfm16.

library(forestmangr)
library(dplyr)
library(tidyr)

data(exfm14)
data_ex <- exfm14 %>% filter(strata%in%1:10)
data_ex
#> # A tibble: 485 x 4
#>   strata  plot   age    dh
#>    <dbl> <dbl> <dbl> <dbl>
#> 1      1     2    30  12.3
#> 2      1     2    42  16.3
#> 3      1     2    54  17.6
#> 4      1     2    66  18.9
#> 5      1     2    78  21.0
#> 6      1     2    90  20.5
#> # ... with 479 more rows

In order to fit Schumacher’s dominant height model, we can use lm_table. Thanks to the log and inv functions, there is no need to create new variables:

mod1 <- lm_table(data_ex, log(dh) ~ inv(age))
mod1
#>         b0        b1      Rsqr  Rsqr_adj Std.Error
#> 1 3.484289 -24.84688 0.5774814 0.5766066 0.1489379

To fit a non-linear model, like Chapman-Richards’ we can use the nls_table function. This function uses Levenberg-Marquardt’s algorithm by default, in order to assure a good fit. Since this is a non-linear fit, we have to input initial values for all coefficients:

mod2 <- nls_table(data_ex, dh ~ b0 * (1 - exp( -b1 * age )  )^b2, 
          mod_start = c( b0=23, b1=0.03, b2 = 1.3  ) )
mod2
#>         b0         b1       b2
#> 1 25.07289 0.04864685 2.210243

If we wanted to fit one model of each stratum, we can use the .groups argument:

mod1 <- lm_table(data_ex, log(dh) ~ inv(age), .groups = "strata")
mod1
#>    strata       b0        b1      Rsqr  Rsqr_adj Std.Error
#> 1       1 3.409597 -23.89531 0.6735579 0.6626765 0.1231461
#> 2       2 3.438958 -20.93546 0.5838795 0.5748334 0.1161602
#> 3       3 3.466038 -24.71091 0.5240282 0.5145088 0.1672240
#> 4       4 3.503344 -23.68113 0.6594713 0.6511657 0.1146404
#> 5       5 3.536465 -26.23788 0.5750804 0.5692596 0.1571615
#> 6       6 3.541486 -26.18396 0.5843789 0.5758968 0.1589094
#> 7       7 3.546067 -28.33781 0.7170698 0.7125789 0.1331317
#> 8       8 3.385322 -22.29504 0.5114795 0.4966758 0.1641533
#> 9       9 3.444338 -23.34891 0.5151849 0.5062068 0.1638949
#> 10     10 3.411819 -24.00458 0.5749233 0.5585742 0.1491196
mod2 <- nls_table(data_ex, dh ~ b0 * (1 - exp( -b1 * age )  )^b2, 
          mod_start = c( b0=23, b1=0.03, b2 = 1.3  ),
          .groups = "strata" )
mod2
#>    strata       b0         b1        b2
#> 1       1 22.38251 0.06883817 3.9495071
#> 2       2 23.61424 0.08097176 5.4171092
#> 3       3 24.09040 0.05813164 2.9428225
#> 4       4 29.73188 0.02172249 0.8875606
#> 5       5 26.74705 0.04163860 1.8799458
#> 6       6 25.63739 0.05955898 3.4005108
#> 7       7 26.68246 0.03878189 1.8419905
#> 8       8 23.35938 0.05159001 2.2394412
#> 9       9 27.31603 0.02544091 0.9842925
#> 10     10 22.95016 0.06214439 3.3861913

If the fit is not ideal, it’s possible to use a dataframe with starting values for each stratum, and use it as an input for mod_start:

tab_start <- data.frame(strata = c(1:10), 
              rbind(
              data.frame(b0=rep(23, 5),b1=rep(0.03,5),b2=rep(1.3,5) ), 
              data.frame(b0=rep(23, 5),b1=rep(0.03,5),b2=rep(.5,5) )))
tab_start
#>    strata b0   b1  b2
#> 1       1 23 0.03 1.3
#> 2       2 23 0.03 1.3
#> 3       3 23 0.03 1.3
#> 4       4 23 0.03 1.3
#> 5       5 23 0.03 1.3
#> 6       6 23 0.03 0.5
#> 7       7 23 0.03 0.5
#> 8       8 23 0.03 0.5
#> 9       9 23 0.03 0.5
#> 10     10 23 0.03 0.5
mod2 <- nls_table(data_ex, dh ~ b0 * (1 - exp( -b1 * age )  )^b2, 
          mod_start = tab_start,
          .groups = "strata" )
mod2
#>    strata       b0         b1        b2
#> 1       1 22.38251 0.06883817 3.9495071
#> 2       2 23.61424 0.08097176 5.4171092
#> 3       3 24.09040 0.05813164 2.9428225
#> 4       4 29.73188 0.02172249 0.8875606
#> 5       5 26.74705 0.04163860 1.8799458
#> 6       6 25.63740 0.05955884 3.4004949
#> 7       7 26.68250 0.03878163 1.8419741
#> 8       8 23.35905 0.05159620 2.2398949
#> 9       9 27.31667 0.02543790 0.9841984
#> 10     10 22.95016 0.06214456 3.3862119

Now we’re going to fit some other models. These are:

Schumacher: \[ Ln(DH) = \beta_0 + \beta_1 * \frac{1}{age} \]

Chapman-Richards: \[ DH = \beta_0 * (1 - exp^{-\beta_1 * age})^{\beta_2} \]

Bayley-Clutter: \[ Ln(DH) = \beta_0 + \beta_1 * \begin{pmatrix} \frac{1}{age} \end{pmatrix} ^{\beta_2} \]

Curtis: \[ DH = \beta_0 + \beta_1 * \frac{1}{age} \]

We’ll fit these models and add their estimated values to the original data using the merge_est output and naming each estimated variable with the est_name argument:

data_ex_est <- data_ex %>% 
  lm_table(log(dh) ~ inv(age), .groups = "strata",
           output = "merge_est", est.name = "Schumacher") %>% 
  
  nls_table(dh ~ b0 * (1 - exp( -b1 * age )  )^b2, 
          mod_start = c( b0=23, b1=0.03, b2 = 1.3  ),.groups="strata",
          output ="merge_est",est.name="Chapman-Richards") %>% 
  
  nls_table(log(dh) ~ b0 + b1 * ( inv(age)^b2 ) , 
          mod_start = c( b0=3, b1=-130, b2 = 1.5),.groups = "strata",
          output ="merge_est",est.name = "Bailey-Clutter") %>% 
  
  lm_table(dh ~ inv(age), .groups = "strata",
           output = "merge_est", est.name = "Curtis") 

head(data_ex_est)  
#>   strata plot age       dh Schumacher Chapman-Richards Bailey-Clutter
#> 1      1    2  30 12.29841   13.64110         13.10199       12.58661
#> 2      1    2  42 16.32000   17.12709         17.86289       18.23334
#> 3      1    2  54 17.58000   19.43531         20.31013       20.32246
#> 4      1    2  66 18.94000   21.06362         21.45677       21.20276
#> 5      1    2  78 21.04000   22.27015         21.97365       21.62661
#> 6      1    2  90 20.48000   23.19865         22.20283       21.85316
#>     Curtis
#> 1 13.61765
#> 2 17.58237
#> 3 19.78499
#> 4 21.18665
#> 5 22.15704
#> 6 22.86866

Ps: The lm_table function checks if the model has log in the y variable, and if it does, it removes it automatically when estimating variables. Because of that, there’s no need to calculate the exponential for the estimated variables.

In order to compare these models, we’ll calculate the root mean square error and bias for all models. To do this, we’ll gather all estimated variables in a single column using tidyr::gather, group by model, and use the rmse_per and bias_per functions:

data_ex_est %>% 
  gather(Model, Value, 
         Schumacher, `Chapman-Richards`, `Bailey-Clutter`, Curtis) %>% 
  group_by(Model) %>% 
  summarise(
    RMSE = rmse_per(y = dh, yhat = Value),
    BIAS = bias_per(y = dh, yhat = Value) )
#> # A tibble: 4 x 3
#>   Model             RMSE      BIAS
#>   <chr>            <dbl>     <dbl>
#> 1 Bailey-Clutter    14.8  1.03e+ 0
#> 2 Chapman-Richards  14.7 -7.37e- 4
#> 3 Curtis            14.8  2.94e-15
#> 4 Schumacher        15.0  1.03e+ 0

Another way of comparing and evaluating these models is using residual graphical analysis. The function resid_plot can help us with that:

resid_plot(data_ex_est, "dh", "Schumacher", "Chapman-Richards", "Bailey-Clutter", "Curtis")

There are other types of plots avaiable, such as histogram:

resid_plot(data_ex_est, "dh", "Schumacher","Chapman-Richards","Bailey-Clutter", "Curtis",
           type = "histogram_curve")

And versus:

resid_plot(data_ex_est, "dh", "Schumacher", "Chapman-Richards", "Bailey-Clutter", "Curtis",
           type = "versus")