Using lists to run multiple statistics test with one set of code

Question

I'm interested in using lists to run multiple statistics test with one set of code.

For example, I want to run glm() tests that vary in terms of DVs, IVs, data, and family, based on rows in a data frame / list. I can do this the long way, and I can use lapply() to do this a "medium way" such that I can change the DV used in the test. But I would like to know if there is a method {preferably using lapply()} to complete this task with less code and in a more automated/iterative fashion.

For the example data, I created 2 datasets using the ggplot2::diamonds data and the code below:

### for dataset with top 300 rows
# ---- NOTE: selects only the top 300 rows of the dataset
diamonds_top300 <- data.frame(dplyr::top_n(diamonds, 300, table))
# ---- NOTE: gives dataset info
head(diamonds_top300)
str(diamonds_top300)
colnames(diamonds_top300)
nrow(diamonds_top300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_top300$price)
unique(diamonds_top300$y)
unique(diamonds_top300$cut)
unique(diamonds_top300$color)
unique(diamonds_top300$carat)
unique(diamonds_top300$clarity)
unique(diamonds_top300$depth)
unique(diamonds_top300$table)

### for dataset with bottom 300 rows
# ---- NOTE: selects only the bottom 300 rows of the dataset
diamonds_bottom300 <- data.frame(dplyr::top_n(diamonds, -300, table))
# ---- NOTE: gives dataset info
head(diamonds_bottom300)
str(diamonds_bottom300)
colnames(diamonds_bottom300)
nrow(diamonds_bottom300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_bottom300$price)
unique(diamonds_bottom300$y)
unique(diamonds_bottom300$cut)
unique(diamonds_bottom300$color)
unique(diamonds_bottom300$carat)
unique(diamonds_bottom300$clarity)
unique(diamonds_bottom300$depth)
unique(diamonds_bottom300$table)

I then used these data to create a data frame with list information, and got these results:

## creates df with variable info
model_variable_df <-
  data.frame(
    cbind(
      DV_name = c("carat", "depth", "price"),
      DV_label = c("carat size", "depth size", "diamond price"),
      dataset_name = c("diamonds_bottom300", "diamonds_bottom300", "diamonds_top300"),
      IV_name = c("x + y + color", "x + y + clarity", "x + z + color"),
      family = c("poisson", "poisson", "gaussian")
    )
  )

> model_variable_df
  DV_name      DV_label       dataset_name         IV_name   family
1   carat    carat size diamonds_bottom300   x + y + color  poisson
2   depth    depth size diamonds_bottom300 x + y + clarity  poisson
3   price diamond price    diamonds_top300   x + z + color gaussian

I can accomplish my task using the long method:

## long form of 3 models

### creates first model
freq_glm_poisson_carat <- 
  (
  glm(
    carat ~ x + y + color,
             data = diamonds_bottom300, 
             family = poisson()
           )
  )

### creates 2nd model
freq_glm_binomial_depth <- 
  glm(
    depth ~ x + y + clarity,
    data = diamonds_bottom300, 
    family= poisson()
  )

### creates 3rd model
freq_glm_gaussian_price <- 
  glm(
    price ~ x + z + color,
    data = diamonds_top300, 
    family= gaussian()
  )

I can also use the medium method for more specific and limited DV_name based tasks.

## model that uses lapply, and just values DV
# ---- NOTE: DV_name is the only thing that changes
# ---- NOTE: IVs / effects (fixed and random) are the same as x + y + color, or model 1
# ---- NOTE: data = diamonds_top300 only
# ---- NOTE: family = poisson() only
# ---- NOTE: creates list object
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  lapply(model_variable_df$DV_name,
         function(DV_list) wrapr::let(
           c(DV_col = DV_list, 
             dataset_obj = "diamonds_top300"),
           glm(
             DV_col ~ x + y + color, 
             data = dataset_obj,
             family = poisson()
             )
         )
  )
# ---- NOTE: changes list object name
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  setNames(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, paste("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter", 
                                                                       model_variable_df$DV_name,
                                                                       sep = "__")
  )
# ---- NOTE: creates unique objects for each part list object
list2env(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, .GlobalEnv)
# ---- NOTE: gathers objects with prefix
apropos("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter")

Is there any method I could use to complete this task using (1) less code and (2) more iteration/automation? Any and all help is much appreciated.

FYI, I use RStudio on a 2013 Intel Macbook Pro.

Thanks.

Code used for practice:

# sets up data

## Loads packages
# ---- NOTE: making plots and diamonds dataset
if(!require(ggplot2)){install.packages("ggplot2")}
# ---- NOTE: run mixed effects models
if(!require(lme4)){install.packages("lme4")}
# ---- NOTE: for data wrangling
if(!require(dplyr)){install.packages("dplyr")}
# ---- NOTE: for iteration
if(!require(wrapr)){install.packages("wrapr")}

## dataset creation

### for dataset with top 300 rows
# ---- NOTE: selects only the top 300 rows of the dataset
diamonds_top300 <- data.frame(dplyr::top_n(diamonds, 300, table))
# ---- NOTE: gives dataset info
head(diamonds_top300)
str(diamonds_top300)
colnames(diamonds_top300)
nrow(diamonds_top300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_top300$price)
unique(diamonds_top300$y)
unique(diamonds_top300$cut)
unique(diamonds_top300$color)
unique(diamonds_top300$carat)
unique(diamonds_top300$clarity)
unique(diamonds_top300$depth)
unique(diamonds_top300$table)

### for dataset with bottom 300 rows
# ---- NOTE: selects only the bottom 300 rows of the dataset
diamonds_bottom300 <- data.frame(dplyr::top_n(diamonds, -300, table))
# ---- NOTE: gives dataset info
head(diamonds_bottom300)
str(diamonds_bottom300)
colnames(diamonds_bottom300)
nrow(diamonds_bottom300)
# ---- NOTE: gives unique values of Fixed and Random effects, and dvs
unique(diamonds_bottom300$price)
unique(diamonds_bottom300$y)
unique(diamonds_bottom300$cut)
unique(diamonds_bottom300$color)
unique(diamonds_bottom300$carat)
unique(diamonds_bottom300$clarity)
unique(diamonds_bottom300$depth)
unique(diamonds_bottom300$table)

## creates df with variable info
model_variable_df <-
  data.frame(
    cbind(
      DV_name = c("carat", "depth", "price"),
      DV_label = c("carat size", "depth size", "diamond price"),
      dataset_name = c("diamonds_bottom300", "diamonds_bottom300", "diamonds_top300"),
      IV_name = c("x + y + color", "x + y + clarity", "x + z + color"),
      family = c("poisson", "poisson", "gaussian")
    )
  )

## long for of 3 models

### creates first model
freq_glm_poisson_carat <- 
  (
  glm(
    carat ~ x + y + color,
             data = diamonds_bottom300, 
             family = poisson()
           )
  )

### creates 2nd model
freq_glm_binomial_depth <- 
  glm(
    depth ~ x + y + clarity,
    data = diamonds_bottom300, 
    family= poisson()
  )

### creates 3rd model
freq_glm_gaussian_price <- 
  glm(
    price ~ x + z + color,
    data = diamonds_top300, 
    family= gaussian()
  )

## model that uses lapply, and just values DV
# ---- NOTE: DV_name is the only thing that changes
# ---- NOTE: IVs / effects (fixed and random) are the same as x + y + color, or model 1
# ---- NOTE: data = diamonds_top300 only
# ---- NOTE: family = poisson() only
# ---- NOTE: creates list object
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  lapply(model_variable_df$DV_name,
         function(DV_list) wrapr::let(
           c(DV_col = DV_list, 
             dataset_obj = "diamonds_top300"),
           glm(
             DV_col ~ x + y + color, 
             data = dataset_obj,
             family = poisson()
             )
         )
  )
# ---- NOTE: changes list object name
freq_checking_mlm_poisson_Effects_x_y_color_1z_model <- 
  setNames(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, paste("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter", 
                                                                       model_variable_df$DV_name,
                                                                       sep = "__")
  )
# ---- NOTE: creates unique objects for each part list object
list2env(freq_checking_mlm_poisson_Effects_x_y_color_1z_model, .GlobalEnv)
# ---- NOTE: gathers objects with prefix
apropos("freq_checking_mlm_poisson_Effects_x_y_color_1z_model_contracts_filter")

or Map, they are all loops Map(function(x, y, d, fam) glm(reformulate(x, y), data = get(d), family = match.fun(fam)), x = model_variable_df$IV_name, y = model_variable_df$DV_name, d = model_variable_df$dataset_name, fam = model_variable_df$family) — rawr, Commented Apr 28, 2021 at 5:17

Ronak Shah · Accepted Answer · 2021-04-28 05:10:28Z

0

You can do this with lapply as :

lapply(seq(nrow(model_variable_df)), function(i) {
  val <- model_variable_df[i, ]
  glm(as.formula(paste(val$DV_name, val$IV_name, sep = '~')), 
      data = get(val$dataset_name), family = val$family)
}) -> model_list

as.formula is used to convert string to formula and get is used to get the dataset from the string value.

answered Apr 28, 2021 at 5:10

Ronak Shah

390k20 gold badges170 silver badges234 bronze badges

Add a comment |

Collectives™ on Stack Overflow

Using lists to run multiple statistics test with one set of code

1 Answer 1

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Related