# load various data sets
data(penguins)
smoke_complete <- read_csv(here::here("part8", "data", "clinical.csv"))
lusc_data <- read_csv(here::here("part8", "data", "tumor", "LUSC.csv"))
meso_data <- read_csv(here::here("part8", "data", "tumor", "MESO.csv"))
prad_data <- read_csv(here::here("part8", "data", "tumor", "PRAD.csv"))
read_data <- read_csv(here::here("part8", "data", "tumor", "READ.csv"))8: Functions and Lists
BSTA 526: R Programming for Health Data Science
1 Welcome to R Programming: Part 8!
Today we will learn more about customizing tables, adding and reordering factors, and ggplot.
Before you get started:
Remember to save this notebook under a new name, such as
part_08_b526_YOURNAME.qmd.
- Load the packages in the setup code chunk
1.1 Learning Objectives
- Learn how to write and use custom functions
- Learn about lists as a general purpose data structure
- Learn and utilize list properties
- Access list elements using
$and[[]] - Understand the difference between homogeneous and heterogeneous lists
- Use
purrr::pluck()to access list elements - Understand how
data.frames()are list-like
1.2 Load data sets
2 Motivation
Today: building blocks for truly automating your data wrangling or analysis work.
This is the true power of learning how to program!
2.1 Custom Functions & Lists
At first these two sections will seem disjointed:
- Functions
- Create your own functions
- Use for data wrangling or analyses or visualization, anything
- Lists
- An object type in R that can be very useful
- Spoiler alert: data frames can actually behave like a
list
- Next week we will use them together with “Functional Programming Tools”
- such as
purrr::map()type functions to iterate a function - over a list or over rows of a data frame, for example.
- such as
2.2 Custom Functions
Please read the Functions chapter in R4DS for some background! (Chapter 25)
We know what a function is in R, for instance, these are “built-in” functions in R
mean(1:10)[1] 5.5
toupper("abc")[1] "ABC"
But what if we want to create our own custom functions?
We have actually done this before! When we used across():
penguins %>% summarize(across(.cols = ends_with("mm"),
.fns = ~ mean(.x, na.rm = TRUE))) # this is a custom function, in a way# A tibble: 1 × 3
bill_length_mm bill_depth_mm flipper_length_mm
<dbl> <dbl> <dbl>
1 43.9 17.2 201.
2.3 Motivating Example
- Here is another example for motivating today’s work.
- Let’s say we want some summary measures from our data,
- specifically the means of days to death, age at diagnosis, and
- the correlation of those two variables.
lusc_data %>% summarize(
mean_cig = mean(days_to_death, na.rm = TRUE),
mean_age = mean(age_at_diagnosis, na.rm = TRUE),
cor_cig_age = cor(days_to_death, age_at_diagnosis,
use = "complete.obs"))# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 872. 24754. -0.0299
- Now, we have read in multiple data sets, and want to calculate this for each data set.
- We can do this separately, with copy and paste and
- hope we don’t make a mistake, and
- also keep track of all our outputs somehow:
lusc_data %>% summarize(
mean_cig = mean(days_to_death, na.rm = TRUE),
mean_age = mean(age_at_diagnosis, na.rm = TRUE),
cor_cig_age = cor(days_to_death, age_at_diagnosis,
use = "complete.obs"))# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 872. 24754. -0.0299
meso_data %>% summarize(
mean_cig = mean(days_to_death, na.rm = TRUE),
mean_age = mean(age_at_diagnosis, na.rm = TRUE),
cor_cig_age = cor(days_to_death, age_at_diagnosis,
use = "complete.obs"))# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 584. 23188. -0.206
prad_data %>% summarize(
mean_cig = mean(days_to_death, na.rm = TRUE),
mean_age = mean(age_at_diagnosis, na.rm = TRUE),
cor_cig_age = cor(days_to_death, age_at_diagnosis,
use = "complete.obs"))# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 1579 22480. 0.152
read_data %>% summarize(
mean_cig = mean(days_to_death, na.rm = TRUE),
mean_age = mean(age_at_diagnosis, na.rm = TRUE),
cor_cig_age = cor(days_to_death, age_at_diagnosis,
use = "complete.obs"))# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 786. 23705. -0.288
- Or, we can create our own function to do this!
- We will go through the parts of a function below,
- but this is what it would look like:
# custom function
myfun <- function(mydata){
mydata %>% summarize(
mean_cig = mean(days_to_death, na.rm = TRUE),
mean_age = mean(age_at_diagnosis, na.rm = TRUE),
cor_cig_age = cor(days_to_death, age_at_diagnosis,
use = "complete.obs"))
}
# apply custom function to each of the datasets
# (note that the output is not being saved below)
myfun(lusc_data)# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 872. 24754. -0.0299
myfun(meso_data)# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 584. 23188. -0.206
myfun(prad_data)# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 1579 22480. 0.152
myfun(read_data)# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 786. 23705. -0.288
- Later we will see that we can make a list of data frames, and
- use
purrr::map()to iterate over this list and - obtain all of the outputs at once.
- use
- This is a motivating example, it will take some time to get to this point!
list_of_data <- list("LUSC" = lusc_data,
"MESO" = meso_data,
"PRAD" = prad_data,
"READ" = read_data)# returns a list of outputs
map(list_of_data, myfun)$LUSC
# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 872. 24754. -0.0299
$MESO
# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 584. 23188. -0.206
$PRAD
# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 1579 22480. 0.152
$READ
# A tibble: 1 × 3
mean_cig mean_age cor_cig_age
<dbl> <dbl> <dbl>
1 786. 23705. -0.288
# returns a data frame of outputs
map_df(list_of_data, myfun, .id = "cancer_type")# A tibble: 4 × 4
cancer_type mean_cig mean_age cor_cig_age
<chr> <dbl> <dbl> <dbl>
1 LUSC 872. 24754. -0.0299
2 MESO 584. 23188. -0.206
3 PRAD 1579 22480. 0.152
4 READ 786. 23705. -0.288
We’ll start with learning about how to create our own functions.
3 Functions: Don’t Repeat Yourself (DRY)
3.1 Motivation
- If you do something in R once and have to do it again,
- it’s probably best to think of how to do it well once,
- rather than cutting and pasting again.
- Part of doing this is by using and writing functions.
- We can take code that we might have to write 10 times in a row, and
- package it in a way that we can use it multiple times.
- For the example above,
- we have some computations we want to perform on specific variables in a data set, and
- we have multiple data sets with those variables we need to do this with.
- A function can be used to perform these computations.
3.2 Advantages over using copy-and-paste
From R for Data Science:
Writing a function has four big advantages over using copy-and-paste:
- You can give a function an evocative name that makes your code easier to understand.
- As requirements change, you only need to update code in one place, instead of many.
- You eliminate the chance of making incidental mistakes when you copy and paste (i.e. updating a variable name in one place, but not in another).
- It makes it easier to reuse work from project-to-project, increasing your productivity over time.
3.3 Creating a function
Let’s learn about how we define a function.
f <- function(<arguments>) {
## Do something interesting or useful
## Code
## Return an output
}Things to ask yourself:
- What do you want your function to do?
- What will the output of the function be?
- What kind of object do you want the output to be?
3.4 Example of creating a function
square_number <- function(number) {
output_number <- number * number
return(output_number)
}
square_number(4)[1] 16
- We define our function by
- giving it a name,
square_number, and - defining its arguments within the parentheses of
function().
- giving it a name,
- Our example function only has one argument,
number:
square_number <- function(number) {
- What about the
{?- It defines the beginning of a code block -
- a code block defines the code that will run when we use the function.
- It defines the beginning of a code block -
- Between the
{....}we calculate the square of our number and- assign it to
output_number.
- assign it to
- The next line uses
return()to return a value from our function. - Then we close the code block with a
}.
- The return value of a function is the last expression in the function body to be evaluated.
- i.e.
return()should appear in the last line of the function.
- i.e.
- Note that you don’t have to use
return()- but it makes things clearer once your functions get complicated, and
- especially if you are returning multiple outputs.
- The function below without `return() does the same as above.
If you don’t specify the return object with return(), R will assume you meant to return the last line of code inside the function that returns a value.
square_number_again <- function(number) {
number * number
}
square_number_again(4)[1] 16
- However, be careful! Notice that the code below doesn’t work.
- Why? Because nothing was returned,
- the last line does not return a value but saves an object.
- This is why it’s best to just use
return()- so you can be sure you’re returning what you want.
square_number_again <- function(number) {
mynumsq <- number * number
}
square_number_again(4)
# note the lack of output when running this function!3.5 Mini-challenge (on your own)
Make a function called
cube_numberthat returns the cube of a number.Test it out and make sure it works.
Hint: you can find the cube of a number using
num^3.
cube_number <- ___3.6 Arguments
- We also can input multiple arguments to a function, which can be of different types:
abs_fun <- function(x, i) {
res <- abs(x - mean(x)) ^ i
return(res)
}- What is the output of this function?
- What type of data object is the output?
abs_fun(x = 1:10, i = 4) [1] 410.0625 150.0625 39.0625 5.0625 0.0625 0.0625 5.0625 39.0625
[9] 150.0625 410.0625
mypet <- function(name, pet) {
mystring <- paste(name,"'s ", pet, sep = "")
mystring <- toupper(mystring)
return(mystring)
}- You can name arguments, or use them in order, just like built-in R functions
# not naming arguments and hoping they are ordered correctly:
mypet("meike", "cat")[1] "MEIKE'S CAT"
# oops
mypet("cat", "meike")[1] "CAT'S MEIKE"
# naming arguments:
mypet(name = "meike",
pet = "cat")[1] "MEIKE'S CAT"
# naming arguments not in order:
mypet(pet = "cat",
name = "meike")[1] "MEIKE'S CAT"
- What if we only give one argument?
mypet("meike")- We can also set default arguments.
- If we define an argument inside function(),
- that is the default value and it no longer needs to be specified.
mypet <- function(name, pet = "Mouse") {
mystring <- paste(name,"'s ", pet, sep = "")
mystring <- toupper(mystring)
return(mystring)
}
mypet("meike", "cat")[1] "MEIKE'S CAT"
mypet("meike")[1] "MEIKE'S MOUSE"
3.7 Save the output of the function
- We can save the output of a function just as we do with built-in functions:
petstring <- mypet(name = "meike", pet = "cat")
petstring[1] "MEIKE'S CAT"
- We can also manipulate the output, since it is an object:
tolower(petstring)[1] "meike's cat"
str_replace_all(string = petstring,
pattern = "CAT",
replacement = "DOG")[1] "MEIKE'S DOG"
3.8 Challenge 1 (10 min)
Create a function that calculates the mean of a numeric vector, removing
NA’s. Make sure to test that it works. What happens when you input a character vector?Create a function that takes a data frame and calculates the mean of all numeric variables using your custom mean function above. Try this with the penguins data set.
3.9 What does the function name return?
- Type/run
corin the console (no parentheses).- What do you see?
- Also try a name of a function you created, such as
square_number.
# note: what yo see in the console is different than in the html output
square_number<srcref: file "" chars 1:18 to 4:1>
3.10 Example: standardizing values in a vector
- Below we create our own function to standardize values in a vector:
- subtract the mean of the values and
- divide by the standard deviation of the values
# argument is a numeric vector
standardize_fun <- function(vec) {
vec2 <- (vec - mean(vec, na.rm = TRUE)) / sd(vec, na.rm = TRUE)
return(vec2) # returns another numeric vector
}
# test 1
tmpvec <- 1:10
standardize_fun(tmpvec) [1] -1.4863011 -1.1560120 -0.8257228 -0.4954337 -0.1651446 0.1651446
[7] 0.4954337 0.8257228 1.1560120 1.4863011
# test 2
tmpvec <- c(-3, -2, -1, 0, 1, 2, 3)
standardize_fun(tmpvec)[1] -1.3887301 -0.9258201 -0.4629100 0.0000000 0.4629100 0.9258201 1.3887301
- Usually we create functions to do more complex tasks than simple number or text manipulation.
- Often the input argument is a data frame or tibble.
3.11 Example & Mini challenge: counting categories in a variable from a data frame
- What are the input and output of the function below?
- What requirements do we need for the input data frame?
count_categories <- function(df){
counts <- df %>%
count(disease)
return(counts)
}- Apply the function to different data sets:
# these data are loaded in the first section above
count_categories(smoke_complete)# A tibble: 15 × 2
disease n
<chr> <int>
1 BLCA 412
2 BRCA 1098
3 CESC 307
4 COAD 461
5 GBM 617
6 LGG 516
7 LUSC 1008
8 MESO 87
9 PRAD 500
10 READ 172
11 SKCM 470
12 STAD 443
13 THYM 124
14 UCEC 560
15 UCS 57
count_categories(lusc_data)# A tibble: 1 × 2
disease n
<chr> <int>
1 LUSC 504
count_categories(meso_data)# A tibble: 1 × 2
disease n
<chr> <int>
1 MESO 87
- Why does this not work?
count_categories(penguins)- This function might be useful if you needed to summarize the diseases across a large number of files.
- One thing you might notice is that
diseaseis “hard coded”- which means it always counts the column with that exact name.
- There is a way to make this more flexible
- (i.e. take an argument that specifies the column name)
- using something called
tidyeval, which we will learn about next time.
3.12 Example: read in data from a file path
- If you find yourself reading in many data sets the same way,
- you might create your own custom function to do this the way you like.
# custom function to read in a file (specified with argument path)
# output is a data frame
load_files <- function(path){
out_frame <- readxl::read_excel(path, na="NA", sheet=1)
out_frame <- janitor::clean_names(out_frame) %>%
mutate(path_name = path)
return(out_frame)
}- Run the function:
smoke1 <- load_files(path = here::here("part8", "data", "smoke_1.xlsx"))
glimpse(smoke1) # notice last columnRows: 549
Columns: 6
$ primary_diagnosis <chr> "C34.1", "C34.1", "C34.3", "C34.1", "C34.1", "C34.3"…
$ tumor_stage <chr> "stage ia", "stage ib", "stage ib", "stage ia", "sta…
$ age_at_diagnosis <dbl> 24477, 26615, 28171, 27154, 23370, 19025, 26938, 284…
$ vital_status <chr> "dead", "dead", "dead", "alive", "alive", "dead", "d…
$ morphology <chr> "8070/3", "8070/3", "8070/3", "8083/3", "8070/3", "8…
$ path_name <chr> "/Users/niederha/Library/CloudStorage/OneDrive-Orego…
- You don’t have to include “
path =”
smoke2 <- load_files(here::here("part8", "data", "smoke_2.xlsx"))
glimpse(smoke2) # notice last columnRows: 603
Columns: 6
$ primary_diagnosis <chr> "C67.9", "C67.9", "C67.9", "C67.9", "C67.9", "C67.9"…
$ tumor_stage <chr> "stage iv", "stage ii", "stage iii", "stage iv", "st…
$ age_at_diagnosis <dbl> 17682, 19776, 23631, 26546, 24534, 25134, 26527, 243…
$ vital_status <chr> "alive", "alive", "alive", "dead", "dead", "dead", "…
$ morphology <chr> "8120/3", "8120/3", "8120/3", "8120/3", "8120/3", "8…
$ path_name <chr> "/Users/niederha/Library/CloudStorage/OneDrive-Orego…
3.13 Using custom functions within mutate() and across()
- We can use custom functions within
mutate():
# standardize_fun is a function we created above
# using inside mutate without across:
smoke1 %>%
mutate(age_std = standardize_fun(age_at_diagnosis)) %>%
# selecting just a few columns to see new column:
select(primary_diagnosis, age_at_diagnosis, age_std)# A tibble: 549 × 3
primary_diagnosis age_at_diagnosis age_std
<chr> <dbl> <dbl>
1 C34.1 24477 -0.0933
2 C34.1 26615 0.574
3 C34.3 28171 1.06
4 C34.1 27154 0.743
5 C34.1 23370 -0.439
6 C34.3 19025 -1.80
7 C34.3 26938 0.675
8 C34.1 28430 1.14
9 C34.1 30435 1.77
10 C34.9 24019 -0.236
# ℹ 539 more rows
- When using
across()within themutate(), we can use the custom function - Note that the code below replaces values in all numeric columns with standardized transformations of the values
smoke1 %>%
mutate(across(.cols = where(is.numeric),
.fns = standardize_fun))# A tibble: 549 × 6
primary_diagnosis tumor_stage age_at_diagnosis vital_status morphology
<chr> <chr> <dbl> <chr> <chr>
1 C34.1 stage ia -0.0933 dead 8070/3
2 C34.1 stage ib 0.574 dead 8070/3
3 C34.3 stage ib 1.06 dead 8070/3
4 C34.1 stage ia 0.743 alive 8083/3
5 C34.1 stage iiia -0.439 alive 8070/3
6 C34.3 stage ib -1.80 dead 8070/3
7 C34.3 stage iv 0.675 dead 8070/3
8 C34.1 stage ib 1.14 dead 8070/3
9 C34.1 stage iib 1.77 dead 8070/3
10 C34.9 stage iv -0.236 dead 8070/3
# ℹ 539 more rows
# ℹ 1 more variable: path_name <chr>
# we don't need the .fns = ~ standarize_fun(.x)
# because it is a name of a function and we aren't changing arguments3.14 Creating a function inside across()
- We have seen examples where we created a function inside
across() - This is a bit messier but does the same thing
- We have to use
.xas our function argument placeholder/name.
smoke1 %>%
mutate(across(.cols = where(is.numeric),
.fns = ~ (.x - mean(.x, na.rm = TRUE)) / sd(.x, na.rm = TRUE)
))# A tibble: 549 × 6
primary_diagnosis tumor_stage age_at_diagnosis vital_status morphology
<chr> <chr> <dbl> <chr> <chr>
1 C34.1 stage ia -0.0933 dead 8070/3
2 C34.1 stage ib 0.574 dead 8070/3
3 C34.3 stage ib 1.06 dead 8070/3
4 C34.1 stage ia 0.743 alive 8083/3
5 C34.1 stage iiia -0.439 alive 8070/3
6 C34.3 stage ib -1.80 dead 8070/3
7 C34.3 stage iv 0.675 dead 8070/3
8 C34.1 stage ib 1.14 dead 8070/3
9 C34.1 stage iib 1.77 dead 8070/3
10 C34.9 stage iv -0.236 dead 8070/3
# ℹ 539 more rows
# ℹ 1 more variable: path_name <chr>
- You might think of the function like the one created in the code below,
- which is the same as what we specified above,
- just with
.xinstead ofvec, and noreturn()
standardize_fun <- function(.x) {
(.x - mean(.x, na.rm = TRUE)) / sd(.x, na.rm = TRUE)
}3.15 Using custom functions within summarize()
- Below we create a function and use it within
summarize().- Input: a numeric vector (such as a numeric column of a dataset)
- Output: it divides the mean of the vector by the standard deviation of the vector
- This custom function is a “summarizing” function that returns one value,
- not a vector of multiple values.
mean_over_sd <- function(vec) {
mean(vec, na.rm = TRUE) / sd(vec, na.rm = TRUE)
}- Use the custom function within
summarize():
penguins %>%
group_by(species) %>%
summarize(std = mean_over_sd(bill_length_mm))# A tibble: 3 × 2
species std
<fct> <dbl>
1 Adelie 14.6
2 Chinstrap 14.6
3 Gentoo 15.4
- Use the custom function within
summarize()andacross():
penguins %>%
group_by(species) %>%
summarize(across(
.cols = where(is.numeric),
.fns = mean_over_sd))# A tibble: 3 × 6
species bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
<fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Adelie 14.6 15.1 29.0 8.07 2443.
2 Chinstrap 14.6 16.2 27.5 9.71 2326.
3 Gentoo 15.4 15.3 33.5 10.1 2535.
3.16 Challenge 2 (15 min)
Use the above functions
standardize_fun()andmean_over_sd()with the LUSC data to standardize the numeric vectorsage_at_diagnosisandcigarettes_per_day. Calculate mean/sd for those vectors as well.Create a custom function that converts mm to cm, and apply the function within a
mutate()andacross()on all the columns inpenguinsdata that end with “mm”.
3.17 Checking for user errors with if()
- What happens when someone tries
square_number("two")? - Since “two” is a character, you can’t multiply it.
# run this yourself, eval: false so we can knit
square_number("two")- We can let the the user to know that this isn’t what the function is expecting for its input.
- We can use
if(){ }as a way to check if the input valuenumberis actually numeric. if()statements check the condition within the parentheses,- and then run the code in the
{ }(curly brackets) if the condition is true.
- and then run the code in the
# run this yourself, eval: false so we can knit
square_number_improved <- function(number) {
# check for whether number is numeric:
if(!is.numeric(number)) {
# stop gives the error and exits the function:
stop("Your input was not a number")
}
output_number <- number * number
return(output_number)
}
square_number_improved(2)
square_number_improved("two")- It depends on who uses your functions.
- If the functions are just for you and your lab, as long as everyone understands what goes into the functions, you probably don’t need to anticipate all kinds of errors, just a few major ones.
- It’s when you are releasing your software as a freely available package that you should think about doing extensive testing for errors, especially the functions you anticipate will be used by most people who use your software.
3.18 Challenge 3 (10 minutes)
- Create a custom function that does the same data cleaning steps as the code below for one dataset, so that you don’t repeat the same code 3 times. Put your solution in the code chunk below this, labeled
challenge1solns. - Your custom function should essentially be a copy of the code that does these steps:
- Loads the time point data from an .xlsx file,
- cleans the data,
- renames any necessary columns, and
- then pivots the data into a long format.
- The function should have two arguments:
- the name of the excel file and
- the name of the sheet
- The function should output a long dataset
- Bind the data at the end of the code chunk after running the function three times.
- If you have time, add in an error check that when the sheet number is not between 2 and 4, returns an error message to say “Incorrect sheet value for time point data.”
# Load data
mouse_tp1 <- read_excel(here::here("part8", "data", "mouse_biomarker.xlsx"),
sheet = 2,
na = c("","/"),
n_max = 34, # CAREFUL WITH THIS
.name_repair = janitor::make_clean_names) %>%
remove_empty(which = c("rows","cols"))
mouse_tp2 <- read_excel(here::here("part8", "data", "mouse_biomarker.xlsx"),
sheet = 3,
na = c("","/"),
.name_repair = janitor::make_clean_names) %>%
remove_empty(which = c("rows","cols"))
mouse_tp3 <- read_excel(here::here("part8", "data", "mouse_biomarker.xlsx"),
sheet = 4,
na = c("","/"),
.name_repair = janitor::make_clean_names) %>%
remove_empty(which = c("rows","cols"))
# rename columns (do this to all data, just in case)
colnames(mouse_tp2) <- str_replace(colnames(mouse_tp2), "_ng", "_pg")
mouse_tp1 <- mouse_tp1 %>%
rename(mirna1 = mirna_1,
mirna2 = mi_rna_2,
preference_obj1 = preference,
preference_obj2 = x_4) %>%
filter(!is.na(sid)) %>%
mutate(
preference_obj1 = as.numeric(preference_obj1),
preference_obj2 = as.numeric(preference_obj2)
)
mouse_tp2 <- mouse_tp2 %>%
rename(mirna1 = mirna_1,
mirna2 = mi_rna_2,
preference_obj1 = preference,
preference_obj2 = x_4) %>%
filter(!is.na(sid)) %>%
mutate(
preference_obj1 = as.numeric(preference_obj1),
preference_obj2 = as.numeric(preference_obj2)
)
mouse_tp3 <- mouse_tp3 %>%
rename(mirna1 = mirna_1,
mirna2 = mi_rna_2,
preference_obj1 = preference,
preference_obj2 = x_4) %>%
filter(!is.na(sid)) %>%
mutate(
preference_obj1 = as.numeric(preference_obj1),
preference_obj2 = as.numeric(preference_obj2)
)
# Pivot data
mouse_tp1 <- mouse_tp1 %>%
pivot_longer(cols = starts_with("normalized"),
names_to = "biomarker_type",
values_to = "biomarker_value")
mouse_tp2 <- mouse_tp2 %>%
pivot_longer(cols = starts_with("normalized"),
names_to = "biomarker_type",
values_to = "biomarker_value")
mouse_tp3 <- mouse_tp3 %>%
pivot_longer(cols = starts_with("normalized"),
names_to = "biomarker_type",
values_to = "biomarker_value")
# Note: the code below is not a part of your custom funcion
# Bind data
mouse_tp <- bind_rows("tp1" = mouse_tp1,
"tp2" = mouse_tp2,
"tp3" = mouse_tp3,
.id = "time")# create your function here
# implement your function 3 times
# bind the outputted data frames4 Lists
4.1 Lists, what are they?
- Lists are a general purpose data structure in R.
- They are very flexible, in that they have slots.
- Unlike vectors, a
listcan be heterogeneous.- Each slot can contain elements of different types, such as vectors, matrices, data frames, or even other lists!
- Lists allow you to organize and store diverse types of data in a single object.
4.2 List basics
- We’ll start with a making a list and understand different properties about it.
my_list <- list(cat_names = c("Morris", "Julia"),
hedgehog_names = "Spiny",
dog_names = c("Rover", "Spot"))
my_list$cat_names
[1] "Morris" "Julia"
$hedgehog_names
[1] "Spiny"
$dog_names
[1] "Rover" "Spot"
- Lists have a
length():
length(my_list)[1] 3
# note that there are no dimensions though:
dim(my_list)NULL
glimpse()a list
glimpse(my_list)List of 3
$ cat_names : chr [1:2] "Morris" "Julia"
$ hedgehog_names: chr "Spiny"
$ dog_names : chr [1:2] "Rover" "Spot"
- If the lists elements are named, you can ask for their names:
names(my_list)[1] "cat_names" "hedgehog_names" "dog_names"
- We can access a single element in the list using
$and its name,- similar to how we access data frame columns:
names(my_list)[1] "cat_names" "hedgehog_names" "dog_names"
my_list$cat_names[1] "Morris" "Julia"
my_list$hedgehog_names[1] "Spiny"
my_list$dog_names[1] "Rover" "Spot"
- We can also access an element using the
[[]](double brackets) accessor
my_list[[1]][1] "Morris" "Julia"
my_list[[2]][1] "Spiny"
my_list[[3]][1] "Rover" "Spot"
# what happens when you run this?
# my_list[[4]]- If a list’s slots are named, they can be accessed with their name:
my_list[["hedgehog_names"]][1] "Spiny"
# what is the type/structure of the result?
str(my_list[["hedgehog_names"]]) chr "Spiny"
str(my_list[["dog_names"]]) chr [1:2] "Rover" "Spot"
4.3 Difference between [] and [[]]
- Beware of the difference between
[]and[[]]. - The
[]accessor returns a list of length 1, not the element in it:
my_list <- list(cat_names = c("Morris", "Julia"),
hedgehog_names = "Spiny",
dog_names = c("Rover", "Spot"))
my_list["hedgehog_names"]$hedgehog_names
[1] "Spiny"
str(my_list["hedgehog_names"])List of 1
$ hedgehog_names: chr "Spiny"
- Compare to
[[]]:
my_list[["hedgehog_names"]][1] "Spiny"
In most cases, you want to use [[]].
4.4 Vectors vs. lists, and [] vs. [[]]
- Vectors
vec[1:3]returns a vector of length 3vec[1]returns a vector of length 1[[]]cannot be used with vectors
my_vec <- c("Morris", "Julia", "Spiny", "Rover", "Spot")
my_vec[1:3][1] "Morris" "Julia" "Spiny"
my_vec[1][1] "Morris"
- Lists
list[1:3]returns a list of length 3list[1]returns a list of length 1list[[1]]returns the first element of a list- this could be a number (vector of length 1), a vector, another list, a data frame, a ggplot, whatever!
my_list[1:3]$cat_names
[1] "Morris" "Julia"
$hedgehog_names
[1] "Spiny"
$dog_names
[1] "Rover" "Spot"
my_list[1]$cat_names
[1] "Morris" "Julia"
my_list[[1]][1] "Morris" "Julia"
From the help manual of [[:
“The most important distinction between
[,[[and$is that the[can select more than one element whereas the other two select a single element.”
4.5 What does my_list[[1]][1] do?
my_list[[1]][1][1] "Morris"
- Let’s break it up step-by-step to better understand what it does:
# save the first element of the list as tmpout
tmpout <- my_list[[1]]
str(tmpout) chr [1:2] "Morris" "Julia"
# now take the first element of tmpout
tmpout[1][1] "Morris"
- The first part,
my_list[[1]], returns the first element ofmy_list- which is a vector of length two (
c("Morris", "Julia"))
- which is a vector of length two (
- The second part,
[1], returns the first element of that vector “Morris”:
str(my_list[[1]][1]) chr "Morris"
- If you try to pull out an element that doesn’t exist, you get a strange variety of results, so be careful!!
# run in R on your computer to see the errors
my_list[[1]][3]
my_list[[6]]
my_list[["dogg_names"]] # check your spelling!!!4.6 Mini-challenge (your turn)
- Find the length of the
cat_nameselement inmy_list. Then, find the length ofhedgehog_names.
my_list <- list(cat_names = c("Morris", "Julia"),
hedgehog_names = "Spiny",
dog_names = c("Rover", "Spot"))4.7 purrr::pluck()
- The
purrrpackage has an additional way to extract list elements:purrr::pluck(). - This is equivalent to using
[[]], but always returnsNULLwhen the element does not exist. pluck()can be a handy way to extract something from a list without dealing with the[[]]and[]confusion.
my_list <- list(cat_names = c("Morris", "Julia"),
hedgehog_names = "Spiny",
dog_names = c("Rover", "Spot"))
my_list %>%
pluck("cat_names")[1] "Morris" "Julia"
# go deeper in a list
my_list %>%
pluck("cat_names", 2)[1] "Julia"
pluck()outputs a vector, just likemy_list[["cat_names"]]:
my_list %>%
pluck("cat_names") %>%
str() chr [1:2] "Morris" "Julia"
pluck()outputs NULL instead of an error:
my_list %>%
pluck("dogggg_names")NULL
my_list %>%
pluck(10)NULL
4.8 Mini-challenge (your turn)
- Run the code below. What does it output?
Did it output a data.frame, or a vector?
penguins %>%
pluck("species") [1] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[8] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[15] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[22] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[29] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[36] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[43] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[50] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[57] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[64] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[71] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[78] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[85] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[92] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[99] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[106] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[113] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[120] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[127] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[134] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[141] Adelie Adelie Adelie Adelie Adelie Adelie Adelie
[148] Adelie Adelie Adelie Adelie Adelie Gentoo Gentoo
[155] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[162] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[169] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[176] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[183] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[190] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[197] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[204] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[211] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[218] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[225] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[232] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[239] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[246] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[253] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[260] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[267] Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
[274] Gentoo Gentoo Gentoo Chinstrap Chinstrap Chinstrap Chinstrap
[281] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[288] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[295] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[302] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[309] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[316] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[323] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[330] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[337] Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
[344] Chinstrap
Levels: Adelie Chinstrap Gentoo
4.9 Adding things to Lists
- The slots in a
listare not fixed. - We can take an already made list and add a new slot to it using
<-:
my_list <- list(cat_names = c("Morris", "Julia"),
hedgehog_names = "Spiny",
dog_names = c("Rover", "Spot"))
my_list$raccoon_names <- c("Rocky")
my_list$cat_names
[1] "Morris" "Julia"
$hedgehog_names
[1] "Spiny"
$dog_names
[1] "Rover" "Spot"
$raccoon_names
[1] "Rocky"
my_list$cat_ages <- c(2, 5)
my_list$cat_names
[1] "Morris" "Julia"
$hedgehog_names
[1] "Spiny"
$dog_names
[1] "Rover" "Spot"
$raccoon_names
[1] "Rocky"
$cat_ages
[1] 2 5
This is a little bit like adding a column to a data frame (with
mutate()oradd_column()or even$).We can just keep adding!
This also works numerically:
# another list, this time not named, but named
my_list <- list(hello = c("hello", "world"),
c("a","b","c"),
stuff = "stuff")
# notice only two elements have names, this is allowed
my_list$hello
[1] "hello" "world"
[[2]]
[1] "a" "b" "c"
$stuff
[1] "stuff"
names(my_list)[1] "hello" "" "stuff"
my_list[[4]] <- c("new","stuff")
my_list$hello
[1] "hello" "world"
[[2]]
[1] "a" "b" "c"
$stuff
[1] "stuff"
[[4]]
[1] "new" "stuff"
my_list[[6]] <- "Does this work?"
# What happened to the fifth element?
my_list$hello
[1] "hello" "world"
[[2]]
[1] "a" "b" "c"
$stuff
[1] "stuff"
[[4]]
[1] "new" "stuff"
[[5]]
NULL
[[6]]
[1] "Does this work?"
names(my_list)[1] "hello" "" "stuff" "" "" ""
4.10 homogeneous versus heterogeneous lists
- When we automate a repetitive process, we usually assume that the lists that we create are homogeneous.
- That is, the data type of the list element is the same for each slot in the list.
- In our case, usually the data.type will be a
data.frame.
- However, lists can also be heterogenous as we can see in the list above when we added
cat_ageselement. - A common pattern is to return a heterogenous list from a function.
- For example, the
lm()function actually returns a list.
- For example, the
- Run a linear regression model with lm()
# outcome is on left side of ~
# predictor/independent variable/covariate is on right side of ~
# specify data in data argument
output <- lm(body_mass_g ~ bill_length_mm, data = penguins)
# summary() produces summaries about the model results
sum_output <- summary(output)
sum_output
Call:
lm(formula = body_mass_g ~ bill_length_mm, data = penguins)
Residuals:
Min 1Q Median 3Q Max
-1762.08 -446.98 32.59 462.31 1636.86
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 362.307 283.345 1.279 0.202
bill_length_mm 87.415 6.402 13.654 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 645.4 on 340 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.3542, Adjusted R-squared: 0.3523
F-statistic: 186.4 on 1 and 340 DF, p-value: < 2.2e-16
- The summary regression output is a list:
names(sum_output) [1] "call" "terms" "residuals" "coefficients"
[5] "aliased" "sigma" "df" "r.squared"
[9] "adj.r.squared" "fstatistic" "cov.unscaled" "na.action"
str(sum_output)List of 12
$ call : language lm(formula = body_mass_g ~ bill_length_mm, data = penguins)
$ terms :Classes 'terms', 'formula' language body_mass_g ~ bill_length_mm
.. ..- attr(*, "variables")= language list(body_mass_g, bill_length_mm)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "body_mass_g" "bill_length_mm"
.. .. .. ..$ : chr "bill_length_mm"
.. ..- attr(*, "term.labels")= chr "bill_length_mm"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(body_mass_g, bill_length_mm)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "body_mass_g" "bill_length_mm"
$ residuals : Named num [1:342] -30.2 -15.2 -635.1 -120.4 -147.7 ...
..- attr(*, "names")= chr [1:342] "1" "2" "3" "5" ...
$ coefficients : num [1:2, 1:4] 362.31 87.42 283.35 6.4 1.28 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "bill_length_mm"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
$ aliased : Named logi [1:2] FALSE FALSE
..- attr(*, "names")= chr [1:2] "(Intercept)" "bill_length_mm"
$ sigma : num 645
$ df : int [1:3] 2 340 2
$ r.squared : num 0.354
$ adj.r.squared: num 0.352
$ fstatistic : Named num [1:3] 186 1 340
..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
$ cov.unscaled : num [1:2, 1:2] 1.93e-01 -4.32e-03 -4.32e-03 9.84e-05
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "bill_length_mm"
.. ..$ : chr [1:2] "(Intercept)" "bill_length_mm"
$ na.action : 'omit' Named int [1:2] 4 272
..- attr(*, "names")= chr [1:2] "4" "272"
- attr(*, "class")= chr "summary.lm"
- There are different ways to access the coefficients element of the list:
sum_output$coefficients Estimate Std. Error t value Pr(>|t|)
(Intercept) 362.30672 283.345233 1.278676 2.018833e-01
bill_length_mm 87.41528 6.401985 13.654401 3.808283e-34
sum_output[["coefficients"]] Estimate Std. Error t value Pr(>|t|)
(Intercept) 362.30672 283.345233 1.278676 2.018833e-01
bill_length_mm 87.41528 6.401985 13.654401 3.808283e-34
sum_output %>% pluck(coefficients) Estimate Std. Error t value Pr(>|t|)
(Intercept) 362.30672 283.345233 1.278676 2.018833e-01
bill_length_mm 87.41528 6.401985 13.654401 3.808283e-34
- Note this is a matrix, not a data frame!
str(sum_output[["coefficients"]]) num [1:2, 1:4] 362.31 87.42 283.35 6.4 1.28 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:2] "(Intercept)" "bill_length_mm"
..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
- What if we use
[]?
# [] returns a list:
sum_output["coefficients"]$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 362.30672 283.345233 1.278676 2.018833e-01
bill_length_mm 87.41528 6.401985 13.654401 3.808283e-34
str(sum_output["coefficients"])List of 1
$ coefficients: num [1:2, 1:4] 362.31 87.42 283.35 6.4 1.28 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "bill_length_mm"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
- Use
[]to access multiple elements of the list:
sum_output[c("call","coefficients")]$call
lm(formula = body_mass_g ~ bill_length_mm, data = penguins)
$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 362.30672 283.345233 1.278676 2.018833e-01
bill_length_mm 87.41528 6.401985 13.654401 3.808283e-34
# a list of length 2
str(sum_output[c("call","coefficients")])List of 2
$ call : language lm(formula = body_mass_g ~ bill_length_mm, data = penguins)
$ coefficients: num [1:2, 1:4] 362.31 87.42 283.35 6.4 1.28 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:2] "(Intercept)" "bill_length_mm"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
- Remember you cannot use [[]] with more than one name:
sum_output[[c("call","coefficients")]]NULL
- This function runs a linear model using the penguins data and saves some of the regression output as a heterogeneous list
run_penguin_model <- function(myisland = "Torgersen") {
subset_penguins <- penguins %>%
filter(island == myisland)
output <- summary(lm(body_mass_g ~ bill_length_mm, data=subset_penguins))
out_list <- list(
coefficients = output$coefficients,
call = output$call,
island = myisland)
return(out_list)
}
output_model <- run_penguin_model("Torgersen")
output_model$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1144.5433 734.4877 1.558288 0.125601038
bill_length_mm 65.7706 18.8012 3.498212 0.001006702
$call
lm(formula = body_mass_g ~ bill_length_mm, data = subset_penguins)
$island
[1] "Torgersen"
4.11 List of lists
- Lists can be heterogeneous, and since they can contain anything, they can also contain other lists!
my_info_list <- list(
name = "Patty",
pets = c("Cat", "Dog","Parrot","Kangaroo"),
address = list(number = 273,
street = "N Main St",
city = "Portland",
state = "Oregon")
)
my_info_list$name
[1] "Patty"
$pets
[1] "Cat" "Dog" "Parrot" "Kangaroo"
$address
$address$number
[1] 273
$address$street
[1] "N Main St"
$address$city
[1] "Portland"
$address$state
[1] "Oregon"
- We can pull out elements from the list inside a list
my_info_list[["address"]][["street"]][1] "N Main St"
my_info_list %>% pluck("address", "street")[1] "N Main St"
4.12 View a list
- Try using
View()withmy_list,my_info_list, andoutputfrom the linear model above. - Rstudio tries to give you an interactive viewing experience for each element.
- While in the View screen, click on the icon at the very right of the screen, what does that do?
# run this in R
View(my_list)
View(my_info_list)
View(output)4.13 data.frames are list-like
You can access columns in a
data.frameusing$.This is because a
data.frameis a special instance of a list.Hence, you can use all of the above accessors to manipulate variables in a
data.frame.Load the
mtcarsdata frame (it’s nice and small)
data(mtcars)
glimpse(mtcars)Rows: 32
Columns: 11
$ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
$ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,…
$ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
$ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
$ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
$ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
$ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
$ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
$ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
$ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
$ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…
- What is
lengthdoing here? how does it compare todim?
length(mtcars)[1] 11
mtcarsis also a kind of list - we can use$, [[]],andpluck():
mtcars$mpg [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
[16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
[31] 15.0 21.4
mtcars[[1]] [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
[16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
[31] 15.0 21.4
str(mtcars[[1]]) num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
mtcars[["mpg"]] [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
[16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
[31] 15.0 21.4
mtcars %>% pluck("mpg") [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
[16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
[31] 15.0 21.4
mtcars %>% pull("mpg") [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4
[16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7
[31] 15.0 21.4
- Compare the output above to that below when using a single
[]:
mtcars[1] mpg
Mazda RX4 21.0
Mazda RX4 Wag 21.0
Datsun 710 22.8
Hornet 4 Drive 21.4
Hornet Sportabout 18.7
Valiant 18.1
Duster 360 14.3
Merc 240D 24.4
Merc 230 22.8
Merc 280 19.2
Merc 280C 17.8
Merc 450SE 16.4
Merc 450SL 17.3
Merc 450SLC 15.2
Cadillac Fleetwood 10.4
Lincoln Continental 10.4
Chrysler Imperial 14.7
Fiat 128 32.4
Honda Civic 30.4
Toyota Corolla 33.9
Toyota Corona 21.5
Dodge Challenger 15.5
AMC Javelin 15.2
Camaro Z28 13.3
Pontiac Firebird 19.2
Fiat X1-9 27.3
Porsche 914-2 26.0
Lotus Europa 30.4
Ford Pantera L 15.8
Ferrari Dino 19.7
Maserati Bora 15.0
Volvo 142E 21.4
str(mtcars[1])'data.frame': 32 obs. of 1 variable:
$ mpg: num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
- A common pattern in Base-R for filtering that you should be aware of is using the
$operator for filtering on rows. - Thetidyversesimplifies this withfilter()and that’s what I encourage you to use,- but this pattern is often used with objects in the Bioconductor package (or in other people’s code ;))
mtcars[mtcars$cyl == 8, ] mpg cyl disp hp drat wt qsec vs am gear carb
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
- This is the tidyverse version of the above “base-R” code
mtcars %>%
filter(cyl == 8) mpg cyl disp hp drat wt qsec vs am gear carb
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
4.14 Recap of lists
- These are some basic operations with lists in R.
- Lists are powerful for organizing and manipulating complex data structures in a flexible way.
- You can create a list using the
list()function. - Elements within the list can be of different types:
# Example list
my_list <- list(
numeric_vector = c(1, 2, 3),
character_vector = c("a", "b", "c"),
matrix = matrix(1:4, nrow = 2),
data_frame = data.frame(name = c("John", "Jane"), age = c(25, 30))
)
my_list$numeric_vector
[1] 1 2 3
$character_vector
[1] "a" "b" "c"
$matrix
[,1] [,2]
[1,] 1 3
[2,] 2 4
$data_frame
name age
1 John 25
2 Jane 30
- You can access elements in a list using
- double square brackets
[[]] - or the dollar sign
$for named elements:
- double square brackets
# Accessing elements
(numeric_vector <- my_list[[1]])[1] 1 2 3
(character_vector <- my_list$character_vector)[1] "a" "b" "c"
- You can add elements to a list
- using the
c()function or - by assigning a new element to a specific index or
- using the familiar
$:
- using the
# Adding elements
my_list <- c(my_list, new_element = "This is a new element")
my_list[[6]] <- "Another new element"
my_list$new3 <- "Yet another new element"
my_list$numeric_vector
[1] 1 2 3
$character_vector
[1] "a" "b" "c"
$matrix
[,1] [,2]
[1,] 1 3
[2,] 2 4
$data_frame
name age
1 John 25
2 Jane 30
$new_element
[1] "This is a new element"
[[6]]
[1] "Another new element"
$new3
[1] "Yet another new element"
- Lists can have attributes, such as names and dimensions.
- You can assign names to list elements:
names(my_list)[1] "numeric_vector" "character_vector" "matrix" "data_frame"
[5] "new_element" "" "new3"
# Assigning names to list elements
names(my_list) <- c("numeric", "character", "matrix", "dataframe", "new1", "new2", "new3")
names(my_list)[1] "numeric" "character" "matrix" "dataframe" "new1" "new2"
[7] "new3"
- Lists can be nested, meaning you can have lists within lists:
# Nested list
nested_list <- list(
sublist1 = list(a = 1, b = 2),
sublist2 = list(c = "apple", d = "banana")
)
nested_list$sublist1
$sublist1$a
[1] 1
$sublist1$b
[1] 2
$sublist2
$sublist2$c
[1] "apple"
$sublist2$d
[1] "banana"
5 Next time
Next class we will focus on iteration using functions, lists, and purrr::map().
6 Post Class Survey
Please fill out the post-class survey.
Your responses are anonymous in that I separate your names from the survey answers before compiling/reading.
You may want to review previous years’ feedback here.
7 Acknowledgements
- Part 8 is based on the BSTA 505 Winter 2023 course, taught by Jessica Minnier.
- I made modifications to update the material from RMarkdown to Quarto, and streamlined/edited content for slides.
- Minnier’s Acknowledgements:
- Written by Jessica Minnier and Ted Laderas.
- Some material inspired by Kelly Bodwin’s Adventures in R.