#load tidyverse up
knitr::opts_chunk$set(echo = TRUE)
pacman::p_load(
tidyverse,
readxl,
here,
janitor,
gt
)
#load example dataset
clinical_data <- read_excel(here("function_week", "data", "tcga_clinical_data.xlsx"),
sheet = 2,
na = "NA")dplyr::na_if
Function of the Week
NA
1 Function Of Interest: dplyr - na_if(x,y)
In this document, I will introduce the dplyr na_if() function and show what it’s for.
2 What is it for?
This function is used to replace annoying values with NA. It allows you to replace NaN with NA, even though NaN == NaN returns NA.
2.1 Example 1: The Basics
# Example 1: The basics
# na_if functions as `na_if(x, y)`; where x is the vector to modify and y is the value to replace with NA.
x <- c(1, 25, -5, 0, 10)
x_inf <- 100/x
# This enters us an infinite value, which has downstream effects on common data analysis.
x_inf[1] 100 4 -20 Inf 10
mean(x_inf, na.rm = T)[1] Inf
# We see that we are not given a proper mean.
x_na_if <- 100/ na_if(x, 0)
x_na_if[1] 100 4 -20 NA 10
mean(x_na_if, na.rm = T)[1] 23.5
# Success! What a meaningful change!The previous example we adapted from Rdocumentation.org.
2.2 Lets Set The Table: Data Clean Up
Now that we have glimpsed the power of na_if, lets see how to utilize it in an actual data set.
# First, lets clean up column names to aid data viewing.
clinical_clean <- clinical_data %>%
rename(tumor_class = classification_of_tumor,
last_status = last_known_disease_status,
vital = vital_status,
morph = morphology,
diagnosis = primary_diagnosis,
stage = tumor_stage,
last_diseasestat = days_to_last_known_disease_status,
datetime = created_datetime,
recurrence = days_to_recurrence,
origin = tissue_or_organ_of_origin,
progression = progression_or_recurrence,
biopsy_site = site_of_resection_or_biopsy,
last_follow_up = days_to_last_follow_up,
intent_type = treatment_intent_type,
treatment = treatment_or_therapy) %>%
select(c(-updated_datetime))2.3 Putting It All On The Table
clinical_table <- clinical_clean %>%
head(2)
gt(clinical_table)| submitter_id | tumor_class | last_status | diagnosis | stage | age_at_diagnosis | vital | morph | days_to_death | last_diseasestat | datetime | state | recurrence | diagnosis_id | tumor_grade | origin | days_to_birth | progression | prior_malignancy | biopsy_site | last_follow_up | cigarettes_per_day | weight | alcohol_history | alcohol_intensity | bmi | years_smoked | exposure_id | height | gender | year_of_birth | race | demographic_id | ethnicity | year_of_death | treatment_id | therapeutic_agents | intent_type | treatment | bcr_patient_barcode | disease |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TCGA-2W-A8YY | not reported | not reported | C53.9 | not reported | 18886 | alive | 8560/3 | NA | NA | NA | live | NA | 908ee155-bfca-5240-b78b-6b82f565aedd | not reported | C53.9 | -18886 | not reported | not reported | C53.9 | 533 | NA | 42 | NA | NA | 16.40625 | NA | 67aa3949-ad62-5f81-ad08-e8e295f84cfb | 160 | female | 1962 | white | b89e4409-f7c6-53f2-a85f-31448e2ae1f6 | not hispanic or latino | NA | 026fa545-ac02-5915-ac23-5984d67a75f8 | NA | NA | NA | TCGA-2W-A8YY | CESC |
| TCGA-4J-AA1J | not reported | not reported | C53.9 | not reported | 11611 | alive | 8070/3 | NA | NA | NA | live | NA | 20b61f8a-5efb-5bcc-aaad-5f79cd8ff313 | not reported | C53.9 | -11611 | not reported | not reported | C53.9 | 542 | NA | 48 | NA | NA | 17.63085 | NA | 93ddbaf1-67b9-59a9-8a04-ef00db42fd54 | 165 | female | 1982 | white | 1c2c712d-0a6c-5b52-a4b0-8e1d61256f6c | not hispanic or latino | NA | f68ae36d-85e6-558c-91a7-f5b69b9dde19 | NA | NA | NA | TCGA-4J-AA1J | CESC |
# As you can see there are many entries that use the phrase "not reported". This phrase did not get caught when we loaded that data into R.2.4 Example 2: Are You Tired of Data Being not reported?
# Now if we want all the not reported inputs to be catergorzed as "NA" we will use the na_if function.
clinical_na_if <- clinical_clean %>%
mutate(tumor_class = na_if(tumor_class, "not reported"),
last_status = na_if(last_status, "not reported"),
stage = na_if(stage, "not reported"),
tumor_grade = na_if(tumor_grade, "not reported"))
gt(head(clinical_na_if, 2))| submitter_id | tumor_class | last_status | diagnosis | stage | age_at_diagnosis | vital | morph | days_to_death | last_diseasestat | datetime | state | recurrence | diagnosis_id | tumor_grade | origin | days_to_birth | progression | prior_malignancy | biopsy_site | last_follow_up | cigarettes_per_day | weight | alcohol_history | alcohol_intensity | bmi | years_smoked | exposure_id | height | gender | year_of_birth | race | demographic_id | ethnicity | year_of_death | treatment_id | therapeutic_agents | intent_type | treatment | bcr_patient_barcode | disease |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TCGA-2W-A8YY | NA | NA | C53.9 | NA | 18886 | alive | 8560/3 | NA | NA | NA | live | NA | 908ee155-bfca-5240-b78b-6b82f565aedd | NA | C53.9 | -18886 | not reported | not reported | C53.9 | 533 | NA | 42 | NA | NA | 16.40625 | NA | 67aa3949-ad62-5f81-ad08-e8e295f84cfb | 160 | female | 1962 | white | b89e4409-f7c6-53f2-a85f-31448e2ae1f6 | not hispanic or latino | NA | 026fa545-ac02-5915-ac23-5984d67a75f8 | NA | NA | NA | TCGA-2W-A8YY | CESC |
| TCGA-4J-AA1J | NA | NA | C53.9 | NA | 11611 | alive | 8070/3 | NA | NA | NA | live | NA | 20b61f8a-5efb-5bcc-aaad-5f79cd8ff313 | NA | C53.9 | -11611 | not reported | not reported | C53.9 | 542 | NA | 48 | NA | NA | 17.63085 | NA | 93ddbaf1-67b9-59a9-8a04-ef00db42fd54 | 165 | female | 1982 | white | 1c2c712d-0a6c-5b52-a4b0-8e1d61256f6c | not hispanic or latino | NA | f68ae36d-85e6-558c-91a7-f5b69b9dde19 | NA | NA | NA | TCGA-4J-AA1J | CESC |
As you can see editing one column is very accessible, but once you get past three columns writing the na_if function gets tedious. There has got to be a better way! Akin to any late night infomercial, there is a better way by using our good old friend across function.
2.5 Example 3: Have No Fear na_if Is Here!
## Example 3: Multiple columns
clinical_across <- clinical_clean %>%
mutate(across(where(is.character),
~na_if(., "not reported")))
gt(head(clinical_across, 2))| submitter_id | tumor_class | last_status | diagnosis | stage | age_at_diagnosis | vital | morph | days_to_death | last_diseasestat | datetime | state | recurrence | diagnosis_id | tumor_grade | origin | days_to_birth | progression | prior_malignancy | biopsy_site | last_follow_up | cigarettes_per_day | weight | alcohol_history | alcohol_intensity | bmi | years_smoked | exposure_id | height | gender | year_of_birth | race | demographic_id | ethnicity | year_of_death | treatment_id | therapeutic_agents | intent_type | treatment | bcr_patient_barcode | disease |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TCGA-2W-A8YY | NA | NA | C53.9 | NA | 18886 | alive | 8560/3 | NA | NA | NA | live | NA | 908ee155-bfca-5240-b78b-6b82f565aedd | NA | C53.9 | -18886 | NA | NA | C53.9 | 533 | NA | 42 | NA | NA | 16.40625 | NA | 67aa3949-ad62-5f81-ad08-e8e295f84cfb | 160 | female | 1962 | white | b89e4409-f7c6-53f2-a85f-31448e2ae1f6 | not hispanic or latino | NA | 026fa545-ac02-5915-ac23-5984d67a75f8 | NA | NA | NA | TCGA-2W-A8YY | CESC |
| TCGA-4J-AA1J | NA | NA | C53.9 | NA | 11611 | alive | 8070/3 | NA | NA | NA | live | NA | 20b61f8a-5efb-5bcc-aaad-5f79cd8ff313 | NA | C53.9 | -11611 | NA | NA | C53.9 | 542 | NA | 48 | NA | NA | 17.63085 | NA | 93ddbaf1-67b9-59a9-8a04-ef00db42fd54 | 165 | female | 1982 | white | 1c2c712d-0a6c-5b52-a4b0-8e1d61256f6c | not hispanic or latino | NA | f68ae36d-85e6-558c-91a7-f5b69b9dde19 | NA | NA | NA | TCGA-4J-AA1J | CESC |
3 Is it helpful?
This function would be particularly useful is you wanted to change any NaN inputs to NA, or if you had very cluttered data with lots of “unknowns” or “not reported” or any other unusual entry for NA. This is also useful because when loading excel data you cannot have two different NA arguments.
What is particularly useful about this function is when nesting it in the mutate and across function because you can make large edits to several vectors.