#load tidyverse up
library(tidyverse)
#example dataset
library(palmerpenguins)
data(penguins)writexl::write_xlsx
Function of the Week
Export data to xlsx
1 Function of the Week: write_xlsx
In this document, I will introduce the write_xlsx function and show what it’s for.
1.1 What is it for?
This function exports your data frame to an xlsx file. You can create an excel file with your data in, write it to a specific path, and even create an excel with multiple named sheets.
# load data; reading in multiple files so can demonstrate utility of write_xlsx later
PDAC_normal_clinical_data <- read_excel(here("function_week", "data","S061_CPTAC_PDA_Discovery_Cohort_Clinical_Data_r1_Feb2021.xlsx"),
sheet = 2,
na = " ") %>%
mutate(ethnicity_race_ancestry_identified = na_if(ethnicity_race_ancestry_identified, "TSS did not collect this information")) # fixing na values as na
PDAC_normal_biospecimen_attributes <- read_excel(here("function_week", "data","S061_CPTAC_PDA_Discovery_Cohort_Clinical_Data_r1_Feb2021.xlsx"),
sheet = 3,
na = " ")
PDAC_normal_completeness <- read_excel(here("function_week", "data","S061_CPTAC_PDA_Discovery_Cohort_Clinical_Data_r1_Feb2021.xlsx"),
sheet = 4,
na = " ")
PDAC_normal_case_IDs <- read_excel(here("function_week", "data","S061_Case_IDs_for_GDC_release_of_PDAC_140_and_normal_pancreas_7.xlsx"),
na = " ") %>%
clean_names() # fix column names
PDAC_normal_case_IDs_clean <- PDAC_normal_case_IDs[!is.na(PDAC_normal_case_IDs$case_id), ] # removes empty rows# manipulate data a bit
PDAC_normal_biospecimen_attributes_clean <- PDAC_normal_biospecimen_attributes %>%
filter(biospecimen_disqualified != "Yes") # trying to filter out those samples w/ specimens that were disqualified
glimpse(PDAC_normal_biospecimen_attributes)Rows: 300
Columns: 12
$ tumor_code <chr> "PDA", "PDA", "PDA", "PDA", "PDA", "PDA"…
$ case_id <chr> "C3L-00017", "C3L-00102", "C3L-00102", "…
$ specimen_id <chr> "C3L-00017-02", "C3L-00102-01", "C3L-001…
$ slide_id <chr> "C3L-00017-22", "C3L-00102-21", "C3L-001…
$ tissue_type <chr> "tumor", "tumor", "normal", "tumor", "tu…
$ biospecimen_disqualified <chr> "No", "No", "No", "No", "No", "No", "No"…
$ percent_tumor_surface_area <dbl> 100, 20, NA, 100, 60, 70, NA, 100, NA, 9…
$ percent_total_cellularity <dbl> 100, 98, NA, 95, 80, 80, NA, 90, NA, 100…
$ non_cellular_component <chr> "none present", "hemorrhage", NA, "fibro…
$ percent_tumor_nuclei <dbl> 20, 20, NA, 50, 20, 25, NA, 70, NA, 40, …
$ percent_necrotic_surface_area <dbl> 0, 0, NA, 0, 0, 20, NA, 0, NA, 0, 2, 2, …
$ weight_in_mg <dbl> 230.0, 566.0, 534.0, 59.0, 116.0, 55.0, …
glimpse(PDAC_normal_biospecimen_attributes_clean) # looks like we successfully filtered out the disqualified specimens from this dataRows: 275
Columns: 12
$ tumor_code <chr> "PDA", "PDA", "PDA", "PDA", "PDA", "PDA"…
$ case_id <chr> "C3L-00017", "C3L-00102", "C3L-00102", "…
$ specimen_id <chr> "C3L-00017-02", "C3L-00102-01", "C3L-001…
$ slide_id <chr> "C3L-00017-22", "C3L-00102-21", "C3L-001…
$ tissue_type <chr> "tumor", "tumor", "normal", "tumor", "tu…
$ biospecimen_disqualified <chr> "No", "No", "No", "No", "No", "No", "No"…
$ percent_tumor_surface_area <dbl> 100, 20, NA, 100, 60, 70, NA, 100, NA, 9…
$ percent_total_cellularity <dbl> 100, 98, NA, 95, 80, 80, NA, 90, NA, 100…
$ non_cellular_component <chr> "none present", "hemorrhage", NA, "fibro…
$ percent_tumor_nuclei <dbl> 20, 20, NA, 50, 20, 25, NA, 70, NA, 40, …
$ percent_necrotic_surface_area <dbl> 0, 0, NA, 0, 0, 20, NA, 0, NA, 0, 2, 2, …
$ weight_in_mg <dbl> 230.0, 566.0, 534.0, 59.0, 116.0, 55.0, …
PDAC_normal_clinical_biospecimen_join <- inner_join(PDAC_normal_biospecimen_attributes_clean, PDAC_normal_clinical_data, by = "case_id") # join biospecimen dataset and clinical data dataset excluding those IDs that have disqualified specimens
glimpse(PDAC_normal_clinical_data)Rows: 140
Columns: 41
$ tumor_code <chr> …
$ case_id <chr> …
$ type_of_analyzed_samples <chr> …
$ case_withdrawn <chr> …
$ gender <chr> …
$ age <dbl> …
$ height_at_time_of_surgery_cm <dbl> …
$ weight_at_time_of_surgery_kg <dbl> …
$ BMI <dbl> …
$ ethnicity_race_ancestry_identified <chr> …
$ tumor_site <chr> …
$ tumor_size_in_cm <dbl> …
$ histologic_type <chr> …
$ histologic_grade <chr> …
$ pathologic_staging_primary_tumor_pT <chr> …
$ pathologic_staging_regional_lymph_nodes_pN <chr> …
$ clinical_staging_distant_metastasis_cM <chr> …
$ pathologic_staging_distant_metastasis_pM <chr> …
$ tumor_stage_pathological <chr> …
$ AJCC_TNM_cancer_staging_edition <chr> …
$ tumor_focality <chr> …
$ tumor_necrosis <chr> …
$ margin_status <chr> …
$ residual_tumor <chr> …
$ `lymph-vascular_invasion` <chr> …
$ perineural_invasion <chr> …
$ paraneoplastic_syndrome_present <chr> …
$ additional_pathologic_findings <chr> …
$ history_of_cancer <chr> …
$ cancer_history_cancer_type <chr> …
$ alcohol_consumption <chr> …
$ tobacco_smoking_history <chr> …
$ was_the_participant_exposed_to_secondhand_smoke <chr> …
$ days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact <dbl> …
$ measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment <chr> …
$ `measure_of_success_of_outcome_at_date_of_completion_of_last_follow-up_form` <chr> …
$ tumor_status_at_date_of_last_contact_or_death <chr> …
$ vital_status_at_date_of_last_contact <chr> …
$ cause_of_death_at_date_of_last_contact <chr> …
$ new_tumor_after_initial_treatment_at_date_of_last_contact <chr> …
$ type_of_new_tumor_at_date_of_last_contact <chr> …
glimpse(PDAC_normal_clinical_biospecimen_join) # dataset has longer number of rows than clinical data sheet did itself because duplicate case_ids for normal and tumor specimens includedRows: 275
Columns: 52
$ tumor_code.x <chr> …
$ case_id <chr> …
$ specimen_id <chr> …
$ slide_id <chr> …
$ tissue_type <chr> …
$ biospecimen_disqualified <chr> …
$ percent_tumor_surface_area <dbl> …
$ percent_total_cellularity <dbl> …
$ non_cellular_component <chr> …
$ percent_tumor_nuclei <dbl> …
$ percent_necrotic_surface_area <dbl> …
$ weight_in_mg <dbl> …
$ tumor_code.y <chr> …
$ type_of_analyzed_samples <chr> …
$ case_withdrawn <chr> …
$ gender <chr> …
$ age <dbl> …
$ height_at_time_of_surgery_cm <dbl> …
$ weight_at_time_of_surgery_kg <dbl> …
$ BMI <dbl> …
$ ethnicity_race_ancestry_identified <chr> …
$ tumor_site <chr> …
$ tumor_size_in_cm <dbl> …
$ histologic_type <chr> …
$ histologic_grade <chr> …
$ pathologic_staging_primary_tumor_pT <chr> …
$ pathologic_staging_regional_lymph_nodes_pN <chr> …
$ clinical_staging_distant_metastasis_cM <chr> …
$ pathologic_staging_distant_metastasis_pM <chr> …
$ tumor_stage_pathological <chr> …
$ AJCC_TNM_cancer_staging_edition <chr> …
$ tumor_focality <chr> …
$ tumor_necrosis <chr> …
$ margin_status <chr> …
$ residual_tumor <chr> …
$ `lymph-vascular_invasion` <chr> …
$ perineural_invasion <chr> …
$ paraneoplastic_syndrome_present <chr> …
$ additional_pathologic_findings <chr> …
$ history_of_cancer <chr> …
$ cancer_history_cancer_type <chr> …
$ alcohol_consumption <chr> …
$ tobacco_smoking_history <chr> …
$ was_the_participant_exposed_to_secondhand_smoke <chr> …
$ days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact <dbl> …
$ measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment <chr> …
$ `measure_of_success_of_outcome_at_date_of_completion_of_last_follow-up_form` <chr> …
$ tumor_status_at_date_of_last_contact_or_death <chr> …
$ vital_status_at_date_of_last_contact <chr> …
$ cause_of_death_at_date_of_last_contact <chr> …
$ new_tumor_after_initial_treatment_at_date_of_last_contact <chr> …
$ type_of_new_tumor_at_date_of_last_contact <chr> …
Now we will export our data! Using write_xlsx
# we can export simply to a single excel
write_xlsx(PDAC_normal_clinical_biospecimen_join, path = here("function_week", "data", "output_data", "PDAC_normal_clinical_biospecimen_join.xlsx"))
# we can also export with column names included or not
write_xlsx(PDAC_normal_clinical_biospecimen_join, path = here("function_week", "data", "output_data", "PDAC_normal_clinical_biospecimen_join2.xlsx"), col_names = FALSE) # deletes column names
# can export with column names centered and bold
write_xlsx(PDAC_normal_clinical_biospecimen_join, path = here("function_week", "data", "output_data", "PDAC_normal_clinical_biospecimen_join3.xlsx"), format_headers = TRUE)
# you can write different data frames to multiple sheets in your exported excel file
data_list <- list(Sheet1 = PDAC_normal_clinical_data, Sheet2 = PDAC_normal_biospecimen_attributes, Sheet3 = PDAC_normal_completeness, Sheet4 = PDAC_normal_case_IDs_clean) # have to create a list of data frames; you can also name these sheets if you want
write_xlsx(data_list,
path = here("function_week", "data", "output_data", "PDAC_normal_all_data.xlsx"))
# you can also name each of the sheets when they export to excel if you want
data_list <- list(clinical_data = PDAC_normal_clinical_data, biospecimen_attributes = PDAC_normal_biospecimen_attributes, completeness = PDAC_normal_completeness, case_ids = PDAC_normal_case_IDs_clean) # have to create a list of data frames; you can also name these sheets if you want
write_xlsx(data_list,
path = here("function_week", "data", "output_data", "PDAC_normal_all_data_named.xlsx"))1.2 Is it helpful?
Yes, I think this is helpful when you have manipulated your data - possibly done some calculations or manipulated things in some way - and want to save a new data from from there. You can also rework data with mutates, joins, merges, etc and save the data in its new form as an xlsx.