writexl::write_xlsx

Function of the Week

Export data to xlsx
Author

Kathryn Fowler

Published

January 25, 2025

1 Function of the Week: write_xlsx

In this document, I will introduce the write_xlsx function and show what it’s for.

#load tidyverse up
library(tidyverse)
#example dataset
library(palmerpenguins)
data(penguins)

1.1 What is it for?

This function exports your data frame to an xlsx file. You can create an excel file with your data in, write it to a specific path, and even create an excel with multiple named sheets.

# load data; reading in multiple files so can demonstrate utility of write_xlsx later
PDAC_normal_clinical_data <- read_excel(here("function_week", "data","S061_CPTAC_PDA_Discovery_Cohort_Clinical_Data_r1_Feb2021.xlsx"),
                                        sheet = 2,
                                        na = " ") %>%
  mutate(ethnicity_race_ancestry_identified = na_if(ethnicity_race_ancestry_identified, "TSS did not collect this information")) # fixing na values as na

PDAC_normal_biospecimen_attributes <- read_excel(here("function_week", "data","S061_CPTAC_PDA_Discovery_Cohort_Clinical_Data_r1_Feb2021.xlsx"),
                                                 sheet = 3,
                                                 na = " ")

PDAC_normal_completeness <- read_excel(here("function_week", "data","S061_CPTAC_PDA_Discovery_Cohort_Clinical_Data_r1_Feb2021.xlsx"),
                                       sheet = 4,
                                       na = " ")

PDAC_normal_case_IDs <- read_excel(here("function_week", "data","S061_Case_IDs_for_GDC_release_of_PDAC_140_and_normal_pancreas_7.xlsx"),
                                   na = " ") %>%
  clean_names() # fix column names
PDAC_normal_case_IDs_clean <- PDAC_normal_case_IDs[!is.na(PDAC_normal_case_IDs$case_id), ] # removes empty rows
# manipulate data a bit
PDAC_normal_biospecimen_attributes_clean <- PDAC_normal_biospecimen_attributes %>%
  filter(biospecimen_disqualified != "Yes") # trying to filter out those samples w/ specimens that were disqualified

glimpse(PDAC_normal_biospecimen_attributes)
Rows: 300
Columns: 12
$ tumor_code                    <chr> "PDA", "PDA", "PDA", "PDA", "PDA", "PDA"…
$ case_id                       <chr> "C3L-00017", "C3L-00102", "C3L-00102", "…
$ specimen_id                   <chr> "C3L-00017-02", "C3L-00102-01", "C3L-001…
$ slide_id                      <chr> "C3L-00017-22", "C3L-00102-21", "C3L-001…
$ tissue_type                   <chr> "tumor", "tumor", "normal", "tumor", "tu…
$ biospecimen_disqualified      <chr> "No", "No", "No", "No", "No", "No", "No"…
$ percent_tumor_surface_area    <dbl> 100, 20, NA, 100, 60, 70, NA, 100, NA, 9…
$ percent_total_cellularity     <dbl> 100, 98, NA, 95, 80, 80, NA, 90, NA, 100…
$ non_cellular_component        <chr> "none present", "hemorrhage", NA, "fibro…
$ percent_tumor_nuclei          <dbl> 20, 20, NA, 50, 20, 25, NA, 70, NA, 40, …
$ percent_necrotic_surface_area <dbl> 0, 0, NA, 0, 0, 20, NA, 0, NA, 0, 2, 2, …
$ weight_in_mg                  <dbl> 230.0, 566.0, 534.0, 59.0, 116.0, 55.0, …
glimpse(PDAC_normal_biospecimen_attributes_clean) # looks like we successfully filtered out the disqualified specimens from this data
Rows: 275
Columns: 12
$ tumor_code                    <chr> "PDA", "PDA", "PDA", "PDA", "PDA", "PDA"…
$ case_id                       <chr> "C3L-00017", "C3L-00102", "C3L-00102", "…
$ specimen_id                   <chr> "C3L-00017-02", "C3L-00102-01", "C3L-001…
$ slide_id                      <chr> "C3L-00017-22", "C3L-00102-21", "C3L-001…
$ tissue_type                   <chr> "tumor", "tumor", "normal", "tumor", "tu…
$ biospecimen_disqualified      <chr> "No", "No", "No", "No", "No", "No", "No"…
$ percent_tumor_surface_area    <dbl> 100, 20, NA, 100, 60, 70, NA, 100, NA, 9…
$ percent_total_cellularity     <dbl> 100, 98, NA, 95, 80, 80, NA, 90, NA, 100…
$ non_cellular_component        <chr> "none present", "hemorrhage", NA, "fibro…
$ percent_tumor_nuclei          <dbl> 20, 20, NA, 50, 20, 25, NA, 70, NA, 40, …
$ percent_necrotic_surface_area <dbl> 0, 0, NA, 0, 0, 20, NA, 0, NA, 0, 2, 2, …
$ weight_in_mg                  <dbl> 230.0, 566.0, 534.0, 59.0, 116.0, 55.0, …
PDAC_normal_clinical_biospecimen_join <- inner_join(PDAC_normal_biospecimen_attributes_clean, PDAC_normal_clinical_data, by = "case_id") # join biospecimen dataset and clinical data dataset excluding those IDs that have disqualified specimens

glimpse(PDAC_normal_clinical_data)
Rows: 140
Columns: 41
$ tumor_code                                                                        <chr> …
$ case_id                                                                           <chr> …
$ type_of_analyzed_samples                                                          <chr> …
$ case_withdrawn                                                                    <chr> …
$ gender                                                                            <chr> …
$ age                                                                               <dbl> …
$ height_at_time_of_surgery_cm                                                      <dbl> …
$ weight_at_time_of_surgery_kg                                                      <dbl> …
$ BMI                                                                               <dbl> …
$ ethnicity_race_ancestry_identified                                                <chr> …
$ tumor_site                                                                        <chr> …
$ tumor_size_in_cm                                                                  <dbl> …
$ histologic_type                                                                   <chr> …
$ histologic_grade                                                                  <chr> …
$ pathologic_staging_primary_tumor_pT                                               <chr> …
$ pathologic_staging_regional_lymph_nodes_pN                                        <chr> …
$ clinical_staging_distant_metastasis_cM                                            <chr> …
$ pathologic_staging_distant_metastasis_pM                                          <chr> …
$ tumor_stage_pathological                                                          <chr> …
$ AJCC_TNM_cancer_staging_edition                                                   <chr> …
$ tumor_focality                                                                    <chr> …
$ tumor_necrosis                                                                    <chr> …
$ margin_status                                                                     <chr> …
$ residual_tumor                                                                    <chr> …
$ `lymph-vascular_invasion`                                                         <chr> …
$ perineural_invasion                                                               <chr> …
$ paraneoplastic_syndrome_present                                                   <chr> …
$ additional_pathologic_findings                                                    <chr> …
$ history_of_cancer                                                                 <chr> …
$ cancer_history_cancer_type                                                        <chr> …
$ alcohol_consumption                                                               <chr> …
$ tobacco_smoking_history                                                           <chr> …
$ was_the_participant_exposed_to_secondhand_smoke                                   <chr> …
$ days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact            <dbl> …
$ measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment <chr> …
$ `measure_of_success_of_outcome_at_date_of_completion_of_last_follow-up_form`      <chr> …
$ tumor_status_at_date_of_last_contact_or_death                                     <chr> …
$ vital_status_at_date_of_last_contact                                              <chr> …
$ cause_of_death_at_date_of_last_contact                                            <chr> …
$ new_tumor_after_initial_treatment_at_date_of_last_contact                         <chr> …
$ type_of_new_tumor_at_date_of_last_contact                                         <chr> …
glimpse(PDAC_normal_clinical_biospecimen_join) # dataset has longer number of rows than clinical data sheet did itself because duplicate case_ids for normal and tumor specimens included
Rows: 275
Columns: 52
$ tumor_code.x                                                                      <chr> …
$ case_id                                                                           <chr> …
$ specimen_id                                                                       <chr> …
$ slide_id                                                                          <chr> …
$ tissue_type                                                                       <chr> …
$ biospecimen_disqualified                                                          <chr> …
$ percent_tumor_surface_area                                                        <dbl> …
$ percent_total_cellularity                                                         <dbl> …
$ non_cellular_component                                                            <chr> …
$ percent_tumor_nuclei                                                              <dbl> …
$ percent_necrotic_surface_area                                                     <dbl> …
$ weight_in_mg                                                                      <dbl> …
$ tumor_code.y                                                                      <chr> …
$ type_of_analyzed_samples                                                          <chr> …
$ case_withdrawn                                                                    <chr> …
$ gender                                                                            <chr> …
$ age                                                                               <dbl> …
$ height_at_time_of_surgery_cm                                                      <dbl> …
$ weight_at_time_of_surgery_kg                                                      <dbl> …
$ BMI                                                                               <dbl> …
$ ethnicity_race_ancestry_identified                                                <chr> …
$ tumor_site                                                                        <chr> …
$ tumor_size_in_cm                                                                  <dbl> …
$ histologic_type                                                                   <chr> …
$ histologic_grade                                                                  <chr> …
$ pathologic_staging_primary_tumor_pT                                               <chr> …
$ pathologic_staging_regional_lymph_nodes_pN                                        <chr> …
$ clinical_staging_distant_metastasis_cM                                            <chr> …
$ pathologic_staging_distant_metastasis_pM                                          <chr> …
$ tumor_stage_pathological                                                          <chr> …
$ AJCC_TNM_cancer_staging_edition                                                   <chr> …
$ tumor_focality                                                                    <chr> …
$ tumor_necrosis                                                                    <chr> …
$ margin_status                                                                     <chr> …
$ residual_tumor                                                                    <chr> …
$ `lymph-vascular_invasion`                                                         <chr> …
$ perineural_invasion                                                               <chr> …
$ paraneoplastic_syndrome_present                                                   <chr> …
$ additional_pathologic_findings                                                    <chr> …
$ history_of_cancer                                                                 <chr> …
$ cancer_history_cancer_type                                                        <chr> …
$ alcohol_consumption                                                               <chr> …
$ tobacco_smoking_history                                                           <chr> …
$ was_the_participant_exposed_to_secondhand_smoke                                   <chr> …
$ days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact            <dbl> …
$ measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment <chr> …
$ `measure_of_success_of_outcome_at_date_of_completion_of_last_follow-up_form`      <chr> …
$ tumor_status_at_date_of_last_contact_or_death                                     <chr> …
$ vital_status_at_date_of_last_contact                                              <chr> …
$ cause_of_death_at_date_of_last_contact                                            <chr> …
$ new_tumor_after_initial_treatment_at_date_of_last_contact                         <chr> …
$ type_of_new_tumor_at_date_of_last_contact                                         <chr> …

Now we will export our data! Using write_xlsx

# we can export simply to a single excel
write_xlsx(PDAC_normal_clinical_biospecimen_join, path = here("function_week", "data", "output_data", "PDAC_normal_clinical_biospecimen_join.xlsx"))

# we can also export with column names included or not
write_xlsx(PDAC_normal_clinical_biospecimen_join, path = here("function_week", "data", "output_data", "PDAC_normal_clinical_biospecimen_join2.xlsx"), col_names = FALSE) # deletes column names

# can export with column names centered and bold
write_xlsx(PDAC_normal_clinical_biospecimen_join, path = here("function_week", "data", "output_data", "PDAC_normal_clinical_biospecimen_join3.xlsx"), format_headers = TRUE)

# you can write different data frames to multiple sheets in your exported excel file
data_list <- list(Sheet1 = PDAC_normal_clinical_data, Sheet2 = PDAC_normal_biospecimen_attributes, Sheet3 = PDAC_normal_completeness, Sheet4 = PDAC_normal_case_IDs_clean) # have to create a list of data frames; you can also name these sheets if you want

write_xlsx(data_list,
           path = here("function_week", "data", "output_data", "PDAC_normal_all_data.xlsx"))

# you can also name each of the sheets when they export to excel if you want
data_list <- list(clinical_data = PDAC_normal_clinical_data, biospecimen_attributes = PDAC_normal_biospecimen_attributes, completeness = PDAC_normal_completeness, case_ids = PDAC_normal_case_IDs_clean) # have to create a list of data frames; you can also name these sheets if you want

write_xlsx(data_list,
           path = here("function_week", "data", "output_data", "PDAC_normal_all_data_named.xlsx"))

1.2 Is it helpful?

Yes, I think this is helpful when you have manipulated your data - possibly done some calculations or manipulated things in some way - and want to save a new data from from there. You can also rework data with mutates, joins, merges, etc and save the data in its new form as an xlsx.