Bubble plots showing geneset enrichments

This will save both the bubble plots as well as corresponding supplementary tables in an excel spreadsheet

DE results were generated by ../differential_expression/covid_de_pipeline

options(java.parameters = "- Xmx1024m")#so that xlsx library works, https://stackoverflow.com/questions/27153974/how-to-fix-outofmemoryerror-java-gc-overhead-limit-exceeded-in-r
library(tidyverse)

## ── Attaching packages ────────────────────────────────── tidyverse 1.3.0 ──

## ✔ ggplot2 3.3.0     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.5
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(readxl)
library(viridis)

## Loading required package: viridisLite

library(xlsx)


IN_DIR <- "data/CITE5p/all_batches/differential_expression/2020_08_09/sample_groups/"
OUT_DIR <- "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble/"
OUT_PATH_TABLE_FDR2 <- "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble_fgsea_table_fdr.2.xlsx"
OUT_PATH_TABLE_SPECIFIC_PATHWAY <- "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble_fgsea_table_specific_pathway.xlsx"
dir.create(OUT_DIR, recursive = TRUE)

PATHWAYS_DAT_PATH <- "input/genesets/pathway_summary-reduced_AJM_JT_09162020.xlsx"

CELL_GROUPING_IN_PATH <- "input/cell_groupings_for_bubble_plots_2020_08_19.csv"

keep_pathways_dat <- read_excel(PATHWAYS_DAT_PATH)

keep_pathways_dat <- keep_pathways_dat %>%
        rename(pathway = Primary_gene_sets) %>%
        mutate(Category = factor(Category, levels = unique(Category))) %>%
        mutate(keep_pathway = TRUE)

cellgroups_dat <- read_csv(CELL_GROUPING_IN_PATH)

## Parsed with column specification:
## cols(
##   cellgroup = col_character(),
##   celltype = col_character()
## )

cellgroups_dat <- cellgroups_dat %>%
        mutate(cellgroup = 
               factor(cellgroup, levels = c("Bcell", "CD4", "CD8", "OtherTcell",
                                            "NK", "Gran", "Mono", "DC", "Platelet")))

OVERWRITE <- TRUE

files <- list.files(IN_DIR, full.names = F, recursive = TRUE) 
files <- grep("fgsea_table", files, value = TRUE)

files <- files[sapply(strsplit(files, "\\/"), length) == 7]

files_dat <- data.frame(filename = files, 
                       sample_group = sapply(strsplit(files, split = "\\/"), `[[`, 1),
                       model = sapply(strsplit(files, split = "\\/"), `[[`, 3),
                       coefficient = sapply(strsplit(files, split = "\\/"), `[[`, 5),
                       cell_anno = sapply(strsplit(files, split = "\\/"), `[[`, 6),
                       celltype_mod_coef = sapply(strsplit(files, split = "\\/"), `[[`, 7),
                       stringsAsFactors = FALSE) %>%
                       mutate(celltype = sapply(strsplit(celltype_mod_coef, split = "--"), `[[`, 1)) %>%
          mutate(celltype = replace(celltype, grepl("specific_gating", cell_anno), paste("gated", celltype[grepl("specific_gating", cell_anno)])))

files_dat <- files_dat %>%
        filter(coefficient != "Age") %>%
        filter(cell_anno %in% c("Sorted-specific_gating", "Unsorted-WCTcoursecelltype"))

files_dat <- files_dat %>%
        mutate(sampgroup_mod_coef = paste(sample_group, model, coefficient, sep = "--"))

keep_comparisons <- c(
                      "1" = "t0_plus_healthy--healthy_vs_covid--COVID-Healthy",
                      "2" = "t0_covid_only--PC1--PC1",
                      "3" = "all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1low",
                      "4" = "all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1high",
                      "5" = "all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_in_mid",
                      "6" = "all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_X_onsetMid-early",
                      "7" = "t0_covid_only--initial.Spike.corrected--initial.Spike.corrected"
                      )

files_dat <- files_dat %>%
        filter(sampgroup_mod_coef %in% keep_comparisons)

sampgroup_mod_coefs <- keep_comparisons
for(i in seq_along(sampgroup_mod_coefs)){
  sampg_m_c <- sampgroup_mod_coefs[[i]]
  fignumber <- names(sampgroup_mod_coefs)[[i]]

  outpath <- file.path(OUT_DIR, paste0(sampg_m_c, ".pdf"))
  #outpath_table <- file.path(OUT_DIR_TABLE, paste0(sampg_m_c, ".tsv"))
  print(outpath)
  if(!OVERWRITE){
    if(file.exists(outpath)){
      next()
    }
  }

  files_dat_sub <- files_dat %>%
          filter(sampgroup_mod_coef == sampg_m_c) %>%
          select(filename, celltype)
  keep_paths <- files_dat_sub$filename
  keep_paths <- file.path(IN_DIR, keep_paths)
  names(keep_paths) <- files_dat_sub$celltype


  fgsea_list <- lapply(keep_paths, function(path){
    read_tsv(path,
             col_types = cols(
                         pathway = col_character(),
                         pval = col_double(),
                         padj = col_double(),
                         ES = col_double(),
                         NES = col_double(),
                         nMoreExtreme = col_double(),
                         size = col_double(),
                         leadingEdge = col_character()
                         ))
  })
  fgsea_dat <- bind_rows(fgsea_list, .id = "celltype")
  fgsea_dat <- full_join(fgsea_dat, cellgroups_dat, by = "celltype")

  fgsea_dat <- fgsea_dat %>%
          filter(!grepl("dblt", celltype, ignore.case = TRUE) & !celltype %in% c("gated ungated", "gated_out", "Unknown", "PB_Plasmablasts", "TissueResMemT", "Granulocytes")) %>%
          mutate(celltype = replace(celltype, celltype == "Treg", "Treg/Activated.Tcell"))

  fgsea_dat <- full_join(keep_pathways_dat, fgsea_dat) %>%
          filter(!is.na(celltype))


  fgsea_dat <- fgsea_dat %>% mutate(celltype = factor(celltype), pathway = factor(pathway))
                                    #new_geneset_names = factor(new_geneset_names))

  #write table before setting adjusted p values to NA if above .2
  if(which(sampgroup_mod_coefs == sampg_m_c) == 1){
    APPEND <- FALSE
  }else{
    APPEND <- TRUE
  }

  sheetname <- sapply(strsplit(sampg_m_c, "--"), `[[`, 3)
  sheetname <- paste(fignumber, sheetname, sep = "_")
  sheetname <- gsub("PC1", "DSM", sheetname)

  #get rid of foointerferon
  fgsea_dat <- fgsea_dat %>% filter(!grepl("HSV", pathway)) %>%
          mutate(pathway = gsub("foointerferon", "", pathway))

  fgsea_dat %>% 
          filter(padj < .2) %>% select(-c("new_geneset_names", "keep_pathway")) %>%
          as.data.frame() %>% 
          write.xlsx(file=OUT_PATH_TABLE_FDR2, sheetName=sheetname, row.names=FALSE, 
                     append = APPEND)

  fgsea_dat %>% 
          filter(padj < .2) %>% 
          filter(keep_pathway) %>% 
          select(-c("new_geneset_names", "keep_pathway")) %>%
          as.data.frame() %>% 
          write.xlsx(file=OUT_PATH_TABLE_SPECIFIC_PATHWAY, sheetName=sheetname, 
                     row.names=FALSE, append = APPEND)

  fgsea_dat <- fgsea_dat %>% mutate(padj = replace(padj, padj > .2, NA)) %>%
          filter(keep_pathway)
          #filter(pval < .05)



  fgsea_dat <- fgsea_dat %>%
          mutate(new_geneset_names = as.character(new_geneset_names)) %>%
          mutate(geneset_db = sapply(strsplit(new_geneset_names, "_"), `[[`, 1)) %>% 
          mutate(geneset_db = str_to_upper(geneset_db)) %>%
          mutate(new_geneset_names = lapply(lapply(strsplit(new_geneset_names, "_"), `[`, -1), FUN = paste, collapse = "_")) %>%
          #mutate(geneset_db = sapply(strsplit(new_geneset_names, "_"), `[[`, 1))
          #mutate(new_geneset_names = gsub("_", " ", new_geneset_names)) %>%
          mutate(new_geneset_names = str_to_sentence(new_geneset_names)) %>%
          mutate(new_geneset_names = gsub(" ", "_", new_geneset_names)) %>%
          mutate(new_geneset_names = paste(geneset_db, new_geneset_names, sep = "_")) %>%
          mutate(new_geneset_names = gsub("nfkb", "NFkB", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("type_i", "type_I", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("\\(i\\)", "\\(I\\)", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("Jak", "JAK", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("Mtorc", "MTORC", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("Tnfa", "TNFa", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("stat", "STAT", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("YELLOW", "Yellow", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("Fever", "fever", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("Flu", "flu", new_geneset_names, ignore.case = TRUE)) %>%
          mutate(new_geneset_names = gsub("GINSBURG", "Ginsburg", new_geneset_names, ignore.case = TRUE))

  p = ggplot(fgsea_dat, aes(y = new_geneset_names, x = celltype, size = -log10(padj), fill = NES)) + 
    geom_point(shape = 21) + 
    scale_size_continuous(range = c(0,4), limits = c(0, 5)) +
    #scale_fill_viridis_c() + 
    scale_fill_gradientn(colours = viridis(100), limits=c(-4.5, 4.5)) +
    theme_bw() +
    scale_x_discrete(position = "top") + 
    scale_y_discrete(position = "right") + 
    theme(axis.text.x=element_text(angle = 45, hjust = 0)) + 
    theme(axis.title.y = element_blank()) +
    theme(legend.position = "right") + 
    labs(size = '-log10(ajdusted P value)', fill = 'Normalized Enrichment Score') +
    theme(legend.title = element_text(face = "bold",colour = "black", size = 8)) +
    theme(axis.text.y = element_text(size = 8, face = "bold", color = "black")) + 
    theme(axis.text.x = element_text(size = 8.5, face = "bold", color = "black")) + 
    guides(shape = guide_legend(override.aes = list(size = 5))) + 
    guides(color = guide_legend(override.aes = list(size = 5))) + 
    theme(legend.title = element_text(size = 8), legend.text = element_text(size = 8),
          panel.spacing = unit(0, "lines"))+
    ggtitle(sampg_m_c) + 
    #facet_wrap(~Category, ncol = 1, scales = "free_y", space = "free")
    facet_grid(Category~ cellgroup, scales = "free", space = "free") +
    theme(
          strip.background = element_blank(),
          strip.text.x = element_blank(),
          strip.text.y = element_blank()
    )

  #ggsave(plot = p, filename = outpath, height = 5.5, width = 9.5)
  print(p)


}

## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//t0_plus_healthy--healthy_vs_covid--COVID-Healthy.pdf"

## Joining, by = "pathway"

## Warning: Removed 369 rows containing missing values (geom_point).

## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//t0_covid_only--PC1--PC1.pdf"

## Joining, by = "pathway"

## Warning: Removed 426 rows containing missing values (geom_point).

## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1low.pdf"

## Joining, by = "pathway"

## Warning: Removed 414 rows containing missing values (geom_point).

## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1high.pdf"

## Joining, by = "pathway"

## Warning: Removed 448 rows containing missing values (geom_point).

## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_in_mid.pdf"

## Joining, by = "pathway"

## Warning: Removed 496 rows containing missing values (geom_point).

## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_X_onsetMid-early.pdf"

## Joining, by = "pathway"

## Warning: Removed 462 rows containing missing values (geom_point).

## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//t0_covid_only--initial.Spike.corrected--initial.Spike.corrected.pdf"

## Joining, by = "pathway"

## Warning: Removed 405 rows containing missing values (geom_point).