This will save both the bubble plots as well as corresponding supplementary tables in an excel spreadsheet
DE results were generated by ../differential_expression/covid_de_pipeline
options(java.parameters = "- Xmx1024m")#so that xlsx library works, https://stackoverflow.com/questions/27153974/how-to-fix-outofmemoryerror-java-gc-overhead-limit-exceeded-in-r
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.0 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.5
## ✔ tidyr 1.0.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readxl)
library(viridis)
## Loading required package: viridisLite
library(xlsx)
IN_DIR <- "data/CITE5p/all_batches/differential_expression/2020_08_09/sample_groups/"
OUT_DIR <- "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble/"
OUT_PATH_TABLE_FDR2 <- "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble_fgsea_table_fdr.2.xlsx"
OUT_PATH_TABLE_SPECIFIC_PATHWAY <- "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble_fgsea_table_specific_pathway.xlsx"
dir.create(OUT_DIR, recursive = TRUE)
PATHWAYS_DAT_PATH <- "input/genesets/pathway_summary-reduced_AJM_JT_09162020.xlsx"
CELL_GROUPING_IN_PATH <- "input/cell_groupings_for_bubble_plots_2020_08_19.csv"
keep_pathways_dat <- read_excel(PATHWAYS_DAT_PATH)
keep_pathways_dat <- keep_pathways_dat %>%
rename(pathway = Primary_gene_sets) %>%
mutate(Category = factor(Category, levels = unique(Category))) %>%
mutate(keep_pathway = TRUE)
cellgroups_dat <- read_csv(CELL_GROUPING_IN_PATH)
## Parsed with column specification:
## cols(
## cellgroup = col_character(),
## celltype = col_character()
## )
cellgroups_dat <- cellgroups_dat %>%
mutate(cellgroup =
factor(cellgroup, levels = c("Bcell", "CD4", "CD8", "OtherTcell",
"NK", "Gran", "Mono", "DC", "Platelet")))
OVERWRITE <- TRUE
files <- list.files(IN_DIR, full.names = F, recursive = TRUE)
files <- grep("fgsea_table", files, value = TRUE)
files <- files[sapply(strsplit(files, "\\/"), length) == 7]
files_dat <- data.frame(filename = files,
sample_group = sapply(strsplit(files, split = "\\/"), `[[`, 1),
model = sapply(strsplit(files, split = "\\/"), `[[`, 3),
coefficient = sapply(strsplit(files, split = "\\/"), `[[`, 5),
cell_anno = sapply(strsplit(files, split = "\\/"), `[[`, 6),
celltype_mod_coef = sapply(strsplit(files, split = "\\/"), `[[`, 7),
stringsAsFactors = FALSE) %>%
mutate(celltype = sapply(strsplit(celltype_mod_coef, split = "--"), `[[`, 1)) %>%
mutate(celltype = replace(celltype, grepl("specific_gating", cell_anno), paste("gated", celltype[grepl("specific_gating", cell_anno)])))
files_dat <- files_dat %>%
filter(coefficient != "Age") %>%
filter(cell_anno %in% c("Sorted-specific_gating", "Unsorted-WCTcoursecelltype"))
files_dat <- files_dat %>%
mutate(sampgroup_mod_coef = paste(sample_group, model, coefficient, sep = "--"))
keep_comparisons <- c(
"1" = "t0_plus_healthy--healthy_vs_covid--COVID-Healthy",
"2" = "t0_covid_only--PC1--PC1",
"3" = "all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1low",
"4" = "all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1high",
"5" = "all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_in_mid",
"6" = "all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_X_onsetMid-early",
"7" = "t0_covid_only--initial.Spike.corrected--initial.Spike.corrected"
)
files_dat <- files_dat %>%
filter(sampgroup_mod_coef %in% keep_comparisons)
sampgroup_mod_coefs <- keep_comparisons
for(i in seq_along(sampgroup_mod_coefs)){
sampg_m_c <- sampgroup_mod_coefs[[i]]
fignumber <- names(sampgroup_mod_coefs)[[i]]
outpath <- file.path(OUT_DIR, paste0(sampg_m_c, ".pdf"))
#outpath_table <- file.path(OUT_DIR_TABLE, paste0(sampg_m_c, ".tsv"))
print(outpath)
if(!OVERWRITE){
if(file.exists(outpath)){
next()
}
}
files_dat_sub <- files_dat %>%
filter(sampgroup_mod_coef == sampg_m_c) %>%
select(filename, celltype)
keep_paths <- files_dat_sub$filename
keep_paths <- file.path(IN_DIR, keep_paths)
names(keep_paths) <- files_dat_sub$celltype
fgsea_list <- lapply(keep_paths, function(path){
read_tsv(path,
col_types = cols(
pathway = col_character(),
pval = col_double(),
padj = col_double(),
ES = col_double(),
NES = col_double(),
nMoreExtreme = col_double(),
size = col_double(),
leadingEdge = col_character()
))
})
fgsea_dat <- bind_rows(fgsea_list, .id = "celltype")
fgsea_dat <- full_join(fgsea_dat, cellgroups_dat, by = "celltype")
fgsea_dat <- fgsea_dat %>%
filter(!grepl("dblt", celltype, ignore.case = TRUE) & !celltype %in% c("gated ungated", "gated_out", "Unknown", "PB_Plasmablasts", "TissueResMemT", "Granulocytes")) %>%
mutate(celltype = replace(celltype, celltype == "Treg", "Treg/Activated.Tcell"))
fgsea_dat <- full_join(keep_pathways_dat, fgsea_dat) %>%
filter(!is.na(celltype))
fgsea_dat <- fgsea_dat %>% mutate(celltype = factor(celltype), pathway = factor(pathway))
#new_geneset_names = factor(new_geneset_names))
#write table before setting adjusted p values to NA if above .2
if(which(sampgroup_mod_coefs == sampg_m_c) == 1){
APPEND <- FALSE
}else{
APPEND <- TRUE
}
sheetname <- sapply(strsplit(sampg_m_c, "--"), `[[`, 3)
sheetname <- paste(fignumber, sheetname, sep = "_")
sheetname <- gsub("PC1", "DSM", sheetname)
#get rid of foointerferon
fgsea_dat <- fgsea_dat %>% filter(!grepl("HSV", pathway)) %>%
mutate(pathway = gsub("foointerferon", "", pathway))
fgsea_dat %>%
filter(padj < .2) %>% select(-c("new_geneset_names", "keep_pathway")) %>%
as.data.frame() %>%
write.xlsx(file=OUT_PATH_TABLE_FDR2, sheetName=sheetname, row.names=FALSE,
append = APPEND)
fgsea_dat %>%
filter(padj < .2) %>%
filter(keep_pathway) %>%
select(-c("new_geneset_names", "keep_pathway")) %>%
as.data.frame() %>%
write.xlsx(file=OUT_PATH_TABLE_SPECIFIC_PATHWAY, sheetName=sheetname,
row.names=FALSE, append = APPEND)
fgsea_dat <- fgsea_dat %>% mutate(padj = replace(padj, padj > .2, NA)) %>%
filter(keep_pathway)
#filter(pval < .05)
fgsea_dat <- fgsea_dat %>%
mutate(new_geneset_names = as.character(new_geneset_names)) %>%
mutate(geneset_db = sapply(strsplit(new_geneset_names, "_"), `[[`, 1)) %>%
mutate(geneset_db = str_to_upper(geneset_db)) %>%
mutate(new_geneset_names = lapply(lapply(strsplit(new_geneset_names, "_"), `[`, -1), FUN = paste, collapse = "_")) %>%
#mutate(geneset_db = sapply(strsplit(new_geneset_names, "_"), `[[`, 1))
#mutate(new_geneset_names = gsub("_", " ", new_geneset_names)) %>%
mutate(new_geneset_names = str_to_sentence(new_geneset_names)) %>%
mutate(new_geneset_names = gsub(" ", "_", new_geneset_names)) %>%
mutate(new_geneset_names = paste(geneset_db, new_geneset_names, sep = "_")) %>%
mutate(new_geneset_names = gsub("nfkb", "NFkB", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("type_i", "type_I", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("\\(i\\)", "\\(I\\)", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("Jak", "JAK", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("Mtorc", "MTORC", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("Tnfa", "TNFa", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("stat", "STAT", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("YELLOW", "Yellow", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("Fever", "fever", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("Flu", "flu", new_geneset_names, ignore.case = TRUE)) %>%
mutate(new_geneset_names = gsub("GINSBURG", "Ginsburg", new_geneset_names, ignore.case = TRUE))
p = ggplot(fgsea_dat, aes(y = new_geneset_names, x = celltype, size = -log10(padj), fill = NES)) +
geom_point(shape = 21) +
scale_size_continuous(range = c(0,4), limits = c(0, 5)) +
#scale_fill_viridis_c() +
scale_fill_gradientn(colours = viridis(100), limits=c(-4.5, 4.5)) +
theme_bw() +
scale_x_discrete(position = "top") +
scale_y_discrete(position = "right") +
theme(axis.text.x=element_text(angle = 45, hjust = 0)) +
theme(axis.title.y = element_blank()) +
theme(legend.position = "right") +
labs(size = '-log10(ajdusted P value)', fill = 'Normalized Enrichment Score') +
theme(legend.title = element_text(face = "bold",colour = "black", size = 8)) +
theme(axis.text.y = element_text(size = 8, face = "bold", color = "black")) +
theme(axis.text.x = element_text(size = 8.5, face = "bold", color = "black")) +
guides(shape = guide_legend(override.aes = list(size = 5))) +
guides(color = guide_legend(override.aes = list(size = 5))) +
theme(legend.title = element_text(size = 8), legend.text = element_text(size = 8),
panel.spacing = unit(0, "lines"))+
ggtitle(sampg_m_c) +
#facet_wrap(~Category, ncol = 1, scales = "free_y", space = "free")
facet_grid(Category~ cellgroup, scales = "free", space = "free") +
theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
strip.text.y = element_blank()
)
#ggsave(plot = p, filename = outpath, height = 5.5, width = 9.5)
print(p)
}
## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//t0_plus_healthy--healthy_vs_covid--COVID-Healthy.pdf"
## Joining, by = "pathway"
## Warning: Removed 369 rows containing missing values (geom_point).
## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//t0_covid_only--PC1--PC1.pdf"
## Joining, by = "pathway"
## Warning: Removed 426 rows containing missing values (geom_point).
## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1low.pdf"
## Joining, by = "pathway"
## Warning: Removed 414 rows containing missing values (geom_point).
## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1group_onsetcontinous_interaction--days_onset_in_PC1high.pdf"
## Joining, by = "pathway"
## Warning: Removed 448 rows containing missing values (geom_point).
## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_in_mid.pdf"
## Joining, by = "pathway"
## Warning: Removed 496 rows containing missing values (geom_point).
## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//all_timepoints_covid_only--PC1_onset_group_interaction--PC1High-low_X_onsetMid-early.pdf"
## Joining, by = "pathway"
## Warning: Removed 462 rows containing missing values (geom_point).
## [1] "plots/CITE5p/all_batches/paper_figures/differential_expression/2020_12_02/fgsea_bubble//t0_covid_only--initial.Spike.corrected--initial.Spike.corrected.pdf"
## Joining, by = "pathway"
## Warning: Removed 405 rows containing missing values (geom_point).