Code to reproduce manuscript results &
figures
Matt Mulè
email: mpm63 AT cantab.ac.uk
Data associated with this manuscript used in analysis workflow below
are stored at the following repository:
10.5281/zenodo.10546916
The data repository includes CITE-seq data in additional formats for
reuse with other packages.
flu_vacc_CITEseq_Seurat4.rds
- a Seurat version 4 object
(separate assays for RNA and ADT).
flu_vacc_CITEseq_combinedassay.h5ad
- an Anndata object for
analysis in python / scanpy. (RNA + ADT
combined in single matrix).
Many scripts in this workflow require the scglmmr package associated
with this manuscript. Separate stand alone R functions used in package
can also be found in functions/scglmmr_functions/
and
sourced at the beginning of a script.
The data downloaded from the Zenodo repository should be added to a
data/ directory in the project root folder. The analysis can without
specifying any file paths. Each R script is self-contained, reading data
from the /data folder and writing to figures or results files within
each analysis subdirectory relative to the root directory using the R
package here
. Unless otherwise noted, scripts were run with
R 4.0.5. Package versions are listed in the table.
non CITEseq data used in the analysis is saved in the following
directories:
data
–vand
–CHI_H1N1_data
–full_metadata
–stim
Annotated CITE-seq data in a Seurat object is saved in the project
directory as h1h5_annotated_with_meta.rds
This section is run with R 3.5.1
Calculate and visualize distribution of individuals across protein based
clusters.
Biaxial plots of unsupervised cluster dsb normalized protein
expression.
mid_res/sample_and_protein_distributions/1.sample.lineage.proteinbiaxial.r
## Must be run in R 3.5.1
# umap of joint clustering results
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
source("functions/analysis_functions.R")
# Set path
= here("mid_res/sample_and_protein_distributions/figures/");
figpath dir.create(figpath, recursive = TRUE)
# full sample bar plot
= readRDS(file = here("data/h1h5_annotated_with_meta.rds"))@meta.data
md = md$celltype_joint %>% unique() %>% sort()
celltypes = celltypes[c(7:12)]
t4 = celltypes[c(13:16)]
t8 = celltypes[c(4:5, 18, 19, 21, 23)]
myeloid = celltypes[c(1,2,6)]
bc = celltypes[c(22)]
nk = celltypes[c(3,20)]
unconventionalT
= md %>% mutate(lineage =
md if_else(celltype_joint %in% t4, "CD4 T Cell",
if_else(celltype_joint %in% t8, "CD8 T cell",
if_else(celltype_joint %in% myeloid, "Myeloid",
if_else(celltype_joint %in% bc, "B cell",
if_else(celltype_joint %in% nk, "NK cell",
if_else(celltype_joint %in% unconventionalT, "unconventional T",
false = "other")))))))
= md %>% filter(!celltype_joint %in% "DOUBLET")
md2
# calc and vis fraction of total
= md2 %>% group_by(lineage, sample) %>% tally
d
=
p ggplot(d, aes(x = sample, y = n, fill = lineage)) +
geom_bar(position = 'fill', stat = 'identity', show.legend = TRUE) +
::scale_fill_jama() +
ggsciylab("percent of total") +
theme(axis.text.x=element_text(angle = -90, hjust = 0)) +
theme(axis.text.x = element_text(size =6)) +
theme(axis.title.x = element_blank())
ggsave(p, filename = paste0(figpath, "LINEAGE_fullsamplebarplot.pdf"), width = 9, height = 4)
#### #Manual gate plots
= paste0(figpath,"mgplots/") ; dir.create(figpath)
figpath
# Day 1 cohort CD14 monocyte data
= as.data.frame(t(readRDS(file = here("h1h5_annotated.rds"))@assay$CITE@data))
cite colnames(cite) = str_sub(colnames(cite), 1, -6)
= cbind(md,cite)
mdf = mdf %>% filter(cohort == "H1N1")
h1md
# match colors in umap
= readRDS(here("data/celltypes_vector_ordered.rds"))
celltypes = pals::kelly(n = 22) %>% as.vector()
cu = cu[-1]
cu = c("midnightblue", cu, "lightslateblue")
cu 15] = "maroon1"
cu[11] = "darkseagreen1"
cu[= cu[-1]
cu = rev(cu)
cu $celltype_joint = factor(h1md$celltype_joint, levels = celltypes)
h1md
###################
# B cells
=
p ggplot(h1md %>% filter(lineage %in% "B cell"), aes(x = CD27, CD38, color = celltype_joint)) +
theme_bw(base_size = 12) +
scale_color_manual(values = c("lightslateblue","#2B3D26", "#882D17")) +
geom_density_2d() +
labs(color="celltype") +
theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16)) +
theme(legend.position = c(0.85, 0.4)) +
theme(legend.key.size = unit(0.35, units='cm'))
ggsave(p, filename = paste0(figpath,'Bcell.pdf'), width = 3.5, height = 3)
=
p ggplot(h1md %>% filter(lineage %in% "B cell"), aes(x = CD27, CD38)) +
theme_bw(base_size = 12) +
geom_bin2d(bins = 200) +
scale_fill_viridis(option = "B") +
theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16)) +
theme(legend.position = c(0.85, 0.4)) +
theme(legend.key.size = unit(0.45, units='cm'))
ggsave(p, filename = paste0(figpath,'Bcell_density.pdf'), width = 3.5, height = 3)
###################
### Myeloid Cells
=
p ggplot(h1md %>% filter(lineage %in% "Myeloid") %>% filter(!celltype_joint %in% 'IgA_CD14_Mono'),
aes(x = CD16, CD303, color = celltype_joint)) +
theme_bw(base_size = 12) +
scale_color_manual(values = c("#654522", "#8DB600", "#BE0032","#875692","#222222"))+
geom_density_2d() +
labs(color="celltype") +
theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16)) +
theme(legend.position = c(0.75, 0.6))
pggsave(p, filename = paste0(figpath,'myeloid1.pdf'), width = 3.5, height = 3)
=
p ggplot(h1md %>% filter(lineage %in% "Myeloid") %>% filter(!celltype_joint %in% 'IgA_CD14_Mono'),
aes(x = CD16, CD303)) +
theme_bw(base_size = 12) +
geom_bin2d(bins = 200) +
scale_fill_viridis(option = "B") +
theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16)) +
theme(legend.position = c(0.75, 0.6)) +
theme(legend.key.size = unit(0.45, units = 'cm'))
pggsave(p, filename = paste0(figpath,'myeloid1_density.pdf'), width = 3.5, height = 3)
# prots 2
=
p ggplot(h1md %>% filter(lineage %in% "Myeloid") %>% filter(!celltype_joint %in% 'IgA_CD14_Mono'),
aes(x = CD16, CD14, color = celltype_joint)) +
theme_bw(base_size = 12) +
scale_color_manual(values = c("#654522", "#8DB600", "#BE0032","#875692","#222222"))+
geom_density_2d() +
labs(color="celltype") +
theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16)) +
theme(legend.position = c(0.8, 0.75)) +
theme(legend.key.size = unit(0.45, units = 'cm'))
pggsave(p, filename = paste0(figpath,'myeloid2.pdf'), width = 3.5, height = 3)
#
# ### T clels
= ggplot(h1md %>% filter(lineage %in% "CD8 T cell"), aes(x = CD161, CD45RO, color = celltype_joint)) +
p theme_bw(base_size = 12) +
::scale_color_jco() +
ggscigeom_density_2d() +
labs(color="celltype") +
theme(axis.title.x = element_text(size = 16), axis.title.y = element_text(size = 16))
pggsave(p, filename = paste0(figpath,'cd8Tcell.pdf'), width = 5, height = 3)
# R version 3.5.3 Patched (2019-03-11 r77192)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] viridis_0.5.1 viridisLite_0.3.0 here_0.1 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_2.1.1 tidyverse_1.2.1
# [12] Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.1
#
# loaded via a namespace (and not attached):
# [1] readxl_1.3.1 snow_0.4-3 backports_1.1.4 Hmisc_4.2-0 plyr_1.8.4 igraph_1.2.4.1 lazyeval_0.2.2 splines_3.5.3
# [9] crosstalk_1.0.0 digest_0.6.25 foreach_1.4.4 htmltools_0.3.6 lars_1.2 gdata_2.18.0 magrittr_2.0.1 checkmate_1.9.3
# [17] cluster_2.0.7-1 mixtools_1.1.0 ROCR_1.0-7 modelr_0.1.4 R.utils_2.8.0 colorspace_1.4-1 rvest_0.3.4 haven_2.1.0
# [25] xfun_0.7 crayon_1.3.4 jsonlite_1.6 survival_2.43-3 zoo_1.8-6 iterators_1.0.10 ape_5.3 glue_1.3.1
# [33] pals_1.5 gtable_0.3.0 webshot_0.5.1 kernlab_0.9-27 prabclus_2.3-1 DEoptimR_1.0-8 maps_3.3.0 scales_1.0.0
# [41] mvtnorm_1.0-10 bibtex_0.4.2 miniUI_0.1.1.1 Rcpp_1.0.1 metap_1.1 dtw_1.20-1 xtable_1.8-4 htmlTable_1.13.1
# [49] reticulate_1.12 foreign_0.8-71 bit_1.1-14 mapproj_1.2.6 proxy_0.4-23 mclust_5.4.5 SDMTools_1.1-221.1 Formula_1.2-3
# [57] stats4_3.5.3 tsne_0.1-3 htmlwidgets_1.3 httr_1.4.0 gplots_3.0.1.1 RColorBrewer_1.1-2 fpc_2.2-1 acepack_1.4.1
# [65] modeltools_0.2-22 ica_1.0-2 pkgconfig_2.0.2 R.methodsS3_1.7.1 flexmix_2.3-15 nnet_7.3-12 manipulateWidget_0.10.0 tidyselect_0.2.5
# [73] labeling_0.3 rlang_0.4.5 reshape2_1.4.3 later_0.8.0 munsell_0.5.0 cellranger_1.1.0 tools_3.5.3 cli_1.1.0
# [81] generics_0.0.2 broom_0.5.2 ggridges_0.5.1 npsurv_0.4-0 knitr_1.23 bit64_0.9-7 fitdistrplus_1.0-14 robustbase_0.93-5
# [89] rgl_0.100.30 caTools_1.17.1.2 RANN_2.6.1 packrat_0.5.0 pbapply_1.4-0 nlme_3.1-137 mime_0.6 R.oo_1.22.0
# [97] xml2_1.2.0 hdf5r_1.2.0 compiler_3.5.3 rstudioapi_0.10 png_0.1-7 lsei_1.2-0 stringi_1.4.3 lattice_0.20-38
# [105] ggsci_2.9 vctrs_0.2.4 pillar_1.4.1 lifecycle_0.1.0 Rdpack_0.11-0 lmtest_0.9-37 data.table_1.12.2 bitops_1.0-6
# [113] irlba_2.3.3 gbRd_0.4-11 httpuv_1.5.1 R6_2.4.0 latticeExtra_0.6-28 promises_1.0.1 KernSmooth_2.23-15 gridExtra_2.3
# [121] codetools_0.2-16 dichromat_2.0-0 MASS_7.3-51.1 gtools_3.8.1 assertthat_0.2.1 rprojroot_1.3-2 withr_2.1.2 diptest_0.75-7
# [129] parallel_3.5.3 doSNOW_1.0.16 hms_0.4.2 grid_3.5.3 rpart_4.1-13 class_7.3-15 segmented_0.5-4.0 Rtsne_0.15
# [137] shiny_1.3.2 lubridate_1.7.4 base64enc_0.1-3
Calculate and visualize distribution of immune cell frequency across individuals.
mid_res/sample_and_protein_distributions/2.cell.frequency.map.r
# this script is run in R 4.0.5
# sample barplot
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
= here("mid_res/sample_and_protein_distributions/figures/")
figpath
= readRDS(file = "data/h1h5_annotated_with_meta.rds")
s
= s@meta.data %>%
freq_plot select(celltype_joint, sample) %>%
group_by(celltype_joint, sample) %>%
summarise(n = n()) %>%
group_by(sample) %>%
mutate(log_cell = log10(n)) %>%
select(sample, celltype_joint, log_cell) %>%
spread(celltype_joint, log_cell) %>%
mutate(sample = if_else(str_sub(sample, 1, 2) == 'H5',
true = str_sub(sample, -6, -1),
false = sample)) %>%
column_to_rownames("sample") %>%
t()
= read_delim(file = "data/full_metadata/full_sample_metadata.txt", delim = "\t")
annotation
= s@meta.data %>%
md select(sample) %>%
group_by(sample) %>%
summarise(n_cells = log10(n())) %>%
mutate(subjectID = str_sub(sample, -6, -4)) %>%
mutate(timepoint = str_sub(sample, -2, -1)) %>%
mutate(group = plyr::mapvalues(
x = subjectID,
from = annotation$subjectid,
to = annotation$adjMFC)) %>%
select(sample, group, timepoint) %>%
mutate(group = if_else(str_sub(sample, 1, 2) == 'H5', true = "adjuvant", false = group)) %>%
mutate(sample = if_else(
str_sub(sample, 1, 2) == 'H5',
true = str_sub(sample, -6, -1),
false = sample)) %>%
column_to_rownames("sample")
# quant palette
= list(
mat_colors group = c("grey", "red", "deepskyblue3"),
timepoint = c("grey", "orange", "black")
)names(mat_colors$timepoint) = unique(md$timepoint)
names(mat_colors$group) = unique(md$group)
# plot
rownames(freq_plot) = str_replace_all(rownames(freq_plot),pattern = '_', replacement = ' ')
::pheatmap(freq_plot,
pheatmapannotation_col = md,
display_numbers = FALSE,
color = grDevices::colorRampPalette(colors = c("gray100", "black" ))(100),
cluster_cols = F, border_color = NA,
width = 10, height = 5,treeheight_row = 20,
annotation_colors = mat_colors,
filename = paste0(figpath, 'sample_celltype_map.pdf')
)
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] here_1.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.4.0
# [7] tidyr_1.1.2 tibble_3.1.8 ggplot2_3.3.3 tidyverse_1.3.0 sp_1.4-5 SeuratObject_4.1.0
# [13] Seurat_4.0.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_2.0-0 deldir_1.0-6 ellipsis_0.3.2 ggridges_0.5.3
# [6] rprojroot_2.0.2 fs_1.5.0 rstudioapi_0.13 spatstat.data_2.1-0 farver_2.0.3
# [11] leiden_0.3.7 listenv_0.8.0 ggrepel_0.9.1 lubridate_1.8.0 fansi_0.4.2
# [16] xml2_1.3.2 codetools_0.2-18 splines_4.0.5 polyclip_1.10-0 jsonlite_1.7.2
# [21] packrat_0.7.0 broom_0.7.5 ica_1.0-2 cluster_2.1.2 dbplyr_2.1.0
# [26] png_0.1-7 rgeos_0.5-9 pheatmap_1.0.12 uwot_0.1.10 shiny_1.6.0
# [31] sctransform_0.3.2 spatstat.sparse_2.0-0 compiler_4.0.5 httr_1.4.2 backports_1.2.1
# [36] assertthat_0.2.1 Matrix_1.4-1 fastmap_1.1.0 lazyeval_0.2.2 cli_3.4.1
# [41] later_1.1.0.1 htmltools_0.5.2 tools_4.0.5 igraph_1.2.6 gtable_0.3.0
# [46] glue_1.6.2 RANN_2.6.1 reshape2_1.4.4 Rcpp_1.0.9 scattermore_0.7
# [51] cellranger_1.1.0 vctrs_0.5.1 nlme_3.1-152 progressr_0.10.0 lmtest_0.9-38
# [56] globals_0.14.0 rvest_0.3.6 mime_0.10 miniUI_0.1.1.1 lifecycle_1.0.3
# [61] irlba_2.3.3 goftest_1.2-2 future_1.21.0 MASS_7.3-53.1 zoo_1.8-8
# [66] scales_1.1.1 spatstat.core_2.0-0 hms_1.0.0 promises_1.2.0.1 spatstat.utils_2.3-0
# [71] parallel_4.0.5 RColorBrewer_1.1-2 reticulate_1.18 pbapply_1.4-3 gridExtra_2.3
# [76] rpart_4.1-15 stringi_1.5.3 rlang_1.0.6 pkgconfig_2.0.3 matrixStats_0.58.0
# [81] lattice_0.20-41 ROCR_1.0-11 tensor_1.5 patchwork_1.1.1 htmlwidgets_1.5.3
# [86] cowplot_1.1.1 tidyselect_1.2.0 parallelly_1.23.0 RcppAnnoy_0.0.18 plyr_1.8.6
# [91] magrittr_2.0.3 R6_2.5.0 generics_0.1.2 DBI_1.1.1 withr_2.4.3
# [96] pillar_1.8.1 haven_2.4.3 mgcv_1.8-34 fitdistrplus_1.1-3 survival_3.2-10
# [101] abind_1.4-5 future.apply_1.7.0 crayon_1.4.1 modelr_0.1.8 KernSmooth_2.23-18
# [106] utf8_1.2.2 spatstat.geom_2.4-0 plotly_4.9.3 readxl_1.3.1 grid_4.0.5
# [111] data.table_1.14.0 reprex_1.0.0 digest_0.6.27 xtable_1.8-4 httpuv_1.5.5
# [116] munsell_0.5.0 viridisLite_0.3.0
Clustered protein histogram distribution across cell types
mid_res/histogram_hclust/hclust_histogram_protein.r
# high resolution histogram heatmaps
# script uses R 3.5.1
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(ggridges))
suppressMessages(library(ggsci))
suppressMessages(library(viridis))
suppressMessages(library(here))
source(file = "functions/analysis_functions.R")
# save paths
= here("mid_res/histogram_hclust/figures/")
figpath dir.create(figpath)
# Define proteins for hclust and visualization
# T cell markers
= c("CD3_PROT", "CD4_PROT", "CD8_PROT", "CD45RA_PROT", "CD45RO_PROT",
tc_markers "CD161_PROT", "CD127_PROT", "CD57_PROT", "CD27_PROT", "CD62L_PROT",
"KLRG1_PROT", "CD103_PROT", "CD25_PROT", "CD31_PROT")
# B cell markers
= c("CD20_PROT", "CD38_PROT", "IgD_PROT", "CD133_PROT", "IgM_PROT", "CD40_PROT")
bc_markers
# monocyte / dc markers
= c("CD33_PROT", "CD14_PROT", "CD16_PROT", "CD141_PROT", "CD11b_PROT")
mono_markers
# NK markers
= c("CD56_PROT")
nk_markers
# CD markers
= c("CD1c_PROT")
dc_markers
# rare cell markers
= c("CD303_PROT", "CD123_PROT", "CD34_PROT")
rare_markers
# cell activation markers
= c("CD71_PROT", "CD183_PROT", "CD184_PROT", "CD185_PROT", "CD39_PROT",
activation "CD279_PROT", "CD278 _PROT","CD194_PROT", "CD195_PROT", "CD196_PROT",
"CD117_PROT", "CD244_PROT")
= c(tc_markers,
prot_use
bc_markers,
mono_markers,
nk_markers,
dc_markers,
rare_markers,
activation) = str_replace(prot_use, pattern = "_PROT", replacement = "")
prot_use_plot
c("#000000", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# match a color palette to these markers
= rev(c(rep("red3", length(tc_markers)),
my_pal rep("royalblue1", length(bc_markers)),
rep("#009E73", length(mono_markers)),
rep("#0072B2", length(nk_markers)),
rep("#D55E00", length(dc_markers)),
rep("#CC79A7", length(rare_markers)),
rep("black", length(activation))
))
# read in H1 Seurat object
= ReadCohort(joint_object_dir = "data/h1h5_annotated_with_meta.rds", cohort = "H1N1")
h1 = h1 %>% SetAllIdent(id = "celltype_joint") %>% SubsetData(ident.remove = "DOUBLET", subset.raw = TRUE)
h1
# get vector of all clusters
= h1@meta.data$celltype_joint %>% unique
celltypes = SetAllIdent(h1,id = "celltype_joint") %>%
h1 SubsetData(max.cells.per.ident = 1000, random.seed = 1, subset.raw = TRUE)
# convert to tidy ; aggregate as the mean of proteins
= h1@assay$CITE@data %>% t %>% as.data.frame() %>% rownames_to_column("cell")
adt = h1@meta.data %>% select(celltype = celltype_joint)
md = cbind(adt, md)
adt = adt %>%
mean_mtx select(celltype, everything()) %>%
group_by(celltype) %>%
summarize_at(.vars = prot_use, .funs = base::mean) %>%
column_to_rownames("celltype") %>%
%>%
t
as.data.frame
# index for tidying
= rownames(mean_mtx)[1]
index1 = rownames(mean_mtx)[length(rownames(mean_mtx))]
index2
# order by lineage
= h1@meta.data$celltype_joint %>% unique() %>% sort()
celltype_order = celltype_order[c(12,11,7,8,9,10,13,19,14:16,3,17,21,1,2,6,4,5,18,20,22)]
celltype_order
# alt (not used)
# use hclust within pheatmap to get ordered of clustered protein and celltypes
#x = pheatmap::pheatmap(mean_mtx, silent = TRUE)
#celltype_order = colnames(mean_mtx[ ,x$tree_col$order]) %>% rev
# convert tidy and reorder based on hclust
= adt %>%
adt.l select(prot_use, celltype) %>%
gather(key = prot, value = dsb_count, index1:index2) %>%
mutate(prot = str_sub(prot, 1, -6)) %>%
mutate(prot = factor(prot, levels = rev(prot_use_plot))) %>%
mutate(celltype = factor(celltype, levels = celltype_order))
# plot
= length(celltype_order) %>% as.numeric()
col_split =
adt.l %>% filter(dsb_count > -5) %>%
adt.l filter(!celltype=="DOUBLET" )
= ggplot(adt.l, aes(x = dsb_count, y = prot, color = prot, fill = prot)) +
p geom_density_ridges2(show.legend = F, inherit.aes = T, size = 0.1) +
theme_minimal() +
scale_fill_manual(values = my_pal) +
scale_color_manual(values = my_pal) +
geom_vline(xintercept = 0, color = "red", size=0.3) +
xlab("dsb normalized protein expression") +
theme(axis.title.x = element_text(size = 15)) +
facet_wrap(~celltype, ncol = col_split, scales = "free_x") +
theme(panel.spacing.x = unit(0.1, "lines"))+
theme(strip.background = element_blank()) +
theme(strip.text = element_text(colour = 'black', size = 10, angle = 90, hjust = 0)) +
theme(axis.text.x = element_text(size = 5, family = "Helvetica", color = "black")) +
theme(axis.text.y = element_text(size = 10, family = "Helvetica", color = "black"))
ggsave(p, filename = paste0(figpath,"H1_cluster_histogram_heatmap.pdf"), width = 14.5, height =10)
# R version 3.5.3 Patched (2019-03-11 r77192)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] viridis_0.5.1 viridisLite_0.3.0 ggsci_2.9 ggridges_0.5.1 magrittr_1.5 forcats_0.4.0 stringr_1.4.0
# [8] dplyr_0.8.5 purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_2.1.1 tidyverse_1.2.1 Seurat_2.3.4
# [15] Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 ellipsis_0.3.0 class_7.3-15 modeltools_0.2-22 mclust_5.4.5
# [7] htmlTable_1.13.1 base64enc_0.1-3 rstudioapi_0.10 proxy_0.4-23 npsurv_0.4-0 flexmix_2.3-15
# [13] bit64_0.9-7 lubridate_1.7.4 mvtnorm_1.0-10 xml2_1.2.0 codetools_0.2-16 splines_3.5.3
# [19] R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-5 knitr_1.23 Formula_1.2-3 jsonlite_1.6
# [25] broom_0.5.2 ica_1.0-2 cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7 R.oo_1.22.0
# [31] pheatmap_1.0.12 compiler_3.5.3 httr_1.4.0 backports_1.1.4 assertthat_0.2.1 lazyeval_0.2.2
# [37] cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6 tools_3.5.3 igraph_1.2.4.1
# [43] gtable_0.3.0 glue_1.3.1 RANN_2.6.1 reshape2_1.4.3 Rcpp_1.0.1 cellranger_1.1.0
# [49] vctrs_0.2.4 gdata_2.18.0 ape_5.3 nlme_3.1-137 iterators_1.0.10 fpc_2.2-1
# [55] gbRd_0.4-11 lmtest_0.9-37 xfun_0.7 rvest_0.3.4 lifecycle_0.1.0 irlba_2.3.3
# [61] gtools_3.8.1 DEoptimR_1.0-8 MASS_7.3-51.1 zoo_1.8-6 scales_1.0.0 hms_0.4.2
# [67] doSNOW_1.0.16 parallel_3.5.3 RColorBrewer_1.1-2 reticulate_1.12 pbapply_1.4-0 gridExtra_2.3
# [73] rpart_4.1-13 segmented_0.5-4.0 latticeExtra_0.6-28 stringi_1.4.3 foreach_1.4.4 checkmate_1.9.3
# [79] caTools_1.17.1.2 bibtex_0.4.2 Rdpack_0.11-0 SDMTools_1.1-221.1 rlang_0.4.5 pkgconfig_2.0.2
# [85] dtw_1.20-1 prabclus_2.3-1 bitops_1.0-6 lattice_0.20-38 ROCR_1.0-7 labeling_0.3
# [91] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4 R6_2.4.0 generics_0.0.2
# [97] snow_0.4-3 gplots_3.0.1.1 Hmisc_4.2-0 haven_2.1.0 pillar_1.4.1 foreign_0.8-71
# [103] withr_2.1.2 fitdistrplus_1.0-14 mixtools_1.1.0 survival_2.43-3 nnet_7.3-12 tsne_0.1-3
# [109] modelr_0.1.4 crayon_1.3.4 hdf5r_1.2.0 KernSmooth_2.23-15 readxl_1.3.1 grid_3.5.3
# [115] data.table_1.12.2 metap_1.1 digest_0.6.19 diptest_0.75-7 R.utils_2.8.0 stats4_3.5.3
# [121] munsell_0.5.0
Aggregated protein expression heatmap across 780 libraries by sample x timepoint
mid_res/histogram_hclust/hclust_histogram_protein.r
suppressMessages(library(ComplexHeatmap))
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
# functions
source("functions/analysis_functions.R")
source("functions/protein_annotation_functions.r")
# save path
= here("mid_res/aggregated_protein_libraries/figures/")
figpath dir.create(figpath)
# cluster information combined heatmap
= readRDS(file = here("data/h1h5_annotated_with_meta.rds")) %>%
h1 SetAllIdent(id = "celltype_joint") %>%
SubsetData(ident.remove = "DOUBLET")
# specify subsets of proteins for fig 1
= c(
prot_order # B
"CD19", "CD20", "IgM", "IgD", "CD40", "CD185",
# pdc
"CD123", "CD303",
# my lin / dc
"HLA-DR", "CD11c",
# DC and mono
"CD71", "CD14", "CD16", "CD11b", "CD1d", "CD1c",
# hsc
"CD34",
# T
"CD3", "CD45RO", "CD45RA" , "CD62L", "CD27",
"CD28", "CD279", "CD4", "CD8", "CD161",
# nk
"CD244", "KLRG1", "CD127", "CD56", "CD57", "CD38",
# state
"CD103" , "CD196", "CD195", "CD25", "CD86",
"CD69", "CD31"
)
# aggregate protein data
# single cell data
= as.data.frame(t(h1@assay$CITE@data))
prot.dat # replact ADT string
colnames(prot.dat) = str_sub(colnames(prot.dat), start = 1, end = -6)
# aggregate (mean)
= cbind(prot.dat, h1@meta.data) %>%
prot_data group_by(sample, subject_id = sampleid , timepoint,
celltype = celltype_joint,
time_cohort, batch, age, gender, antibody_response = adjmfc.group) %>%
summarize_at(.vars = colnames(prot.dat), .funs = median) %>%
ungroup() %>%
select(sample, celltype, prot_order) %>%
arrange(celltype, sample) %>%
unite(col = "sample_celltype", sample:celltype, sep = "_") %>%
column_to_rownames("sample_celltype") %>%
t()
# cell frequency
= h1@meta.data
md = md %>%
df group_by(sample, subject_id = sampleid, timepoint,
time_cohort, batch, celltype = celltype_joint, antibody_response = adjmfc.group) %>%
age, gender, summarize(count = n(), log_lib_size = log10(sum(nUMI))) %>%
group_by(sample) %>%
mutate(cell_freq=count/sum(count)*100) %>%
arrange(celltype, sample) %>%
mutate(log_cell_count = log10(count))
= df$cell_freq
cellfreq
# celltype
= df$celltype
cellt = HeatmapAnnotation(celltype = cellt,
ha col = list(celltype = c(
"BC_Mem" = "lightslateblue",
"BC_Naive" = "#2B3D26",
"CD103_Tcell" = "#E25822",
"CD14_Mono"= "red",
"CD16_Mono" = "firebrick4",
"CD38_Bcell" = "#882D17",
"CD4_CD161_Mem_Tcell" = "navy",
"CD4_CD25_Tcell"= "#B3446C",
"CD4_CD56_Tcell" = "maroon1",
"CD4_CD57_Tcell" = "#604E97",
"CD4_Efct_Mem_Tcell" ="#F99379",
"CD4Naive_Tcell" = "#0067A5",
"CD8_CD161_Tcell" = "olivedrab",
"CD8_Mem_Tcell" = "#008856",
"CD8_Naive_Tcell" = "#848482",
"CD8_NKT" = "#C2B280",
"HSC" = "#BE0032",
"IgA_CD14_Mono" = "#A1CAF1",
"MAIT_Like" = "#F38400",
"mDC" = "#875692",
"NK" = "#F3C300",
"pDC" = "#222222"))
)
# Create annotations
= columnAnnotation(
libraries_map log_library_size = column_anno_points(
$log_lib_size,
dfsize = unit(0.3, 'mm'),
pch = 21, axis = TRUE, border = TRUE,
gp = gpar(color = "black")
),height = unit(1.8, units = "cm")
)
# matrix color values
= circlize::colorRamp2(breaks = c(-1,0,2,4,8,12,16,20),
col_fun colors = viridis::viridis(n = 8, option = "B"))
# organize proteins by lineages
rownames(prot_data) = factor(rownames(prot_data),levels = prot_order)
# cluster by column; save heatmap
pdf(paste0(figpath,"heatmap3.pdf"), width = 7, height = 6.5)
draw(
::Heatmap(
ComplexHeatmapmatrix = prot_data,
name = "",
col = col_fun,
row_names_gp = gpar(color = "black", fontsize = 10),
top_annotation = ha,
bottom_annotation = libraries_map,
show_column_names = FALSE,
cluster_rows = FALSE,
cluster_columns = TRUE,
use_raster = TRUE), show_annotation_legend = FALSE)
dev.off()
sessionInfo()
# R version 3.5.3 Patched (2019-03-11 r77192)
# attached base packages:
# [1] grid stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] ComplexHeatmap_1.20.0 viridis_0.5.1 viridisLite_0.3.0 here_0.1 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.3
# [9] readr_1.3.1 tidyr_1.0.2 tibble_2.1.1 tidyverse_1.2.1 Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.1
#
# loaded via a namespace (and not attached):
# [1] circlize_0.4.10 readxl_1.3.1 snow_0.4-3 backports_1.1.4 Hmisc_4.2-0 plyr_1.8.4 igraph_1.2.4.1
# [8] lazyeval_0.2.2 splines_3.5.3 crosstalk_1.0.0 digest_0.6.25 foreach_1.4.4 htmltools_0.3.6 lars_1.2
# [15] fansi_0.4.0 gdata_2.18.0 magrittr_2.0.1 checkmate_1.9.3 cluster_2.0.7-1 mixtools_1.1.0 ROCR_1.0-7
# [22] modelr_0.1.4 R.utils_2.8.0 colorspace_1.4-1 rvest_0.3.4 haven_2.1.0 xfun_0.7 crayon_1.3.4
# [29] jsonlite_1.6 survival_2.43-3 zoo_1.8-6 iterators_1.0.10 ape_5.3 glue_1.3.1 pals_1.5
# [36] gtable_0.3.0 webshot_0.5.1 GetoptLong_1.0.2 kernlab_0.9-27 shape_1.4.4 prabclus_2.3-1 DEoptimR_1.0-8
# [43] maps_3.3.0 scales_1.0.0 pheatmap_1.0.12 mvtnorm_1.0-10 bibtex_0.4.2 miniUI_0.1.1.1 Rcpp_1.0.1
# [50] metap_1.1 dtw_1.20-1 xtable_1.8-4 htmlTable_1.13.1 reticulate_1.12 foreign_0.8-71 bit_1.1-14
# [57] mapproj_1.2.6 proxy_0.4-23 mclust_5.4.5 SDMTools_1.1-221.1 Formula_1.2-3 stats4_3.5.3 tsne_0.1-3
# [64] htmlwidgets_1.3 httr_1.4.0 gplots_3.0.1.1 RColorBrewer_1.1-2 fpc_2.2-1 ellipsis_0.3.0 acepack_1.4.1
# [71] modeltools_0.2-22 ica_1.0-2 pkgconfig_2.0.2 R.methodsS3_1.7.1 flexmix_2.3-15 nnet_7.3-12 utf8_1.1.4
# [78] manipulateWidget_0.10.0 tidyselect_0.2.5 labeling_0.3 rlang_0.4.5 reshape2_1.4.3 later_0.8.0 munsell_0.5.0
# [85] cellranger_1.1.0 tools_3.5.3 cli_1.1.0 generics_0.0.2 broom_0.5.2 ggridges_0.5.1 npsurv_0.4-0
# [92] knitr_1.23 bit64_0.9-7 fitdistrplus_1.0-14 robustbase_0.93-5 rgl_0.100.30 caTools_1.17.1.2 RANN_2.6.1
# [99] packrat_0.5.0 pbapply_1.4-0 nlme_3.1-137 mime_0.6 R.oo_1.22.0 xml2_1.2.0 hdf5r_1.2.0
# [106] compiler_3.5.3 rstudioapi_0.10 png_0.1-7 lsei_1.2-0 stringi_1.4.3 lattice_0.20-38 ggsci_2.9
# [113] vctrs_0.2.4 pillar_1.4.1 lifecycle_0.1.0 GlobalOptions_0.1.2 Rdpack_0.11-0 lmtest_0.9-37 data.table_1.12.2
# [120] bitops_1.0-6 irlba_2.3.3 gbRd_0.4-11 httpuv_1.5.1 R6_2.4.0 latticeExtra_0.6-28 promises_1.0.1
# [127] KernSmooth_2.23-15 gridExtra_2.3 sessioninfo_1.1.1 codetools_0.2-16 dichromat_2.0-0 MASS_7.3-51.1 gtools_3.8.1
# [134] assertthat_0.2.1 rjson_0.2.20 rprojroot_1.3-2 withr_2.1.2 diptest_0.75-7 parallel_3.5.3 doSNOW_1.0.16
# [141] hms_0.4.2 rpart_4.1-13 class_7.3-15 segmented_0.5-4.0 Rtsne_0.15 shiny_1.3.2 lubridate_1.7.4
# [148] base64enc_0.1-3
Analyze transcriptome of manually gated cells using cell type
specific gene signatures
mid_res/pblast_abc_integration/1_pbasc_analysis_gates_modscoresv2.r
# Analysis of plasmablast and Activated_B.
# script uses R 3.5.1
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
# source functions
theme_set(theme_bw())
source("functions/analysis_functions.R")
# file path
= here("mid_res/pblast_abc_integration/figures/")
figpath = here("mid_res/pblast_abc_integration/generated_data/")
datapath dir.create(figpath); dir.create(datapath)
# load h1 annotated data
= ReadCohort(joint_object_dir = "data/h1h5_annotated_with_meta.rds", cohort = "H1N1") %>%
h1 SubsetData(accept.high = 28, subset.name = "CD3_PROT", subset.raw = T)
#manually gate activated memory B cells and plasmablasts
= GenePlot4(h1, gene1 = "CD19_PROT", gene2 = "CD14_PROT", pt.size = 0.1)
p ggsave(p, filename = paste0(figpath,"/cd19cells.png"),width = 4, height = 3)
= GenePlot4(h1, gene1 = "CD19_PROT", gene2 = "CD3_PROT", pt.size = 0.1) +
p geom_vline(xintercept = 8) +
geom_hline(yintercept = 5)
ggsave(p, filename = paste0(figpath,"/cd19cells_2.pdf"),width = 4, height = 3)
############## Pt 1 Manual gate asc abc
= function(SeuratObject, return.seurat = F) {
GateBC = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which(adt["CD19_PROT", ] > 8 &
cells "CD3_PROT", ] < 5 &
adt["CD14_PROT", ] < 5 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells, subset.raw = TRUE)
sub return(sub)
else { return(cells) }
}
}
= GateBC(SeuratObject = h1, return.seurat = F)
bcells_gate = SubsetData(h1, cells.use = bcells_gate, subset.raw = T)
cd19
# abc asc
= GenePlot4(cd19, gene1 = "CD71_PROT", gene2 = "IgD_PROT",pt.size = 0.1) +
p geom_vline(xintercept = 5) + geom_hline(yintercept = 10)
ggsave(p, filename = paste0(figpath,"/cd71igd_cells.pdf"),width = 4, height = 3)
# naive memory
= GenePlot4(cd19, gene1 = "CD27_PROT", gene2 = "IgD_PROT",pt.size = 0.1) +
p geom_vline(xintercept = 4) + geom_hline(yintercept = 10)
ggsave(p, filename = paste0(figpath,"/cd27_igd_cells.pdf"),width = 4, height = 3)
# activated B and asc gate
= function(SeuratObject) {
Gate_Activated_BASC = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which(adt["CD71_PROT", ] > 5 & adt["IgD_PROT", ] < 10))
cells return(cells)
}= Gate_Activated_BASC(SeuratObject = cd19)
Activated_B_asc_cells = SubsetData(cd19, cells.use = Activated_B_asc_cells, subset.raw = T)
Activated_Basc
# plot gates
= GenePlot4(Activated_Basc, gene1 = "CD20_PROT", gene2 = "CD38_PROT", pt.size = 0.6) +
p geom_vline(xintercept = 8) +
geom_hline(yintercept = 10)
ggsave(p, filename = paste0(figpath,"/Activated_B_pblast.pdf"),width = 4, height = 3)
= cbind(Activated_Basc@meta.data, as.data.frame(t(Activated_Basc@assay$CITE@data)))
bmd = bmd %>% mutate(
bmd b_type =
if_else(
> 8 &
CD19_PROT > 5 &
CD71_PROT < 10 &
IgD_PROT < 8 & CD38_PROT > 10,
CD20_PROT true = "Plasmablast",
if_else(
> 8 &
CD19_PROT > 5 &
CD71_PROT < 10 &
IgD_PROT > 8 &
CD20_PROT < 10,
CD38_PROT true = "Activated_Bcell",
false = "NA"
)
)%>%
) filter(b_type %in% c("Plasmablast", "Activated_Bcell")) %>%
select(b_type, barcode_check) %>%
column_to_rownames("barcode_check")
= SubsetData(Activated_Basc, cells.use = rownames(bmd), subset.raw = TRUE) %>% AddMetaData(metadata = bmd)
bsub
########## Pt 2 add module scores for ellebedy gene sets.
= read.table(file = here("signature_curation/ellebedy_genes.txt"), sep = "\t", header = T)
bcgenes = bcgenes %>% filter(celltype == "ABC-")
Activated_B.genes = bcgenes %>% filter(celltype == "ASC-")
asc.genes = list(as.character(Activated_B.genes$Gene), as.character(asc.genes$Gene))
module.list names(module.list) = c("Activated_B_module", "ASC_module")
saveRDS(module.list, file = paste0(datapath,"/ellebedy_bcell.rds"))
# pt 2 module s
# add module score for ellebedy genes
= AddModuleScore(bsub,
bsub genes.list = module.list,
enrich.name = names(module.list),
random.seed = 1)
names(bsub@meta.data)[c(33, 34)] = c("Activated_Bcell_Gene_Score", "Plasmablast_Gene_score")
= bsub@meta.data %>% select(Activated_Bcell_Gene_Score, Plasmablast_Gene_score, b_type)
bsubdf
# plot module scores.
= bsub@meta.data
bsubdf = ggpubr::ggviolin(data = bsubdf, x = "b_type",
p y = c("Plasmablast_Gene_score", "Activated_Bcell_Gene_Score"),
combine = TRUE,
fill = "b_type",
palette = "d3")
= p %>% ggpubr::ggadd(add = "jitter", jitter = 0.35, alpha = 0.4, size = 1, shape = 16)
p = p +
p theme(legend.position = "none") +
ylab("module score") + xlab("") +
theme(strip.background = element_blank()) +
::stat_compare_means(method = "wilcox")
ggpubrggsave(p, filename =paste0(figpath,"/pb_asc_modules.pdf"), height = 3, width = 4.5)
sessionInfo()
# R version 3.5.3 Patched (2019-03-11 r77192)
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] viridis_0.5.1 viridisLite_0.3.0 here_0.1 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.5
# [7] purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_2.1.1 tidyverse_1.2.1 Seurat_2.3.4
# [13] Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 class_7.3-15 modeltools_0.2-22 ggridges_0.5.1 rprojroot_1.3-2
# [7] mclust_5.4.5 htmlTable_1.13.1 base64enc_0.1-3 rstudioapi_0.10 proxy_0.4-23 ggpubr_0.2
# [13] npsurv_0.4-0 flexmix_2.3-15 bit64_0.9-7 lubridate_1.7.4 mvtnorm_1.0-10 xml2_1.2.0
# [19] codetools_0.2-16 splines_3.5.3 R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-5 knitr_1.23
# [25] Formula_1.2-3 jsonlite_1.6 packrat_0.5.0 broom_0.5.2 ica_1.0-2 cluster_2.0.7-1
# [31] kernlab_0.9-27 png_0.1-7 R.oo_1.22.0 compiler_3.5.3 httr_1.4.0 backports_1.1.4
# [37] assertthat_0.2.1 lazyeval_0.2.2 cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6
# [43] tools_3.5.3 igraph_1.2.4.1 gtable_0.3.0 glue_1.3.1 RANN_2.6.1 reshape2_1.4.3
# [49] Rcpp_1.0.1 cellranger_1.1.0 vctrs_0.2.4 gdata_2.18.0 ape_5.3 nlme_3.1-137
# [55] iterators_1.0.10 fpc_2.2-1 gbRd_0.4-11 lmtest_0.9-37 xfun_0.7 rvest_0.3.4
# [61] lifecycle_0.1.0 irlba_2.3.3 gtools_3.8.1 DEoptimR_1.0-8 MASS_7.3-51.1 zoo_1.8-6
# [67] scales_1.0.0 hms_0.4.2 doSNOW_1.0.16 parallel_3.5.3 RColorBrewer_1.1-2 reticulate_1.12
# [73] pbapply_1.4-0 gridExtra_2.3 rpart_4.1-13 segmented_0.5-4.0 latticeExtra_0.6-28 stringi_1.4.3
# [79] foreach_1.4.4 checkmate_1.9.3 caTools_1.17.1.2 bibtex_0.4.2 Rdpack_0.11-0 SDMTools_1.1-221.1
# [85] rlang_0.4.5 pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.3-1 bitops_1.0-6 lattice_0.20-38
# [91] ROCR_1.0-7 labeling_0.3 htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4
# [97] magrittr_2.0.1 R6_2.4.0 generics_0.0.2 snow_0.4-3 gplots_3.0.1.1 Hmisc_4.2-0
# [103] haven_2.1.0 pillar_1.4.1 foreign_0.8-71 withr_2.1.2 fitdistrplus_1.0-14 mixtools_1.1.0
# [109] survival_2.43-3 nnet_7.3-12 tsne_0.1-3 modelr_0.1.4 crayon_1.3.4 hdf5r_1.2.0
# [115] KernSmooth_2.23-15 readxl_1.3.1 grid_3.5.3 data.table_1.12.2 metap_1.1 digest_0.6.25
# [121] diptest_0.75-7 R.utils_2.8.0 stats4_3.5.3 munsell_0.5.0
Analyze variance explained by each factor of a multivariate model fit across all cell types. This model includes call type as a variable. mid_res/variance_partition/1_variance_partition_allsamples.r
# R4
# initialize
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
library(BiocParallel)
library(variancePartition)
library(scglmmr)
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# figpath
= here('mid_res/variance_partition/figures/'); dir.create(figpath, recursive = TRUE)
figpath = here('mid_res/variance_partition/generated_data/'); dir.create(datapath, recursive = TRUE)
datapath
# load data
= readRDS(file = here('data/h1h5_annotated_with_meta.rds'))
s table(s@meta.data$time_cohort, s@meta.data$sampleid)
table(s@meta.data$timepoint, s@meta.data$sampleid)
# pb data
= s@meta.data
meta = s@raw.data
umi pdf()
= scglmmr::SubjectCelltypeTable(metadata = meta, celltype_column = "celltype_joint", sample_column = "sample")
tab
# remove cells prior to pseudobulk analysis
= meta[!meta$celltype_joint %in% c(tab$celltypes_remove, 'DOUBLET'), ]
meta = umi[ ,rownames(meta)]
umi
# make pseudobulk data
= scglmmr::PseudobulkList(rawcounts = umi,
pb metadata = meta,
sample_col = "sample",
celltype_col = "celltype_joint",
avg_or_sum = "sum")
# add cell type to column names of pb list
for (i in 1:length(pb)) {
colnames(pb[[i]]) = paste(colnames(pb[[i]]), names(pb[i]), sep = "~")
}saveRDS(pb,file = paste0(figpath, 'pb_vp.rds'))
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb
# merge pseudobulk data
= do.call(cbind, pb)
pd
# make celltype ~ sample metadata
=
csd data.frame(sid = colnames(pd)) %>%
mutate(sample_celltype = sid) %>%
separate(sid, into = c('sample', 'celltype'), sep = '~')
#column_to_rownames('sample_celltype')
# make subject metadata
=
samplemd %>%
meta group_by(sample) %>%
select(age, sampleid, gender, batch, time_cohort, timepoint, adjmfc.group) %>%
summarise_each(funs = unique) %>%
ungroup() %>%
as.data.frame()
saveRDS(samplemd, file = paste0(here('data/samplemd.rds')))
# sample_celltype metadata for design matrices
= full_join(csd, samplemd, by = 'sample') %>%
cf column_to_rownames('sample_celltype')
#############
# process bulk data
#filter lowly expressed (in this case basically unexpressed genes)
= edgeR::filterByExpr(pd,
gene_keep min.count = 1,
min.total.count = 1,
min.prop = 0.5,
group = as.factor(cf$celltype))
# FALSE TRUE
# 5314 14320
= pd[gene_keep, ]
pd
# normalize bulk data
= edgeR::DGEList(counts = pd, samples = cf)
pd = edgeR::calcNormFactors(object = pd)
pd
##############
# Get voom observational weights
# these precision weights for every gene for every sample model uncertainty
<- model.matrix(~celltype, cf)
design <- voom(pd, design)
v
############
# variance partition model
# specify mixed effects interacion model
= ~ age + (1|gender) + (1|sampleid) + (1|celltype) + (1|timepoint) + (1|adjmfc.group) + (1|celltype:timepoint)
f
# run model on each gene extract varinace explained
<- fitExtractVarPartModel(exprObj = v, formula = f, data = cf, REML = FALSE, BPPARAM = pparam)
vp saveRDS(vp, file = paste0(datapath, 'vp.rds'))
= readRDS(file = here('mid_res/variance_partition/generated_data/vp.rds'))
vp
# plot
= plotVarPart(vp)
p $variable %>% str
dat= p$data
dat levels(dat$variable) = list(`response group` = 'adjmfc.group', `cell type` = 'celltype',
`celltype:timepoint` = 'celltype:timepoint', sex = 'gender',
subjectID = 'sampleid', timepoint = 'timepoint', age = 'age',
residuals = 'Residuals')
= dat %>% filter(!variable == 'Residuals')
dat #dat$variable[dat$variable == 'gender'] = 'sex'
= ggplot(dat, aes(x = reorder(variable, value), y = value, fill = variable , color = variable)) +
p theme_bw() +
theme(axis.text = element_text(color = 'black')) +
geom_boxplot(outlier.color = 'red', outlier.alpha = 0.2, outlier.shape = 21, show.legend = FALSE) +
::scale_fill_npg(alpha = 0.5) +
ggsci::scale_color_npg() +
ggscitheme(axis.text = element_text(color = 'black')) +
ylab('% variance explained') + xlab('') +
coord_flip()
pggsave(p, filename = paste0(figpath,'fullvp.pdf'), width = 4.6, height = 1.5)
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] parallel stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] scglmmr_0.1.0 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.4.0
# [7] tidyr_1.1.2 tibble_3.0.6 tidyverse_1.3.0 variancePartition_1.20.0 Biobase_2.50.0 BiocGenerics_0.36.1
# [13] scales_1.1.1 BiocParallel_1.24.1 limma_3.46.0 ggplot2_3.3.3 SeuratObject_4.0.0 Seurat_4.0.1
# [19] here_1.0.1
#
# loaded via a namespace (and not attached):
# [1] estimability_1.3 scattermore_0.7 coda_0.19-4 bit64_4.0.5 multcomp_1.4-16
# [6] irlba_2.3.3 DelayedArray_0.16.3 data.table_1.14.0 rpart_4.1-15 RCurl_1.98-1.3
# [11] doParallel_1.0.16 generics_0.1.0 snow_0.4-3 TH.data_1.0-10 callr_3.7.0
# [16] cowplot_1.1.1 usethis_2.0.1 RSQLite_2.2.7 shadowtext_0.0.9 RANN_2.6.1
# [21] future_1.21.0 bit_4.0.4 enrichplot_1.10.2 spatstat.data_2.1-0 xml2_1.3.2
# [26] lubridate_1.7.9.2 httpuv_1.5.5 ggsci_2.9 SummarizedExperiment_1.20.0 assertthat_0.2.1
# [31] viridis_0.5.1 hms_1.0.0 promises_1.2.0.1 fansi_0.4.2 progress_1.2.2
# [36] caTools_1.18.1 dbplyr_2.1.0 readxl_1.3.1 igraph_1.2.6 DBI_1.1.1
# [41] htmlwidgets_1.5.3 spatstat.geom_2.0-1 stats4_4.0.5 ellipsis_0.3.1 ggpubr_0.4.0
# [46] backports_1.2.1 annotate_1.68.0 deldir_0.2-10 MatrixGenerics_1.2.1 vctrs_0.3.6
# [51] remotes_2.4.0 ROCR_1.0-11 abind_1.4-5 cachem_1.0.4 withr_2.4.1
# [56] ggforce_0.3.3 emmeans_1.5.4 sctransform_0.3.2 prettyunits_1.1.1 goftest_1.2-2
# [61] cluster_2.1.2 DOSE_3.16.0 lazyeval_0.2.2 crayon_1.4.1 labeling_0.4.2
# [66] edgeR_3.32.1 pkgconfig_2.0.3 tweenr_1.0.2 GenomeInfoDb_1.26.7 nlme_3.1-152
# [71] pkgload_1.2.1 devtools_2.4.2 rlang_0.4.10 globals_0.14.0 lifecycle_1.0.0
# [76] miniUI_0.1.1.1 sandwich_3.0-0 downloader_0.4 modelr_0.1.8 cellranger_1.1.0
# [81] rprojroot_2.0.2 polyclip_1.10-0 GSVA_1.38.2 matrixStats_0.58.0 lmtest_0.9-38
# [86] graph_1.68.0 Matrix_1.3-2 carData_3.0-4 boot_1.3-27 zoo_1.8-8
# [91] reprex_1.0.0 pheatmap_1.0.12 ggridges_0.5.3 processx_3.5.2 png_0.1-7
# [96] viridisLite_0.3.0 bitops_1.0-6 KernSmooth_2.23-18 blob_1.2.1 qvalue_2.22.0
# [101] parallelly_1.23.0 rstatix_0.7.0 ggsignif_0.6.0 S4Vectors_0.28.1 memoise_2.0.0
# [106] GSEABase_1.52.1 magrittr_2.0.1 plyr_1.8.6 ica_1.0-2 gplots_3.1.1
# [111] zlibbioc_1.36.0 compiler_4.0.5 scatterpie_0.1.7 RColorBrewer_1.1-2 lme4_1.1-26
# [116] fitdistrplus_1.1-3 cli_2.5.0 XVector_0.30.0 listenv_0.8.0 patchwork_1.1.1
# [121] pbapply_1.4-3 ps_1.5.0 MASS_7.3-53.1 mgcv_1.8-34 tidyselect_1.1.0
# [126] stringi_1.5.3 GOSemSim_2.16.1 locfit_1.5-9.4 ggrepel_0.9.1 grid_4.0.5
# [131] fastmatch_1.1-0 tools_4.0.5 rio_0.5.16 future.apply_1.7.0 rstudioapi_0.13
# [136] foreign_0.8-81 foreach_1.5.1 gridExtra_2.3 farver_2.0.3 Rtsne_0.15
# [141] ggraph_2.0.5 digest_0.6.27 rvcheck_0.1.8 BiocManager_1.30.10 shiny_1.6.0
# [146] Rcpp_1.0.6 car_3.0-10 GenomicRanges_1.42.0 broom_0.7.5 egg_0.4.5
# [151] later_1.1.0.1 RcppAnnoy_0.0.18 org.Hs.eg.db_3.12.0 httr_1.4.2 AnnotationDbi_1.52.0
# [156] colorspace_2.0-0 rvest_0.3.6 XML_3.99-0.6 fs_1.5.0 tensor_1.5
# [161] reticulate_1.18 IRanges_2.24.1 splines_4.0.5 uwot_0.1.10 statmod_1.4.35
# [166] spatstat.utils_2.1-0 graphlayouts_0.7.2 plotly_4.9.3 sessioninfo_1.1.1 xtable_1.8-4
# [171] jsonlite_1.7.2 nloptr_1.2.2.2 tidygraph_1.2.0 ggfun_0.0.4 testthat_3.0.2
# [176] R6_2.5.0 pillar_1.4.7 htmltools_0.5.1.1 mime_0.10 glue_1.4.2
# [181] fastmap_1.1.0 minqa_1.2.4 clusterProfiler_3.18.1 codetools_0.2-18 fgsea_1.16.0
# [186] utf8_1.1.4 pkgbuild_1.2.0 mvtnorm_1.1-1 lattice_0.20-41 spatstat.sparse_2.0-0
# [191] pbkrtest_0.5-0.1 curl_4.3 leiden_0.3.7 colorRamps_2.3 gtools_3.8.2
# [196] zip_2.1.1 openxlsx_4.2.3 GO.db_3.12.1 survival_3.2-10 desc_1.3.0
# [201] munsell_0.5.0 DO.db_2.9 GenomeInfoDbData_1.2.4 iterators_1.0.13 haven_2.3.1
# [206] reshape2_1.4.4 gtable_0.3.0 spatstat.core_2.0-0
#
#
#
Analyze variance explained by each factor of multivariate models fit within each cell protein based type. mid_res/variance_partition/2_variance_partition_withincelltype.r
# R4
# initialize
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
library(BiocParallel)
library(variancePartition)
library(scglmmr)
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# figpath
= here('mid_res/variance_partition/figures/'); dir.create(figpath, recursive = TRUE)
figpath = here('mid_res/variance_partition/generated_data/'); dir.create(datapath, recursive = TRUE)
datapath
# load data
= readRDS(file = here('data/h1h5_annotated_with_meta.rds'))
s table(s@meta.data$time_cohort, s@meta.data$sampleid)
table(s@meta.data$timepoint, s@meta.data$sampleid)
# pb data
= s@meta.data
meta = s@raw.data
umi = scglmmr::SubjectCelltypeTable(metadata = meta, celltype_column = "celltype_joint", sample_column = "sample")
tab
# remove cells prior to pseudobulk analysis
= meta[!meta$celltype_joint %in% c(tab$celltypes_remove, 'DOUBLET'), ]
meta = umi[ ,rownames(meta)]
umi
# creat sample metadata
=
samplemd %>%
meta group_by(sample) %>%
select(age, gender, sampleid, batch, time_cohort, timepoint, adjmfc.group) %>%
summarise_each(funs = unique) %>%
ungroup() %>%
as.data.frame()
# read pb data from vpar1 script 1
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb
# run variance partition workflow for each cell subset separately
for (i in 1:length(pb)) {
=
csd data.frame(sid = colnames(pb[[i]])) %>%
mutate(sample_celltype = sid) %>%
separate(sid, into = c('sample', 'celltype'), sep = '~')
# sample_celltype metadata for design matrices
= full_join(csd, samplemd, by = 'sample') %>%
cf column_to_rownames('sample_celltype')
#############
# process bulk data indexed over each celltype
#filter lowly expressed (in this case basically unexpressed genes)
= pb[[i]]
pd = edgeR::filterByExpr(pd,
gene_keep min.count = 5,
min.total.count = 2,
min.prop = 0.5,
group = as.factor(cf$sample))
print(table(gene_keep))
= pd[gene_keep, ]
pd
# normalize bulk data
= edgeR::DGEList(counts = pd, samples = cf)
pd = edgeR::calcNormFactors(object = pd)
pd
##############
# Get voom observational weights
# these precision weights for every gene for every sample model uncertainty
<- model.matrix(~timepoint, cf)
design <- voom(pd, design, plot = TRUE)
v
# specify mixed effects interacion model
= ~ age + (1|gender) + (1|sampleid) + (1|timepoint) + (1|adjmfc.group) + (1|timepoint:adjmfc.group)
f
# run model on each gene extract varinace explained
<- fitExtractVarPartModel(exprObj = v, formula = f, data = cf, REML = FALSE, BPPARAM = pparam)
vp saveRDS(vp, file = paste0(datapath, names(pb)[i], 'vp.rds'))
# plot
= plotVarPart(vp)
p = ggplot(p$data, aes(x = reorder(variable, value), y = value, fill = variable )) +
p theme_bw() +
theme(axis.text = element_text(color = 'black')) +
geom_boxplot(outlier.color = 'red', outlier.alpha = 0.2, outlier.shape = 21, show.legend = FALSE) +
::scale_fill_jama() +
ggsciylab('% variance explained') + xlab('') +
coord_flip()
ggsave(p,filename = paste0(figpath,names(pb)[i],'vp.pdf'), width = 4.5, height = 1.4)
}
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] parallel stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] scglmmr_0.1.0 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.4.0
# [7] tidyr_1.1.2 tibble_3.0.6 tidyverse_1.3.0 variancePartition_1.20.0 Biobase_2.50.0 BiocGenerics_0.36.1
# [13] scales_1.1.1 BiocParallel_1.24.1 limma_3.46.0 ggplot2_3.3.3 SeuratObject_4.0.0 Seurat_4.0.1
# [19] here_1.0.1
#
# loaded via a namespace (and not attached):
# [1] estimability_1.3 scattermore_0.7 coda_0.19-4 bit64_4.0.5 multcomp_1.4-16
# [6] irlba_2.3.3 DelayedArray_0.16.3 data.table_1.14.0 rpart_4.1-15 RCurl_1.98-1.3
# [11] doParallel_1.0.16 generics_0.1.0 snow_0.4-3 TH.data_1.0-10 callr_3.7.0
# [16] cowplot_1.1.1 usethis_2.0.1 RSQLite_2.2.7 shadowtext_0.0.9 RANN_2.6.1
# [21] future_1.21.0 bit_4.0.4 enrichplot_1.10.2 spatstat.data_2.1-0 xml2_1.3.2
# [26] lubridate_1.7.9.2 httpuv_1.5.5 ggsci_2.9 SummarizedExperiment_1.20.0 assertthat_0.2.1
# [31] viridis_0.5.1 hms_1.0.0 promises_1.2.0.1 fansi_0.4.2 progress_1.2.2
# [36] caTools_1.18.1 dbplyr_2.1.0 readxl_1.3.1 igraph_1.2.6 DBI_1.1.1
# [41] htmlwidgets_1.5.3 spatstat.geom_2.0-1 stats4_4.0.5 ellipsis_0.3.1 ggpubr_0.4.0
# [46] backports_1.2.1 annotate_1.68.0 deldir_0.2-10 MatrixGenerics_1.2.1 vctrs_0.3.6
# [51] remotes_2.4.0 ROCR_1.0-11 abind_1.4-5 cachem_1.0.4 withr_2.4.1
# [56] ggforce_0.3.3 emmeans_1.5.4 sctransform_0.3.2 prettyunits_1.1.1 goftest_1.2-2
# [61] cluster_2.1.2 DOSE_3.16.0 lazyeval_0.2.2 crayon_1.4.1 labeling_0.4.2
# [66] edgeR_3.32.1 pkgconfig_2.0.3 tweenr_1.0.2 GenomeInfoDb_1.26.7 nlme_3.1-152
# [71] pkgload_1.2.1 devtools_2.4.2 rlang_0.4.10 globals_0.14.0 lifecycle_1.0.0
# [76] miniUI_0.1.1.1 sandwich_3.0-0 downloader_0.4 modelr_0.1.8 cellranger_1.1.0
# [81] rprojroot_2.0.2 polyclip_1.10-0 GSVA_1.38.2 matrixStats_0.58.0 lmtest_0.9-38
# [86] graph_1.68.0 Matrix_1.3-2 carData_3.0-4 boot_1.3-27 zoo_1.8-8
# [91] reprex_1.0.0 pheatmap_1.0.12 ggridges_0.5.3 processx_3.5.2 png_0.1-7
# [96] viridisLite_0.3.0 bitops_1.0-6 KernSmooth_2.23-18 blob_1.2.1 qvalue_2.22.0
# [101] parallelly_1.23.0 rstatix_0.7.0 ggsignif_0.6.0 S4Vectors_0.28.1 memoise_2.0.0
# [106] GSEABase_1.52.1 magrittr_2.0.1 plyr_1.8.6 ica_1.0-2 gplots_3.1.1
# [111] zlibbioc_1.36.0 compiler_4.0.5 scatterpie_0.1.7 RColorBrewer_1.1-2 lme4_1.1-26
# [116] fitdistrplus_1.1-3 cli_2.5.0 XVector_0.30.0 listenv_0.8.0 patchwork_1.1.1
# [121] pbapply_1.4-3 ps_1.5.0 MASS_7.3-53.1 mgcv_1.8-34 tidyselect_1.1.0
# [126] stringi_1.5.3 GOSemSim_2.16.1 locfit_1.5-9.4 ggrepel_0.9.1 grid_4.0.5
# [131] fastmatch_1.1-0 tools_4.0.5 rio_0.5.16 future.apply_1.7.0 rstudioapi_0.13
# [136] foreign_0.8-81 foreach_1.5.1 gridExtra_2.3 farver_2.0.3 Rtsne_0.15
# [141] ggraph_2.0.5 digest_0.6.27 rvcheck_0.1.8 BiocManager_1.30.10 shiny_1.6.0
# [146] Rcpp_1.0.6 car_3.0-10 GenomicRanges_1.42.0 broom_0.7.5 egg_0.4.5
# [151] later_1.1.0.1 RcppAnnoy_0.0.18 org.Hs.eg.db_3.12.0 httr_1.4.2 AnnotationDbi_1.52.0
# [156] colorspace_2.0-0 rvest_0.3.6 XML_3.99-0.6 fs_1.5.0 tensor_1.5
# [161] reticulate_1.18 IRanges_2.24.1 splines_4.0.5 uwot_0.1.10 statmod_1.4.35
# [166] spatstat.utils_2.1-0 graphlayouts_0.7.2 plotly_4.9.3 sessioninfo_1.1.1 xtable_1.8-4
# [171] jsonlite_1.7.2 nloptr_1.2.2.2 tidygraph_1.2.0 ggfun_0.0.4 testthat_3.0.2
# [176] R6_2.5.0 pillar_1.4.7 htmltools_0.5.1.1 mime_0.10 glue_1.4.2
# [181] fastmap_1.1.0 minqa_1.2.4 clusterProfiler_3.18.1 codetools_0.2-18 fgsea_1.16.0
# [186] utf8_1.1.4 pkgbuild_1.2.0 mvtnorm_1.1-1 lattice_0.20-41 spatstat.sparse_2.0-0
# [191] pbkrtest_0.5-0.1 curl_4.3 leiden_0.3.7 colorRamps_2.3 gtools_3.8.2
# [196] zip_2.1.1 openxlsx_4.2.3 GO.db_3.12.1 survival_3.2-10 desc_1.3.0
# [201] munsell_0.5.0 DO.db_2.9 GenomeInfoDbData_1.2.4 iterators_1.0.13 haven_2.3.1
# [206] reshape2_1.4.4 gtable_0.3.0 spatstat.core_2.0-0
#
#
#
Figure generation from analysis above mid_res/variance_partition/3_figures_variance_partition_withincelltype.R
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(variancePartition))
source('functions/MattPMutils.r')
suppressMessages(library(magrittr))
# figpath
= here('mid_res/variance_partition/figures_vars/'); dir.create(figpath, recursive = TRUE)
figpath = here('mid_res/variance_partition/generated_data2/'); dir.create(datapath, recursive = TRUE)
datapath
# parallel opts
# register(SnowParam(4))
# pparam = SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
= list(celltype = c(
col "BC_Mem" = "lightslateblue",
"BC_Naive" = "#2B3D26",
"CD103_Tcell" = "#E25822",
"CD14_Mono"= "red",
"CD16_Mono" = "firebrick4",
"CD38_Bcell" = "#882D17",
"CD4_CD161_Mem_Tcell" = "navy",
"CD4_CD25_Tcell"= "#B3446C",
"CD4_CD56_Tcell" = "maroon1",
"CD4_CD57_Tcell" = "#604E97",
"CD4_Efct_Mem_Tcell" ="#F99379",
"CD4Naive_Tcell" = "#0067A5",
"CD8_CD161_Tcell" = "olivedrab",
"CD8_Mem_Tcell" = "#008856",
"CD8_Naive_Tcell" = "#848482",
"CD8_NKT" = "#C2B280",
"HSC" = "#BE0032",
"IgA_CD14_Mono" = "#A1CAF1",
"MAIT_Like" = "#F38400",
"mDC" = "#875692",
"NK" = "#F3C300",
"pDC" = "#222222"))
= structure(col[[1]])
ccu names(ccu) = str_replace_all(string = names(ccu), pattern = '_', replacement = ' ')
= sapply(ccu, col.alpha, 0.5)
ccu2
# load bulk data
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb = list.files(path = here('mid_res/variance_partition/generated_data/'),
dl pattern = '.rds', recursive = TRUE,full.names = TRUE)
= dl[-c(15,17)] # remove total bulk
dl
# get cell type names (file names)
= list.files(path = here('mid_res/variance_partition/generated_data/'),
cts pattern = '.rds', recursive = TRUE,full.names = FALSE)
= cts[-c(15,17)]
cts = str_replace_all(string = cts,pattern = 'vp.rds', replacement = '')
cts
# read and format variance partition results
= lapply(dl, readRDS)
vl names(vl) = cts
= list()
dl for (i in 1:length(vl)) {
= plotVarPart(vl[[i]])
p $data$celltype = names(vl[i])
p= p$data
d = d
dl[[i]]
}
# combine results across cell types
%>% head
dl[[i]] = reduce(dl, .f = rbind)
test = test %>% select(-c(gene))
test2 $celltype = factor(test2$celltype, levels = cts)
test2
# rename metadata vars
levels(test2$variable) =
list(
`response group` = 'adjmfc.group',
sex = 'gender',
subjectID = 'sampleid',
timepoint = 'timepoint',
`timepoint:response` = "timepoint:adjmfc.group",
age = 'age',
residuals = 'Residuals'
)
# visualize full results
$celltype = str_replace_all(test2$celltype, pattern = '_',replacement = ' ')
test2= test2 %>% filter(variable %in% c('age', 'sex', 'subjectID', 'timepoint'))
d $variable = factor(d$variable,levels = c('subjectID', 'timepoint', 'age', 'sex'))
d
# add outlier designation
= d %>%
d group_by(celltype, variable) %>%
mutate(outlier = value > quantile(value, 0.75) + IQR(value) * 1.5) %>%
ungroup
= ggplot(d, aes(x = reorder(celltype,value), y = value , color = celltype, fill= celltype)) +
p facet_wrap(~variable, nrow = 1) +
theme_bw() +
theme(axis.text = element_text(color = 'black', size = 10)) +
geom_boxplot(varwidth = TRUE,
outlier.color = 'red',
outlier.alpha = 0.2,
outlier.shape = NA,
show.legend = FALSE, size = 0.3) +
geom_point(data = function(x) dplyr::filter_(x, ~ outlier),
position = 'jitter',
shape = 21, size = 1.1, stroke = 0, alpha = 1/3,
show.legend = FALSE) +
scale_color_manual(values = ccu) +
scale_fill_manual(values = ccu2) +
ylab('% variance explained') + xlab('') +
theme(strip.background = element_blank(),
strip.text = element_text(size = 18, color = 'black'),
axis.text.y = element_text(size = 15, color = 'black'),
axis.title = element_text(size = 16, color = 'black')) +
coord_flip()
pggsave(p, filename = paste0(figpath, 'vpartfull_.pdf'), width = 10, height = 5.1)
ggsave(p, filename = paste0(figpath, 'vpartfull_.png'), width = 10, height = 5.1)
########
# monnocyte
= vl$CD14_Mono %>%
m as.data.frame() %>%
rownames_to_column('gene')
colnames(m) = c(
'gene',
'response.group',
'sex',
'subjectID',
'timepoint',
'timepoint:response',
'age',
'residuals'
)
# rank genes by vars
= m[order(desc(m$sampleid)), ]
ds
# get sampele meta data to add to gene data
= readRDS(file = here('data/samplemd.rds'))
samplemd = pb$CD14_Mono
mdat = edgeR::cpm(mdat, log = TRUE)
mdat
# plot genes
= plotPercentBars(vl$CD14_Mono[c('DDX3Y', 'TMEM176B', 'STAT1','PPARGC1', 'TP53RK'), ] )
p levels(p$data$variable) = c('response.group', 'sex', 'SubjectID', 'timepoint', 'timepoint:response', 'age', 'residuals')
levels(p$data$variable) = c('response.group', 'sex', 'subjectID', 'timepoint', 'timepoint:response', 'age', 'residuals')
= p +
p ::scale_fill_jama(alpha = 0.9) +
ggscitheme_bw() +
theme(axis.text = element_text(size = 15, color = 'black')) +
theme(axis.title = element_text(size = 20, color = 'black'))+
theme(legend.position = 'top') +
guides(fill=guide_legend(nrow=4, byrow=TRUE)) +
theme(legend.text = element_text(size = 18), legend.key.size = unit(0.8,units = 'cm')) +
theme(legend.title = element_blank()) +
theme(axis.text.x = element_text(hjust=1))
ggsave(p, filename = paste0(figpath,'genesubpct.pdf'), width = 5.6, height = 5)
# genes
= c(
mgene.highlight 'PPARGC1B',
'TMEM176B',
'LILRA3',
'TP53RK',
'PRPF19',
'HLA-DRB5',
'GBP2',
'PSME2',
'VAMP5',
'STAT1',
'CD69',
'MAP3K8',
'DDX3Y'
)# make matrix
= as.data.frame(as.matrix(t(mdat[ mgene.highlight, ])))
d2 = d2 %>%
d2 rownames_to_column('sid') %>%
separate(sid, into = c('sample', 'celltype'), sep = '~')
= full_join(d2, samplemd, by = 'sample')
d2 $sampleid = str_sub(d2$sampleid, -3,-1)
d2
# subject
=
p ggplot(d2, aes(x = reorder(sampleid, TMEM176B), y = TMEM176B, fill = timepoint)) +
theme_bw() +
xlab('Subject ID') +
geom_point(shape = 21, size = 2.5, fill = col.alpha('black', 0.7) , color = 'white') +
theme(axis.text.x=element_text(angle = -90, hjust = 0, size = 5)) +
theme(axis.title = element_text(color = 'black', size = 14)) +
theme(legend.position = c(0.74, 0.24)) +
theme(legend.key.size = unit(0.2,units = 'cm')) +
scale_fill_manual(values = c('black', 'black', 'black'))
pggsave(p, filename = paste0(figpath, 'TMEM176B.pdf'), width = 2.5, height = 2.5)
# sex
=
p ggplot(d2, aes(x = gender, y = DDX3Y)) +
theme_bw() +
xlab('Sex') +
theme(axis.title = element_text(color = 'black', size = 14)) +
geom_boxplot(show.legend = FALSE, fill = col.alpha(acol = 'black', alpha = 0.3))
ggsave(p, filename = paste0(figpath, 'sex_gene.pdf'), width = 2.5, height = 2.5)
# time
=
p ggplot(d2, aes(x = timepoint, y = STAT1)) +
theme_bw() +
xlab('Time') +
theme(axis.title = element_text(color = 'black', size = 14)) +
geom_boxplot(show.legend = FALSE, fill = col.alpha(acol = 'black', alpha = 0.3))
ggsave(p, filename = paste0(figpath, 'timegene.pdf'), width = 2.5, height = 2.5)
# Age
=
p ggplot(d2, aes(x = age, y = TP53RK)) +
theme_bw() +
theme(axis.title = element_text(color = 'black', size = 14)) +
geom_point(shape = 21, size = 2.5, fill = col.alpha('black', 0.7) , color = 'white') +
scale_fill_manual(values = c('black')) +
geom_smooth(method = 'lm', color= 'black') +
::stat_cor(label.x.npc = 0.1, label.y.npc = 0.1)
ggpubr
pggsave(p, filename = paste0(figpath, 'agegene2.pdf'), width = 2.5, height = 2.5)
# Age 2
=
p ggplot(d2, aes(x = age, y = PPARGC1B)) +
theme_bw() +
theme(axis.title = element_text(color = 'black', size = 14)) +
geom_point(shape = 21, size = 2.5, fill = col.alpha('black', 0.7) , color = 'white') +
scale_fill_manual(values = c('black')) +
geom_smooth(method = 'lm', color= 'black') +
::stat_cor(label.x.npc = 0.1, label.y.npc = 0.1)
ggpubr
pggsave(p, filename = paste0(figpath, 'agegene.pdf'), width = 2.5, height = 2.5)
Gene set enrichment of genes ranked by variance explained by age in
CD8 T cell subsets.
mid_res/variance_partition/4_age_variancefraction_enrichment.r
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(variancePartition))
source('functions/MattPMutils.r')
library(magrittr)
# figpath
= here('mid_res/variance_partition/figures_vars/'); dir.create(figpath, recursive = TRUE)
figpath = here('mid_res/variance_partition/generated_data2/'); dir.create(datapath, recursive = TRUE)
datapath = readRDS(file = here('data/samplemd.rds'))
samplemd # parallel opts
# register(SnowParam(4))
# pparam = SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
= list(celltype = c(
col "BC_Mem" = "lightslateblue",
"BC_Naive" = "#2B3D26",
"CD103_Tcell" = "#E25822",
"CD14_Mono"= "red",
"CD16_Mono" = "firebrick4",
"CD38_Bcell" = "#882D17",
"CD4_CD161_Mem_Tcell" = "navy",
"CD4_CD25_Tcell"= "#B3446C",
"CD4_CD56_Tcell" = "maroon1",
"CD4_CD57_Tcell" = "#604E97",
"CD4_Efct_Mem_Tcell" ="#F99379",
"CD4Naive_Tcell" = "#0067A5",
"CD8_CD161_Tcell" = "olivedrab",
"CD8_Mem_Tcell" = "#008856",
"CD8_Naive_Tcell" = "#848482",
"CD8_NKT" = "#C2B280",
"HSC" = "#BE0032",
"IgA_CD14_Mono" = "#A1CAF1",
"MAIT_Like" = "#F38400",
"mDC" = "#875692",
"NK" = "#F3C300",
"pDC" = "#222222"))
= structure(col[[1]])
ccu names(ccu) = str_replace_all(string = names(ccu), pattern = '_', replacement = ' ')
= sapply(ccu, col.alpha, 0.5)
ccu2
# set theme
= list(
mtheme theme_bw(),
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
)
# load bulk data
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb = list.files(path = here('mid_res/variance_partition/generated_data/'),
dl pattern = '.rds', recursive = TRUE,full.names = TRUE)
= dl[-c(15,17)] # remove total bulk
dl
# get cell type names (file names)
= list.files(path = here('mid_res/variance_partition/generated_data/'),
cts pattern = '.rds', recursive = TRUE,full.names = FALSE)
= cts[-c(15,17)]
cts = str_replace_all(string = cts,pattern = 'vp.rds', replacement = '')
cts
# read and format variance partition results
= lapply(dl, readRDS)
vl names(vl) = cts
= list()
dl for (i in 1:length(vl)) {
= plotVarPart(vl[[i]])
p $data$celltype = names(vl[i])
p= p$data
d = d
dl[[i]]
}
# rank genes by variance fraction assciated with age
= readRDS(file = here('signature_curation/hallmark.rds'))
hlmk # parallel opts
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
= slist = list()
agelist #vars = c('age', 'timepoint', 'gender', 'sampleid')
for (u in 1:length(vl)) {
# get variance fractions
= vl[[u]] %>%
m as.data.frame() %>%
rownames_to_column('gene')
# rank genes by sex
= m[order(desc(m$age)), ]
da = structure(da$age, names= da$gene)
rank.age
# rank genes by subject
= m[order(desc(m$sampleid)), ]
ds = structure(ds$sampleid, names= ds$gene)
rank.subject
# format lists
= rank.age
agelist[[u]] = rank.subject
slist[[u]]
}
# run gsea for age and subject
#age.gs = scglmmr::FgseaList(rank.list.celltype = age.gsea,pathways = hlmk, scoreType = "pos", BPPARAM= pparam)
= sgsea = list()
age.gsea for (u in 1:length(agelist)) {
= fgsea::fgsea(hlmk, agelist[[u]], scoreType = "pos", BPPARAM = pparam)
age.gsea[[u]] = fgsea::fgsea(hlmk, slist[[u]], scoreType = "pos", BPPARAM = pparam)
sgsea[[u]]
}for (u in 1:length(agelist)) {
$celltype = names(vl)[u]
age.gsea[[u]]
}names(age.gsea) = names(vl)
= scglmmr::PlotFgsea(gsea_result_list = age.gsea, padj_filter = 0.05 )
p saveRDS(age.gsea,file = paste0(datapath,'age.gsea.rds'))
= lapply(age.gsea, function(x)
age.gsea.sub %>% filter(
x %in% c(
pathway 'HALLMARK_IL2_STAT5_SIGNALING',
'HALLMARK_IL6_JAK_STAT3_SIGNALING',
'HALLMARK_INFLAMMATORY_RESPONSE',
'HALLMARK_ALLOGRAFT_REJECTION'
)
))
= scglmmr::LeadingEdgeIndexed(gsea.result.list = age.gsea.sub,padj.threshold = 0.05)
li = Filter(li, f = length)
li $CD8_CD161_Tcell
li
= bind_rows(age.gsea.sub, .id = 'celltype')
dage = dage %>% filter(padj < 0.05)
dage
$pathway = str_replace_all(dage$pathway,pattern = 'HALLMARK_',replacement = '')
dage$pathway = str_replace_all(dage$pathway,pattern = '_',replacement = ' ')
dage= c("olivedrab", "#848482")
cu = sapply(cu, col.alpha, 0.5)
cu2 =
p ggplot(dage %>%
filter(celltype %in% c('CD8_Naive_Tcell', 'CD8_CD161_Tcell')),
aes(x = NES, y = pathway, label = celltype,
group = celltype, fill = celltype, color = celltype)) +
+
mtheme geom_linerange(aes(x = NES, color = celltype, xmin = 0, xmax = NES),
position = position_dodge(width = 0.35)) +
geom_point(aes(x = NES, color = celltype),position = position_dodge(width = 0.35)) +
geom_point(shape = 21, size = 2.5, position = position_dodge(width = 0.35)) +
::scale_fill_npg() +
ggscitheme(axis.text = element_text(color = 'black')) +
geom_vline(xintercept = 0, linetype = 'dashed') +
ylab('') +
xlab('Normalized Enrichment Score') +
theme(axis.title.x = element_text(size = 14)) +
theme(axis.text.y = element_text(size = 12)) +
ggtitle('Age associated variance enrichment')
pggsave(p,filename = paste0(figpath, 'tcell_age.pdf'), width = 6, height =2.2)
# CD8 Naive
# logcpm matrix
= unique(unlist(li$CD8_Naive_Tcell, use.names = FALSE))
cd8.genes = pb$CD8_Naive_Tcell
mdat = edgeR::cpm(mdat, log = TRUE)
mdat = as.data.frame(as.matrix(t(mdat[cd8.genes,])))
d2 = d2 %>%
d2 rownames_to_column('sid') %>%
separate(sid, into = c('sample', 'celltype'), sep = '~')
= full_join(d2, samplemd, by = 'sample')
d2 $sampleid = str_sub(d2$sampleid, -3, -1)
d2
# scale age
= function(x) {
scale.simple - mean(x)) / sd(x)
(x
}$age = scale.simple(d2$age)
d2
# fit models
= d2 %>% select(age, all_of(cd8.genes))
dmat = d2$age
age.scaled
= apply(dmat[, 2:ncol(dmat)], MARGIN = 2, function(x) {
age.coef = lm(x ~ 0 + age.scaled)
y return(y)
})
= lapply(age.coef, broom::tidy)
age.res = bind_rows(age.res,.id = 'gene')
age.res
= age.res %>%
age.pos filter(estimate > 0) %$%
geneplot(agelist[[11]][age.pos])
# save CD8 naive age associated genes with positive effect size
= age.res %>%
age.pos.cd8n filter(estimate > 0) %$%
genesaveRDS(age.pos.cd8n, file = paste0(datapath, 'age.pos.cd8n.rds'))
= data.frame(age.var = agelist[[11]][age.pos]) %>%
age.var rownames_to_column('gene')
=
p ggplot(age.var, aes(y = reorder(gene, age.var) , x = age.var*100, label = gene )) +
+
mtheme theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
ylab('') +
xlab('% variance explained by age') +
xlim(c(-5, 40)) +
geom_point(color = "#4DBBD5FF") +
::geom_text_repel(data = age.var %>%
ggrepelfilter(gene %in% c('KLF6', 'RGS16','TNF', 'IL12A', 'HLA-DQA1',
'ABCA1', 'CD70','GZMB', 'IL10A', "FASLG",
'CCL5', 'NOD2', 'CD74', 'IFNG', 'CCL4', 'FAS')),
size = 3,
force = 1,
nudge_x = -20,
direction = "y",
hjust = 1,
segment.size = 0.001) +
ggtitle('Positive association with age\nCD8 Naive T cells') +
theme(axis.title = element_text(size = 18))
ggsave(p,filename = paste0(figpath, 'cd8n.varexp.age.positive.pdf'), width = 4, height = 4)
# CD8 CD161
# logcpm matrix
= unique(unlist(li$CD8_CD161_Tcell,use.names = FALSE))
cd161.genes = pb$CD8_CD161_Tcell
mdat = edgeR::cpm(mdat, log = TRUE)
mdat = as.data.frame(as.matrix(t(mdat[cd161.genes, ])))
d2 = d2 %>%
d2 rownames_to_column('sid') %>%
separate(sid, into = c('sample', 'celltype'), sep = '~')
= full_join(d2, samplemd, by = 'sample')
d2 $sampleid = str_sub(d2$sampleid, -3,-1)
d2
# scale age
= function(x) { (x - mean(x)) / sd(x)}
scale.simple $age = scale.simple(d2$age)
d2
# fit models
= d2 %>% select(age, all_of(cd161.genes))
dmat = d2$age
age.scaled
= apply(dmat[ ,2:ncol(dmat)],MARGIN = 2, function(x) {
age.coef = lm(x ~ 0 + age.scaled)
y return(y)
} )
= lapply(age.coef, broom::tidy)
age.res = bind_rows(age.res,.id = 'gene')
age.res
= age.res %>% filter(estimate > 0) %$% gene
age.pos
# save CD8 CD161 age associated genes with positive effect size
= age.res %>%
age.pos.cd161 filter(estimate > 0) %$%
genesaveRDS(age.pos.cd161, file = paste0(datapath, 'age.pos.cd161.rds'))
= data.frame(age.var = agelist[[9]][age.pos]) %>% rownames_to_column('gene')
age.var =
p ggplot(age.var, aes(y = reorder(gene, age.var) , x = age.var*100, label = gene )) +
+
mtheme theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
ylab('') +
xlab('% variance explained by age') +
xlim(c(-5, 40)) +
geom_point(color = "#E64B35FF") +
::geom_text_repel(data = age.var %>%
ggrepelfilter(age.var > 0.08) %>%
filter(gene %in% c('HLA-DQA1', 'KRT1', 'IFNG', 'CCL4', 'KLRD1',
"CCL5", "CFP", "CD74", "HLA-DRA", "IL2RB",
"IL17RA", "TNFRSF8", "MAP3K8", "SERPINB6", "CD38",
"TYK2", "CD70", "PROK2" , "RGS1" )),
size = 3,
force = 1,
nudge_x = -20,
direction = "y",
hjust = 1,
segment.size = 0.001) +
ggtitle('Positive association with age\nCD8 CD161+ T cells') +
theme(axis.title = element_text(size = 18))
ggsave(p,filename = paste0(figpath, 'cd161T.varexp.age.positive.pdf'), width = 4, height = 4)
# write lists
plot(age.coef)
= age.coef[age.coef > 0]
pos.age = age.coef[age.coef < 0]
neg.age ::fwrite(list(names(pos.age)),file = paste0(datapath,'pos.age.cd8n.txt'))
data.table::fwrite(list(names(neg.age)),file = paste0(datapath,'neg.age.cd8n.txt')) data.table
Mixed effects covariate adjusted model of vaccination effects across donors within each protein based subset – unadjuvanted cohort. Day 1 and day 7 post vaccination effects estimates. Derive the effect size of vaccination effect to rank genes and run enrichment.
mid_res/1_H1N1_pseudobulk_DE/1_h1_mixed_effect_workflow_V4.r
# H1N1 differential expression testing and gene set enrichment analysis
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(variancePartition))
suppressMessages(library(scglmmr))
source("functions/analysis_functions.R")
# make output directories
= here("mid_res/1_H1N1_pseudobulk_DE/dataV4/")
datapath dir.create(datapath, recursive = TRUE)
= here("mid_res/1_H1N1_pseudobulk_DE/figuresV4/")
figpath dir.create(figpath, recursive = TRUE)
# parallel options
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# read processed pseudobulk data
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb
# subset to unadjuvanted cohort and remove cell type string from sample names
= lapply(pb, function(x) x = x[ ,1:40])
pb = gsub("~.*","",colnames(pb[[1]]))
cnames = lapply(pb, function(x) {
pb %>%
x as.data.frame() %>%
setNames(nm = cnames) %>%
as.matrix()
})
# sample metadata
= readRDS(file = here('data/samplemd.rds')) %>% filter(! adjmfc.group %in% 'AS03')
samplemd $scaledage = as.vector(scale(samplemd$age))
samplemdnames(samplemd)[names(samplemd) == 'sampleid'] <- 'subjectid'
names(samplemd)[names(samplemd) == 'adjmfc.group'] <- 'group'
$group = str_replace_all(string = samplemd$group,pattern = ' ', replacement = '')
samplemd
# format
= samplemd %>%
samplemd mutate(time.group = paste(timepoint, group,sep = "_")) %>%
remove_rownames() %>%
column_to_rownames('sample')
# relevel combined factor
$time.group = factor(samplemd$time.group,
samplemdlevels = c('d0_high', 'd1_high', 'd7_high',
'd0_low', 'd1_low', 'd7_low'))
$timepoint = factor(samplemd$timepoint, levels = c('d0', 'd1', 'd7'))
samplemd
# create separate model metadata for the separate cohorts being tested
= samplemd[samplemd$time_cohort == 'd1', ] %>% droplevels()
d1 = samplemd[samplemd$time_cohort == 'd7', ] %>% droplevels()
d7 = samplemd[samplemd$timepoint == 'd0', ] %>% droplevels()
d0
# subset the bulk lists for each time cohort
= lapply(pb, function(x){ x = x[ , rownames(d1)]})
d1d = lapply(pb, function(x){ x = x[ , rownames(d7)]})
d7d = lapply(pb, function(x){ x = x[ , rownames(d0)]})
d0d
################################
# fit day 1 model
<- ~ 0 + timepoint + batch + gender + age + (1|subjectid)
f1
# set up contrast matrix (based on first element of list)
= edgeR::DGEList(counts = d1d[[1]], samples = d1)
d = getContrast(exprObj = d, formula = f1, data = d1, coefficient = c( 'timepointd1', 'timepointd0'))
cmat plotContrasts(cmat)
# run on each subset
= v1 = list()
fit1 for (i in 1:length(d1d)) {
# init data
= d1
meta = f1
form = cmat
contrast_matrix = d1d[[i]]
counts
# dge list
= edgeR::DGEList(counts = counts, samples = meta)
d
# filter cell type specific lowly expressed genes and calc norm factors
= edgeR::filterByExpr(y = d$counts, min.count = 3, design = as.factor(d$samples$timepoint))
gtable print(names(d1d)[i]);print(table(gtable))
= d[gtable, keep.lib.sizes=FALSE]
d = edgeR::calcNormFactors(object = d)
d
# get voom observation level weights
= voomWithDreamWeights(counts = d,
v formula = form,
data = meta,
BPPARAM = pparam,
plot = TRUE, save.plot = TRUE)
# fit contrast mixed model
= dream(exprObj = v, formula = form, data = meta,
fitmm L = contrast_matrix, useWeights = TRUE,
BPPARAM = pparam, REML = TRUE)
= variancePartition::eBayes(fit = fitmm)
fitmm # save results
= v
v1[[i]] = fitmm
fit1[[i]]
}names(v1) = names(fit1) = names(d1d)
################################
# fit day 7 model (uses same formula)
<- ~ 0 + timepoint + batch + gender + age + (1|subjectid)
f1
# set up contrast matrix (based on first element of list)
= edgeR::DGEList(counts = d7d[[1]], samples = d7)
d = getContrast(exprObj = d, formula = f1, data = d7, coefficient = c( 'timepointd7', 'timepointd0'))
cmat plotContrasts(cmat)
# run on each subset
= v7 = list()
fit7 for (i in 1:length(d7d)) {
# init data
= d7
meta = f1
form = cmat
contrast_matrix = d7d[[i]]
counts
# dge list
= edgeR::DGEList(counts = counts, samples = meta)
d
# filter cell type specific lowly expressed genes
= edgeR::filterByExpr(y = d$counts, min.count = 3, design = as.factor(d$samples$timepoint))
gtable table(gtable)
= d[gtable, keep.lib.sizes=FALSE]
d = edgeR::calcNormFactors(object = d)
d
# get voom observation level weights
= voomWithDreamWeights(counts = d,
v formula = form,
data = meta,
BPPARAM = pparam,
plot = TRUE, save.plot = TRUE)
# fit contrast mixed model
= dream(exprObj = v, formula = form, data = meta,
fitmm L = contrast_matrix, useWeights = TRUE,
BPPARAM = pparam, REML = TRUE)
= variancePartition::eBayes(fit = fitmm)
fitmm # save results
= v
v7[[i]] = fitmm
fit7[[i]]
}names(v7) = names(fit7) = names(d7d)
# run baseline model using limma
# set up fixed effects model to run with limma
<- model.matrix(~ 0 + group + batch + gender + age, data = d0)
mod0 colnames(mod0) = c("high", "low", 'batch2', "genderM", "age")
= makeContrasts(adjmfc = high - low, levels = colnames(mod0))
c0
= v0 = cont0 = list()
fit0 for (i in 1:length(d0d)) {
# init data
= d0
meta # form = f1
= c0
contrast_matrix = d0d[[i]]
counts
# dge list
= edgeR::DGEList(counts = counts, samples = meta)
d
# filter cell type specific lowly expressed genes ** Change grouping factor for filter by expression to group
= edgeR::filterByExpr(y = d$counts, min.count = 3, design = as.factor(d$samples$group))
gtable table(gtable)
= d[gtable, keep.lib.sizes=FALSE]
d = edgeR::calcNormFactors(object = d)
d
# get voom observation level weights
= voom(counts = d, design = mod0, save.plot = TRUE, plot = TRUE)
v #v = voomWithDreamWeights(counts = d, formula = form, data = meta, BPPARAM = pparam, plot = TRUE, save.plot = TRUE)
# fit contrast mixed model
= limma::lmFit(object = v,design = mod0)
fit = contrasts.fit(fit = fit, contrasts = c0)
cfit = limma::eBayes(fit = cfit)
eb # save results
= v
v0[[i]] = fit
fit0[[i]] = eb
cont0[[i]]
}names(v0) = names(fit0) = names(cont0) = names(d0d)
# save model fitting data
saveRDS(object = samplemd, file = paste0(datapath, 'samplemd.rds'))
saveRDS(object = d1, file = paste0(datapath, 'd1.rds'))
saveRDS(object = d7, file = paste0(datapath, 'd7.rds'))
saveRDS(object = d0, file = paste0(datapath, 'd0.rds'))
saveRDS(object = d1d, file = paste0(datapath, 'd1d.rds'))
saveRDS(object = d7d, file = paste0(datapath, 'd7d.rds'))
saveRDS(object = d0d, file = paste0(datapath, 'd0d.rds'))
# save model fits
# d0
saveRDS(object = fit0, file = paste0(datapath, 'fit0.rds'))
saveRDS(object = cont0, file = paste0(datapath, 'cont0.rds'))
saveRDS(object = v0, file = paste0(datapath, 'v0.rds'))
# day 1
saveRDS(object = fit1, file = paste0(datapath, 'fit1.rds'))
saveRDS(object = v1, file = paste0(datapath, 'v1.rds'))
# day 7
saveRDS(object = fit7, file = paste0(datapath, 'fit7.rds'))
saveRDS(object = v7, file = paste0(datapath, 'v7.rds'))
# sessioninfo
sessionInfo()
Enrichment of curated gene signature pathways based on genes ranked
by vaccination effects above.
mid_res/1_H1N1_pseudobulk_DE/2_rungsea_day1_day7_V4.r
# R version 4.0.5
# H1N1 differential expression testing and gene set enrichment analysis
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
# parallel options for FseaList
::register(BiocParallel::SnowParam(4))
BiocParallel= BiocParallel::SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# set data path
= here("mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/")
datapath dir.create(datapath)
# load pathways to be tested.
= readRDS(file = here('signature_curation/combined_sig_sub.rds'))
sig_test = readRDS(file = here('signature_curation/sig_test_sub.rds'))
core_sigs
# load each time statistical contrast model result extract contrast and rank genes by t statistic
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/fit1.rds'))
fit1 = readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/fit7.rds'))
fit7 = ExtractResult(model.fit.list = fit1, what = 'lmer.z.ranks', coefficient.number = 1, coef.name = 'L1')
r1 = ExtractResult(model.fit.list = fit7, what = 'lmer.z.ranks', coefficient.number = 1, coef.name = 'L1')
r7
# run gene set enrichment Day 1 models
# run unbiased modules and core signatures from past flu studies
= FgseaList(rank.list.celltype = r1, pathways = core_sigs, BPPARAM = pparam)
g1c = FgseaList(rank.list.celltype = r1, pathways = sig_test, BPPARAM = pparam)
g1f
# day 7
= FgseaList(rank.list.celltype = r7, pathways = sig_test, BPPARAM = pparam)
g7f
# save
saveRDS(object = g1c, file = paste0(datapath,'g1c.rds'))
saveRDS(object = g1f, file = paste0(datapath,'g1f.rds'))
saveRDS(object = g7f, file = paste0(datapath,'g7f.rds'))
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] viridis_0.5.1 viridisLite_0.3.0 scglmmr_0.1.0 variancePartition_1.25.6
# [5] BiocParallel_1.24.1 limma_3.46.0 magrittr_2.0.1 here_1.0.1
# [9] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
# [13] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3
# [17] tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] tidyselect_1.1.0 lme4_1.1-26 RSQLite_2.2.7 AnnotationDbi_1.52.0
# [5] grid_4.0.5 scatterpie_0.1.7 munsell_0.5.0 codetools_0.2-18
# [9] statmod_1.4.35 withr_2.4.3 colorspace_2.0-0 GOSemSim_2.16.1
# [13] Biobase_2.50.0 rstudioapi_0.13 stats4_4.0.5 ggsignif_0.6.0
# [17] DOSE_3.16.0 labeling_0.4.2 MatrixGenerics_1.2.1 Rdpack_2.1.1
# [21] emmeans_1.5.4 GenomeInfoDbData_1.2.4 polyclip_1.10-0 pheatmap_1.0.12
# [25] bit64_4.0.5 farver_2.0.3 rprojroot_2.0.2 downloader_0.4
# [29] coda_0.19-4 vctrs_0.4.1 generics_0.1.2 TH.data_1.0-10
# [33] R6_2.5.0 doParallel_1.0.16 GenomeInfoDb_1.26.7 graphlayouts_0.7.2
# [37] locfit_1.5-9.4 bitops_1.0-6 cachem_1.0.4 fgsea_1.16.0
# [41] DelayedArray_0.16.3 assertthat_0.2.1 scales_1.1.1 multcomp_1.4-16
# [45] ggraph_2.0.5 enrichplot_1.10.2 gtable_0.3.0 egg_0.4.5
# [49] tidygraph_1.2.0 sandwich_3.0-0 rlang_1.0.2 slanter_0.2-0
# [53] splines_4.0.5 rstatix_0.7.0 broom_0.7.5 abind_1.4-5
# [57] BiocManager_1.30.10 reshape2_1.4.4 modelr_0.1.8 backports_1.2.1
# [61] qvalue_2.22.0 clusterProfiler_3.18.1 tools_4.0.5 ellipsis_0.3.2
# [65] gplots_3.1.1 RColorBrewer_1.1-2 BiocGenerics_0.36.1 Rcpp_1.0.6
# [69] plyr_1.8.6 progress_1.2.2 zlibbioc_1.36.0 RCurl_1.98-1.3
# [73] prettyunits_1.1.1 ggpubr_0.4.0 cowplot_1.1.1 S4Vectors_0.28.1
# [77] zoo_1.8-8 SummarizedExperiment_1.20.0 haven_2.3.1 ggrepel_0.9.1
# [81] fs_1.5.0 data.table_1.14.0 DO.db_2.9 openxlsx_4.2.3
# [85] reprex_1.0.0 mvtnorm_1.1-1 packrat_0.7.0 matrixStats_0.58.0
# [89] hms_1.0.0 GSVA_1.38.2 xtable_1.8-4 pbkrtest_0.5-0.1
# [93] RhpcBLASctl_0.21-247.1 XML_3.99-0.6 rio_0.5.16 readxl_1.3.1
# [97] IRanges_2.24.1 gridExtra_2.3 compiler_4.0.5 KernSmooth_2.23-18
# [101] crayon_1.4.1 shadowtext_0.0.9 minqa_1.2.4 ggfun_0.0.4
# [105] snow_0.4-3 lubridate_1.7.9.2 DBI_1.1.1 tweenr_1.0.2
# [109] dbplyr_2.1.0 MASS_7.3-53.1 boot_1.3-27 Matrix_1.3-2
# [113] car_3.0-10 cli_3.3.0 rbibutils_2.0 parallel_4.0.5
# [117] igraph_1.2.6 GenomicRanges_1.42.0 pkgconfig_2.0.3 rvcheck_0.1.8
# [121] foreign_0.8-81 xml2_1.3.2 foreach_1.5.1 annotate_1.68.0
# [125] XVector_0.30.0 GeneOverlap_1.26.0 estimability_1.3 rvest_0.3.6
# [129] digest_0.6.27 graph_1.68.0 cellranger_1.1.0 fastmatch_1.1-0
# [133] edgeR_3.32.1 GSEABase_1.52.1 curl_4.3 gtools_3.8.2
# [137] nloptr_1.2.2.2 lifecycle_1.0.0 nlme_3.1-152 jsonlite_1.7.2
# [141] aod_1.3.1 carData_3.0-4 pillar_1.4.7 lattice_0.20-41
# [145] fastmap_1.1.0 httr_1.4.2 survival_3.2-10 GO.db_3.12.1
# [149] glue_1.6.2 zip_2.1.1 iterators_1.0.13 bit_4.0.4
# [153] ggforce_0.3.3 stringi_1.5.3 blob_1.2.1 org.Hs.eg.db_3.12.0
# [157] caTools_1.18.1 memoise_2.0.0
Generate figures from model results above for day 1 vaccination
effects in unadjuvanted cohort. Derive dell type specific vaccination
genes and shared core interferon signature.
mid_res/1_H1N1_pseudobulk_DE/3_V4_figures.r
# R version 4.0.5
# H1N1 differential expression testing and gene set enrichment analysis
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
# set fig path
= here("mid_res/1_H1N1_pseudobulk_DE/figuresV4/")
figpath = here("mid_res/1_H1N1_pseudobulk_DE/dataV4/")
datapath
# heirarchical signal visualization
# day 1 gsea result
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g1c.rds'))
g1c = PlotFgsea(gsea_result_list = g1c, p.threshold = 0.05)
p #new
= data.table::fread(file = here('signature_curation/sig_test_sub_annotation.txt'))
mann = unique(mann$annotation)
categ = p$data
pd $annotation = plyr::mapvalues(pd$pathway, from = mann$pathway, to = mann$annotation)
pd$annotation = factor(pd$annotation,levels = categ)
pd
# add annotation to plot data env
$data = pd
p$data$celltype = str_replace_all(p$data$celltype , pattern = '_', replacement = ' ')
p$data = p$data %>% filter(!pathway == 'btm M4.1 cell cycle (I)')
p
$data
p# save plot
= ggsci::pal_jama()(2)[2]
high.col = ggsci::pal_jama()(2)[1]
low.col = p +
p facet_grid(vars(annotation), scales = 'free', space = 'free', switch = 'y', ) +
theme(strip.text.y = element_text(angle = 0)) +
theme(strip.placement = "outside") +
scale_size_area(max_size = 4) +
theme(legend.position = 'right', legend.justification = 'bottom') +
theme(strip.background = element_blank()) +
scale_fill_gradient2(low = low.col, mid = 'white', high = high.col) +
theme(legend.position = 'bottom')
pggsave(p,filename = paste0(figpath, 'g1c.gsea.pdf'),width = 6.3, height = 6.2)
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g1c.rds'))
g1c = PlotFgsea(gsea_result_list = g1c, padj_filter = 0.05)
p #new
= data.table::fread(file = here('signature_curation/sig_test_sub_annotation.txt'))
mann = unique(mann$annotation)
categ = p$data
pd $annotation = plyr::mapvalues(pd$pathway, from = mann$pathway, to = mann$annotation)
pd$annotation = factor(pd$annotation,levels = categ)
pd
# add annotation to plot data env
$data = pd
p$data$celltype = str_replace_all(p$data$celltype , pattern = '_', replacement = ' ')
p$data = p$data %>% filter(!pathway == 'btm M4.1 cell cycle (I)')
p
p$data
p# save plot
= ggsci::pal_jama()(2)[2]
high.col = ggsci::pal_jama()(2)[1]
low.col quantile(-log10(p$data$padj))
= seq(1.3, 3, 5, 7, 8) # round quantiles
break.circle = p +
p facet_grid(vars(annotation), scales = 'free', space = 'free', switch = 'y') +
theme(strip.text.y = element_text(angle = 0)) +
theme(strip.placement = "outside") +
scale_size_area(breaks = break.circle) +
theme(legend.position = 'bottom', legend.margin = margin(t = -8)) +
theme(strip.background = element_blank()) +
scale_fill_gradient2(low = low.col, mid = 'white', high = high.col) +
theme(legend.spacing = unit(0.1, 'cm')) +
theme(axis.text.x = element_text(size = 10), axis.text.y = element_text(size = 10))
pggsave(p,filename = paste0(figpath, 'g1c.gsea.pdf'),width = 7.5, height = 6.6)
#######
# day 1 heatmap fc
# highlight genes that have a relative signal enriched in certain cell types
# define core day 1 induced signature
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/fit1.rds'))
fit1 = ExtractResult(model.fit.list = fit1, coefficient.number = 1, coef.name = 'L1')
d1res
# get leading edge genes of all enrichments by cell type
= LeadingEdgeIndexed(gsea.result.list = g1c,padj.threshold = 0.05)
li = unique(unlist(li, use.names = FALSE))
li.full
# logFold Change estimates from mixed model
= GetGeneMatrix(result.list = d1res,
gm gene_subset = li.full,
pvalfilter = 0.05,
stat_for_matrix = 'logFC',
logfcfilter = 0.1)
# get rid of underscore in celltype name
colnames(gm) = str_replace_all(colnames(gm), pattern = '_', replacement = ' ')
# function to define number of columns that have a non zero value
<- apply(gm, 1, function(x) sum(! x == 0 ))
nnzero
# Define shared induced state
= gm[which(nnzero >= 5),]
ms1 = rownames(ms1)
ms1.names
# save genes defining the shared state
saveRDS(ms1.names, file = paste0(datapath, 'ms1.names.rds'))
# heatmap of core shared day 1 interferon state
pdf(file = paste0(figpath, 'ms1.mat.pdf'),width = 5, height = 5)
::pheatmap(ms1, treeheight_row = 10, treeheight_col = 10,
pheatmapcolor = viridis::viridis(n = 10, option = "inferno"),clustering_method = 'ward.D',
breaks = seq(from = 0, to = 1.5, length.out = 10),
fontsize_row = 6, fontsize_col = 12, border_color = NA)
dev.off()
# celltype specific signals
= c(
asub # bc mem and naive
'TRIM56','MYD88', 'TBK1', 'CD69',
# CD14 mono
'CAMK2D','IFI27' ,'IL15RA' ,'IL1RN' ,'INSIG1','LCK' ,
'LYN','SERPING1','SLC16A6' ,'SLC25A1','TGFB1',
'CCL2' ,'FCGR1B',
# mdc cd16 cd14
'WARS' ,'PARP14', 'IFITM2' ,'FBXO6' ,'PSMA4' ,'ACTR2', 'ICAM1', 'IL15',
#CD16 mono
'LAP3' ,'CYBA' ,'FYB' ,'FCGR1A',
# CD4 naive
'IRF4' ,'IL2RB' ,'IRF8' ,'MXD1' , 'RELA' ,
# mDC
'TYK2' ,'TNFSF10',
# CD4 EM
'IRS2','PTPN2',
# CD8 naive t
'EIF2AK2', 'EIF4G2' ,'PIK3CG',
# nk
'IFI44' ,
# CD8 CD161
'MX2','PIK3R5' , 'CALR'
)
library(ComplexHeatmap)
= gm[! rownames(gm) %in% c(ms1.names), ]
gmd<0] <- 0
gmd[gmd= slanter::slanted_reorder(gmd)
gmd
# select labes
= which(rownames(gmd) %in% asub)
rlab = rowAnnotation(foo = anno_mark(
ha at = rlab,
labels = asub,
labels_gp = gpar(color = "black", fontsize = 8)
))
# rlab2 = which(rownames(gmd) %in% asub2)
# ha2 = rowAnnotation(foo = anno_mark(
# at = rlab2,
# labels = asub2,
# labels_gp = gpar(color = "red", fontsize = 8)
# ))
# color map
= circlize::colorRamp2(
col_fun breaks = seq(0,1.2, length.out = 10),
colors = viridis::viridis(n = 10, option = 'inferno')
)
# draw heatmap and rasterize
pdf(file = paste0(figpath, 'gmd.mat.pdf'),width = 5, height = 7.5)
::Heatmap(matrix = gmd,
ComplexHeatmapright_annotation = ha,
show_row_names = FALSE,
# do not cluster use slanter
cluster_columns = FALSE, cluster_rows = FALSE,
col = col_fun,
use_raster = TRUE)
dev.off()
# heirarchical signal deconvolution of reactome interferon signature
# g1c object loaded in first line above
# g1c = readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g1c.rds'))
# get all leading edge genes across subsets
= g1c %>%
rifn bind_rows() %>%
filter( pathway == 'reactome interferon signaling' & padj < 0.05) %$% leadingEdge %>%
unlist() %>%
unique()
# extract matrix of log fold change estimate across donors
= GetGeneMatrix(result.list = d1res,
mtx stat_for_matrix = "logFC",
gene_subset = rifn,
pvalfilter = 0.05,
logfcfilter = 0.25)
# test = GetGeneMatrix(result.list = d1res,
# stat_for_matrix = "logFC",
# gene_subset = rifn, pvalfilter = Inf, logfcfilter = -5)
# remove underscores
colnames(mtx) = str_replace_all(colnames(mtx),pattern = '_', replacement = " ")
# draw heatmap
::pheatmap(
pheatmap
mtx ,border_color = NA,
color = viridis::viridis(n = 18, option = "A"),
breaks = seq(from = 0, to = 1.5, length.out = 19),
treeheight_col = 10, treeheight_row = 0,
fontsize = 6, fontsize_row = 5, fontsize_col = 6,
width = 2.3, height = 5,
clustering_method = 'complete',
filename = paste0(figpath, "d1_reactomeLeadingedge_cluster_genes_heatmap_full.pdf")
)
# now show the main perturbed cell type (mono 14) fold changes in pseudobulk logcpm
# across each individual donor
# load model fitting data and pseudobulk data saved in mixed model workflow (scipt 1)
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/d1d.rds'))
d1d = readRDS(file = here("mid_res/1_H1N1_pseudobulk_DE/dataV4/d1.rds"))
d1
# log CPM the pseudobulk data
= lapply(d1d, edgeR::cpm, log = TRUE)
lcpm
# scglmmr function to extract leading edge genes
= LeadEdgeTidySampleExprs(av.exprs.list = lcpm, gsea.list = g1c, padj.filter = 0.05, NES.filter = 0)
lexp1
# annotate time and batch on the heatmap
= d1[, c('batch', 'timepoint')]
heatmap_anno = list(
anno_color timepoint = c('d1' = "orange", "d0" = "white"),
batch = c('1' = "black", '2' = "white")
)
# define color vector
= c("#053061", "#1E61A5", "#3C8ABE", "#7CB7D6", "#BAD9E9", "#E5EEF3",
cu "#F9EAE1", "#F9C7AD", "#EB9273", "#CF5246", "#AB1529", "#67001F")
# scglmmr function for leading edge gene matrix across donors
= scglmmr::LeadEdgeSampleHeatmap(tidy.exprs.list = lexp1,
mat2 modulename = "reactome interferon signaling",
celltype_plot = 'CD14_Mono',
metadata = meta,
metadata_annotate = c('adjMFC', 'batch'),
sample_column = 'sample',
returnmat = TRUE)
# draw heatmap
::pheatmap(mat2,
pheatmapborder_color = NA,
treeheight_row = 0, treeheight_col = 10,
annotation = heatmap_anno,
annotation_colors = anno_color,
color = cu,
width = 5, height = 7.6,
scale = "row",
filename = paste0(figpath, "mono_d1_ReactomeIFN.pdf")
)
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
#
# attached base packages:
# [1] grid stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] ComplexHeatmap_2.6.2 scglmmr_0.1.0 here_1.0.1 forcats_0.5.1 stringr_1.4.0
# [6] dplyr_1.0.4 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
# [11] ggplot2_3.3.3 tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] tidyselect_1.1.0 lme4_1.1-26 RSQLite_2.2.7
# [4] AnnotationDbi_1.52.0 BiocParallel_1.24.1 scatterpie_0.1.7
# [7] munsell_0.5.0 codetools_0.2-18 statmod_1.4.35
# [10] withr_2.4.3 colorspace_2.0-0 GOSemSim_2.16.1
# [13] Biobase_2.50.0 rstudioapi_0.13 stats4_4.0.5
# [16] ggsignif_0.6.0 DOSE_3.16.0 labeling_0.4.2
# [19] Rdpack_2.1.1 MatrixGenerics_1.2.1 emmeans_1.5.4
# [22] GenomeInfoDbData_1.2.4 polyclip_1.10-0 bit64_4.0.5
# [25] farver_2.0.3 pheatmap_1.0.12 rprojroot_2.0.2
# [28] downloader_0.4 coda_0.19-4 vctrs_0.4.1
# [31] generics_0.1.2 TH.data_1.0-10 doParallel_1.0.16
# [34] R6_2.5.0 GenomeInfoDb_1.26.7 clue_0.3-59
# [37] graphlayouts_0.7.2 locfit_1.5-9.4 bitops_1.0-6
# [40] cachem_1.0.4 fgsea_1.16.0 DelayedArray_0.16.3
# [43] assertthat_0.2.1 scales_1.1.1 multcomp_1.4-16
# [46] ggraph_2.0.5 enrichplot_1.10.2 gtable_0.3.0
# [49] Cairo_1.5-12.2 egg_0.4.5 tidygraph_1.2.0
# [52] sandwich_3.0-0 rlang_1.0.2 slanter_0.2-0
# [55] GlobalOptions_0.1.2 splines_4.0.5 rstatix_0.7.0
# [58] broom_0.7.5 BiocManager_1.30.10 reshape2_1.4.4
# [61] abind_1.4-5 modelr_0.1.8 backports_1.2.1
# [64] qvalue_2.22.0 clusterProfiler_3.18.1 tools_4.0.5
# [67] ellipsis_0.3.2 gplots_3.1.1 RColorBrewer_1.1-2
# [70] BiocGenerics_0.36.1 Rcpp_1.0.6 plyr_1.8.6
# [73] progress_1.2.2 zlibbioc_1.36.0 RCurl_1.98-1.3
# [76] prettyunits_1.1.1 ggpubr_0.4.0 GetoptLong_1.0.5
# [79] viridis_0.5.1 cowplot_1.1.1 S4Vectors_0.28.1
# [82] zoo_1.8-8 SummarizedExperiment_1.20.0 haven_2.3.1
# [85] ggrepel_0.9.1 cluster_2.1.2 fs_1.5.0
# [88] variancePartition_1.25.6 magrittr_2.0.1 data.table_1.14.0
# [91] DO.db_2.9 openxlsx_4.2.3 circlize_0.4.12
# [94] reprex_1.0.0 mvtnorm_1.1-1 packrat_0.7.0
# [97] matrixStats_0.58.0 hms_1.0.0 GSVA_1.38.2
# [100] xtable_1.8-4 pbkrtest_0.5-0.1 RhpcBLASctl_0.21-247.1
# [103] XML_3.99-0.6 rio_0.5.16 readxl_1.3.1
# [106] IRanges_2.24.1 gridExtra_2.3 shape_1.4.6
# [109] compiler_4.0.5 KernSmooth_2.23-18 crayon_1.4.1
# [112] shadowtext_0.0.9 minqa_1.2.4 ggfun_0.0.4
# [115] lubridate_1.7.9.2 DBI_1.1.1 tweenr_1.0.2
# [118] dbplyr_2.1.0 MASS_7.3-53.1 boot_1.3-27
# [121] Matrix_1.3-2 car_3.0-10 cli_3.3.0
# [124] rbibutils_2.0 parallel_4.0.5 igraph_1.2.6
# [127] GenomicRanges_1.42.0 pkgconfig_2.0.3 rvcheck_0.1.8
# [130] foreign_0.8-81 foreach_1.5.1 xml2_1.3.2
# [133] annotate_1.68.0 XVector_0.30.0 GeneOverlap_1.26.0
# [136] estimability_1.3 rvest_0.3.6 digest_0.6.27
# [139] graph_1.68.0 cellranger_1.1.0 fastmatch_1.1-0
# [142] edgeR_3.32.1 GSEABase_1.52.1 curl_4.3
# [145] gtools_3.8.2 nloptr_1.2.2.2 rjson_0.2.20
# [148] nlme_3.1-152 lifecycle_1.0.0 jsonlite_1.7.2
# [151] aod_1.3.1 carData_3.0-4 viridisLite_0.3.0
# [154] limma_3.46.0 pillar_1.4.7 lattice_0.20-41
# [157] fastmap_1.1.0 httr_1.4.2 survival_3.2-10
# [160] GO.db_3.12.1 glue_1.6.2 zip_2.1.1
# [163] iterators_1.0.13 png_0.1-7 bit_4.0.4
# [166] ggforce_0.3.3 stringi_1.5.3 blob_1.2.1
# [169] org.Hs.eg.db_3.12.0 caTools_1.18.1 memoise_2.0.0
Generate figures from model results above for day 7 vaccination
effects in unadjuvanted cohort.
mid_res/1_H1N1_pseudobulk_DE/4_V4_figures_d7.R
# R version 4.0.5
# H1N1 differential expression testing and gene set enrichment analysis
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
# set fig path
= here("mid_res/1_H1N1_pseudobulk_DE/figuresV4/")
figpath
# set theme for subset plots
= list(
mtheme1 theme_bw(base_size = 10.5),
theme(text = element_text(color = 'black')),
theme(strip.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold",family = "Helvetica"),
axis.text.y = element_text(size = 12, color = 'black'))
)
# load day 7 mixed model gene set enrichment results
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g7f.rds'))
g7f
# filter subset of signals to visualize
= lapply(g7f, function(x)
g7f %>%
x filter(!str_sub(pathway, 1,5) == 'REACT' ) %>%
filter(NES > 0)
)
# save global
= PlotFgsea(gsea_result_list = g7f, p.threshold = 0.01, NES_filter = 0.1)
p ggsave(p, filename = paste0(figpath, 'gsea.pos.d7.pdf'), width = 8, height = 9)
# cell type specific signals to highlight.
= p$data
d7d $pathway = as.character(d7d$pathway)
d7d# define B cell signals
= c(
bsub 'LI.S2 B cell surface signature',
'LI.M47.0 enriched in B cells (I)',
'CD40_ACT',
'LI.M5.0 regulation of antigen presentation and immune response',
'LI.S8 Naive B cell surface signature',
'KEGG_AMINOACYL_TRNA_BIOSYNTHESIS',
'KEGG_OXIDATIVE_PHOSPHORYLATION',
'LI.M212 purine nucleotide biosynthesis',
'LI.M234 transcription elongation, RNA polymerase II',
'LI.M32.0 platelet activation (I)',
'LI.M227 translation initiation',
'LI.M37.0 immune activation - generic cluster'
)
= d7d %>% filter(celltype == 'BC_Naive' & pathway %in% bsub)
bsub.plot
# give shorter names
$pathway[bsub.plot$pathway == 'LI.M5.0 regulation of antigen presentation and immune response'] <-
bsub.plot'LI.M5.0 regulation of antigen presentation'
$pathway[bsub.plot$pathway == 'KEGG_AMINOACYL_TRNA_BIOSYNTHESIS'] <-
bsub.plot'kegg aminoacyl tRNA biosynthesis'
$pathway[bsub.plot$pathway == 'KEGG_OXIDATIVE_PHOSPHORYLATION'] <-
bsub.plot'kegg oxidatie phosphorylation'
# save plot
= ggplot(bsub.plot, aes(x = NES, y = reorder(pathway, NES), size = -log10(padj)) ) +
p +
mtheme1 ylab("") +
xlab('Normalized Enrichment Score') +
geom_vline(xintercept = 0, linetype = 'dashed') +
xlim(-1,3) +
geom_point(shape = 21 , fill ='red' ) +
scale_size_area() +
ggtitle('Day 7 induced: Naive B cells')
ggsave(p,filename = paste0(figpath,'bnaive.d7.pdf'), width = 7, height = 3)
# t cell (EM )
= d7d %>% filter(celltype == 'CD4_Efct_Mem_Tcell')
tsub.plot # give shorter names
$pathway = as.character(tsub.plot$pathway)
tsub.plot$pathway[tsub.plot$pathway == 'KEGG_VALINE_LEUCINE_AND_ISOLEUCINE_DEGRADATION'] <-
tsub.plot'kegg valine leucine isoleucine degratation'
$pathway[tsub.plot$pathway == 'KEGG_PEROXISOME'] <- 'kegg peroxisome'
tsub.plot$pathway[tsub.plot$pathway == 'KEGG_FATTY_ACID_METABOLISM'] <-
tsub.plot'kegg fatty acid metabolism'
$pathway[tsub.plot$pathway == 'KEGG_PRIMARY_IMMUNODEFICIENCY'] <-
tsub.plot'kegg primary immunodeficiency'
# save plot
= ggplot(tsub.plot, aes(x = NES, y = reorder(pathway, NES), size = -log10(padj)) ) +
p +
mtheme1 ylab("") +
xlab('Normalized Enrichment Score') +
geom_vline(xintercept = 0, linetype = 'dashed') +
xlim(-1,3) +
geom_point(shape = 21 , fill ='red' ) +
scale_size_area() +
ggtitle('Day 7 induced: CD4 effector memory T cells')
pggsave(p,filename = paste0(figpath,'cd4mem.d7.pdf'), width = 7, height = 3)
Visualize core shared across subsets interferon signature defined in
script 3.
mid_res/1_H1N1_pseudobulk_DE/5_shared_core_fin_state.r
# R version 4.0.5
# H1N1 differential expression testing and gene set enrichment analysis
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
source('functions/MattPMutils.r')
# set fig path
= here("mid_res/1_H1N1_pseudobulk_DE/figuresV4/")
figpath
# load model fitting data and pseudobulk data saved in mixed model workflow (scipt 1)
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/d1d.rds'))
d1d
# log CPM the pseudobulk data
= lapply(d1d, edgeR::cpm, log = TRUE)
lcpm
# read Core signature
= readRDS(file = here("mid_res/1_H1N1_pseudobulk_DE/dataV4/ms1.names.rds"))
ms1.names
= lapply(lcpm, function(x){ x[ms1.names, ] %>%
av2 t() %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
mutate(timepoint = str_sub(sample, -2, -1))
})
# for (i in 1:length(av2)) {
# av2[[i]]$celltype = names(av2)[i]
# }
= av2 %>% bind_rows(.id = 'celltype')
av2 = av2 %>% gather(gene, average, ADAR:ZBP1)
av3 = av3 %>% group_by(sample, celltype, timepoint) %>%
av4 summarize(core_ifn_score = mean(average))
= av4 %>% mutate(subject = str_sub(sample, 1,3))
av4 $celltype = str_replace_all(string = av4$celltype,pattern = '_', replacement = ' ')
av4
# colors
= unname(sapply(c("grey", "orange"), col.alpha, 0.5))
cu1 = c("grey", "orange")
cu2 =
pggplot(av4, aes(x = celltype, y = core_ifn_score , color = timepoint, fill = timepoint )) +
theme_bw() +
geom_boxplot() +
#scale_x_discrete(position = "top") +
ylab("shared day 1 induced IFN state") +
xlab("")+
scale_color_manual(values = cu2 ) +
scale_fill_manual(values = cu1 ) +
theme(axis.title.y = element_text(size = 12, color = 'black' )) +
theme(axis.text.x = element_text(size = 11, color = "black") ) +
theme(axis.text.x=element_text(angle = 45, hjust = 1, color = 'black'))+
theme(legend.position = 'top')
pggsave(p, filename = paste0(figpath, "core_interferon_state.pdf"), width = 6, height = 5)
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
#
# attached base packages:
# [1] grid stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] ComplexHeatmap_2.6.2 scglmmr_0.1.0 here_1.0.1 forcats_0.5.1 stringr_1.4.0
# [6] dplyr_1.0.4 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
# [11] ggplot2_3.3.3 tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] tidyselect_1.1.0 lme4_1.1-26 RSQLite_2.2.7
# [4] AnnotationDbi_1.52.0 BiocParallel_1.24.1 scatterpie_0.1.7
# [7] munsell_0.5.0 codetools_0.2-18 statmod_1.4.35
# [10] withr_2.4.3 colorspace_2.0-0 GOSemSim_2.16.1
# [13] Biobase_2.50.0 rstudioapi_0.13 stats4_4.0.5
# [16] ggsignif_0.6.0 DOSE_3.16.0 labeling_0.4.2
# [19] Rdpack_2.1.1 MatrixGenerics_1.2.1 emmeans_1.5.4
# [22] GenomeInfoDbData_1.2.4 polyclip_1.10-0 bit64_4.0.5
# [25] farver_2.0.3 pheatmap_1.0.12 rprojroot_2.0.2
# [28] downloader_0.4 coda_0.19-4 vctrs_0.4.1
# [31] generics_0.1.2 TH.data_1.0-10 doParallel_1.0.16
# [34] R6_2.5.0 GenomeInfoDb_1.26.7 clue_0.3-59
# [37] graphlayouts_0.7.2 locfit_1.5-9.4 bitops_1.0-6
# [40] cachem_1.0.4 fgsea_1.16.0 DelayedArray_0.16.3
# [43] assertthat_0.2.1 scales_1.1.1 multcomp_1.4-16
# [46] ggraph_2.0.5 enrichplot_1.10.2 gtable_0.3.0
# [49] Cairo_1.5-12.2 egg_0.4.5 tidygraph_1.2.0
# [52] sandwich_3.0-0 rlang_1.0.2 slanter_0.2-0
# [55] GlobalOptions_0.1.2 splines_4.0.5 rstatix_0.7.0
# [58] broom_0.7.5 BiocManager_1.30.10 reshape2_1.4.4
# [61] abind_1.4-5 modelr_0.1.8 backports_1.2.1
# [64] qvalue_2.22.0 clusterProfiler_3.18.1 tools_4.0.5
# [67] ellipsis_0.3.2 gplots_3.1.1 RColorBrewer_1.1-2
# [70] BiocGenerics_0.36.1 Rcpp_1.0.6 plyr_1.8.6
# [73] progress_1.2.2 zlibbioc_1.36.0 RCurl_1.98-1.3
# [76] prettyunits_1.1.1 ggpubr_0.4.0 GetoptLong_1.0.5
# [79] viridis_0.5.1 cowplot_1.1.1 S4Vectors_0.28.1
# [82] zoo_1.8-8 SummarizedExperiment_1.20.0 haven_2.3.1
# [85] ggrepel_0.9.1 cluster_2.1.2 fs_1.5.0
# [88] variancePartition_1.25.6 magrittr_2.0.1 data.table_1.14.0
# [91] DO.db_2.9 openxlsx_4.2.3 circlize_0.4.12
# [94] reprex_1.0.0 mvtnorm_1.1-1 packrat_0.7.0
# [97] matrixStats_0.58.0 hms_1.0.0 GSVA_1.38.2
# [100] xtable_1.8-4 pbkrtest_0.5-0.1 RhpcBLASctl_0.21-247.1
# [103] XML_3.99-0.6 rio_0.5.16 readxl_1.3.1
# [106] IRanges_2.24.1 gridExtra_2.3 shape_1.4.6
# [109] compiler_4.0.5 KernSmooth_2.23-18 crayon_1.4.1
# [112] shadowtext_0.0.9 minqa_1.2.4 ggfun_0.0.4
# [115] lubridate_1.7.9.2 DBI_1.1.1 tweenr_1.0.2
# [118] dbplyr_2.1.0 MASS_7.3-53.1 boot_1.3-27
# [121] Matrix_1.3-2 car_3.0-10 cli_3.3.0
# [124] rbibutils_2.0 parallel_4.0.5 igraph_1.2.6
# [127] GenomicRanges_1.42.0 pkgconfig_2.0.3 rvcheck_0.1.8
# [130] foreign_0.8-81 foreach_1.5.1 xml2_1.3.2
# [133] annotate_1.68.0 XVector_0.30.0 GeneOverlap_1.26.0
# [136] estimability_1.3 rvest_0.3.6 digest_0.6.27
# [139] graph_1.68.0 cellranger_1.1.0 fastmatch_1.1-0
# [142] edgeR_3.32.1 GSEABase_1.52.1 curl_4.3
# [145] gtools_3.8.2 nloptr_1.2.2.2 rjson_0.2.20
# [148] nlme_3.1-152 lifecycle_1.0.0 jsonlite_1.7.2
# [151] aod_1.3.1 carData_3.0-4 viridisLite_0.3.0
# [154] limma_3.46.0 pillar_1.4.7 lattice_0.20-41
# [157] fastmap_1.1.0 httr_1.4.2 survival_3.2-10
# [160] GO.db_3.12.1 glue_1.6.2 zip_2.1.1
# [163] iterators_1.0.13 png_0.1-7 bit_4.0.4
# [166] ggforce_0.3.3 stringi_1.5.3 blob_1.2.1
# [169] org.Hs.eg.db_3.12.0 caTools_1.18.1 memoise_2.0.0
Comparison of day 7 signatures predictive of antibody response in
microarray and aggregated CITE-seq data
mid_res/array_bulk_comparison/array_bulk_comparison.R
# R 4.0.5
suppressMessages(library(tidyverse))
suppressMessages(library(scglmmr))
suppressMessages(library(here))
suppressMessages(library(BiocParallel))
suppressMessages(library(edgeR))
suppressMessages(library(variancePartition))
# set paths
= here('mid_res/array_bulk_comparison/figures/')
figpath dir.create(figpath)
= here('mid_res/array_bulk_comparison/generated_data/')
datapath dir.create(datapath)
# set parallel options
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
#pb = readRDS(file = here('mid_res/pb.ds'))
#fit7 = readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/fit7.rds'))
= readRDS(file = here('signature_curation/core_d7.rds'))
core7 $`LI.M156.0 plasma cell b cell Ig`
core7
# read processed pseudobulk data
# subset to unadjuvanted cohort and remove cell type string from sample names
= readRDS(here('data/h1h5_annotated_with_meta.rds'))
s
# day1.cohort = c("200", "205","207", "236", "237", "250", "273" ,"279")
= c("201", "209", "212", "215", "229", "233",
day7.cohort "234", "245", "256", "261", "268", "277")
= s@meta.data %>%
md7 filter(cohort == 'H1N1') %>%
filter(sampleid %in% day7.cohort) %>%
arrange(sampleid, timepoint)
= s@raw.data[ ,rownames(md7)]
umi7
# pseudobulk all cells
= lapply(X = split(md7, f = md7$sample), FUN = rownames)
scell = lapply(scell, function(x) Matrix::rowSums(umi7[ ,x]))
csample = as.data.frame(t(do.call(cbind, csample))) %>% t()
pbmat
#define day 7 metadata
= md7 %>%
met select(sample,timepoint , subjectid = sampleid) %>%
group_by(sample, subjectid, timepoint) %>%
distinct() %>%
ungroup() %>%
column_to_rownames('sample')
$timepoint = factor(met$timepoint, levels = c('d0', 'd7'))
met
# check order
stopifnot(isTRUE(all.equal(colnames(pbmat), rownames(met))))
# filter features
= filterByExpr(y = pbmat, design = met$timepoint,min.count = 3)
gene.keep = pbmat[gene.keep, ]
pbmat
# fit model
= ~ 0 + timepoint + (1|subjectid)
f1 = makeContrastsDream(formula = f1, data = met,
L1 contrasts = "timepointd7 - timepointd0")
= voomWithDreamWeights(counts = pbmat,formula = f1,
v7 BPPARAM = pparam,data = met)
= dream(exprObj = v7,formula = f1,data = met,
result7 L = L1, BPPARAM = pparam, useWeights = TRUE)
# save
saveRDS(result7,file = paste0(datapath,'result7.rds'))
## Part II fit same model on microarray data
# array data coefficient
# read array data
= data.table::fread("data/CHI_H1N1_data/microarray/CHI_GE_matrix_gene.txt", data.table = F) %>%
array ::remove_rownames() %>%
tibble::column_to_rownames("gene") %>%
tibbleselect(-matches("day70")) %>%
select(., matches("day0|day7")) %>%
select(-matches("pre"))
# day 7 samples; no day 7 data for subject 209
=
array7 %>%
array select(which(substr(names(.),1,3) %in% day7.cohort)) %>%
select(-matches("209"))
# Metadata
=
d7md colnames(array7) %>%
::as.data.frame() %>%
baserename(sample = ".") %>%
mutate(timepoint = str_sub(sample, -4,-1)) %>%
mutate(subjectid = str_sub(sample, 1,3)) %>%
column_to_rownames("sample")
$timepoint = factor(d7md$timepoint, levels = c('day0', 'day7'))
d7md
# check order
stopifnot(isTRUE(all.equal(colnames(array7), rownames(d7md))))
# test same genes
= rownames(pbmat)
gene.sub = array7[gene.sub, ]
array7 = !is.na(Matrix::rowSums(array7))
gene.keep2 = array7[gene.keep2, ]
array7
# fit model
.1 = makeContrastsDream(formula = f1, data = d7md,
L1contrasts = "timepointday7 - timepointday0")
# no weights for normalized microarray data ; same formula
.1 = dream(exprObj = array7, formula = f1,data = d7md,
result7L = L1.1, BPPARAM = pparam, useWeights = FALSE)
# save
saveRDS(result7.1,file = paste0(datapath,'result7.1.rds'))
# comparison
# extract results for array data
= ExtractResult(model.fit.list = list('array' = result7.1),
ra coefficient.number = 1,
coef.name = 'timepointday7 - timepointday0')
$array$logFC.array = ra$array$logFC
ra= ra$array %>%
ra select(gene, logFC.array)
# extract results for CITE-seq bulk data
= ExtractResult(model.fit.list = list('CITE-seq Bulk' = result7),
rc coefficient.number = 1,
coef.name = 'timepointd7 - timepointd0')
$`CITE-seq Bulk`$logFC.CITEseq = rc$`CITE-seq Bulk`$logFC
rc= rc$`CITE-seq Bulk` %>%
rc select(gene, logFC.CITEseq)
# visualize correlation between signals
= full_join(ra, rc)
d = d %>% filter(gene %in% core7$`LI.M156.0 plasma cell b cell Ig`)
dsub =
p ggplot(dsub, aes(x = logFC.array , y = logFC.CITEseq)) +
theme_bw() +
geom_smooth(method = 'lm', color = 'black') +
geom_point(shape = 21, color = 'white', fill = 'black') +
::geom_text_repel(data = dsub, mapping = aes(label = gene),
ggrepelsegment.size = 0.1, box.padding = 0.1,
max.overlaps = 4, size = 2.6) +
::stat_cor(method = "pearson") +
ggpubrylab("LI.M156 Module \n CITE-seq Day 7 log2 FC") +
xlab("LI.M156 Module \n Microarray Day 7 log2 FC")
ggsave(p, filename = paste0(figpath, "/m156_gene_correlation.pdf"),width = 3, height = 3)
Single cell deconvolution of predictive signatures across protein
based subsets.
mid_res/d7_predictive_deconvolution/1.d7predictive.score.deconvolution.r
This script uses R 3.5.1
# Day 7 signature deconvolution direct with module score.
# R 3.5.1
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
suppressMessages(library(magrittr))
suppressMessages(library(here))
source("functions/analysis_functions.R")
source('functions/MattPMutils.r')
# save path
= here("mid_res/d7_predictive_deconvolution/figures/")
figpath dir.create(figpath)
# day 7 core signatures.
= readRDS("signature_curation/core_d7.rds")
sig7
# h1 data baseline and day 7 cells.
= ReadCohort(joint_object_dir = "data/h1h5_annotated_with_meta.rds", cohort = "H1N1")
h1 = SetAllIdent(h1, id = "time_cohort") %>%
h1 SubsetData(ident.use = "d7")
# add module score
= AddModuleScore(h1, genes.list = sig7, seed.use = 1, enrich.name = names(sig7))
h1
# get long for mfor visualization of module score distribtion.
= h1@meta.data %>% select(
df_sig
celltype_joint,
timepoint,LI.M156_Plasma_Cell = `LI.M156.0 plasma cell b cell Ig6`,
CHI_4 = 'CHI 4 d710',
CHI_5 = `CHI 5 d711`,
CHI_d7_Response = `CHI d7 Response9`
%>%
) mutate(celltype_b = if_else(
%in% c("BC_Mem", "BC_Naive", "CD38_Bcell", "pDC"),
celltype_joint true = celltype_joint,
false = "other"
%>%
)) gather(module, module_score, LI.M156_Plasma_Cell:CHI_d7_Response) %>%
mutate(timepoint = factor(timepoint, levels = c("d0", "d7"))) %>%
mutate(celltype_b = str_replace_all(
string = celltype_b,
pattern = "_",
replacement = " "
%>%
)) mutate(celltype_joint = str_replace_all(
string = celltype_joint,
pattern = "_",
replacement = " "
%>%
)) mutate(celltype_b = factor(
celltype_b,levels = c("CD38 Bcell", "pDC", "BC Naive", "BC Mem", "other")
))
# plot CHI predictive sig from array
= df_sig %>% filter(timepoint == "d7" &
subplot %in% c("CHI_d7_Response", "CHI_4", "LI.M156_Plasma_Cell"))
module = subplot %>% mutate(
subplot module = dplyr::recode(
module,"LI.M156_Plasma_Cell" = "M156",
"CHI_d7_Response" = "Antibody sig",
"CHI_4" = "CHI 4"
)
)
= col.alpha(acol = 'grey',alpha = 0.2)
grey1
= ggplot(subplot, aes(
p x = reorder(celltype_joint, module_score),
y = module_score,
fill = celltype_joint
+
)) theme_bw() +
geom_violin(
show.legend = FALSE,
scale = "width",
size = 0.2,
draw_quantiles = c(0.5)
+
) facet_wrap( ~ module,
as.table = TRUE,
scales = "free_x",
ncol = 4) +
geom_hline(yintercept = 0,
size = 0.5,
linetype = "dashed") +
coord_flip() +
scale_fill_manual(values = c(rep(grey1, 5), "red", rep(grey1, 17))) +
theme(
panel.spacing = unit(0.2, "lines"),
strip.background = element_blank(),
strip.text = element_text(size = 8),
axis.text.y = element_text(size = 10, color = "black"),
axis.text.x = element_text(size = 7, color = "black"),
panel.border = element_blank()
+
) xlab("") + ylab("single cell score distribution \n day 7 bulk predictive signatures") +
ggtitle("Day 7 post-vaccination")
ggsave(p, filename = paste0(figpath,"d7core_Tirosh_mod_score_celltypes_sub.pdf"), width = 5, height = 5)
Single cell raw UMI count deconvolution of TNFRSF17 gene.
mid_res/d7_predictive_deconvolution/2.TNFRSF17.deconvolution.r4.0.5.r
# R 4.0.5
suppressMessages(library(tidyverse))
suppressMessages(library(scglmmr))
suppressMessages(library(here))
suppressMessages(library(magrittr))
source('functions/analysis_functions.R')
# save path
= here('mid_res/d7_predictive_deconvolution/figures/')
figpath
# single cell composition of m156
= readRDS(file = here('signature_curation/core_d7.rds'))
core7 = readRDS(file = here('data/h1h5_annotated_with_meta.rds'))
h1
= intersect(rownames(h1@data), core7$`LI.M156.0 plasma cell b cell Ig`)
m156_sub = as.data.frame(as.matrix(t(as.matrix(h1@data[m156_sub, ]))))
gene #pdf = cbind(gene, h1@meta.data)
#pdf = pdf %>% gather(gene, normcount, m156_sub[1]:m156_sub[length(m156_sub)])
# composisiton of TNFRSF17
= as.data.frame(as.matrix(t(as.matrix(h1@raw.data[m156_sub, ]))))
gene = cbind(gene, h1@meta.data)
pdf = pdf %>% gather(gene, count, m156_sub[1]:m156_sub[length(m156_sub)])
pdf = pdf %>%
tnf filter(cohort == 'H1N1') %>%
filter(gene == "TNFRSF17" & timepoint %in% c("d0", "d7") ) %>%
group_by(celltype_joint, timepoint, gene) %>%
summarise(n = sum(count)) %>%
ungroup()
# visualize distribution
$celltype_joint = str_replace_all(tnf$celltype_joint, pattern = '_',replacement = ' ')
tnf= ggplot(tnf, aes(x = reorder(celltype_joint, n), y = n, fill = timepoint)) +
p geom_bar(stat = "identity", position=position_dodge()) +
theme_bw() +
coord_flip() +
theme(axis.text.x = element_text(size = 9, color = 'black'), axis.title.x = element_text(color = 'black')) +
theme(axis.text.y = element_text(size = 9, color = 'black'), axis.title.y = element_text(color = 'black')) +
ylab("UMI counts") +
scale_fill_manual(values = c("grey48", "red")) +
theme(axis.title.y = element_blank()) +
theme(legend.position = c(0.7, 0.2) ) +
ggtitle("TNFRSF17 Gene")
pggsave(p, filename = paste0(figpath, "TNFRSF17_composition.pdf"),width = 3, height = 4.3)
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] magrittr_2.0.3 here_1.0.1 scglmmr_0.1.0 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
# [8] readr_1.4.0 tidyr_1.1.2 tibble_3.1.8 ggplot2_3.3.3 tidyverse_1.3.0 viridis_0.5.1 viridisLite_0.3.0
#
# loaded via a namespace (and not attached):
# [1] utf8_1.2.2 tidyselect_1.2.0 lme4_1.1-26 RSQLite_2.2.7
# [5] AnnotationDbi_1.52.0 grid_4.0.5 BiocParallel_1.24.1 scatterpie_0.1.7
# [9] munsell_0.5.0 codetools_0.2-18 statmod_1.4.35 withr_2.4.3
# [13] colorspace_2.0-0 GOSemSim_2.16.1 Biobase_2.50.0 rstudioapi_0.13
# [17] ROCR_1.0-11 stats4_4.0.5 ggsignif_0.6.0 DOSE_3.16.0
# [21] labeling_0.4.2 MatrixGenerics_1.2.1 Rdpack_2.1.1 emmeans_1.5.4
# [25] GenomeInfoDbData_1.2.4 polyclip_1.10-0 bit64_4.0.5 farver_2.0.3
# [29] pheatmap_1.0.12 rprojroot_2.0.2 downloader_0.4 coda_0.19-4
# [33] vctrs_0.5.1 generics_0.1.2 TH.data_1.0-10 R6_2.5.0
# [37] doParallel_1.0.16 GenomeInfoDb_1.26.7 graphlayouts_0.7.2 locfit_1.5-9.4
# [41] bitops_1.0-6 cachem_1.0.4 fgsea_1.16.0 DelayedArray_0.16.3
# [45] assertthat_0.2.1 scales_1.1.1 multcomp_1.4-16 ggraph_2.0.5
# [49] enrichplot_1.10.2 gtable_0.3.0 egg_0.4.5 tidygraph_1.2.0
# [53] sandwich_3.0-0 rlang_1.0.6 slanter_0.2-0 splines_4.0.5
# [57] rstatix_0.7.0 broom_0.7.5 BiocManager_1.30.10 reshape2_1.4.4
# [61] abind_1.4-5 modelr_0.1.8 backports_1.2.1 qvalue_2.22.0
# [65] clusterProfiler_3.18.1 tools_4.0.5 ellipsis_0.3.2 gplots_3.1.1
# [69] RColorBrewer_1.1-2 BiocGenerics_0.36.1 Rcpp_1.0.9 plyr_1.8.6
# [73] progress_1.2.2 zlibbioc_1.36.0 RCurl_1.98-1.3 prettyunits_1.1.1
# [77] ggpubr_0.4.0 cowplot_1.1.1 S4Vectors_0.28.1 zoo_1.8-8
# [81] SummarizedExperiment_1.20.0 haven_2.4.3 ggrepel_0.9.1 fs_1.5.0
# [85] variancePartition_1.25.6 data.table_1.14.0 DO.db_2.9 openxlsx_4.2.3
# [89] RANN_2.6.1 reprex_1.0.0 mvtnorm_1.1-1 packrat_0.7.0
# [93] matrixStats_0.58.0 hms_1.0.0 GSVA_1.38.2 xtable_1.8-4
# [97] pbkrtest_0.5-0.1 RhpcBLASctl_0.21-247.1 XML_3.99-0.6 rio_0.5.16
# [101] readxl_1.3.1 IRanges_2.24.1 gridExtra_2.3 compiler_4.0.5
# [105] KernSmooth_2.23-18 crayon_1.4.1 shadowtext_0.0.9 minqa_1.2.4
# [109] ggfun_0.0.4 lubridate_1.8.0 DBI_1.1.1 tweenr_1.0.2
# [113] dbplyr_2.1.0 MASS_7.3-53.1 boot_1.3-27 Matrix_1.4-1
# [117] car_3.0-10 cli_3.4.1 rbibutils_2.0 parallel_4.0.5
# [121] igraph_1.2.6 GenomicRanges_1.42.0 pkgconfig_2.0.3 rvcheck_0.1.8
# [125] foreign_0.8-81 xml2_1.3.2 foreach_1.5.1 annotate_1.68.0
# [129] XVector_0.30.0 GeneOverlap_1.26.0 estimability_1.3 rvest_0.3.6
# [133] digest_0.6.27 graph_1.68.0 cellranger_1.1.0 fastmatch_1.1-0
# [137] edgeR_3.32.1 GSEABase_1.52.1 curl_4.3 gtools_3.8.2
# [141] nloptr_1.2.2.2 lifecycle_1.0.3 nlme_3.1-152 jsonlite_1.7.2
# [145] aod_1.3.1 carData_3.0-4 limma_3.46.0 fansi_0.4.2
# [149] pillar_1.8.1 lattice_0.20-41 fastmap_1.1.0 httr_1.4.2
# [153] survival_3.2-10 GO.db_3.12.1 glue_1.6.2 zip_2.1.1
# [157] iterators_1.0.13 bit_4.0.4 ggforce_0.3.3 stringi_1.5.3
# [161] blob_1.2.1 org.Hs.eg.db_3.12.0 caTools_1.18.1 memoise_2.0.0
This section is run with R 3.5.1
Construct mRNA-based monocyte single cell latent space with DDRTree. Infer pseudotime using monocle.
mid_res/monocyte_map/1_monocyte_map.r
# R 3.5
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(Seurat))
suppressMessages(library(here))
#source("de_workflow-master/downstream_analysis_functions.r")
source("functions/analysis_functions.R")
= readRDS("signature_curation/BTM_li.rds")
btm = here('mid_res/monocyte_map/generated_data/'); dir.create(datapath)
datapath
# select celltype on which to to run pseudotime analysis
= c("CD14_Mono", "CD16_Mono")
celltype_use
# subset time cohort
= ReadCohort(joint_object_dir = "data/h1h5_annotated_with_meta.rds", cohort = "H1N1") %>%
sub SetAllIdent(id = "time_cohort") %>%
SubsetData(ident.use = "d1", subset.raw = TRUE) %>%
SetAllIdent(id = "celltype_joint") %>%
SubsetData(ident.use = celltype_use, subset.raw = TRUE)
# addd proteins as meta data
= as.data.frame(t(sub@assay$CITE@data))
prot_dat = AddMetaData(sub, metadata = prot_dat)
sub
#### use DDR tree to calculate trajectory
library(monocle)
= monocle::importCDS(sub)
sm = BiocGenerics::estimateSizeFactors(sm)
sm <- detectGenes(sm, min_expr = 0.1)
sm <- row.names(subset(fData(sm),num_cells_expressed >= 15))
expressed_genes
## Select genes
= differentialGeneTest(sm[expressed_genes,],fullModelFormulaStr = "~timepoint",cores = 4)
time1_genes = grep(pattern = "RPL|RPS|MT-|RP11", x = expressed_genes, value = TRUE)
rpgene = time1_genes %>%
t1genes rownames_to_column("gene") %>%
filter(qval < 0.15) %>%
arrange(qval) %>%
filter(!gene %in% rpgene)
= t1genes %$% gene
t1gene
# set ordering filter and reduce dimensions by the ddr tree algorithm
= setOrderingFilter(sm, ordering_genes = t1gene)
msm = reduceDimension(sm, max_components = 2,
sm reduction_method = "DDRTree",
residualModelFormulaStr = "~ sampleid")
= orderCells(sm, reverse = TRUE)
sm saveRDS(sm, file = paste0(datapath, "sm_cd14_cd16_d1_monocle_object.rds"))
Integrate pseudotime, surface protein levels and time relative to
vaccination with mRNA based pseudotime calculated above to interpret 3
branches.
mid_res/monocyte_map/2_monocyte_mat_visualization.I.r
# R 3.5
# make visualization of main monocyte pseudotime axis.
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(Seurat))
suppressMessages(library(here))
suppressMessages(library(monocle))
source("functions/analysis_functions.R")
source('functions/MattPMutils.r')
= readRDS("signature_curation/BTM_li.rds")
btm = here("mid_res/monocyte_map/figures/"); dir.create(figpath, recursive = TRUE)
figpath = readRDS(file = here("mid_res/monocyte_map/generated_data/sm_cd14_cd16_d1_monocle_object.rds"))
sm
######
# visualization
= plot_cell_trajectory(sm, color_by = "Pseudotime")
p = ggplot_build(p)[["plot"]][["data"]] %>%
df rename(component_1 = data_dim_1 ,component_2 = data_dim_2)
$adjmfc.time = factor(df$adjmfc.time, levels = c("d0 low", "d1 low", "d0 high", "d1 high"))
dflibrary(cowplot)
# set plot theme
= list(theme_bw(),
theme.set theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank()))
# reverse order to make the root node go left to right
$component_1 = -1* df$component_1
df
# create main plot
= ggplot(df, aes(x = component_1, y = component_2, fill = Pseudotime)) +
p1 +
theme.set geom_point(shape = 21, size = 2.2, color = "grey" ,stroke = 0.1) +
theme(legend.position = c(0.1, 0.7)) +
theme(strip.background = element_blank()) +
ylab("mRNA trajectory component 2") +
xlab("mRNA trajectory component 1") +
scale_fill_viridis_c(option = "B") +
scale_color_manual(values = c("grey", "black"))
ggsave(p1, filename = paste0(figpath, "mono_trajectory_only.png"), width = 6, height = 5)
# make a background plot on which to add the canvas marginal plots
= ggplot(df, aes(x = component_1, y = component_2, fill = Pseudotime)) +
pnull +
theme.set theme(legend.position = c(0.1, 0.7)) +
theme(strip.background = element_blank()) +
theme(axis.title.x = element_text(size = 17)) +
theme(axis.title.y = element_text(size = 17)) +
ylab("mRNA trajectory component 2") +
xlab("mRNA trajectory component 1") +
scale_fill_viridis_c(option = "plasma") +
scale_color_manual(values = c("grey", "black"))
# real time for top margin
= c(
time.col col.alpha(acol = 'black', 0.1),
col.alpha(acol = ggsci::pal_jama()(2)[2], 0.4)
) = c(
timecol2 'black',
::pal_jama()(2)[2]
ggsci
)= axis_canvas(pnull, axis = "x") +
xd geom_density(data = df, aes(x = component_1, color = timepoint), size = 1) +
scale_color_manual(values = time.col)
## test
= axis_canvas(pnull, axis = "x") +
xd geom_density(data = df, aes(x = component_1, fill = timepoint, color = timepoint), size = 1) +
scale_fill_manual(values = time.col) +
scale_color_manual(values = timecol2)
xd##
# CD14 vs CD16 protein for bottom margin
= axis_canvas(pnull, axis = "x") +
x16 geom_smooth(data = df,
aes(x = component_1, y = CD16_PROT),
method = "loess", se = TRUE,
color = 'black') +
geom_smooth(data = df,
aes(x = component_1, y = CD14_PROT),
method = "loess", se = TRUE,
color = 'grey')
# add to plot
<- insert_xaxis_grob(pnull, xd, grid::unit(.2, "null"), position = "top")
p2 = insert_xaxis_grob(p2, x16, grid::unit(.3, "null"), position = "bottom")
p4 = ggdraw(p4)
p6
p6# save
ggsave(p6, filename = paste0(figpath, "monocyte_merged_plot.pdf"), width = 6, height = 8)
ggsave(p6, filename = paste0(figpath, "monocyte_merged_plot.png"), width = 6, height = 8)
Integrate “bottom up” single cell monocyte pseudotime reconstruction with “top down” mixed effects vaccine perturbation phenotypes. Within the genes defined by the pseudobulk mixed effects models (the section above) and in the leading edge of curated pathway enrichments, calculate branch dependent differential expression using BEAM. Define categories of genes based on their behavior across single cell pseudotime. mid_res/monocyte_map/3_monocyte_perturbation_integration.r
# R 3.5
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(Seurat))
suppressMessages(library(here))
library(monocle)
source("functions/analysis_functions.R")
source("functions/MattPMutils.r")
= here("mid_res/monocyte_map/figures/"); dir.create(figpath, recursive = TRUE)
figpath = here("mid_res/monocyte_map/generated_data/"); dir.create(datapath, recursive = TRUE)
datapath
# load monocle object
= readRDS(file = here("mid_res/monocyte_map/generated_data/sm_cd14_cd16_d1_monocle_object.rds"))
sm
# load day 1 enrichment from monocytes
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g1c.rds'))
g1c = g1c$CD14_Mono %>%
g1 filter(NES > 0) %>%
filter(padj < 0.05)
= g1$leadingEdge
monole names(monole) = g1$pathway
= unique(unlist(monole))
cgene2 saveRDS(cgene2,file = paste0(datapath, 'cgene2.rds'))
# define branch dependent genes
<- BEAM(sm[cgene2, ], branch_point = 1, cores = 4)
de.branch saveRDS(de.branch,file = paste0(datapath,'de_branch.rds'))
= readRDS(file = here('mid_res/monocyte_map/generated_data/de_branch.rds'))
de.branch = de.branch %>% filter(qval < 0.05)
de.branch.sub = as.character(de.branch.sub$gene_short_name)
branch.genes
# Visualization of branch genes
= exprs(sm)[branch.genes, ] %>%
led t() %>%
as.matrix() %>%
as.data.frame()
= cbind(led, sm@phenoData@data)
d
= d %>% select(OASL, CCL2, IFITM2, FCER1G, TNFSF10, FCGR1B)
dd $Pseudotime = d$Pseudotime
dd$timepoint = d$timepoint
dd
# mono act
= dd %>% filter(Pseudotime > 5)
dd = dd %>% gather(gene, value, OASL:FCGR1B)
dd
# vis theme
= list(
mtheme theme_bw(),
geom_smooth(size = 2),
theme(axis.title = element_text(size = 18)),
::scale_color_jama(alpha = 0.8),
ggscitheme(legend.position = c(0.2, 0.8)),
xlab('Pseudotime')
)
# example category 1 gene
= ggplot(dd %>% filter(gene %in% c('CCL2')),
p1 aes(x = Pseudotime, y = value, color = timepoint)) +
ylab('CCL2 Expression') +
mtheme ggsave(p1, filename = paste0(figpath,'Category1_1_CCL2.pdf'), width = 3.5, height = 3.5)
# example category 2 gene
= ggplot(dd %>% filter(gene %in% c('TNFSF10')),
p1 aes(x = Pseudotime, y = value, color = timepoint)) +
ylab('TNFSF10 Expression') +
mtheme ggsave(p1, filename = paste0(figpath,'Category1_2_TNFSF10.pdf'), width = 3.5, height = 3.5)
# example category 2 gene
= ggplot(dd %>% filter(gene %in% c('FCER1G')),
p1 aes(x = Pseudotime, y = value, color = timepoint)) +
ylab('FCER1G Expression') +
mtheme ggsave(p1, filename = paste0(figpath,'Category2_1_FCER1G.pdf'), width = 3.5, height = 3.5)
# example category 2 gene
= ggplot(dd %>% filter(gene %in% c('IFITM2')),
p1 aes(x = Pseudotime, y = value, color = timepoint)) +
+
mtheme ylab('IFITM2 Expression')
ggsave(p1, filename = paste0(figpath,'Category2__1_IFITM2.pdf'), width = 3.5, height = 3.5)
#########################
# leading edge signatures analysis
# Interferon
= intersect(branch.genes, monole$`reactome interferon signaling`)
sig pdf(file = paste0(figpath, 'IFN_branch_de.pdf'),width = 5, height = 5)
plot_genes_branched_heatmap(sm[sig, ],
branch_point = 1,
cores = 1,
num_clusters = 3,
use_gene_short_name = T,
show_rownames = T)
dev.off()
# set categ
= c('IFITM2', 'PTPN1', 'EIF4E2', 'IFITM3', 'HLA-C')
cat2 = sig[!sig %in% cat2]
cat1 # get data
= exprs(sm)[sig, ] %>%
led t() %>%
as.matrix() %>%
as.data.frame()
= cbind(led, sm@phenoData@data)
d = d %>% select(sig)
dd = apply( dd, 2, scale.simple) %>% as.data.frame()
dd $Pseudotime = d$Pseudotime
dd$timepoint = d$timepoint
dd= sig[1]
index1 = sig[length(sig)]
index2 = dd %>% gather(gene, value, index1:index2 )
d3 $cat = ifelse(d3$gene %in% cat1, '1', '2')
d3
= ggplot(data = d3 %>%
p1 filter(Pseudotime > 5 & cat ==1 )) +
geom_smooth(data = d3 %>%
filter(Pseudotime > 5 & timepoint == 'd0' & cat ==1 ),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(1), se = FALSE) +
geom_smooth(data = d3 %>% filter(Pseudotime > 5 & timepoint == 'd1' & cat ==1 ),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(2)[2],
se = FALSE, size = 2) +
theme_bw() +
geom_vline(xintercept = 9.5, linetype = 'dashed') +
theme(strip.background = element_blank()) +
ylab('reactome interferon \n Category 1 genes ') +
theme(axis.title = element_text(size = 18))
p1ggsave(p1,filename = paste0(figpath, 'IFNcat1.pdf'), width = 3.7, height = 3.5)
= ggplot(data = d3 %>%
p2 filter(Pseudotime > 5 & cat ==2 )) +
geom_smooth(data = d3 %>%
filter(Pseudotime > 5 & timepoint == 'd0' & cat ==2 ),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(1), se = FALSE) +
geom_smooth(data = d3 %>%
filter(Pseudotime > 5 & timepoint == 'd1' & cat ==2),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(2)[2],
se = FALSE, size = 2) +
theme_bw() +
geom_vline(xintercept = 9.5, linetype = 'dashed') +
theme(strip.background = element_blank()) +
ylab('reactome interferon \n Category 2 genes ') +
theme(axis.title = element_text(size = 18))
p2ggsave(p2,filename = paste0(figpath, 'IFNcat2.pdf'), width = 3.7, height = 3.5)
# mtor hypoxia
= intersect(branch.genes,
sig c(monole$`HALLMARK hypoxia`, monole$`HALLMARK MTORC1 signaling`)
)
pdf(file = paste0(figpath, 'mtorhypoxia_branch_de.pdf'),width = 5, height = 5)
plot_genes_branched_heatmap(sm[sig, ],
branch_point = 1,
cores = 1,
num_clusters = 3,
use_gene_short_name = T,
show_rownames = T)
dev.off()
#set categ
= c('CTSC', 'PFKL', 'ACTR3', 'CITED2', 'PGK1', 'INSIG1', 'CHST2')
cat2 = sig[!sig %in% cat2]
cat1
# get data
= exprs(sm)[sig, ] %>%
led t() %>%
as.matrix() %>%
as.data.frame()
= cbind(led, sm@phenoData@data)
d = d %>% select(sig)
dd = apply( dd, 2, scale.simple) %>% as.data.frame()
dd $Pseudotime = d$Pseudotime
dd$timepoint = d$timepoint
dd= sig[1]
index1 = sig[length(sig)]
index2 = dd %>% gather(gene, value, index1:index2 )
d3 $cat = ifelse(d3$gene %in% cat1, '1', '2')
d3
= ggplot(data = d3 %>%
p1 filter(Pseudotime > 5 & cat ==1 )) +
geom_smooth(data = d3 %>%
filter(Pseudotime > 5 & timepoint == 'd0' & cat ==1 ),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(1), se = FALSE) +
geom_smooth(data = d3 %>%
filter(Pseudotime > 5 & timepoint == 'd1' & cat ==1 ),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(2)[2],
se = FALSE, size = 2) +
theme_bw() +
geom_vline(xintercept = 9.5, linetype = 'dashed') +
theme(strip.background = element_blank()) +
ylab('MTORC1 and Hypoxia\n Category 1 genes ') +
theme(axis.title = element_text(size = 18))
p1ggsave(p1,filename = paste0(figpath, 'mtorcat1.pdf'), width = 3.7, height = 3.5)
= ggplot(data = d3 %>%
p2 filter(Pseudotime > 5 & cat ==2 )) +
geom_smooth(data = d3 %>%
filter(Pseudotime > 5 & timepoint == 'd0' & cat ==2 ),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(1), se = FALSE) +
geom_smooth(data = d3 %>% filter(Pseudotime > 5 & timepoint == 'd1' & cat ==2),
mapping = aes(x = Pseudotime, y = value, group = gene),
color = ggsci::pal_jama(alpha = 0.2)(2)[2],
se = FALSE, size = 2) +
theme_bw() +
geom_vline(xintercept = 9.5, linetype = 'dashed') +
theme(strip.background = element_blank()) +
ylab('MTORC1 and Hypoxia\n Category 2 genes ') +
theme(axis.title = element_text(size = 18))
ggsave(p2,filename = paste0(figpath, 'mtorcat2.pdf'), width = 3.7, height = 3.5)
Pathway enrichment within the integrated pseudotime and perturbation
based gene categories defined above.
note due to issues the enrichr R package servers, the html web based
enrichr server was used here. This script contains links to those
results and the genes used as input. This script uses the
LeadingEdgeIndexed function from scglmmr requiring R 4.0.5.
mid_res/monocyte_map/4_genecat_enrichr.r
# R 4.0.5
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
library(enrichR)
= here("mid_res/monocyte_map/figures/"); dir.create(figpath, recursive = TRUE)
figpath = here("mid_res/monocyte_map/generated_data/"); dir.create(datapath, recursive = TRUE)
datapath
# monocyte leading edge d1
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g1c.rds'))
g1c = lapply(g1c, function(x) x %>% filter(NES > 0))
g1c = LeadingEdgeIndexed(gsea.result.list = g1c,padj.threshold = 0.05)
mono.le
# branch ependent genes
= readRDS(file = here('mid_res/monocyte_map/data/de_branch.rds'))
de.branch = de.branch %>% filter(qval < 0.05)
de.branch.sub = as.character(de.branch.sub$gene_short_name)
branch.genes
# enrichr
<- c("GO_Molecular_Function_2015",
dbs "GO_Cellular_Component_2015",
"GO_Biological_Process_2015")
= c('IFITM2', 'PTPN1', 'EIF4E2', 'IFITM3', 'HLA-C')
cat2.ifn = intersect(mono.le$CD14_Mono$`reactome interferon signaling`, branch.genes)
cat1.ifn = setdiff(cat1.ifn, cat2.ifn)
cat1.ifn
# mtor
=c('CTSC', 'PFKL', 'ACTR3', 'CITED2', 'PGK1', 'INSIG1', 'CHST2')
cat2.mtor = intersect(mono.le$CD14_Mono$`HALLMARK MTORC1 signaling`, branch.genes)
cat1.mtor = setdiff(cat1.mtor, cat2.mtor)
cat1.mtor
# Cat 1 Mtor
# https://maayanlab.cloud/Enrichr/enrich?dataset=eb3028063e4833deb6212e24235dffee
# Cat 2 mtor
# https://maayanlab.cloud/Enrichr/enrich?dataset=ace0e94a6d3c1fa0037be65c0f677aa2
# Cat 1 IFN
# https://maayanlab.cloud/Enrichr/enrich?dataset=537c52fe1a6621033587e9048bf20e98
# Cat 2 ifn
# https://maayanlab.cloud/Enrichr/enrich?dataset=e3ef3178aeb695f05f90ae855b80da21
Fit combined mixed model with unadjuvanted and adjuvanted subjects
and apply contrast to define difference in 24h fold change post
vaccination within protein subsets adjusted for age and sex.
mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/1_V4_AS03_contrastmodel.r
# R version 4.0.5
# H5 vs H1 day 1 DE contrast pseudobulk model
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(variancePartition))
#suppressMessages(library(scglmmr))
source("functions/analysis_functions.R")
source('functions/scglmmr.functions.R')
# make output directories
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/");
datapath dir.create(datapath, recursive = TRUE)
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/figuresV4/");
figpath dir.create(figpath, recursive = TRUE)
# parallel options for dream lme4 fits
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# read metadata to exract sample names in analysis
= readRDS(file = here('data/samplemd.rds'))
samplemd = samplemd %>% filter(time_cohort =='d1') %$% sample
d1sx
# read processed pseudobulk data
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb
# remove cell type string from sample names and subset to day 1 cohort
= gsub("~.*","",colnames(pb[[1]]))
cnames = lapply(pb, function(x){
pb %>% as.data.frame() %>%
x setNames(nm = cnames) %>%
select(all_of(d1sx)) %>%
as.matrix()
})
# sample metadata for contrast model
=readRDS(file = here('data/samplemd.rds'))
samplemd =
samplemd %>%
samplemd filter(! time_cohort == 'd7') %>%
mutate(scaledage = (age - mean(age)) / sd(age)) %>%
rename('subjectid' = sampleid) %>%
rename('group' = adjmfc.group) %>%
mutate(group = ifelse(group %in% c('high', ' low'), "NOAS03", "AS03")) %>%
mutate(time.group = paste(timepoint, group, sep = "_"))
# get rid of space
$time.group = str_replace_all(
samplemdstring = samplemd$time.group,
pattern = ' ',
replacement = ''
)# re-level time.group into ordered combined factor
$time.group = factor(
samplemd$time.group,
samplemdlevels = c('d0_AS03', 'd1_AS03', 'd0_NOAS03', 'd1_NOAS03')
)# format metadata
= samplemd %>%
samplemd remove_rownames() %>%
column_to_rownames('sample')
# designmat
= samplemd[ ,c('gender', 'scaledage', 'time.group')]
met = model.matrix( ~ 0 + time.group + gender + scaledage, data = met)
mat
################################
# specify random intercept model formula with combined factor
<- ~ 0 + time.group + gender + scaledage + (1|subjectid)
f1
# specify contrast matrix to test the fold change difference
# based on levels of time.group this should be cmat = c(-1, 1, 1, -1, 0, 0)
= makeContrastsDream(formula = f1, data = samplemd,
L2 contrasts = c(delta = "(time.groupd1_AS03 - time.groupd0_AS03) - (time.groupd1_NOAS03 - time.groupd0_NOAS03)")
)plotContrasts(L2) + ggsave(filename = paste0(figpath,'contrastmodel.pdf'), width = 7, height = 4)
# fit model on each subset
# init store
= v1 = list()
fit1 for (i in 1:length(pb)) {
# init data
= samplemd
meta = f1
form = L2
contrast_matrix = pb[[i]]
counts
# dge list
= edgeR::DGEList(counts = counts, samples = meta)
d
# filter cell type specific lowly expressed genes and calc norm factors
= edgeR::filterByExpr(y = d$counts, min.count = 3, design = mat)
gtable print(names(pb)[i]);print(table(gtable))
= d[gtable, keep.lib.sizes=FALSE]
d = edgeR::calcNormFactors(object = d)
d
# get voom observation level weights
= voomWithDreamWeights(counts = d,
v formula = form,
data = meta,
BPPARAM = pparam,
plot = TRUE, save.plot = TRUE)
# fit contrast mixed model
= dream(exprObj = v,
fitmm formula = form,
data = meta,
L = contrast_matrix,
BPPARAM = pparam,
useWeights = TRUE, REML = TRUE)
# save results
= v
v1[[i]] = fitmm
fit1[[i]]
}names(v1) = names(pb)
# save day 1 contrast fit
saveRDS(object = fit1, file = paste0(datapath, 'fit12.rds'))
saveRDS(object = v1, file = paste0(datapath, 'v12.rds'))
# apply eBayes to model fit
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/fit12.rds'))
fit12
# over prev fit1e - compare
= lapply(fit12, variancePartition::eBayes)
fit12e names(fit12e) = names(pb)
saveRDS(object = fit12e, file = paste0(datapath, 'fit12e.rds'))
# save model fitting data
saveRDS(object = samplemd, file = paste0(datapath, 'samplemd12.rds'))
saveRDS(object = pb, file = paste0(datapath, 'pb12.rds'))
saveRDS(object = L2, file = paste0(datapath, 'L212.rds'))
# R 4.2.0
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] viridis_0.6.3 viridisLite_0.4.2 simr_1.0.7 lme4_1.1-33
# [5] Matrix_1.5-4 fgsea_1.24.0 ggpubr_0.6.0 magrittr_2.0.3
# [9] variancePartition_1.28.9 BiocParallel_1.32.6 limma_3.54.2 here_1.0.1
# [13] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2
# [17] purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
# [21] ggplot2_3.4.2 tidyverse_2.0.0
#
# loaded via a namespace (and not attached):
# [1] utf8_1.2.3 spatstat.explore_3.2-1 reticulate_1.28 RUnit_0.4.32
# [5] tidyselect_1.2.0 htmlwidgets_1.6.2 grid_4.2.0 Rtsne_0.16
# [9] devtools_2.4.5 munsell_0.5.0 codetools_0.2-19 ragg_1.2.5
# [13] ica_1.0-3 future_1.32.0 miniUI_0.1.1.1 withr_2.5.0
# [17] spatstat.random_3.1-5 colorspace_2.1-0 progressr_0.13.0 Biobase_2.58.0
# [21] Superpower_0.2.0 knitr_1.42 rstudioapi_0.14 Seurat_4.3.0
# [25] ROCR_1.0-11 ggsignif_0.6.4 tensor_1.5 listenv_0.9.0
# [29] emmeans_1.8.5 Rdpack_2.4 labeling_0.4.2 RLRsim_3.1-8
# [33] polyclip_1.10-4 pheatmap_1.0.12 farver_2.1.1 rprojroot_2.0.3
# [37] parallelly_1.35.0 vctrs_0.6.2 generics_0.1.3 afex_1.3-0
# [41] clusterGeneration_1.3.7 xfun_0.39 timechange_0.2.0 R6_2.5.1
# [45] doParallel_1.0.17 locfit_1.5-9.8 bitops_1.0-7 spatstat.utils_3.0-3
# [49] cachem_1.0.8 promises_1.2.0.1 scales_1.2.1 nnet_7.3-19
# [53] gtable_0.3.3 egg_0.4.5 globals_0.16.2 processx_3.8.1
# [57] goftest_1.2-3 rlang_1.1.1 slanter_0.2-0 systemfonts_1.0.4
# [61] splines_4.2.0 rstatix_0.7.2 lazyeval_0.2.2 checkmate_2.2.0
# [65] spatstat.geom_3.2-1 broom_1.0.4 BiocManager_1.30.20 yaml_2.3.7
# [69] reshape2_1.4.4 abind_1.4-5 backports_1.4.1 httpuv_1.6.10
# [73] Hmisc_5.1-0 tools_4.2.0 usethis_2.2.0 gplots_3.1.3
# [77] ellipsis_0.3.2 RColorBrewer_1.1-3 BiocGenerics_0.44.0 sessioninfo_1.2.2
# [81] ggridges_0.5.4 Rcpp_1.0.10 plyr_1.8.8 base64enc_0.1-3
# [85] progress_1.2.2 ps_1.7.5 prettyunits_1.1.1 rpart_4.1.19
# [89] remaCor_0.0.11 deldir_1.0-6 pbapply_1.7-0 cowplot_1.1.1
# [93] urlchecker_1.0.1 zoo_1.8-12 SeuratObject_4.1.3 ggrepel_0.9.3
# [97] cluster_2.1.4 fs_1.6.2 data.table_1.14.8 scattermore_1.2
# [101] lmerTest_3.1-3 lmtest_0.9-40 RANN_2.6.1 mvtnorm_1.1-3
# [105] fitdistrplus_1.1-11 matrixStats_0.63.0 pkgload_1.3.2 hms_1.1.3
# [109] patchwork_1.1.2 mime_0.12 evaluate_0.21 xtable_1.8-4
# [113] pbkrtest_0.5.2 RhpcBLASctl_0.23-42 gridExtra_2.3 compiler_4.2.0
# [117] KernSmooth_2.23-21 crayon_1.5.2 minqa_1.2.5 htmltools_0.5.5
# [121] mgcv_1.8-42 later_1.3.1 tzdb_0.3.0 Formula_1.2-5
# [125] snow_0.4-4 DBI_1.1.3 MASS_7.3-60 boot_1.3-28.1
# [129] car_3.1-2 cli_3.6.1 rbibutils_2.2.13 parallel_4.2.0
# [133] igraph_1.4.2 pkgconfig_2.0.3 numDeriv_2016.8-1.1 foreign_0.8-84
# [137] sp_1.6-0 binom_1.1-1.1 plotly_4.10.2 spatstat.sparse_3.0-1
# [141] foreach_1.5.2 estimability_1.4.1 callr_3.7.3 digest_0.6.31
# [145] sctransform_0.3.5 RcppAnnoy_0.0.20 spatstat.data_3.0-1 rmarkdown_2.21
# [149] leiden_0.4.3 fastmatch_1.1-3 htmlTable_2.4.1 edgeR_3.40.2
# [153] uwot_0.1.14 curl_5.0.0 gtools_3.9.4 shiny_1.7.4
# [157] nloptr_2.0.3 lifecycle_1.0.3 nlme_3.1-162 jsonlite_1.8.4
# [161] aod_1.3.2 carData_3.0-5 desc_1.4.2 fansi_1.0.4
# [165] pillar_1.9.0 ggsci_3.0.0 lattice_0.21-8 plotrix_3.8-2
# [169] fastmap_1.1.1 httr_1.4.6 pkgbuild_1.4.1 survival_3.5-5
# [173] glue_1.6.2 remotes_2.4.2 png_0.1-8 iterators_1.0.14
# [177] stringi_1.7.12 profvis_0.3.8 textshaping_0.3.6 caTools_1.18.2
# [181] memoise_2.0.1 irlba_2.3.5.1 future.apply_1.10.0
Gene set enrichment and defining As03 specific cell type specific
leading edge phenotypes.
mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/2_V4_AS03_contrastmodel_enrichment.r
# H5 vs H1 day 1 DE contrast pseudobulk model Enrichment
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(variancePartition))
suppressMessages(library(magrittr))
#suppressMessages(library(scglmmr))
source(here('functions/scglmmr.functions.R'))
# specify output directories and init parallel opts
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/")
datapath dir.create(datapath)
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/figuresV4/gsea/")
figpath dir.create(figpath, recursive = TRUE)
# parallel options
::register(BiocParallel::SnowParam(4))
BiocParallel= BiocParallel::SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# load combined modules -- rm the RP gene outliers
= readRDS(file = here('signature_curation/combined_sig_sub.rds'))
cmod
# load contrast fit results
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/fit12e.rds'))
fit12e = ExtractResult(
toprank model.fit.list = fit12e,
what = 'lmer.z.ranks',
coefficient.number = 1,
coef.name = 'delta'
)
# gsea on combined modules
= FgseaList(rank.list.celltype = toprank,pathways = cmod, BPPARAM = pparam)
gc saveRDS(gc,file = paste0(datapath, 'gc.rds'))
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/gc.rds'))
gc
# index leading edge genes
= LeadingEdgeIndexed(gsea.result.list = gc,padj.threshold = 0.05)
li = Filter(li,f = length)
li saveRDS(li,file = paste0(datapath, 'li.rds'))
# jaccard enrichment
= lapply(gc, function(x) x %>% filter(padj < 0.05))
gsub = Filter(gsub, f = nrow)
gsub = EnrichmentJaccard(gsealist = gsub,
ji indexedgenes = li,
saveplot = FALSE,
figpath = figpath,
returnJaccardMtx = TRUE)
saveRDS(ji$sortedgsea,file = paste0(datapath, 'sortedgsea.rds'))
# subet of leading edge genes upregulated in contrast model only.
= lapply(gc, function(x)
g.up %>% filter(padj < 0.1 & NES > 0)
x
)= LeadingEdgeIndexed(gsea.result.list = g.up, padj.threshold = 0.1)
li.up = Filter(li.up, f = length)
li.up saveRDS(li.up,file = paste0(datapath, 'li.up.rds'))
# add top genes by effect
= ExtractResult(model.fit.list = fit12e, coefficient.number = 1, coef.name = 'delta')
res = lapply(res, function(x)
topgene %>%
x filter(logFC > 0.25 & P.Value < 0.03) %$%
gene
)saveRDS(topgene, file = paste0(datapath, 'topgene.rds'))
Calculate log cpm for gene distribution visualization in next
script.
mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/3_V4_calc_logcpm_tidyaveragedata.r
# average gene distributions - part 1
suppressMessages(library(tidyverse))
suppressMessages(library(here))
#suppressMessages(library(scglmmr))
source(here('functions/MattPMutils.r'))
source('functions/scglmmr.functions.R')
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gene_dist/");
datapath dir.create(datapath)
# calculate logcpm
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/pb12.rds'))
pb = readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/samplemd12.rds'))
samplemd = factor(samplemd$time.group)
time.group = list()
av for (i in 1:length(pb)) {
= edgeR::DGEList( pb[[i]] )
dge = edgeR::filterByExpr(y = dge$counts, min.count = 3, design = time.group)
gtable = dge[gtable, ]
dge = edgeR::cpm(dge, log = TRUE)
av[[i]]
}names(av) = names(pb)
# create tidy data for gene visualization and calculation of average signature scores
# get tidy summary data
= list()
av_tidy for (i in 1:length(av)) {
= names(av)[i]
ct = rownames(av[[i]])
gs = GetTidySummary(av.exprs.list = av,
av_tidy[[i]] celltype.index = i,
genes.use = gs) %>%
mutate(cohort = if_else( str_sub(sample, 1,2) == "H5", "H5N1", "H1N1")) %>%
mutate(group = paste(str_sub(sample, -2,-1), cohort))
$group= factor(av_tidy[[i]]$group,
av_tidy[[i]]levels =
c("d0 H1N1" ,"d1 H1N1" ,"d0 H5N1","d1 H5N1"))
$group = plyr::revalue(av_tidy[[i]]$group,
av_tidy[[i]]c("d0 H1N1" = "d0 No-AS03",
"d1 H1N1" = "d1 No-AS03",
"d0 H5N1" = "d0 AS03",
"d1 H5N1" = "d1 AS03"))
}names(av_tidy) = names(av)
saveRDS(av_tidy, file = paste0(datapath,'av_tidy.rds'))
Visualize gene distributions of AS03 specific perturbation effects
across time between unadjuvanted and AS03 adjuvanted cohorts.
mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/5_V4_AS03model_gene_distributions.r
suppressMessages(library(here))
suppressMessages(library(tidyverse))
#suppressMessages(library(scglmmr))
source('functions/scglmmr.functions.R')
source(here('functions/MattPMutils.r'))
# save path
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/as03fig/")
datapath = here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/figuresV4/as03fig/")
figpath
# load aggregated tidy data
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gene_dist/av_tidy.rds'))
av_tidy
# load mixed model fit res v4
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/fit12e.rds'))
fit12e = ExtractResult(model.fit.list = fit12e, coefficient.number = 1,coef.name = 'delta')
fitres
# load gsea results
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/gc.rds'))
gc
# load leading edge indexed
= readRDS(file = 'mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/li.rds')
li
# load top genes
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/topgene.rds'))
topgene
# specify specs
= c("grey48", "grey", "grey48", "deepskyblue3")
cu = sapply(cu, col.alpha, alpha = 0.4) %>% unname()
cu.alpha
= list(
mtheme theme_bw(base_size = 10.5),
theme(axis.title.x = element_blank()),
theme(axis.text.x=element_text(angle = -90, hjust = 0)),
theme(strip.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold",family = "Helvetica"),
axis.text.y = element_text(size = 6),
# new line
axis.text.x = element_text(angle = 45, hjust = 1, size = 6, color = 'black')
)
)= list(
box_gg ylab("log CPM"),
geom_boxplot(show.legend = FALSE, outlier.shape = NA),
mtheme
)
for (i in 1:length(li$CD14_Mono)) {
= fitres$CD14_Mono %>% filter(P.Value < 0.05)
topr names(li$CD14_Mono)[i] %>% print()
::intersect(topr$gene, unlist(li$CD14_Mono[i],use.names = FALSE) ) %>% print()
dplyr
}
# Format monocyte subset plot
#gene_highlight = c('MB21D1', 'FPR2', 'P2RY13', 'TLR4')
= c('MB21D1', 'FPR2', 'P2RY13', 'IFIT3')
gene_highlight $CD14_Mono %>%
fitresfilter(gene %in% gene_highlight) %>%
select(gene, P.Value, adj.P.Val, z.std, contrast, celltype)
# gene P.Value adj.P.Val z.std contrast celltype
# 1 FPR2 4.950912e-08 0.0004864845 5.453064 delta CD14_Mono
# 2 P2RY13 8.134318e-07 0.0012724398 4.932117 delta CD14_Mono
# 3 MB21D1 4.226564e-04 0.0351672126 3.525517 delta CD14_Mono
# 4 IFIT3 8.919633e-04 0.0520128594 3.322557 delta CD14_Mono
= av_tidy$CD14_Mono %>% filter(gene %in% gene_highlight)
mplt $gene[mplt$gene == "MB21D1"] = "CGAS"
mplt$gene = factor(mplt$gene, levels = c("CGAS",'FPR2', 'P2RY13', 'IFIT3'))
mplt
# monocyte subset
= ggplot(mplt, aes(x =group, y = count, fill = group , color = group)) +
p +
mtheme +
box_gg facet_wrap(~gene, scales = "free", nrow = 1) +
scale_color_manual(values = cu) +
scale_fill_manual(values = cu.alpha)
pggsave(p, filename = paste0(figpath, 'mono.subset.pdf'), width = 3.5, height = 2.3)
# mDC
for (i in 1:length(li$mDC)) {
= fitres$mDC %>% filter(P.Value < 0.05)
topr names(li$mDC)[i] %>% print()
::intersect(topr$gene, unlist(li$mDC[i],use.names = FALSE) ) %>% print()
dplyr
}
#gene_highlight = c('FPR1', 'CCR1', 'P2RY13', 'TLR4')
= c('FPR1', 'CCR1', 'P2RY13', 'SCIMP')
gene_highlight $mDC %>%
fitresfilter(gene %in% gene_highlight)%>%
select(gene, P.Value, adj.P.Val, z.std, contrast, celltype)
# gene P.Value adj.P.Val z.std contrast celltype
# 1 SCIMP 0.0001868746 0.1750548 3.736129 delta mDC
# 2 P2RY13 0.0011814319 0.3280209 3.243325 delta mDC
# 3 FPR1 0.0133877926 0.5692877 2.473283 delta mDC
# 4 CCR1 0.0160817638 0.5787543 2.407055 delta mDC
= av_tidy$mDC %>% filter(gene %in% gene_highlight)
mplt $gene = factor(mplt$gene , levels =c('FPR1', 'CCR1', 'P2RY13', 'SCIMP'))
mplt# mDC subset
= ggplot(mplt, aes(x =group, y = count, fill = group , color = group)) +
p +
mtheme +
box_gg facet_wrap(~gene, scales = "free", nrow = 1) +
scale_color_manual(values = cu) +
scale_fill_manual(values = cu.alpha)
pggsave(p, filename = paste0(figpath, 'mdc.subset.pdf'), width = 3., height = 2.3)
# B cell Naive
for (i in 1:length(li$BC_Naive)) {
names(li$BC_Naive)[i] %>% print()
::intersect(topgene$BC_Naive, unlist(li$BC_Naive[i],use.names = FALSE) ) %>%
dplyrprint()
}#bplt = av_tidy$BC_Naive %>% filter(gene == 'PMAIP1')
= c('PMAIP1', 'BTG1')
gene_highlight $BC_Naive %>%
fitresfilter(gene %in% gene_highlight) %>%
select(gene, P.Value, adj.P.Val, z.std, contrast, celltype)
= av_tidy$BC_Naive %>% filter(gene %in% c('PMAIP1', 'BTG1'))
bplt $gene[bplt$gene == "PMAIP1"] = "NOXA (PMAIP1)"
bplt
# B cell subset
= ggplot(bplt, aes(x =group, y = count, fill = group , color = group)) +
p +
mtheme +
box_gg facet_wrap(~gene, scales = "free", nrow = 1) +
scale_color_manual(values = cu) +
scale_fill_manual(values = cu.alpha)
pggsave(p, filename = paste0(figpath, 'Bcell.PMAIP1.pdf'), width = 2.5, height = 2.5)
Further analysis of B cell phenotypes. Single cell model of apoptosis
signature and correlation of apoptosis signature with B CD40 Activation
signature.
mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/6_V4_bsignal.r
# b cell figures
suppressMessages(library(tidyverse))
suppressMessages(library(here))
source('functions/scglmmr.functions.R')
suppressMessages(library(magrittr))
suppressMessages(library(emmeans))
suppressMessages(library(Seurat))
source(here('functions/MattPMutils.r'))
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/figuresV4/bsig/")
figpath dir.create(figpath)
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/bsig_data/")
datapath dir.create(datapath)
# B cell signals from CITE-seq cohort
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/gc.rds'))
gc = gc %>% bind_rows(.id = 'celltype') %>%
d filter(celltype == 'BC_Naive') %>%
filter(padj < 0.05)
###### gsea plot subset
= list(
mtheme1 theme_bw(base_size = 10.5),
theme(text = element_text(color = 'black')),
theme(strip.background = element_blank(),
strip.text = element_text(face = "bold",family = "Helvetica"),
axis.text.y = element_text(size = 12, color = 'black'))
)= ggplot(d, aes(x = NES, y = reorder(pathway, NES),
p fill = celltype, size = -log10(padj)), group = celltype ) +
+
mtheme1 theme(axis.text.y = element_text(size = 9)) +
ylab("") +
xlab('Normalized Enrichment Score') +
geom_vline(xintercept = 0, linetype = 'dashed') +
geom_point(shape = 21 , fill = 'deepskyblue3')
ggsave(p,filename = paste0(figpath, 'BCNaive.as03.enrichment.pdf'), width = 6, height = 3)
# Load day 1 object for both cohorts bcells
= readRDS(file = "data/h1h5_annotated_with_meta.rds")
s = s@meta.data %>%
md filter(celltype_joint == 'BC_Naive') %>%
filter(time_cohort == 'd1')
= s@raw.data[ ,md$barcode_check]
umi = s@assay$CITE@data[ ,md$barcode_check]
adt
# log normalize rna
= CreateSeuratObject(counts = umi, meta.data = md)
s = NormalizeData(s,normalization.method = 'LogNormalize')
s
# plot B cell protein distributions
= cbind(s@meta.data, as.data.frame(t(adt)))
d = c("CD19_PROT", "CD20_PROT", "IgD_PROT", "CD27_PROT","IgM_PROT",
prot_vis"CD21_PROT", "CD40_PROT", "CD38_PROT", "CD24_PROT", "CD14_PROT",
"CD3_PROT")
= d %>%
dpl filter(celltype_joint == "BC_Naive") %>%
select(all_of(prot_vis), sample, cohort) %>%
gather(protein, dsb_norm_value, prot_vis[1]:prot_vis[length(prot_vis)])
$protein = factor(dpl$protein, levels = rev(prot_vis))
dpl$protein = str_sub(dpl$protein, 1, -6)
dpl$cohort[dpl$cohort == 'H5N1'] = 'AS03'
dpl$cohort[dpl$cohort == 'H1N1'] = 'No AS03'
dpl= ggplot(dpl, aes(x = dsb_norm_value, y = reorder(protein, dsb_norm_value), color = cohort, fill = cohort )) +
p ::geom_density_ridges2(show.legend = FALSE, size = 0.3 ) +
ggridgestheme_bw() +
facet_wrap(~cohort) +
geom_vline(xintercept = 0, color = 'black', linetype = 'dashed') +
scale_fill_manual(values = c(col.alpha("grey", 0.8), col.alpha("deepskyblue3", 0.8))) +
scale_color_manual(values =c(col.alpha("grey", 0.8), col.alpha("deepskyblue3", 0.8))) +
ggtitle("Naive B cell cluster") +
theme(strip.background = element_blank()) +
theme(axis.text.y = element_text(color = "black")) +
ylab("") + xlab("dsb normalized protein")
pggsave(p, filename = paste0(figpath, "BCN_cohort_proteindistributions.pdf"), width = 3, height = 3.8)
# B cell state signature analysis
# extract signature geens
= readRDS(here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/gc.rds'))
gsea1 = c("CD40_ACT",
mods "REACTOME_ACTIVATION_OF_BH3_ONLY_PROTEINS",
"LI.M160 leukocyte differentiation",
"REACTOME_INTRINSIC_PATHWAY_FOR_APOPTOSIS")
= readRDS('signature_curation/combined_sig_sub.rds')['CD40_ACT']
cd40
# Define apoptosis signature
$BC_Naive %>%
gsea1filter(pathway %in% mods) %$%
leadingEdge=
apoptosis.signature list('apoptosis.signature' =
$BC_Naive %>%
gsea1filter(pathway %in% mods[2:4]) %$% leadingEdge %>%
unlist(use.names = FALSE) %>%
unique())
= c(cd40, apoptosis.signature)
sig.test saveRDS(sig.test,file = paste0(datapath,'sig.test.rds'))
=readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/bsig_data/sig.test.rds'))
sig.test
::fwrite(list(sig.test$apoptosis.signature),file = paste0(datapath, 'bsig.apoptosis.txt'),sep = '\t')
data.table::fwrite(list(sig.test$CD40_ACT),file = paste0(datapath, 'bsig.cd40.txt'),sep = '\t')
data.table
# gsea
# load contrast fit results
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/fit12e.rds'))
fit12e = ExtractResult(
toprank model.fit.list = fit12e,
what = 'lmer.z.ranks',
coefficient.number = 1,
coef.name = 'delta'
)
= fgsea::fgsea(pathways = list('apoptosis.signature' = sig.test$apoptosis.signature), stats = toprank$BC_Naive)
gs.bsig $leadingEdge
gs.bsig
= fgsea::plotEnrichment(pathway = sig.test$apoptosis.signature,stats = toprank$BC_Naive) +
p geom_line(size = 2, color = 'deepskyblue3')
ggsave(p, filename = paste0(figpath, 'apoptosis.sig.gsea.citeseq.pdf'), width = 4, height = 3)
##################
# fit single cell model
# score modules
= WeightedCellModuleScore(gene_matrix = s@assays$RNA@data,
ms module_list = sig.test,
cellwise_scaling = FALSE,
return_weighted = FALSE)
# combine score and metadata
= cbind(s@meta.data, ms)
d = names(sig.test)[1];
index1 = names(sig.test)[length(sig.test)]
index2
# Calculate d1 FC of average module expression
= d %>%
ddf group_by(sample, sampleid, cohort, timepoint, celltype_joint) %>%
summarise_at(.vars = names(sig.test), .funs = mean) %>%
ungroup() %>%
gather(module, average, index1:index2) %>%
mutate(celltype_module = paste(celltype_joint, module, sep = "~")) %>%
arrange(celltype_joint, sampleid) %>%
mutate(fold_change = lead(average) - average)
= function(x){ (x - mean(x))/ sd(x)}
scale.simple =
signal_cor %>%
ddf filter(timepoint == "d0") %>%
filter(module %in% c( 'CD40_ACT', 'apoptosis.signature')) %>%
select(sample, cohort, module, fold_change) %>%
spread(module, fold_change)
$apoptosis.signature = scale.simple(signal_cor$apoptosis.signature)
signal_cor$CD40_ACT = scale.simple(signal_cor$CD40_ACT)
signal_cor
=
p ggplot(signal_cor %>% mutate(timepoint = str_sub(sample, -2, -1)),
aes(x = apoptosis.signature, y = CD40_ACT)) +
theme_bw() +
geom_smooth(method = "lm", color = col.alpha('black', 0.8)) +
xlab('B cell apoptosis signature fold change') +
ylab('CD40 Activation signature fold change') +
geom_point(aes(fill = cohort), size = 3, shape = 21, show.legend = FALSE) +
scale_fill_manual(values = c(col.alpha("grey", 0.8), col.alpha("deepskyblue3",0.8))) +
::stat_cor(method = "pearson", label.x.npc = 0.01, label.y.npc = 0.01) +
ggpubrggtitle("Naive B cells")
pggsave(p, filename = paste0(figpath, "CD40score_vs_apoptosissig.pdf"), width = 3.2, height = 3.2)
saveRDS(signal_cor, file = paste0(datapath, 'signalcor.rds'))
# Fit mixed model to apoptosis signature.
$cohort_timepoint = factor(d$cohort_timepoint, levels = c("H1N1_d0", "H1N1_d1", "H5N1_d0", "H5N1_d1"))
d$sex = factor(d$gender)
d= c(1,0,0,0);
c00 = c(0,1,0,0);
c01 = c(0,0,1,0);
c10 = c(0,0,0,1)
c11 = list("time1vs0_group2vs1" = ((c11 - c10) - (c01 - c00)), "time0_group2vs1" = (c10 - c00))
contrast_2 = 'apoptosis.signature ~ 0 + cohort_timepoint + age + sex + (1|sampleid)'
f1 = lme4::lmer(formula = f1, data = d)
m1 = emmeans(object = m1, specs = ~ cohort_timepoint, data = d, lmer.df = "asymptotic")
emm1 = emmeans::contrast(emm1, method = contrast_2)
contrast_fit = summary(contrast_fit,infer = c(TRUE, TRUE))
msummary1 $module = 'apoptosis.signature'
msummary1saveRDS(msummary1, file = paste0(datapath,"apoptosis_signature_singlecellmodel_result.rds"))
# visualize
# plotsingle cell distributionn and emmeans contrasts
= c("grey48", "grey", "grey48", "deepskyblue3")
cu = sapply(cu, col.alpha, alpha = 0.8) %>% unname()
cu.alpha
# set theme
= list(theme_bw(),
plot.aes theme(axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15)),
scale_color_manual('grey'))
= list(theme_bw(),
em_aes coord_flip(),
theme(axis.title.y = element_text(size = 7),
axis.text.y = element_text(size = 7)),
scale_color_manual('grey'))
# combined signature change emm in p1 and change y value in p0
= ggplot(d, aes(x = cohort_timepoint, y = apoptosis.signature, fill = cohort_timepoint )) +
p0 geom_violin(show.legend = F,trim = TRUE) +
+
plot.aes ylab('apoptosis signature') +
xlab('vaccine group ~ time') +
scale_fill_manual(values = cu.alpha) +
ggtitle('Naive B cells') +
theme(axis.title.x = element_text(size = 12))
ggsave(p0, filename = paste0(figpath, 'apoptosis.sig.cells.pdf'), width = 4, height = 3.5)
= plot(emm1) +
p1 +
em_aes theme(axis.text.x = element_blank())
ggsave(p1, filename = paste0(figpath, 'apoptosis.sig.emmeans.pdf'), width = 1.2, height =3 )
= plot(msummary1) +
p2 theme_bw() +
geom_vline(xintercept = 0, linetype = 'dashed') +
ggtitle(unique(msummary1$module))
ggsave(p2, filename = paste0(figpath, 'contrast.emmeans.pdf'), width = 4, height = 1.2)
Distribution of B cell apoptosis genes from all fitted mixed model z
statistics from fold change contrast within naive B cells.
mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/7_bgenes.r
# Save results table
suppressMessages(library(tidyverse))
suppressMessages(library(here))
#suppressMessages(library(scglmmr))
source('functions/scglmmr.functions.R')
source(here('functions/MattPMutils.r'))
= here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/bsig_data/")
datapath = here("mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/figuresV4/bsig/")
figpath
# read gsea and mixed model results
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/fit12e.rds'))
fit12e = ExtractResult(model.fit.list = fit12e, coefficient.number = 1, coef.name = 'delta')
res = readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/gc.rds'))
gc = c('PMAIP1 (NOXA)', 'BCL2', 'BTG2', 'BTG1')
apop $BC_Naive$gene[res$BC_Naive$gene == 'PMAIP1'] <- 'PMAIP1 (NOXA)'
res= readRDS(here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/bsig_data/sig.test.rds'))
sig.test = sig.test$apoptosis.signature
apop.genes =
p ggplot(res$BC_Naive, aes(x = logFC, y = z.std)) +
theme_bw() +
xlim(-3,2.1) +
geom_bin2d(bins = 400, show.legend = FALSE, fill = 'black') +
ylab('Mixed model contrast \n standardized z statistic') +
xlab('Difference in day 1 log fold change\nAS03 vs unadjuvanted') +
geom_point(data = res$BC_Naive %>% filter(gene %in% c(apop , apop.genes)), aes(x = logFC, y = z.std),
color = 'deepskyblue3', size = 2) +
::geom_text_repel(data = res$BC_Naive %>% filter(gene %in% apop),
ggrepelaes(x = logFC, y = z.std, label = gene),
color = 'red',size = 5,
nudge_x = -0.6, seed = 2,
nudge_y = 0, box.padding = 1,
segment.size = 0.1) +
theme(title = element_text(size = 18)) +
geom_vline(xintercept = 0, linetype ='dashed') +
ggtitle('Naive B cells')
pggsave(p,filename = paste0(figpath,'bcn_contrast_genes.pdf'), width = 5, height = 5)
Analysis of surface plasmon resonance data to correlate strain specific with non strain antibody avidity in AS03 adjuvanted donors. mid_res/ru/RU_binding.r
suppressMessages(library(tidyverse))
suppressMessages(library(here))
=here("mid_res/ru/figures/"); dir.create(figpath)
figpath =here("mid_res/ru/generated_data/"); dir.create(datapath, recursive = TRUE)
datapath
# define adjuvant subjects
= data.table::fread(here("data/CHI_H5N1_data/clinical_info_adj.txt"))
met
# define adjuvant subjects
= met %>%
adj.subjects select(`Subject ID`, Adjuvant) %>%
filter(Adjuvant == 'Adj') %>%
select(subjectid = `Subject ID`, Adjuvant)
# load SPR data
= read_delim(file = "data/CHI_H5N1_data/MN_abbinding/MPMEDIT_CHI_H5N1_AS03_SPR_data_2017_Khurana_SK.txt",delim = '\t')
ru = c("H5N1-011", "H5N1-017", "H5N1-021", "H5N1-031", "H5N1-038", "H5N1-043")
CITE =
ru.sub %>%
ru separate(Sera,into = c('sx','time'),sep = '-') %>%
filter(time == ' D42') %>%
filter(subjectid %in% adj.subjects$subjectid) %>%
mutate(cite = ifelse(subjectid %in% CITE, '1', '0')) %>%
saveRDS(ru.sub,file = paste0(datapath,'ru.sub.rds'))
=
p ggplot(data = ru.sub, aes(x = Indo_RU_HA1, y = Viet_RU_HA1 )) +
theme_bw() +
geom_smooth(method = "lm", color = "black") +
geom_point(shape = 21 , size = 3, fill = "deepskyblue3") +
theme(legend.position = "top") +
theme(axis.title = element_text(size = 12)) +
theme(axis.text = element_text(size = 10, color = 'black')) +
# ggrepel::geom_text_repel(data = ru.sub %>% filter( cite == 1), size = 2.8, aes(label = subjectid)) +
xlab("Day 42 Antibody Binding (RU) \n Heterologous H5N1 strain (Vietnam)") +
ylab("Day 42 Antibody Binding (RU) \n vaccine H5N1 strain (Indonesia)") +
::stat_cor(method = "pearson")
ggpubrggsave(p, filename = paste0(figpath,"RU_plot.pdf" ), width = 3.5, height = 3.5)
Fit mixed model with unadjuvanted and AS03 adjuvanted subjects, apply
contrast to define difference in 24h fold change post vaccination on
FACS sorted subsets from validation cohort. Format data:
mid_res/vand/1_V4_vand_d1_adjvsnon_DATAFORMAT.r
# R version 4.0.5
# H5 vs H1 day 1 DE contrast pseudobulk model
# reanalysis of Howard et. al AS03 vs PBS cotrol with sorted lineages rnaseq data
# data from https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0167488#pone.0167488.s009
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
###### save paths
= here("mid_res/vand/generated_data/"); dir.create(datapath)
datapath
# load log CPM data from supp table 2 list of celltypes
= here("data/vand/data/")
vand_datapath = list.files(path = vand_datapath, full.names = T)
ctd = ctd[-1]
ctd = lapply(ctd, function(x){read_delim(x, delim = ",")})
e
# get celltypes to name list
= lapply(e,function(x){ str_sub(colnames(x), 3,5) %>% unique}) %>% unlist
celltypes = setdiff(celltypes, "SEM")
celltypes names(e) = celltypes
# setup data for limma / dream lme4
= lapply(e, function(x){
e = x %>%
x select(-ENSEMBL63_GENE_ID) %>%
select(matches('D000|D001|ENSEMBL63_GENE_NAME')) %>%
column_to_rownames("ENSEMBL63_GENE_NAME")
})
# create metadata
= lapply(e, function(x){
md colnames(x) %>%
as.data.frame() %>%
mutate(group = str_sub(., -4,-1)) %>%
mutate(group = if_else(group == "_PBS", "xPBS", group )) %>%
mutate(subjectid = str_sub(., 1,1)) %>%
mutate(timepoint = str_sub(., 7,10)) %>%
mutate(timepoint = ifelse(timepoint == 'D000', 'd0', 'd1')) %>%
mutate(time.group = paste(timepoint, group,sep = "_")) %>%
column_to_rownames(".")
})
# rm samples with missing data.
lapply(md, function(x) { x$subjectid})
###
# md 1 = P
# md 5 = P
# md 6 = C J
## remove missing data from sample data
1]] = md[[1]] %>%
md[[rownames_to_column("sample") %>%
filter(!subjectid %in% "P") %>%
column_to_rownames("sample")
5]] = md[[5]] %>%
md[[rownames_to_column("sample") %>%
filter(!subjectid %in% "P") %>%
column_to_rownames("sample")
6]] = md[[6]] %>%
md[[rownames_to_column("sample") %>%
filter(!subjectid %in% c("C", "J")) %>%
column_to_rownames("sample")
# remove missing data from RNAseq data
1]] = e[[1]][ ,rownames(md[[1]])]
e[[5]] = e[[5]][ ,rownames(md[[5]])]
e[[6]] = e[[6]][ ,rownames(md[[6]])]
e[[
#############################################
## create model matrix and check model rank
= lapply(md, function(x){
d1m = x %>% mutate_if(is.character, as.factor) %$% time.group ;
x = model.matrix(~0 + x) ;
x colnames(x) = str_sub(colnames(x), start = -9, end = -1)
return(x)
})
# re QC model
for (i in 1:length(d1m)) {
= d1m[[i]] ; print(i)
model stopifnot(Matrix::rankMatrix(model) == ncol(model)) ; stopifnot(any(colSums(model) == 0) == FALSE)
}stopifnot(all.equal(
lapply(md, rownames), lapply(e,colnames)
))
# Confirm data has been normalized:
# lapply(e, boxplot)
saveRDS(md, file = paste0(datapath, 'md.rds'))
saveRDS(e, file = paste0(datapath, 'e.rds'))
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS Mojave 10.14.6
#
# Matrix products: default
# BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] magrittr_2.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.6
# [9] ggplot2_3.3.3 tidyverse_1.3.0 here_1.0.1
#
# loaded via a namespace (and not attached):
# [1] Rcpp_1.0.6 cellranger_1.1.0 pillar_1.4.7 compiler_4.0.5 dbplyr_2.1.0 tools_4.0.5 lattice_0.20-41
# [8] jsonlite_1.7.2 lubridate_1.7.9.2 lifecycle_1.0.0 gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.10 Matrix_1.3-2
# [15] reprex_1.0.0 cli_2.5.0 rstudioapi_0.13 DBI_1.1.1 haven_2.3.1 withr_2.4.1 xml2_1.3.2
# [22] httr_1.4.2 fs_1.5.0 generics_0.1.0 vctrs_0.3.6 hms_1.0.0 rprojroot_2.0.2 grid_4.0.5
# [29] tidyselect_1.1.0 glue_1.4.2 R6_2.5.0 readxl_1.3.1 modelr_0.1.8 backports_1.2.1 scales_1.1.1
# [36] ellipsis_0.3.1 rvest_0.3.6 assertthat_0.2.1 colorspace_2.0-0 stringi_1.5.3 munsell_0.5.0 broom_0.7.5
# [43] crayon_1.4.1
Fit model:
mid_res/vand/2_V4_vand_d1_adjvsnon_MODEL.r
# R version 4.0.5
# vanderilt validation cohort of contrast model results from CITE-seq
# analysis within analagous main immune cell lineage.
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(variancePartition))
suppressMessages(library(scglmmr))
###### save paths
= here("mid_res/vand/generated_data/")
datapath
# parallel options
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# load data
= readRDS(file = here('mid_res/vand/generated_data/e.rds'))
e = readRDS(file = here('mid_res/vand/generated_data/md.rds'))
md
# process metadata for contrast model
# order time.group factor
= lapply(md, function(x){
md_lmer = x %>% mutate(time.group = factor(time.group,
x levels = c("d0_AS03", "d1_AS03", "d0_xPBS", "d1_xPBS")))
})
# specify formula for lme4 / dream and set up contrasts
<- ~ 0 + time.group + (1|subjectid)
f1
# specify contrast matrix to test the fold change difference
# based on levels of time.group this should be cmat = c(-1, 1, 1, -1)
= makeContrastsDream(
L2 formula = f1,
data = md_lmer[[1]],
contrasts = c(delta = "(time.groupd1_AS03 - time.groupd0_AS03) - (time.groupd1_xPBS - time.groupd0_xPBS)")
)plotContrasts(L2) + ggsave(filename = paste0(figpath,'contrastmodel.pdf'), width = 7, height = 4)
# fit model on each subset
= fitne = list()
fit1 for (i in 1:length(e)) {
# init data
= e[[i]]
norm_dat = md_lmer[[i]]
meta = f1
form = L2
contrast_matrix
# fit contrast mixed model on prenormalized values
= dream(exprObj = norm_dat,
fitmm formula = form,
data = meta,
L = contrast_matrix,
BPPARAM = pparam,
useWeights = FALSE,
REML = TRUE)
# save results
= fitmm
fitne[[i]] = variancePartition::eBayes(fitmm)
fit1[[i]]
}names(fit1) = names(fitne) = names(e)
# Save
# day 1 contrast fit
saveRDS(object = fit1, file = paste0(datapath, 'fit1.rds'))
saveRDS(object = fitne, file = paste0(datapath, 'fit1.rds'))
# model fitting data
saveRDS(object = md_lmer, file = paste0(datapath, 'md_lmer.rds'))
saveRDS(object = L2, file = paste0(datapath, 'L2.rds'))
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS Mojave 10.14.6
#
# Matrix products: default
# BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
#
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] scglmmr_0.1.0 variancePartition_1.25.6 BiocParallel_1.24.1 limma_3.46.0 magrittr_2.0.1
# [6] here_1.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
# [11] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3 tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] tidyselect_1.1.0 lme4_1.1-26 RSQLite_2.2.7 AnnotationDbi_1.52.0
# [5] grid_4.0.5 scatterpie_0.1.7 munsell_0.5.0 codetools_0.2-18
# [9] statmod_1.4.35 withr_2.4.1 colorspace_2.0-0 GOSemSim_2.16.1
# [13] Biobase_2.50.0 rstudioapi_0.13 stats4_4.0.5 ggsignif_0.6.0
# [17] DOSE_3.16.0 labeling_0.4.2 MatrixGenerics_1.2.1 Rdpack_2.1.1
# [21] emmeans_1.5.4 GenomeInfoDbData_1.2.4 polyclip_1.10-0 pheatmap_1.0.12
# [25] bit64_4.0.5 farver_2.0.3 rprojroot_2.0.2 downloader_0.4
# [29] coda_0.19-4 vctrs_0.3.6 generics_0.1.0 TH.data_1.0-10
# [33] R6_2.5.0 doParallel_1.0.16 GenomeInfoDb_1.26.7 graphlayouts_0.7.2
# [37] locfit_1.5-9.4 bitops_1.0-6 cachem_1.0.4 fgsea_1.16.0
# [41] DelayedArray_0.16.3 assertthat_0.2.1 scales_1.1.1 multcomp_1.4-16
# [45] ggraph_2.0.5 enrichplot_1.10.2 gtable_0.3.0 egg_0.4.5
# [49] tidygraph_1.2.0 sandwich_3.0-0 rlang_0.4.10 splines_4.0.5
# [53] rstatix_0.7.0 broom_0.7.5 BiocManager_1.30.10 reshape2_1.4.4
# [57] abind_1.4-5 modelr_0.1.8 backports_1.2.1 qvalue_2.22.0
# [61] clusterProfiler_3.18.1 tools_4.0.5 ellipsis_0.3.1 gplots_3.1.1
# [65] RColorBrewer_1.1-2 BiocGenerics_0.36.1 Rcpp_1.0.6 plyr_1.8.6
# [69] progress_1.2.2 zlibbioc_1.36.0 RCurl_1.98-1.3 prettyunits_1.1.1
# [73] ggpubr_0.4.0 viridis_0.5.1 cowplot_1.1.1 S4Vectors_0.28.1
# [77] zoo_1.8-8 SummarizedExperiment_1.20.0 haven_2.3.1 ggrepel_0.9.1
# [81] fs_1.5.0 data.table_1.14.0 lmerTest_3.1-3 DO.db_2.9
# [85] openxlsx_4.2.3 reprex_1.0.0 mvtnorm_1.1-1 matrixStats_0.58.0
# [89] hms_1.0.0 GSVA_1.38.2 xtable_1.8-4 pbkrtest_0.5-0.1
# [93] RhpcBLASctl_0.21-247.1 XML_3.99-0.6 rio_0.5.16 readxl_1.3.1
# [97] IRanges_2.24.1 gridExtra_2.3 compiler_4.0.5 KernSmooth_2.23-18
# [101] crayon_1.4.1 shadowtext_0.0.9 minqa_1.2.4 ggfun_0.0.4
# [105] snow_0.4-3 lubridate_1.7.9.2 DBI_1.1.1 tweenr_1.0.2
# [109] dbplyr_2.1.0 MASS_7.3-53.1 boot_1.3-27 Matrix_1.3-2
# [113] car_3.0-10 cli_2.5.0 rbibutils_2.0 parallel_4.0.5
# [117] igraph_1.2.6 GenomicRanges_1.42.0 pkgconfig_2.0.3 rvcheck_0.1.8
# [121] numDeriv_2016.8-1.1 foreign_0.8-81 xml2_1.3.2 foreach_1.5.1
# [125] annotate_1.68.0 XVector_0.30.0 estimability_1.3 rvest_0.3.6
# [129] digest_0.6.27 graph_1.68.0 cellranger_1.1.0 fastmatch_1.1-0
# [133] edgeR_3.32.1 GSEABase_1.52.1 curl_4.3 gtools_3.8.2
# [137] nloptr_1.2.2.2 lifecycle_1.0.0 nlme_3.1-152 jsonlite_1.7.2
# [141] aod_1.3.1 carData_3.0-4 viridisLite_0.3.0 pillar_1.4.7
# [145] lattice_0.20-41 fastmap_1.1.0 httr_1.4.2 survival_3.2-10
# [149] GO.db_3.12.1 glue_1.4.2 zip_2.1.1 iterators_1.0.13
# [153] bit_4.0.4 ggforce_0.3.3 stringi_1.5.3 blob_1.2.1
# [157] org.Hs.eg.db_3.12.0 caTools_1.18.1 memoise_2.0.0
Test enrichment of CITE-seq derived AS03 specific cell phenotypes in
the validation cohort.
mid_res/vand/3_V4_vand_enrCITE_sgnals.r
# gene set enrichment signals in vand cohort
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(variancePartition))
#suppressMessages(library(scglmmr))
source(here('functions/scglmmr.functions.R'))
###### save paths
= here("mid_res/vand/generated_data/")
datapath
# parallel opts
register(SnowParam(4))
= SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# load signatures
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/li.rds'))
li
# combine dsignatures
= li$CD14_Mono %>% unlist() %>% unique()
mono.combined = li$mDC %>% unlist() %>% unique()
mdc.combined = li$BC_Naive %>% unlist() %>% unique()
nb.combined
# combine T signals for total T cell sort validaiton
= list(
t.combined 'Tcell.combined' = c(
$CD4_CD25_Tcell,
li$CD4_Efct_Mem_Tcell,
li$CD4Naive_Tcell,
li$CD8_Mem_Tcell,
li$CD8_Naive_Tcell
li%>%
) unlist() %>%
unique()
)
# add additional b cell signatures from apoptosis hypothesis
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/bsig_data/sig.test.rds'))
sig.test
# add combined signals
$CD14_Mono$combined.signature = mono.combined
li$mDC$combined.signature = mdc.combined
li$BC_Naive$combined.signature = nb.combined
li$BC_Naive = c(li$BC_Naive, sig.test)
li
# load vand fits and extract ranks
= readRDS(file = here('mid_res/vand/generated_data/fit1.rds'))
fit1 = ExtractResult(model.fit.list = fit1,
vand.rank what = 'lmer.z.ranks',
coefficient.number = 1,
coef.name = 'delta')
# CD14 monocyte test in total sorted monocyte
= FgseaList(
mv rank.list.celltype = list('MNC' = vand.rank$MNC),
pathways = li$CD14_Mono,
BPPARAM = pparam
)
# mDC test in sorted DC
= FgseaList(
dcv rank.list.celltype = list('DNC' = vand.rank$DNC),
pathways = li$mDC,
BPPARAM = pparam
)
# naive BC test in sorted total B
= FgseaList(
bcv rank.list.celltype = list('BCL' = vand.rank$BCL),
pathways = li$BC_Naive,
BPPARAM = pparam
)
# T cell combined in sorted T cell
= FgseaList(
tcv rank.list.celltype = list('TCL' = vand.rank$TCL),
pathways = c(
$CD4_CD25_Tcell,
li$CD4_Efct_Mem_Tcell,
li$CD4Naive_Tcell,
li$CD8_Mem_Tcell,
li$CD8_Naive_Tcell,
li
t.combined
),BPPARAM = pparam
)
# save objects
saveRDS(object = mv,file = paste0(datapath, 'mv.rds'))
saveRDS(object = dcv,file = paste0(datapath, 'dcv.rds'))
saveRDS(object = bcv,file = paste0(datapath, 'bcv.rds'))
saveRDS(object = tcv,file = paste0(datapath, 'tcv.rds'))
Combined figures for the CITE-seq and validation cohort.
mid_res/combined_contrast/1_combined_contrast_vand_citeseq.r
suppressMessages(library(tidyverse))
suppressMessages(library(here))
source(here('functions/scglmmr.functions.R'))
source(here('functions/MattPMutils.r'))
# output directories
= here("mid_res/combined_contrast/figures/")
figpath = here('mid_res/combined_contrast/generated_data/')
datapath dir.create(figpath); dir.create(datapath)
# load CITE results
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/gc.rds'))
gc
#######
# CD14 monocytes cite-seq
######
= gc$CD14_Mono %>%
mo as.data.frame() %>%
filter(padj < 0.05 & NES > 0)
# shorten names
$pathway[mo$pathway == "LI.M4.3 myeloid cell enriched receptors and transporters" ] <- 'LI.M4.3 myeloid receptors and transporters'
mo$pathway[mo$pathway == "REACTOME_INTERFERON_SIGNALING" ] <- 'reactome interferon signaling'
mo$pathway[mo$pathway == "LI.M37.0 immune activation - generic cluster" ] <- 'LI.M37.0 immune activation'
mo$pathway[mo$pathway == "SLE_SIG" ] <- 'IFN Sig (SLE)'
mo$pathway[mo$pathway == "IFN1_DCACT" ] <- 'IFN I DCACT'
mo
# save adjuvant signatures without IFN sigs
= mo %>% filter(! pathway %in% c('IFN I DCACT', 'IFN Sig (SLE)', 'reactome interferon signaling', "LI.M127 type I interferon response"))
mo.noifn saveRDS(mo.noifn,file = paste0(datapath, 'mo.noifn.rds'))
$pathway
mo.noifn# [1] "LI.M11.0 enriched in monocytes (II)" "LI.M16 TLR and inflammatory signaling"
# [3] "LI.S4 Monocyte surface signature" "LI.M4.0 cell cycle and transcription"
# [5] "LI.M37.0 immune activation" "LI.M4.3 myeloid receptors and transporters"
# [7] "LI.M118.1 enriched in monocytes (surface)" "LI.M169 mitosis (TF motif CCAATNNSNNNGCG)"
# [9] "LI.M67 activated dendritic cells" "LI.M194 TBA"
# [11] "LI.M118.0 enriched in monocytes (IV)"
######
# mDCs cite-seq
######
= gc$mDC %>%
dc as.data.frame() %>%
filter(padj<0.05 & NES > 0)
# shorten names
$pathway[dc$pathway == "REACTOME_MITOTIC_M_M_G1_PHASES" ] <- 'reactome mitosis M G1 phase'
dc$pathway[dc$pathway == "REACTOME_CELL_CYCLE_CHECKPOINTS" ] <- 'reactome cell cycle checkpoints'
dc$pathway[dc$pathway == "REACTOME_APC_C_CDH1_MEDIATED_DEGRADATION_OF_CDC20_AND_OTHER_APC_C_CDH1_TARGETED_PROTEINS_IN_LATE_MITOSIS_EARLY_G1" ] <- 'reactome CDC20 APC C degradation late mitosis early G1'
dc$pathway[dc$pathway == "LI.M4.3 myeloid cell enriched receptors and transporters" ] <- 'LI.M4.3 myeloid receptors and transporters'
dc$pathway[dc$pathway == "REACTOME_VIF_MEDIATED_DEGRADATION_OF_APOBEC3G" ] <- 'reactome VIF APOBEC3G'
dc$pathway[dc$pathway == "REACTOME_DNA_REPLICATION" ] <- 'reactome DNA replication'
dc$pathway[dc$pathway == "REACTOME_CROSS_PRESENTATION_OF_SOLUBLE_EXOGENOUS_ANTIGENS_ENDOSOMES" ] <- 'reactome cross presentation of exogenous antigen'
dc$pathway[dc$pathway == "REACTOME_AUTODEGRADATION_OF_CDH1_BY_CDH1_APC_C" ] <- 'reactome autodegradation of CDH1 by APC C'
dc$pathway[dc$pathway == "REACTOME_MITOTIC_M_M_G1_PHASES" ] <- 'reactome mitosis M G1 phase'
dc$pathway[dc$pathway == "REACTOME_APC_C_CDC20_MEDIATED_DEGRADATION_OF_MITOTIC_PROTEINS" ] <- 'reactome APC C CDC20 degradation of mitotic proteins'
dc$pathway[dc$pathway == "REACTOME_M_G1_TRANSITION" ] <- 'reactome M G1 transition'
dc$pathway[dc$pathway == "REACTOME_EXTRINSIC_PATHWAY_FOR_APOPTOSIS" ] <- 'reactome apoptotic extrinsic pathway'
dc
# IFN these pathways are not present in the dc enrichments.
= dc
dc.noifn saveRDS(dc.noifn,file = paste0(datapath, 'dc.noifn.rds'))
$pathway
dc.noifn# [1] "LI.M4.0 cell cycle and transcription" "LI.M11.0 enriched in monocytes (II)"
# [3] "reactome mitosis M G1 phase" "reactome cell cycle checkpoints"
# [5] "reactome mitosis late mitosis early G1" "LI.M4.3 myeloid receptors and transporters"
# [7] "reactome VIF APOBEC3G" "reactome DNA replication"
# [9] "LI.M194 TBA" "reactome cross presentation of exogenous antigen"
# [11] "reactome autodegradation of CDH1 by APC C" "reactome APC C CDC20 degradation of mitotic proteins"
# [13] "LI.M177.0 TBA" "reactome M G1 transition"
# [15] "reactome apoptotic extrinsic pathway"
#####
# B cells cite-seq
#####
= gc$BC_Naive %>%
bn as.data.frame() %>%
filter(padj < 0.05) # do not apply NES filter
# add string for cohort
$cohort = 'CITE-seq'
mo$cohort = 'CITE-seq'
dc$cohort = 'CITE-seq'
bn
##################################
# load validation cohort data
##################################
= readRDS(file = here('mid_res/vand/generated_data/mv.rds'))
mv = readRDS(file = here('mid_res/vand/generated_data/dcv.rds'))
dcv = readRDS(file = here('mid_res/vand/generated_data/bcv.rds'))
bcv = as.data.frame(mv$MNC)
mv = as.data.frame(dcv$DNC)
dcv = as.data.frame(bcv$BCL)
bcv
# shorten names
$pathway[mv$pathway == "LI.M4.3 myeloid cell enriched receptors and transporters" ] <- 'LI.M4.3 myeloid receptors and transporters'
mv$pathway[mv$pathway == "LI.M37.0 immune activation - generic cluster" ] <- 'LI.M37.0 immune activation'
mv$pathway[mv$pathway == "SLE_SIG" ] <- 'IFN Sig (SLE)'
mv$pathway[mv$pathway == "IFN1_DCACT" ] <- 'IFN I DCACT'
mv$pathway[dcv$pathway == "REACTOME_MITOTIC_M_M_G1_PHASES" ] <- 'reactome mitosis M G1 phase'
dcv$pathway[dcv$pathway == "REACTOME_CELL_CYCLE_CHECKPOINTS" ] <- 'reactome cell cycle checkpoints'
dcv$pathway[dcv$pathway == "REACTOME_APC_C_CDH1_MEDIATED_DEGRADATION_OF_CDC20_AND_OTHER_APC_C_CDH1_TARGETED_PROTEINS_IN_LATE_MITOSIS_EARLY_G1" ] <- 'reactome CDC20 APC C degradation late mitosis early G1'
dcv$pathway[dcv$pathway == "LI.M4.3 myeloid cell enriched receptors and transporters" ] <- 'LI.M4.3 myeloid receptors and transporters'
dcv$pathway[dcv$pathway == "REACTOME_VIF_MEDIATED_DEGRADATION_OF_APOBEC3G" ] <- 'reactome VIF APOBEC3G'
dcv$pathway[dcv$pathway == "REACTOME_DNA_REPLICATION" ] <- 'reactome DNA replication'
dcv$pathway[dcv$pathway == "REACTOME_CROSS_PRESENTATION_OF_SOLUBLE_EXOGENOUS_ANTIGENS_ENDOSOMES" ] <- 'reactome cross presentation of exogenous antigen'
dcv$pathway[dcv$pathway == "REACTOME_AUTODEGRADATION_OF_CDH1_BY_CDH1_APC_C" ] <- 'reactome autodegradation of CDH1 by APC C'
dcv$pathway[dcv$pathway == "REACTOME_MITOTIC_M_M_G1_PHASES" ] <- 'reactome mitosis M G1 phase'
dcv$pathway[dcv$pathway == "REACTOME_APC_C_CDC20_MEDIATED_DEGRADATION_OF_MITOTIC_PROTEINS" ] <- 'reactome APC C CDC20 degradation of mitotic proteins'
dcv$pathway[dcv$pathway == "REACTOME_M_G1_TRANSITION" ] <- 'reactome M G1 transition'
dcv$pathway[dcv$pathway == "REACTOME_EXTRINSIC_PATHWAY_FOR_APOPTOSIS" ] <- 'reactome apoptotic extrinsic pathway'
dcv
# append with cohort
$cohort = 'validation'
dcv$celltype = 'sorted DC'
dcv
$cohort = 'validation'
bcv$celltype = 'sorted B cells'
bcv
$cohort = 'validation'
mv$celltype = 'sorted monocytes'
mv
# pathways in CITE hyp set.
= dcv %>% filter(pathway %in% dc$pathway)
dcv = mv %>% filter(pathway %in% mo$pathway)
mv
# combine
= c('pathway', 'pval', 'padj', 'NES', 'celltype', 'cohort')
col.keep = list(dcv, bcv, mv, mo, dc, bn)
r.list = lapply(r.list, function(x) x %>% select(all_of(col.keep)))
r.list = bind_rows(r.list)
d
# group
$main = ifelse(d$celltype %in% c('mDC', 'sorted DC'), yes = 'DC', no = d$celltype)
d$main = ifelse(d$celltype %in% c('CD14_Mono', 'sorted monocytes'), yes = 'Mono', no = d$main)
d$main = ifelse(d$celltype %in% c('BC_Naive', 'sorted B cells'), yes = 'BC', no = d$main)
d
# unnate subset
= d %>% filter(!celltype %in% c('BC_Naive', 'sorted B cells'))
d2 $main = factor(d2$main, levels = c('Mono', 'DC'))
d2
# add asterisk for significant validation
= d2 %>% filter(!pathway == 'combined.signature')
d2 =
d3 %>%
d2 mutate(padj.validation = ifelse(cohort == 'validation', padj, no = Inf)) %>%
mutate(padj.citeseq = ifelse(cohort == 'CITE-seq', padj,no = Inf)) %>%
mutate(pathway.new = ifelse( padj.validation < 0.01, yes = paste0(' * ', pathway), no = pathway))
%>% filter(cohort == 'validation')
d3 $pathway = plyr::mapvalues(d2$pathway,from = d3$pathway,to = d3$pathway.new)
d2
$cohort = factor(d2$cohort, levels = c("validation", "CITE-seq"))
d2=
p ggplot(d2, aes(x = NES, y = reorder(pathway, NES), size = -log10(padj), fill=cohort)) +
theme_bw() +
geom_jitter(stroke = 0.3, shape = 21, height = 0.1, width = 0) + # visualize directly overlapping points
scale_fill_manual(values = c(col.alpha('#90C983',0.7), col.alpha('deepskyblue3', 0.5))) +
ylab("") +
xlab('Normalized Enrichment Score') +
geom_vline(xintercept = 0, linetype = 'dashed') +
facet_grid(vars(main), scales = 'free', space = 'free') +
theme_bw(base_size = 9) +
theme(axis.text = element_text(color = 'black')) +
guides(fill = guide_legend(override.aes = list(size = 5)))
pggsave(p, filename = paste0(figpath,'combined_as03_model_withifn.pdf'), width = 5.5, height = 3.8)
# B cells
= d %>%
d3 filter(celltype %in% c('BC_Naive', 'sorted B cells')) %>%
filter(pathway %in% c(bn$pathway, 'apoptosis.signature', 'CD40_ACT') )
=
p ggplot(d3 %>%filter(!pathway == 'combined.signature') %>%filter(cohort == 'validation'),
aes(x = NES,y = reorder(pathway, NES),size = -log10(padj),fill = cohort)) +
theme_bw() +
geom_point(shape = 21) +
scale_fill_manual(values = col.alpha('#90C983',0.7)) +
ylab("") +
xlab('Normalized Enrichment Score') +
geom_vline(xintercept = 0, linetype = 'dashed') +
theme_bw(base_size = 12) +
theme(axis.text = element_text(color = 'black')) +
guides(fill = guide_legend(override.aes = list(size = 5))) +
theme(legend.key.height = unit(0.3, units = 'cm'))
pggsave(p, filename = paste0(figpath,'validation_bc_as03_model.pdf'), width = 5, height = 2.3)
Use model output from 1_h1_mixed_effect_workflow_V4.r to rank genes
based moderated t test statistics of high vs low responder effect at
baseline adjusted for age sex & batch.
mid_res/baseline_response/1_baseline_gseaV3.r
# R version 4.0.5
# H1N1 Unadjuvanted group baseline gene set enrichment analysis
# gsea based on the ranks of the contrast high vs low responder pre vaccinaiton
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
# set save paths
= here("mid_res/baseline_response/dataV3/"); dir.create(datapath)
datapath = here("mid_res/baseline_response/figuresV3/"); dir.create(figpath)
figpath
# parallel options for FseaList
::register(BiocParallel::SnowParam(4))
BiocParallel= BiocParallel::SnowParam(workers = 4, type = "SOCK", progressbar = TRUE)
pparam
# load combined pathways
= readRDS(file = here('signature_curation/combined_signatures.rds'))
mods
# load baseline contrast, rank genes run gsea
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/cont0.rds'))
cont0 = ExtractResult(model.fit.list = cont0, what = 'gene.t.ranks',coefficient.number = 1, coef.name = 'adjmfc')
r0
# run fgea on each cell type
= FgseaList(rank.list.celltype = r0, pathways = mods, BPPARAM = pparam)
g0 saveRDS(object = g0, file = paste0(datapath, 'g0.rds'))
These results read in a set of shortened module / pathway names for visualization that is in the starting data folder Curate enrichment results pt 1. mid_res/baseline_response/2_curate_gseaV3.r
# R version 4.0.5
# Curate H1N1 Unadjuvanted group baseline gene set enrichment analysis
# calculate pairwise jaccard index and reduce enrichments to major signals with low mutual information
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(scglmmr))
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/")
figpath
# read baseline enrichemnt results
= readRDS(file = here('signature_curation/module_rmlist.rds'))
mrm = readRDS(file = here('mid_res/baseline_response/dataV3/g0.rds'))
g0 = lapply(g0, function(x) x %>% filter(!pathway %in% mrm))
g0 = lapply(g0, function(x) x %>% filter(padj < 0.05))
filtered_g0
# compute jaccard index of leadingedge genes within celltype
= LeadingEdgeIndexed(gsea.result.list = filtered_g0, padj.threshold = 0.05)
li
= EnrichmentJaccard(gsealist = filtered_g0, indexedgenes = li,
jres saveplot = FALSE,
figpath = figpath,
returnJaccardMtx = TRUE,
fontsize_row = 7.5, fontsize_col = 7.5)
= jres$sortedgsea %>%
d mutate(leadingEdge = map_chr(leadingEdge, toString)) %>%
select(celltype, av_jaccard,everything())
write_delim(d,file = paste0(datapath, 'g0jaccard.csv'),delim = ',')
# save the jaccard matrices
= jres$jaccard_matrix_list
jmats saveRDS(jmats ,file = paste0(datapath, 'jmats.rds'))
sessionInfo()
Curate enrichment pt 2
mid_res/baseline_response/3_gsea.vis.r
# R version 4.0.5
# visualization of gene set enrichment results.
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(scglmmr))
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/")
figpath
# res text gsea curated
= data.table::fread(here('mid_res/baseline_response/dataV3/g0jaccard.curated.txt')) %>%
d filter(include ==1) %>%
mutate(signal = paste(celltype, pathway, sep = '~'))
# gsea res raw
= readRDS(file = here('mid_res/baseline_response/dataV3/g0.rds'))
g0
# filter to the gene sets from curated results.
= list()
g0.sub for (i in 1:length(g0)) {
=
g0.sub[[i]] %>%
g0[[i]] mutate(signal = paste(celltype, pathway, sep = '~')) %>%
filter(signal %in% d$signal) %>%
mutate(celltype = str_replace_all(celltype,pattern = '_',replacement = ' '))
}names(g0.sub) = names(g0)
# plot
= PlotFgsea(gsea_result_list = g0.sub, padj_filter = 0.01)
p ggsave(p,filename = paste0(figpath, 'gsea.g0sub.baseline.pdf'), width = 9.5, height = 6)
# save object test curated fgsea formatted results.
saveRDS(g0.sub, file = paste0(datapath, 'g0.sub.rds'))
Within each subset calculate log cpm of cell type specific leading
edge enrichment phenotypes based on high vs low responder model.
mid_res/baseline_response/4_baseline_exprs_amz_score.r
# create baseline expression leading edge module correlation matrix
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(scglmmr))
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/")
figpath
# read pb data, subset to day 0 non adj, subset out day 0 metadata.
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb = lapply(pb, function(x) x = x[ ,1:40])
pb = gsub("~.*","",colnames(pb[[1]]))
cnames = lapply(pb, function(x){
pb %>% as.data.frame() %>% setNames(nm = cnames) %>% as.matrix()
x
})= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV3/d0.rds'))
d0 = lapply(pb, function(x){ x = x[ , rownames(d0)]})
d0d
# convert pb data to log counts per million
= list()
d.norm for (i in 1:length(d0d)) {
= edgeR::DGEList(counts = d0d[[i]], samples = d0)
d = edgeR::filterByExpr(y = d$counts, min.count = 3,
gtable design = as.factor(d$samples$group))
= d[gtable, keep.lib.sizes=FALSE]
d = edgeR::cpm(y = d, log = TRUE, prior.count = 1)
d.norm[[i]]
}names(d.norm) = names(d0d)
# get leading edge genes from cur. baseline mods
= readRDS(file = here('mid_res/baseline_response/dataV3/g0.sub.rds'))
g0.sub = LeadingEdgeIndexed(gsea.result.list = g0.sub, padj.threshold = 0.05)
li.g0 = base::Filter(length, li.g0)
li.g0
# subset normalized expression to subsets with baseline enrichments
= d.norm[names(li.g0)]
d.norm
= list()
res for (i in 1:length(d.norm)) {
stopifnot(all.equal( names(d.norm[i]), names(li.g0[i]) ))
= scglmmr::calc_avg_module_zscore(
zscore module.list = li.g0[[i]], average.data.frame = d.norm[[i]]
)rownames(zscore) = paste(rownames(zscore), names(d.norm[i]), sep = '~')
= zscore
res[[i]]
}
= do.call(rbind, res) %>% t()
ds saveRDS(ds, file = paste0(datapath, 'ds.rds'))
sessionInfo()
For each subject calculate the day 7 fold change of the predictive antibody response signature (array data). mid_res/baseline_response/4b_d7FC_response_sig.r
# baseline nettwork correlation
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(scglmmr))
# Day 7 array data and modules in day 7 data # load day 7 gene sets
= readRDS("signature_curation/core_d7.rds")
sig7 = list("CHI_Day7_Response" = sig7$`CHI d7 Response`)
module.list
= here("mid_res/baseline_response/dataV3/")
datapath
# read CHI array data
= data.table::fread(here('data/full_metadata/full_sample_metadata.txt')) %>%
subjects filter(CITEdata == 1 & vaccine_cohort == 'H1N1') %$%
subjectid
=
array7 ::fread(here("data/CHI_H1N1_data/microarray/CHI_GE_matrix_gene.txt"), data.table = F) %>%
data.table::remove_rownames() %>%
tibble::column_to_rownames("gene") %>%
tibbleselect(which(substr(names(.),1,3) %in% subjects)) %>%
select(., matches("day0|day1|day7")) %>%
select(-matches(c("day70|pre|day1") )) %>%
select(-c('207_day0', '209_day0', '279_day0'))
# calculate microarray fold changge (data is already in log space)
= array7[ ,seq(from=1, to = ncol(array7), by = 2)]
t0 = array7[ ,seq(from=2, to = ncol(array7), by = 2)]
t1 stopifnot(str_sub(colnames(t1), 1,3) == str_sub(colnames(t0), 1,3))
stopifnot(dim(t0) == dim(t1))
= t1 - t0
fc7 = as.data.frame(fc7)
fc7
# calculate the average z score of the day 7 fold change values across samples
= calc_avg_module_zscore(module.list = module.list, average.data.frame = fc7)
d7res names(d7res) = str_sub(names(d7res), 1,3)
# save results
saveRDS(d7res, file = paste0(datapath, 'd7res.rds'))
sessionInfo()
Correlate expression of cell type specific baseline states with the
day7 response signature.
mid_res/baseline_response/4c_d7sigFC_vs_baseline_correlation.r
# correlate module expression avz with d7 FC in antibody predictive signature.
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(scglmmr))
source(here('functions/MattPMutils.r'))
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/d7cor/");
figpath dir.create(figpath, recursive = TRUE)
# day 7 response signature fc
= readRDS(file = here('mid_res/baseline_response/dataV3/d7res.rds'))
d7res
# baseline expression correlation
= readRDS(file = here('mid_res/baseline_response/dataV3/ds.rds'))
ds
# created shorter names
= data.table::fread(
new.names file = here('mid_res/baseline_response/dataV3/baseline.module.name.shortened.txt'),
sep = '\t'
)$cname2 = paste(new.names$celltype, new.names$shortname, sep = ' :: ')
new.namescolnames(ds) = new.names$cname2
# format
= ds %>%
d7form as.data.frame() %>%
rownames_to_column('subject') %>%
mutate(subject = str_sub(subject, 1,3)) %>%
filter(subject %in% colnames(d7res)) %>%
column_to_rownames('subject')
saveRDS(d7form, file = paste0(datapath,'d7form.rds'))
= readRDS(file = here('mid_res/baseline_response/dataV3/d7form.rds'))
d7form
# pairwise correlation with d7 response
= cbind(t(d7res), as.data.frame(d7form))
dd saveRDS(dd, file = paste0(datapath,'dd.rds'))
= Hmisc::rcorr(as.matrix(dd),type = 'spearman')
d7.cor saveRDS(d7.cor, file = paste0(datapath,'d7.cor.rds'))
= readRDS(file = here('mid_res/baseline_response/dataV3/d7.cor.rds'))
d7.cor
# aes set
= list(
plotattr theme_bw(),
geom_point(shape = 21, size = 3.5, stroke = 0.8),
#geom_text(nudge_y = 0.05, size = 3),
ylab('Day 7 FC Antibody response signature'),
theme(axis.title.y = element_text(size = 11)),
theme(axis.title.x = element_text(size = 8)),
scale_fill_manual(values = c(col.alpha("red",0.7), col.alpha("dodgerblue", 0.7))),
theme(legend.position = 'none', legend.key.size = unit(0.29, units = 'cm')),
theme(aspect.ratio = 1)
)
# specify response in vis.
= c("205","207","209","212","215","234","237","245","250","256")
high.responders
# calculate and vis correlation between day7 response signature fold change (bulk)
# versus log cpm of the day 7 signatures associated with adjMFC group
for (i in 1:length(colnames(d7form))) {
#i = 1
= c(colnames(d7form)[i], colnames(t(d7res)))
mod.names = cbind(as.data.frame(d7form[, i]), t(d7res))
cplot $subject = rownames(d7form)
cplot$response = ifelse(cplot$subject %in% high.responders, 'high', 'low')
cplotcolnames(cplot)[1:2] = mod.names
# -- for correlation --
= as.data.frame(cbind(v1 = cplot[ ,1], v2 = cplot[ ,2]))
dsub
# calculate and vis. correlation across all subjects; color by response.
= ggpubr::ggscatter(cplot, x = mod.names[1], y = mod.names[2],
p color = col.alpha('white','0.01'),
add.params = list(color = "black", fill = "grey")
+
) +
plotattr aes(fill = response) +
::stat_cor(data = dsub, aes(x = v1, y = v2), method = 'spearman',
ggpubrinherit.aes = FALSE, label.x.npc = "left",label.y.npc = "top", cor.coef.name = "rho",)
# save name by cell type first
= sub("^[^::]*::", "", mod.names[1])
module = gsub("::.*", "", mod.names[1])
celltype ggsave(p, filename = paste0(figpath, celltype, module, 'd7cor.pdf'), width = 3.1, height = 3.1)
}
Create baseline cell phenotype correlation network and ‘calculate
shared latent information’ for intracellular correlations.
mid_res/baseline_response/5_baseline_sli_correlation_network.r
# baseline nettwork correlation
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
source('functions/MattPMutils.r')
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/")
figpath
################################
# Part I correct intracellular correlations for overlapping gene content
################################
# pairwise jaccard index of all leading edge genes from modules
library(scglmmr)
= readRDS(file = here("mid_res/baseline_response/dataV3/g0.sub.rds"))
g0.sub = scglmmr::LeadingEdgeIndexed(gsea.result.list = g0.sub,padj.threshold = 0.05)
li.index
# remove lists with no enrichments indexed by cell type
= base::Filter(g0.sub, f = nrow)
g0.sub = base::Filter(li.index, f = length)
li.index
# reorder enrichment to match leading edge index
= g0.sub[names(li.index)]
g0.sub stopifnot(isTRUE(all.equal(names(g0.sub), names(li.index))))
= li.index[c('CD14_Mono', 'CD16_Mono', 'mDC', 'MAIT_Like')]
ss = lapply(ss, function(x) x %>% unlist() %>% unname() %>% unique())
ss
::venn.diagram(ss,
VennDiagram# color
fill = c("#999999", "#E69F00", "#56B4E9", "#009E73"),
# text
cat.cex = 0.4,cat.default.pos = "inner",cat.fontfamily = "Helvetica",
# file
imagetype="png", height = 1080, width = 1080, resolution = 300,compression = "lzw",
filename = paste0(figpath,"gpvenn.png")
)= VennDiagram::calculate.overlap(ss)
go <- unlist(
go lapply(
1:length(ss), function(j) {
combn(names(ss), j, simplify = FALSE)
}),recursive = FALSE
)#names(combs) <- sapply(combs, function(i) paste0(i, collapse = ""))
#str(combs)
# get jaccard index matrix of intercellular leading edge genes
# from different enrichments
= EnrichmentJaccard(gsealist = g0.sub,
jmat indexedgenes = li.index,
returnJaccardMtx = TRUE)
= jmat$jaccard_matrix_list
jmat
# load of baseline module expression across donors only of
# leading edge genes from baseline enrichments
= readRDS(here('mid_res/baseline_response/dataV3/ds.rds'))
ds ::fwrite(ds,file = paste0(here('git_ignore/ds.csv')),sep = ',')
data.table
# created shorter names
= data.table::fread(
new.names file = here('mid_res/baseline_response/dataV3/baseline.module.name.shortened.txt'),
sep = '\t'
)$cname2 = paste(new.names$celltype, new.names$shortname, sep = ' :: ')
new.namescolnames(ds) = new.names$cname2
# split module expression by cell types to subtract jacard similariry index
# from intracellular spearman correlation coefficient caluclated below
# also add shorter names names
= ds %>%
ds2 t() %>%
as.data.frame() %>%
rownames_to_column('cname2') %>%
full_join(new.names, by = "cname2") %>%
select(-c('module', 'cname', 'shortname')) %>%
select(celltype, everything())
# split
= split(ds2,f = ds2$celltype)
ds2
# calculate Spearman correlation for intracellular correlations
= lapply(ds2, function(x) {
ds.cor = x %>%
mtr select(-c('celltype')) %>%
remove_rownames() %>%
column_to_rownames('cname2') %>%
t()
return(Hmisc::rcorr(mtr, type = 'spearman')$r)
})
# subset to celltypes with multiple enrichments
= ds.cor[names(jmat)]
ds.cor
# append the names of the jaccard matrix with cell type so that format is
# celltype :: module name to match correlation matrix
for (i in 1:length(jmat)) {
= jmat[[i]]
mt colnames(mt) = rownames(mt) = plyr::mapvalues(
rownames(mt),
from = new.names$module,
to = new.names$shortname
)colnames(mt) = rownames(mt) = paste(names(jmat)[i], colnames(mt),sep = ' :: ')
= mt
jmat[[i]]
}
# calculate shared latent informaiton of intracellular correlations
# subtract jaccard similariry from the spearman correlation coefficient
# diagonal corrects to 0
= list()
sli for (i in 1:length(jmat)) {
#i = 1
stopifnot(isTRUE(all.equal(rownames(ds.cor[[i]]), rownames(jmat[[i]]))) )
= ds.cor[[i]] - jmat[[i]]
sli[[i]]
}
# now calculate the full spearman correlation matrix without subtracting jmat
# replace the matrix values from intracellular correlations with the SLI values
# which are stored in sli list by celltype
= Hmisc::rcorr(ds, type = 'spearman')
spearmanmat #saveRDS(spearmanmat, file = paste0(datapath, 'spearmanmat.rds'))
#spearmanmat = readRDS(file = here('mid_res/baseline_response/dataV3/spearmanmat.rds'))
= spearmanmat$r
rhomat = rhomat
mat for (i in 1:length(sli)) {
#i = 1
# get index of cols and rows
= which(rownames(mat) %in% rownames(sli[[i]]))
row.replace = which(colnames(mat) %in% colnames(sli[[i]]))
col.replace
# replace values along the square diagonal of the matrix
# confirm these are the same
stopifnot(isTRUE(all.equal(row.replace, col.replace)))
# check structure
stopifnot(isTRUE(all.equal(
# full spearman matrix subset by rows of celltype i
mat[row.replace, col.replace],# original spearman correlation matrix for celltype i
ds.cor[[i]]
)))
# replace iteratively
= sli[[i]]
mat[row.replace,col.replace]
diag(sli[[i]])
diag(mat[row.replace, col.replace])
# this should be going down with each iteration
print(sum(mat))
}
# there are 2 celltypes with only 1 enrichment.
# These intracellular correlations are just the diagnoal and they were not replaced iteratively
# CD8 mem and CD4 naive
# to adjust these two corerlations = to 1 currently, set the corrected diagonal to 0.
diag(mat) = 0
# save SLI corrected matrix
saveRDS(mat,file = paste0(datapath,'mat.rds'))
= readRDS(file = here('mid_res/baseline_response/dataV3/mat.rds'))
mat
################################
# Part II visualization
################################
# plot pre and post adjustment correlation map
# spearmancorrelation coefficient rhomat
= colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")))(11)
cu <- max(abs(rhomat))
range # without clustering
dev.off()
pdf(file = paste0(figpath,'preadjusted.cormat.baseline.pdf'),width = 6, height = 5)
::pheatmap(rhomat, color = cu,
pheatmapcluster_rows = FALSE, cluster_cols = FALSE,
border_color = NA,
breaks = seq(-range, range, length.out = 11),
fontsize_row = 5, fontsize_col = 0.01)
dev.off()
# plot sli corrected matrix
pdf(file = paste0(figpath,'post.SLIadjusted.cormat.baseline.pdf'),width = 6, height = 5)
::pheatmap(mat, color = cu,
pheatmapcluster_rows = FALSE, cluster_cols = FALSE,
border_color = NA,
breaks = seq(-range, range, length.out = 11),
fontsize_row = 5, fontsize_col = 0.01)
dev.off()
# figure -- full network visualization as matrix (non pruned)
# cluster the square matrix
#diag(mat)[diag(mat) > 0] = 0
<- max(abs(mat))
range = BuenColors::jdb_palette('solar_flare', type = 'continuous') %>% as.vector
cu3 = cu3[seq(from = 0 , to = 1000,length.out = 20)]
cu3 rownames(mat) = str_replace_all(string = rownames(mat),pattern = '_',replacement = ' ')
pdf(file = paste0(figpath,'post.clustered.SLIadjusted.cormat.baseline.pdf'),width = 8, height = 6.5)
::pheatmap(mat,
pheatmapcolor = cu3,
#cluster_rows = FALSE, cluster_cols = FALSE,
border_color = NA,
treeheight_row = 10,
treeheight_col = 20,
breaks = seq(-range, range, length.out = 20),
fontsize_row = 5.5,
fontsize_col = 0.01)
dev.off()
####
# range <- max(abs(mat))
# cu3 = BuenColors::jdb_palette('solar_flare', type = 'continuous') %>% as.vector
# cu3 = cu3[seq(from = 0 , to = 1000,length.out = 20)]
# rownames(mat) = str_replace_all(string = rownames(mat),pattern = '_',replacement = ' ')
# pdf(file = paste0(figpath,'v2.post.clustered.SLIadjusted.cormat.baseline.pdf'),width = 8.5, height = 8)
# pheatmap::pheatmap(mat,
# color = cu3,
# #cluster_rows = FALSE, cluster_cols = FALSE,
# border_color = NA,
# treeheight_row = 10,
# treeheight_col = 20,
# breaks = seq(-range, range, length.out = 20),
# fontsize_row = 6,
# fontsize_col = 6)
# dev.off()
### New version
pdf(file = paste0(figpath,'v2.post.clustered.SLIadjusted.cormat.baseline.pdf'),width = 15, height = 11)
::corrplot(mat, method="color", col=cu3,
corrplottype="upper", order="hclust",
#addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45,
tl.cex = 0.6, #Text label color and rotation
# Combine with significance
#p.mat = p.mat, sig.level = 0.01, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag=TRUE
)dev.off()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] scglmmr_0.1.0 magrittr_2.0.1 here_1.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
# [8] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3 tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] lme4_1.1-26 tidyselect_1.1.0 RSQLite_2.2.7 AnnotationDbi_1.52.0
# [5] htmlwidgets_1.5.3 grid_4.0.5 BiocParallel_1.24.1 scatterpie_0.1.7
# [9] munsell_0.5.0 codetools_0.2-18 statmod_1.4.35 withr_2.4.3
# [13] colorspace_2.0-0 GOSemSim_2.16.1 Biobase_2.50.0 knitr_1.39
# [17] rstudioapi_0.13 stats4_4.0.5 ggsignif_0.6.0 DOSE_3.16.0
# [21] Rdpack_2.1.1 MatrixGenerics_1.2.1 emmeans_1.5.4 GenomeInfoDbData_1.2.4
# [25] polyclip_1.10-0 bit64_4.0.5 farver_2.0.3 pheatmap_1.0.12
# [29] rprojroot_2.0.2 downloader_0.4 coda_0.19-4 vctrs_0.4.1
# [33] generics_0.1.2 TH.data_1.0-10 xfun_0.30 doParallel_1.0.16
# [37] R6_2.5.0 GenomeInfoDb_1.26.7 graphlayouts_0.7.2 locfit_1.5-9.4
# [41] pals_1.7 bitops_1.0-6 cachem_1.0.4 fgsea_1.16.0
# [45] DelayedArray_0.16.3 assertthat_0.2.1 scales_1.1.1 multcomp_1.4-16
# [49] ggraph_2.0.5 nnet_7.3-15 enrichplot_1.10.2 gtable_0.3.0
# [53] egg_0.4.5 tidygraph_1.2.0 sandwich_3.0-0 rlang_1.0.2
# [57] slanter_0.2-0 splines_4.0.5 rstatix_0.7.0 dichromat_2.0-0
# [61] broom_0.7.5 checkmate_2.0.0 BiocManager_1.30.10 reshape2_1.4.4
# [65] abind_1.4-5 modelr_0.1.8 backports_1.2.1 qvalue_2.22.0
# [69] Hmisc_4.5-0 clusterProfiler_3.18.1 tools_4.0.5 ellipsis_0.3.2
# [73] gplots_3.1.1 RColorBrewer_1.1-2 BiocGenerics_0.36.1 Rcpp_1.0.6
# [77] plyr_1.8.6 progress_1.2.2 base64enc_0.1-3 zlibbioc_1.36.0
# [81] RCurl_1.98-1.3 prettyunits_1.1.1 ggpubr_0.4.0 rpart_4.1-15
# [85] viridis_0.5.1 cowplot_1.1.1 S4Vectors_0.28.1 zoo_1.8-8
# [89] SummarizedExperiment_1.20.0 haven_2.3.1 ggrepel_0.9.1 cluster_2.1.2
# [93] fs_1.5.0 variancePartition_1.25.6 data.table_1.14.0 DO.db_2.9
# [97] openxlsx_4.2.3 reprex_1.0.0 mvtnorm_1.1-1 packrat_0.7.0
# [101] matrixStats_0.58.0 hms_1.0.0 GSVA_1.38.2 xtable_1.8-4
# [105] pbkrtest_0.5-0.1 RhpcBLASctl_0.21-247.1 XML_3.99-0.6 rio_0.5.16
# [109] jpeg_0.1-8.1 BuenColors_0.5.6 readxl_1.3.1 IRanges_2.24.1
# [113] gridExtra_2.3 compiler_4.0.5 maps_3.4.0 KernSmooth_2.23-18
# [117] crayon_1.4.1 shadowtext_0.0.9 minqa_1.2.4 htmltools_0.5.2
# [121] ggfun_0.0.4 Formula_1.2-4 lubridate_1.7.9.2 DBI_1.1.1
# [125] tweenr_1.0.2 dbplyr_2.1.0 MASS_7.3-53.1 boot_1.3-27
# [129] Matrix_1.3-2 car_3.0-10 cli_3.3.0 rbibutils_2.0
# [133] parallel_4.0.5 igraph_1.2.6 GenomicRanges_1.42.0 pkgconfig_2.0.3
# [137] rvcheck_0.1.8 foreign_0.8-81 foreach_1.5.1 xml2_1.3.2
# [141] annotate_1.68.0 XVector_0.30.0 GeneOverlap_1.26.0 estimability_1.3
# [145] rvest_0.3.6 digest_0.6.27 graph_1.68.0 cellranger_1.1.0
# [149] fastmatch_1.1-0 htmlTable_2.1.0 edgeR_3.32.1 GSEABase_1.52.1
# [153] curl_4.3 gtools_3.8.2 nloptr_1.2.2.2 nlme_3.1-152
# [157] lifecycle_1.0.0 jsonlite_1.7.2 aod_1.3.1 carData_3.0-4
# [161] mapproj_1.2.8 viridisLite_0.3.0 limma_3.46.0 pillar_1.4.7
# [165] lattice_0.20-41 fastmap_1.1.0 httr_1.4.2 survival_3.2-10
# [169] GO.db_3.12.1 glue_1.6.2 zip_2.1.1 iterators_1.0.13
# [173] png_0.1-7 bit_4.0.4 ggforce_0.3.3 stringi_1.5.3
# [177] blob_1.2.1 org.Hs.eg.db_3.12.0 latticeExtra_0.6-29 caTools_1.18.1
# [181] memoise_2.0.0
Visualize results of baseline high responder networks, part 1.
mid_res/baseline_response/5b_network_construction_and_visualization.r
# baseline nettwork correlation
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(igraph))
suppressMessages(library(ggraph))
source('functions/MattPMutils.r')
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/")
figpath
# load SLI corrected matrix
= readRDS(file = here('mid_res/baseline_response/dataV3/mat.rds'))
mat
# Create mDC and innate sub-network
= data.frame(mods = rownames(mat)) %>%
dn mutate(name = mods) %>%
separate(name, into = c('celltype', 'module'),sep = ' :: ')
= dn %>%
innate.sub filter(celltype %in% c('CD14_Mono', 'CD16_Mono', 'MAIT_Like', 'mDC', 'BC_Naive'))
= innate.sub$mods
ms
# rm QCd modules
#m.rm = readRDS(here('mid_res/baseline_response/dataV3/m.rm.rds'))
= c(
m.rm "CD14_Mono :: M111.1 viral sensing IRF2",
"MAIT_Like :: Kegg Ag Presentation",
"MAIT_Like :: reactome interferon alpha beta"
)= ms[!ms %in% m.rm]
ms
# calculate adjusted p values across the spearman correlation matrix
# load uncorrected matrix Hmisc obejct containing correlation p values
= readRDS(file = here('mid_res/baseline_response/dataV3/spearmanmat.rds'))
spearmanmat = p.adjust.cormat(hmisc.cor = spearmanmat, method = 'fdr')
padj saveRDS(padj, file = paste0(datapath,'padj.rds'))
# filter to the innate subnetwork
# filter edges in SLI adjusted network
# only include correlations with adjusted p < 0.05
= mat
mat2 = mat2[ms, ms]
mat2 = padj[ms, ms]
padj stopifnot(isTRUE(all.equal(colnames(padj), colnames(mat2))))
# filter based on adjusted p
> 0.05] <- 0
mat2[padj
# save the pruned mat2
saveRDS(mat2,file = paste0(datapath,'mat2.rds'))
= readRDS(file = here('mid_res/baseline_response/dataV3/mat2.rds'))
mat2
# make graph of the strongly linked edges pruned above
<- graph_from_adjacency_matrix(
net weighted = TRUE,
mat2, mode = 'undirected',
diag = FALSE
)
# prune the graph further to retain links above the median weight
<- median(E(net)$weight)
med.weight = mat2
mat3 < med.weight] <- 0
mat3[mat3 saveRDS(mat3,file = paste0(datapath,'mat3.rds'))
= readRDS(file = here('mid_res/baseline_response/dataV3/mat3.rds'))
mat3
# make a subhraph with stonger connections above prev median weight.
<- graph_from_adjacency_matrix(
net
mat3,weighted = TRUE,
mode = 'undirected',
diag = FALSE
)
# create network annotations frame for vertices
= data.frame(signal = colnames(mat3)) %>%
d mutate(s = signal) %>%
separate(s,into = c('celltype', 'module'),sep = ' :: ')
# specify vertex attributes
V(net)$celltype = d$celltype
V(net)$module = d$module
# calculate network degree and hubs / authority scores (same for undirected)
V(net)$degree <- degree(net)
V(net)$hubs <- hub.score(net)$vector
V(net)$authorities <- authority.score(net)$vector
# add vertex property information to `d``
$degree = V(net)$degree
d$hubscore = V(net)$hubs
d
# add d7 correlation to d
= readRDS(file = here('mid_res/baseline_response/dataV3/d7.cor.rds'))
d7.cor = d7.cor$P[1, -1][ms]
d7.cor.p = d7.cor$r[1, -1][ms]
d7.cor.rho # check orders correct
stopifnot(isTRUE(all.equal(names(d7.cor.p), d$signal)))
$d7cor.p = d7.cor.p
d$d7cor.rho = d7.cor.rho
d
# add this information to the network
V(net)$d7cor.p = d7.cor.p
V(net)$d7cor.rho = d7.cor.rho
# specify edge width as the weight
E(net)$width = E(net)$weight
# save network
saveRDS(net,file = paste0(datapath,'net.rds'))
= readRDS(file = here('mid_res/baseline_response/dataV3/net.rds'))
net
############################
# plot hubs
= d %>% filter(celltype == 'CD14_Mono') %$% signal
signal.highlight = d %>% filter(celltype == 'CD16_Mono') %$% signal
signal.highlight2
= c('#FFD38F', '#F4A69B', '#A7DDEA', '#8ACFC3', '#9FABC4')
cu3
=
p ggplot(d, aes(y = reorder(module, hubscore), x = hubscore , fill = celltype, label = signal )) +
theme_bw() +
geom_point(shape =21, size = 3.5) +
#ggsci::scale_fill_npg(alpha = 0.8) +
scale_fill_manual(values = cu3) +
ylab('') +
theme(legend.position = c(0.8,0.15), legend.key.size = unit(0.2,units = 'cm')) +
theme(axis.text = element_text(color = 'black')) +
xlab('Hub Score') +
theme(aspect.ratio = 1.1) +
::geom_text_repel(data = d %>% filter(signal %in% signal.highlight & hubscore > 0.75),
ggrepelsize = 2.5, nudge_y = 0, nudge_x = -0.3, seed = 2, segment.size = 0.1,
force = 40,
max.overlaps = 10) +
::geom_text_repel(data = d %>% filter(signal %in% signal.highlight2 & hubscore > 0.85),
ggrepelsize = 2.5, nudge_y = 0, nudge_x = -0.3, box.padding = 0.4, seed = 1,
max.overlaps =10,force = 40,
segment.size = 0.1)
pggsave(p, filename = paste0(figpath, 'innate.subnetwork.hub.pdf'), width = 6.5, height = 6.5)
# specify colors for nodes
= c( col.alpha('orange', alpha = 0.5), ggsci::pal_npg(alpha = 0.5)(4))
cu = cu[factor(V(net)$celltype)]
c.celltype # layout network.
<- layout_in_circle(net)
lay
# specify celltypes for leend
= str_replace_all(string = levels(factor(V(net)$celltype)),pattern = '_',replacement = ' ')
cts
# version with vertiices highlighted
# specify sve path for subgraph plots
= here('mid_res/baseline_response/figuresV3/subgraphsLABELED/'); dir.create(figpath3, recursive = TRUE)
figpath3 # plot the subgraphs
for (i in 1:length(unique(V(net)))) {
# highlight edges
# specify subset highlighted
= incident(net, v = V(net)[i], mode="all")
edge.highlight.t # for savig
= str_replace_all( names(V(net)[i]), pattern = ' :: ', replacement = ' ')
signal
# make a new network
= net
net.sp # set size for highlighted edge
E(net.sp)$width = 1.1
# remove the other edges for visualization
= E(net.sp)[!E(net.sp) %in% edge.highlight.t]
ot <- delete_edges(net.sp, edges = ot)
net.sp E(net.sp)$color = col.alpha('black',0.7)
# plot network
pdf(file = paste0(figpath3,signal,'.subnetwork.pdf'),width = 10, height = 10)
plot(net.sp,
layout = lay,
vertex.label = names(V(net.sp)),
vertex.label.cex=0.3,
edge.size = E(net.sp)$weight,
vertex.size = log(V(net.sp)$degree+ 1)*4,
edge.curved = 0.1,
vertex.color = c.celltype,
vertex.size=degree(net.sp))
legend(x=0.75, y=1.2,
cts,pch=21,
pt.bg=cu,
pt.cex=1, cex=.5, bty="n",ncol=1)
dev.off()
}
#### This not run in published workflow
### Commented out for published workflow bc redundant w code below -- this used to make figs.
# specify sve path for subgraph plots
# figpath2 = here('mid_res/baseline_response/figuresV3/subgraphs/'); dir.create(figpath2, recursive = TRUE)
# # plot the subgraphs
# for (i in 1:length(unique(V(net)))) {
#
# # highlight edges
# # specify subset highlighted
# edge.highlight.t = incident(net, v = V(net)[i], mode="all")
# # for savig
# signal = str_replace_all( names(V(net)[i]), pattern = ' :: ', replacement = ' ')
#
# # make a new network
# net.sp = net
# # set size for highlighted edge
# E(net.sp)$width = 1.1
# # remove the other edges for visualization
# ot = E(net.sp)[!E(net.sp) %in% edge.highlight.t]
# net.sp <- delete_edges(net.sp, edges = ot)
# E(net.sp)$color = col.alpha('black',0.7)
# # plot network
# pdf(file = paste0(figpath2,signal,'.subnetwork.pdf'),width = 4.5, height = 4.5)
# plot(net.sp,
# layout = lay,
# vertex.label = NA,
# vertex.label.cex=0.3,
# edge.size = E(net.sp)$weight,
# vertex.size = log(V(net.sp)$degree+ 1)*4,
# edge.curved = 0.1,
# vertex.color = c.celltype,
# vertex.size=degree(net.sp))
# legend(x=0.75, y=1.2,
# cts,
# pch=21,
# pt.bg=cu,
# pt.cex=1, cex=.5, bty="n",ncol=1)
# dev.off()
# }
#
Visualize results of baseline high responder networks, part 2.
mid_res/baseline_response/5c_network_correlations.r
# baseline nettwork correlation
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(igraph))
source('functions/MattPMutils.r')
# set paths
= here("mid_res/baseline_response/dataV3")
datapath = here("mid_res/baseline_response/figuresV3/network_correlations/");
figpath dir.create(figpath,recursive = TRUE)
# load of baseline module expression across donors only of
# leading edge genes from baseline enrichments
= readRDS(here('mid_res/baseline_response/dataV3/ds.rds'))
ds #data.table::fwrite(ds,file = paste0(here('git_ignore/ds.csv')),sep = ',')
# created shorter names -- read these in
= data.table::fread(
new.names file = here('mid_res/baseline_response/dataV3/baseline.module.name.shortened.txt'),
sep = '\t'
)$cname2 = paste(new.names$celltype, new.names$shortname, sep = ' :: ')
new.namescolnames(ds) = new.names$cname2
# fix subject names
= ds %>%
dp as.data.frame() %>%
rownames_to_column('subject') %>%
mutate(subject = str_sub(subject, 1,3)) %>%
column_to_rownames('subject')
# read matrix and network
= readRDS(file = here('mid_res/baseline_response/dataV3/net.rds'))
net = as_long_data_frame(net)
edf
# readadj p vas for comparison
= readRDS(file = here('mid_res/baseline_response/dataV3/padj.rds'))
padj
# aes set
= list(
plotattr theme_bw(),
geom_point(shape = 21, size = 3.5, stroke = 0.8),
theme(axis.title.y = element_text(size = 8)),
theme(axis.title.x = element_text(size = 8)),
scale_fill_manual(values = c(col.alpha("red",0.7), col.alpha("dodgerblue", 0.7))),
theme(legend.position = 'none', legend.key.size = unit(0.29, units = 'cm')),
theme(aspect.ratio = 1)
)
# specify response in vis.
= c("205","207","209","212", "215",
high.responders "234","237","245","250","256")
for (i in 1:nrow(edf)) {
# get edge to plot from the data framed network
= c(edf[i, ]$from_name, edf[i, ]$to_name)
mod.names
= dp %>%
cplot select(all_of(mod.names)) %>%
rownames_to_column('subject') %>%
mutate(response = ifelse(subject %in% high.responders, 'high', 'low'))
= cor.test(cplot[ , 2], cplot[ ,3], method = 'spearman', exact = FALSE)$p.value
ctp = padj[mod.names[1], mod.names[2]]
adjusted.p print(ctp < adjusted.p)
# calculate and vis. correlation across all subjects; color by response.
= ggpubr::ggscatter(cplot,
p x = mod.names[1],
y = mod.names[2],
conf.int = FALSE,
color = col.alpha('white','0.01'),
add.params = list(color = "black", fill = "grey")) +
+
plotattr aes(fill = response)
# save name by cell type first
= str_replace_all(mod.names,pattern = ' :: ', replacement = '..')
modsave ggsave(p, filename = paste0(figpath, modsave[1], '___', modsave[2], 'cor.pdf'), width = 3.1, height = 3.1)
}
# draw a legend
= here('mid_res/baseline_response/figuresV3/')
fp2 = p + theme(legend.position = 'top')
p2 <- cowplot::get_legend(p2)
legend pdf(file = paste0(fp2, 'LEGEND.pdf'),width = 2, height = 1)
::grid.draw(legend)
griddev.off()
Fit single cell mixed model to test day 1 post vaccination kinetics
of baseline cell phenotypes defined above.
mid_res/baseline_response/6_sc.kinetic.singlecellmodel.r
# Early kinetics of baseline states
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
suppressMessages(library(scglmmr))
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/")
figpath
# load single cell data
= readRDS(file = here('data/h1h5_annotated_with_meta.rds'))
h1 = h1@meta.data %>%
md filter(cohort == 'H1N1') %>%
filter(time_cohort == 'd1') %>%
mutate(group_id = factor(adjmfc.time, levels = c("d0 low", "d1 low", "d0 high", "d1 high"))) %>%
mutate(subjectid = factor(sampleid)) %>%
mutate(sex = factor(gender)) %>%
mutate(scaled.age = as.numeric(scale(age))) %>%
mutate(celltype = celltype_joint)
# add a covariate for number of cells per sample
= md %>% group_by(sample) %>% summarize(ncells = n())
ncell_met $ncell = plyr::mapvalues(x = md$sample, from = ncell_met$sample, to = ncell_met$ncells)
md$ncell = as.numeric(md$ncell)
md$log10ncell = log10(md$ncell)
md
# subset normalized RNA data
= h1@data[ ,rownames(md)]
norm.rna
# get leading edge genes from cur. baseline mods
= readRDS(file = here('mid_res/baseline_response/dataV3/g0.sub.rds'))
g0.sub = LeadingEdgeIndexed(gsea.result.list = g0.sub, padj.threshold = 0.05)
li.g0 = base::Filter(length, li.g0)
li.g0
# metadata by cell type
= names(li.g0)
cts = md %>% filter( celltype %in% cts )
md = split( md, f = md$celltype )
ct.md
# module score for each cell type of specific baseline enriched leading edge genes.
= list()
mod_scores for (i in 1:length(ct.md)) {
# init data for subset i
= norm.rna[ ,rownames(ct.md[[i]])]
rna = li.g0[[i]]
mod.list
# calculate single cell score for baseline-enriched module
= WeightedCellModuleScore(
mod_scores[[i]] gene_matrix = rna,
module_list = mod.list,
threshold = 0,
cellwise_scaling = TRUE,
return_weighted = FALSE
)
# add a "null" score of Gaussian noise as a reference
$null = rnorm(n = nrow(mod_scores[[i]]), mean = 0, sd = 1)
mod_scores[[i]]
}
# specify save paths for marginal means plots.
= paste0(figpath, "/marginalmeans.m1/"); dir.create(plot_savepath1)
plot_savepath1 = paste0(figpath, "/marginalmeans.m2/"); dir.create(plot_savepath2)
plot_savepath2
# specify the 2 models
= 'modulescore ~ 0 + group_id + (1|subjectid)'
f1 = 'modulescore ~ 0 + group_id + log10ncell + scaled.age + sex + (1|subjectid)'
f2 # reviewer question
= 'modulescore ~ 0 + group_id + log10ncell + (1|subjectid)'
f3
# fit sc mod mixed model on module scores.
= mm2 = list()
mm1 for (i in 1:length(ct.md)) {
stopifnot( nrow(ct.md[[i]]) == nrow(mod_scores[[i]]) )
# formula 1
= FitLmerContrast(module_data_frame = mod_scores[[i]],
mm1[[i]] celltype_column = 'celltype',
metadata = ct.md[[i]],
lmer_formula = f1,
plotdatqc = FALSE,
fixed_effects = NULL,
figpath = plot_savepath1)
# formula 2
= FitLmerContrast(module_data_frame = mod_scores[[i]],
mm2[[i]] celltype_column = 'celltype',
metadata = ct.md[[i]],
lmer_formula = f2,
plotdatqc = FALSE,
fixed_effects = NULL,
figpath = plot_savepath2)
}
= do.call(rbind, mm1)
mm1 = do.call(rbind, mm2)
mm2
saveRDS(mm1,file = paste0(datapath, 'mm1.rds'))
saveRDS(mm2,file = paste0(datapath, 'mm2.rds'))
= list()
mm3 for (i in 1:length(ct.md)) {
stopifnot( nrow(ct.md[[i]]) == nrow(mod_scores[[i]]) )
# formula 1
= FitLmerContrast(module_data_frame = mod_scores[[i]],
mm3[[i]] celltype_column = 'celltype',
metadata = ct.md[[i]],
lmer_formula = f3,
plotdatqc = FALSE,
fixed_effects = NULL,
figpath = NULL)
}
= do.call(rbind, mm3)
mm3
saveRDS(mm3,file = paste0(datapath, 'mm3.rds'))
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] SeuratObject_4.0.0 Seurat_4.0.1 scglmmr_0.1.0 magrittr_2.0.1 here_1.0.1
# [6] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.4.0
# [11] tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3 tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] estimability_1.3 scattermore_0.7 coda_0.19-4
# [4] bit64_4.0.5 irlba_2.3.3 multcomp_1.4-16
# [7] DelayedArray_0.16.3 rpart_4.1-15 data.table_1.14.0
# [10] RCurl_1.98-1.3 generics_0.1.0 BiocGenerics_0.36.1
# [13] cowplot_1.1.1 TH.data_1.0-10 RSQLite_2.2.7
# [16] shadowtext_0.0.9 RANN_2.6.1 future_1.21.0
# [19] bit_4.0.4 enrichplot_1.10.2 spatstat.data_2.1-0
# [22] xml2_1.3.2 lubridate_1.7.9.2 httpuv_1.5.5
# [25] SummarizedExperiment_1.20.0 assertthat_0.2.1 viridis_0.5.1
# [28] hms_1.0.0 promises_1.2.0.1 caTools_1.18.1
# [31] dbplyr_2.1.0 readxl_1.3.1 igraph_1.2.6
# [34] DBI_1.1.1 htmlwidgets_1.5.3 spatstat.geom_2.0-1
# [37] stats4_4.0.5 ellipsis_0.3.1 ggpubr_0.4.0
# [40] backports_1.2.1 annotate_1.68.0 deldir_0.2-10
# [43] MatrixGenerics_1.2.1 vctrs_0.3.6 Biobase_2.50.0
# [46] ROCR_1.0-11 abind_1.4-5 cachem_1.0.4
# [49] withr_2.4.3 ggforce_0.3.3 packrat_0.7.0
# [52] emmeans_1.5.4 sctransform_0.3.2 goftest_1.2-2
# [55] cluster_2.1.2 DOSE_3.16.0 lazyeval_0.2.2
# [58] crayon_1.4.1 edgeR_3.32.1 pkgconfig_2.0.3
# [61] labeling_0.4.2 tweenr_1.0.2 GenomeInfoDb_1.26.7
# [64] nlme_3.1-152 rlang_0.4.10 globals_0.14.0
# [67] lifecycle_1.0.0 miniUI_0.1.1.1 sandwich_3.0-0
# [70] downloader_0.4 modelr_0.1.8 cellranger_1.1.0
# [73] rprojroot_2.0.2 polyclip_1.10-0 GSVA_1.38.2
# [76] matrixStats_0.58.0 lmtest_0.9-38 graph_1.68.0
# [79] Matrix_1.3-2 carData_3.0-4 boot_1.3-27
# [82] zoo_1.8-8 reprex_1.0.0 ggridges_0.5.3
# [85] pheatmap_1.0.12 png_0.1-7 viridisLite_0.3.0
# [88] bitops_1.0-6 KernSmooth_2.23-18 blob_1.2.1
# [91] qvalue_2.22.0 parallelly_1.23.0 rstatix_0.7.0
# [94] S4Vectors_0.28.1 ggsignif_0.6.0 scales_1.1.1
# [97] memoise_2.0.0 GSEABase_1.52.1 plyr_1.8.6
# [100] ica_1.0-2 gplots_3.1.1 zlibbioc_1.36.0
# [103] compiler_4.0.5 scatterpie_0.1.7 RColorBrewer_1.1-2
# [106] lme4_1.1-26 fitdistrplus_1.1-3 cli_2.5.0
# [109] XVector_0.30.0 listenv_0.8.0 patchwork_1.1.1
# [112] pbapply_1.4-3 mgcv_1.8-34 MASS_7.3-53.1
# [115] tidyselect_1.1.0 stringi_1.5.3 GOSemSim_2.16.1
# [118] locfit_1.5-9.4 ggrepel_0.9.1 GeneOverlap_1.26.0
# [121] grid_4.0.5 fastmatch_1.1-0 tools_4.0.5
# [124] future.apply_1.7.0 parallel_4.0.5 rio_0.5.16
# [127] rstudioapi_0.13 foreign_0.8-81 gridExtra_2.3
# [130] farver_2.0.3 Rtsne_0.15 ggraph_2.0.5
# [133] digest_0.6.27 rvcheck_0.1.8 BiocManager_1.30.10
# [136] shiny_1.6.0 Rcpp_1.0.6 GenomicRanges_1.42.0
# [139] car_3.0-10 broom_0.7.5 egg_0.4.5
# [142] later_1.1.0.1 RcppAnnoy_0.0.18 org.Hs.eg.db_3.12.0
# [145] httr_1.4.2 AnnotationDbi_1.52.0 colorspace_2.0-0
# [148] tensor_1.5 rvest_0.3.6 XML_3.99-0.6
# [151] fs_1.5.0 reticulate_1.18 IRanges_2.24.1
# [154] splines_4.0.5 uwot_0.1.10 statmod_1.4.35
# [157] spatstat.utils_2.1-0 graphlayouts_0.7.2 plotly_4.9.3
# [160] xtable_1.8-4 jsonlite_1.7.2 nloptr_1.2.2.2
# [163] tidygraph_1.2.0 ggfun_0.0.4 R6_2.5.0
# [166] pillar_1.4.7 htmltools_0.5.2 mime_0.10
# [169] glue_1.4.2 fastmap_1.1.0 minqa_1.2.4
# [172] clusterProfiler_3.18.1 BiocParallel_1.24.1 codetools_0.2-18
# [175] fgsea_1.16.0 mvtnorm_1.1-1 spatstat.sparse_2.0-0
# [178] lattice_0.20-41 slanter_0.2-0 curl_4.3
# [181] leiden_0.3.7 gtools_3.8.2 zip_2.1.1
# [184] GO.db_3.12.1 openxlsx_4.2.3 survival_3.2-10
# [187] limma_3.46.0 munsell_0.5.0 DO.db_2.9
# [190] GenomeInfoDbData_1.2.4 haven_2.3.1 reshape2_1.4.4
# [193] gtable_0.3.0 spatstat.core_2.0-0
Visualize results of kinetic analysis above.
mid_res/baseline_response/7_sc.kinetic.singlecellmodel.figures.r
# Early kinetics of baseline states
# visualize model results
set.seed(1990)
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
# set paths
= here("mid_res/baseline_response/dataV3/")
datapath = here("mid_res/baseline_response/figuresV3/")
figpath source('functions/MattPMutils.r')
= readRDS(file = here('mid_res/baseline_response/dataV3/mm2.rds')) %>%
mm2 filter(!module == 'null') %>%
filter(! singular_fit == 1)
$cm = paste(mm2$celltype, mm2$module,sep = ' :: ')
mm2
# change to shorter names
= data.table::fread(file = here('mid_res/baseline_response/dataV3/baseline.module.name.shortened.txt'), sep = '\t')
new.names $cname2 = paste(new.names$celltype, new.names$shortname, sep = ' :: ')
new.names$cname1 = paste(new.names$celltype, new.names$module, sep = ' :: ')
new.names$cm = plyr::mapvalues(x = mm2$cm, from = new.names$cname1, to = new.names$cname2)
mm2
# assign to 'd'
= mm2
d
# plot innate subset
= d %>% filter(celltype %in% c( 'CD14_Mono', 'CD16_Mono', 'mDC', "MAIT_Like" ))
ds
# filter the modules that did not have a effect in the single cell model
= ds %>%
m.rm filter(estimatetime0_group2vs1 > 0.1) %>%
mutate(m1s = estimatetime0_group2vs1 - std.errortime0_group2vs1) %>%
filter(m1s > -0.01) %$% cm
saveRDS(m.rm, file = paste0(datapath, 'm.rm.rds'))
= ds %>% filter(cm %in% m.rm)
ds
# remove cell label from module name to reduce clutter
$cm = gsub(".*:","",ds$cm)
ds$celltype = str_replace_all(ds$celltype, pattern = '_',replacement =' ')
ds
= list(
pl # baseline
geom_point(shape = 23, size = 3, color = 'black', fill = col.alpha('red', 0.5)),
geom_segment(aes(x = (estimatetime0_group2vs1 + -1*std.errortime0_group2vs1),
xend = estimatetime0_group2vs1 + 1*std.errortime0_group2vs1,
yend = cm),
color = col.alpha('red', 0.5),
size = 2),
# day 1
geom_point(data = ds, aes(x = estimatetime1vs0, y = cm), size = 3, shape = 23,
color = 'black', fill = col.alpha('#e2a359', 0.5)),
geom_segment(aes(x = (estimatetime1vs0 + -1*std.errortime1vs0),
xend = estimatetime1vs0 + 1*std.errortime1vs0,
yend = cm),
color = col.alpha('#e2a359', 0.5),
size = 2),
theme(
axis.text.y = element_text(color = 'black', size = 10),
strip.background = element_blank(),
axis.title.x = element_text(size = 14),
axis.text.x = element_text(size = 14),
strip.text = element_text(size = 12),
panel.spacing.x=unit(2, "lines")
)
)
# mdc, CD14 mono, CD16 mono, mait-like
=
p ggplot(ds, aes(x = estimatetime0_group2vs1, y = reorder(cm, estimatetime1vs0 ))) +
facet_grid(vars(celltype), scales = 'free', space = 'free') +
theme_bw() +
geom_vline(xintercept = 0, linetype = 'dashed') +
+
pl scale_x_continuous( breaks= scales::pretty_breaks(), expand = c(0.15,0)) +
ylab('') +
xlab('contrast effect size ')
pggsave(p, filename = paste0(figpath, 'mm2.innate.mait.pdf'), width = 5, height = 6.5)
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] magrittr_2.0.1 here_1.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.4.0
# [8] tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3 tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] Rcpp_1.0.6 plyr_1.8.6 pillar_1.4.7 compiler_4.0.5 cellranger_1.1.0 dbplyr_2.1.0
# [7] tools_4.0.5 digest_0.6.27 packrat_0.7.0 jsonlite_1.7.2 lubridate_1.7.9.2 lifecycle_1.0.0
# [13] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.10 reprex_1.0.0 cli_2.5.0 rstudioapi_0.13
# [19] DBI_1.1.1 haven_2.3.1 withr_2.4.3 xml2_1.3.2 httr_1.4.2 fs_1.5.0
# [25] generics_0.1.0 vctrs_0.3.6 hms_1.0.0 rprojroot_2.0.2 grid_4.0.5 tidyselect_1.1.0
# [31] data.table_1.14.0 glue_1.4.2 R6_2.5.0 readxl_1.3.1 farver_2.0.3 modelr_0.1.8
# [37] backports_1.2.1 scales_1.1.1 ellipsis_0.3.1 rvest_0.3.6 assertthat_0.2.1 colorspace_2.0-0
# [43] labeling_0.4.2 stringi_1.5.3 munsell_0.5.0 broom_0.7.5 crayon_1.4.1
Process data from GSE171964.
mid_res/mrna/mrna_1_setup.r
# R 4.0.5
# testing out mrna data
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(Matrix))
# save paths
= file.path(here('mid_res/mrna/generated_data/'))
datapath dir.create(datapath)
# read sparse matrix
= Matrix::readMM(file = here('data/GSE171964/matrix.mtx'))
mtx
# reformat because not formatted in geo for Seurat::Read10X()
= read.delim(file = here('data/GSE171964/barcodes.tsv'))
cells = cells %>% separate(x, into = c('space', 'barcode'),sep =' ')
cells = cells$barcode
cells saveRDS(cells, file = paste0(datapath,'cells.rds'))
= readRDS(file = here('mid_res/mrna/generated_data/cells.rds'))
cells
# reformat features
= read.delim(file = here('data/GSE171964/features.tsv'))
features = features %>% separate(x, into = c('space', 'feature'),sep =' ')
features = features$feature
features saveRDS(features, file = paste0(datapath,'features.rds'))
= readRDS(file = here('mid_res/mrna/generated_data/features.rds'))
features
# set names of matrix
colnames(mtx) = cells
rownames(mtx) = features
# load metadata - updated author correction data
= read.delim(file = here('data/GSE171964/GSE171964_geo_pheno_v2.csv'), sep = ',')
p
# define the day 0 day 1 cells and format for sc meta
= p %>% filter(day %in% c('0', '1', '21','22'))
psub = psub$barcode
cell_sub = psub %>% column_to_rownames('barcode')
md
# subset matrix
= mtx[ ,cell_sub]
mtx
# get ADT data
= features[grep(pattern = '_ADT',x = features)]
proteins = mtx[proteins, ]
adt = mtx[rownames(mtx)[!rownames(mtx) %in% proteins], ]
rna
# save
saveRDS(rna, file = paste0(datapath,'rna.rds'))
saveRDS(adt, file = paste0(datapath,'adt.rds'))
saveRDS(md, file = paste0(datapath,'md.rds'))
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] Matrix_1.3-2 here_1.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4
# [6] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3
# [11] tidyverse_1.3.0 SeuratObject_4.0.0 Seurat_4.0.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_2.0-0 deldir_0.2-10 ellipsis_0.3.1
# [5] ggridges_0.5.3 rprojroot_2.0.2 fs_1.5.0 rstudioapi_0.13
# [9] spatstat.data_2.1-0 leiden_0.3.7 listenv_0.8.0 ggrepel_0.9.1
# [13] lubridate_1.7.9.2 xml2_1.3.2 codetools_0.2-18 splines_4.0.5
# [17] polyclip_1.10-0 jsonlite_1.7.2 broom_0.7.5 ica_1.0-2
# [21] cluster_2.1.2 dbplyr_2.1.0 png_0.1-7 uwot_0.1.10
# [25] shiny_1.6.0 sctransform_0.3.2 spatstat.sparse_2.0-0 compiler_4.0.5
# [29] httr_1.4.2 backports_1.2.1 assertthat_0.2.1 fastmap_1.1.0
# [33] lazyeval_0.2.2 cli_2.5.0 later_1.1.0.1 htmltools_0.5.1.1
# [37] tools_4.0.5 igraph_1.2.6 gtable_0.3.0 glue_1.4.2
# [41] RANN_2.6.1 reshape2_1.4.4 Rcpp_1.0.6 scattermore_0.7
# [45] cellranger_1.1.0 vctrs_0.3.6 nlme_3.1-152 lmtest_0.9-38
# [49] globals_0.14.0 rvest_0.3.6 mime_0.10 miniUI_0.1.1.1
# [53] lifecycle_1.0.0 irlba_2.3.3 goftest_1.2-2 future_1.21.0
# [57] MASS_7.3-53.1 zoo_1.8-8 scales_1.1.1 spatstat.core_2.0-0
# [61] hms_1.0.0 promises_1.2.0.1 spatstat.utils_2.1-0 parallel_4.0.5
# [65] RColorBrewer_1.1-2 reticulate_1.18 pbapply_1.4-3 gridExtra_2.3
# [69] rpart_4.1-15 stringi_1.5.3 rlang_0.4.10 pkgconfig_2.0.3
# [73] matrixStats_0.58.0 lattice_0.20-41 ROCR_1.0-11 tensor_1.5
# [77] patchwork_1.1.1 htmlwidgets_1.5.3 cowplot_1.1.1 tidyselect_1.1.0
# [81] parallelly_1.23.0 RcppAnnoy_0.0.18 plyr_1.8.6 magrittr_2.0.1
# [85] R6_2.5.0 generics_0.1.0 DBI_1.1.1 withr_2.4.1
# [89] pillar_1.4.7 haven_2.3.1 mgcv_1.8-34 fitdistrplus_1.1-3
# [93] survival_3.2-10 abind_1.4-5 future.apply_1.7.0 modelr_0.1.8
# [97] crayon_1.4.1 KernSmooth_2.23-18 spatstat.geom_2.0-1 plotly_4.9.3
# [101] grid_4.0.5 readxl_1.3.1 data.table_1.14.0 reprex_1.0.0
# [105] digest_0.6.27 xtable_1.8-4 httpuv_1.5.5 munsell_0.5.0
# [109] viridisLite_0.3.0
Renormalize raw ADT data with
dsb::ModelNegativeADTnorm()
and manually gate cells.
mid_res/mrna/mrna_2_gatemono.r
# R 4.0.5
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(Matrix))
suppressMessages(library(ggsci))
library(magrittr)
library(dsb)
# set paths
= file.path(here('mid_res/mrna/generated_data/'))
datapath = file.path(here('mid_res/mrna/figures/'))
figpath
# load data
= readRDS(file = paste0(datapath,'rna.rds'))
rna = readRDS(file = paste0(datapath,'md.rds'))
md = readRDS(file = paste0(datapath,'adt.rds'))
adt
#slim
<- as(object = rna, Class = "dgCMatrix")
rna <- as(object = adt, Class = "dgCMatrix")
adt
# seurat workflow
= CreateSeuratObject(counts = rna, min.cells = 20, meta.data = md)
s
# normalize ADT with dsb function ModelNegativeADTnorm
= c("Isotype1_ADT", "Isotype2_ADT", "Isotype3_ADT", "Isotype4_ADT")
iso = ModelNegativeADTnorm(cell_protein_matrix = adt,
adt_norm denoise.counts = TRUE,
use.isotype.control = TRUE,
isotype.control.name.vec = iso,
quantile.clipping = TRUE,
return.stats = TRUE)
# define phenotyping antibodies
rownames(adt_norm$dsb_normalized_matrix) =
str_replace_all(
string = rownames(adt_norm$dsb_normalized_matrix),
pattern = '_', replacement = '-'
)
'CITE']] = CreateAssayObject(counts = adt)
s[[= SetAssayData(object = s,
s slot = 'data',
new.data = adt_norm$dsb_normalized_matrix,
assay = 'CITE')
# save processed object
saveRDS(s,file = paste0(datapath, 's.rds'))
# gate out monocytes
library(scales)
= cbind(s@meta.data, data.frame(t(as.matrix(s@assays$CITE@data))))
d = ggplot(d, aes(x = CD14.ADT, y = CD3.ADT)) + geom_point(size =0.1, alpha = 0.4) +
p scale_y_continuous( breaks=pretty_breaks() ) +
scale_x_continuous( breaks=pretty_breaks() ) +
geom_abline(slope = 1,intercept = -0.9, color = 'red') +
geom_vline(xintercept = 1.5, color = 'red') +
geom_hline(yintercept = 1.5, color = 'red')
# manually create triangular gate
$tx = d$CD14.ADT > 1.5
d$ty = d$CD3.ADT < 1.5
d$tlm = d$CD14*1 + -0.9
d# add gate info
$pp3 = ifelse(d$tx==TRUE & d$ty==TRUE & d$CD3.ADT < d$tlm, yes = '1',no = '0')
d
# plot with gated cells highlighted
= ggplot(d, aes(x = CD14.ADT, y = CD3.ADT, color = pp3)) +
p geom_point(size =0.1, alpha = 0.4) +
theme_bw() +
scale_y_continuous( breaks=pretty_breaks()) +
scale_x_continuous( breaks=pretty_breaks()) +
geom_abline(slope = 1,intercept = -0.9, color = 'red') +
geom_vline(xintercept = 1.5, color = 'red') +
geom_hline(yintercept = 1.5, color = 'red') +
ylab('CD3 ADT dsb:ModelNegative') + xlab('CD14 ADT dsb::ModelNegative')
pggsave(p,filename = paste0(figpath,'monogate.png'), width = 9, height = 8)
# define monocytes
= d[d$pp3==1, ] %>% rownames()
dmono saveRDS(dmono,file = paste0(datapath, 'dmono.rds'))
# subset and save monocyte Seurat object.
= subset(s,cells = dmono)
s.mono saveRDS(s.mono,file = paste0(datapath, 's.mono.rds'))
# gate out mdc
= ggplot(d, aes(x = CD11c.ADT, y = CD1c.BDCA1.ADT)) +
p geom_point(size =0.1, alpha = 0.4) +
geom_vline(xintercept = 2.2, color = 'red') +
geom_hline(yintercept = 1.5, color = 'red')
# define mDC
$mdc = ifelse(d$CD11c.ADT>2.2 & d$CD1c.BDCA1.ADT>1.5, yes = '1',no = '0')
d
#plot gated cells
= ggplot(d, aes(x = CD11c.ADT, y = CD1c.BDCA1.ADT, color = mdc)) +
p theme_bw() +
geom_point(size =0.1, alpha = 0.4) +
geom_vline(xintercept = 2.2, color = 'red') +
geom_hline(yintercept = 1.5, color = 'red')
ggsave(p,filename = paste0(figpath,'mdc_gate.png'), width = 9, height = 8)
# subst mDC
= d[d$mdc=="1", ] %>% rownames()
mdc.cells
# subset mDC
= subset(s,cells = mdc.cells)
s.mdc saveRDS(s.mdc,file = paste0(datapath, 's.mdc.rds'))
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] magrittr_2.0.1 scales_1.1.1 plotly_4.9.3 shiny_1.6.0
# [5] dsb_1.0.2 ggsci_2.9 Matrix_1.3-2 here_1.0.1
# [9] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
# [13] readr_1.4.0 tidyr_1.1.2 tibble_3.0.6 ggplot2_3.3.3
# [17] tidyverse_1.3.0 SeuratObject_4.0.0 Seurat_4.0.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_2.0-0 deldir_1.0-6
# [4] ellipsis_0.3.2 ggridges_0.5.3 rsconnect_0.8.25
# [7] mclust_5.4.7 rprojroot_2.0.2 fs_1.5.0
# [10] rstudioapi_0.13 spatstat.data_2.1-0 farver_2.0.3
# [13] leiden_0.3.7 listenv_0.8.0 ggrepel_0.9.1
# [16] lubridate_1.7.9.2 xml2_1.3.2 codetools_0.2-18
# [19] splines_4.0.5 cachem_1.0.4 polyclip_1.10-0
# [22] jsonlite_1.7.2 packrat_0.7.0 broom_0.7.5
# [25] ica_1.0-2 cluster_2.1.2 dbplyr_2.1.0
# [28] png_0.1-7 pheatmap_1.0.12 uwot_0.1.10
# [31] sctransform_0.3.2 spatstat.sparse_2.0-0 compiler_4.0.5
# [34] httr_1.4.2 backports_1.2.1 assertthat_0.2.1
# [37] fastmap_1.1.0 lazyeval_0.2.2 limma_3.46.0
# [40] cli_3.3.0 later_1.1.0.1 htmltools_0.5.2
# [43] tools_4.0.5 igraph_1.2.6 gtable_0.3.0
# [46] glue_1.6.2 RANN_2.6.1 reshape2_1.4.4
# [49] Rcpp_1.0.6 scattermore_0.7 jquerylib_0.1.3
# [52] cellranger_1.1.0 vctrs_0.4.1 nlme_3.1-152
# [55] crosstalk_1.1.1 lmtest_0.9-38 globals_0.14.0
# [58] rvest_0.3.6 mime_0.10 miniUI_0.1.1.1
# [61] lifecycle_1.0.0 irlba_2.3.3 goftest_1.2-2
# [64] FSA_0.9.0 future_1.21.0 MASS_7.3-53.1
# [67] zoo_1.8-8 spatstat.core_2.0-0 hms_1.0.0
# [70] promises_1.2.0.1 spatstat.utils_2.3-0 parallel_4.0.5
# [73] RColorBrewer_1.1-2 yaml_2.2.1 reticulate_1.18
# [76] pbapply_1.4-3 gridExtra_2.3 sass_0.4.0
# [79] rpart_4.1-15 stringi_1.5.3 rlang_1.0.2
# [82] pkgconfig_2.0.3 matrixStats_0.58.0 lattice_0.20-41
# [85] ROCR_1.0-11 tensor_1.5 labeling_0.4.2
# [88] patchwork_1.1.1 htmlwidgets_1.5.3 cowplot_1.1.1
# [91] tidyselect_1.1.0 parallelly_1.23.0 RcppAnnoy_0.0.18
# [94] plyr_1.8.6 R6_2.5.0 generics_0.1.2
# [97] DBI_1.1.1 withr_2.4.3 pillar_1.4.7
# [100] haven_2.3.1 mgcv_1.8-34 fitdistrplus_1.1-3
# [103] survival_3.2-10 abind_1.4-5 future.apply_1.7.0
# [106] modelr_0.1.8 crayon_1.4.1 KernSmooth_2.23-18
# [109] spatstat.geom_2.4-0 viridis_0.5.1 grid_4.0.5
# [112] readxl_1.3.1 data.table_1.14.0 reprex_1.0.0
# [115] digest_0.6.27 xtable_1.8-4 httpuv_1.5.5
# [118] munsell_0.5.0 viridisLite_0.3.0 bslib_0.3.1
Test baseline high responder cell phenotypes in the same subsets
before and after mRNA vaccination with mixed model.
mid_res/mrna/mrna_3_baseline.sig.test.mono.r
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
#source(file = here('functions/scglmmr.functions.R'))
suppressMessages(library(emmeans))
source('functions/MattPMutils.r')
# set paths
= file.path(here('mid_res/mrna/generated_data/'))
datapath = file.path(here('mid_res/mrna/figures/'))
figpath
# load baseline monocyte leadingedge index unique genes
= readRDS(file = here('mid_res/baseline_response/dataV3/g0.sub.rds'))
gs0 = LeadingEdgeIndexed(gsea.result.list = gs0, padj.threshold = 0.05)
li0 = li0$CD14_Mono
li0
# define ifn sigs
= list('sig' = unique(unlist(li0)))
sig.test
# save combined signature genes (e2k)
= sig.test$sig
sig.genes ::fwrite(list(sig.genes),file = paste0(datapath,'sig.txt'), sep = '\t')
data.table
# load monocyte gated CITE-seq data from pfizer data
= readRDS('mid_res/mrna/generated_data/s.mono.rds')
s.mono = NormalizeData(s.mono,assay = 'RNA',normalization.method = 'LogNormalize')
s.mono # define umi matrix and metadata
= s.mono@assays$RNA@data
umi = s.mono@meta.data
md # format metadata for lme4
$time = factor(md$day,levels = c('0', '1', '21', '22'))
md$pt_id = factor(as.character(md$pt_id))
md
# module score simple average for the 3 signatures defined above.
= WeightedCellModuleScore(gene_matrix = umi,
mscore module_list = sig.test,
threshold = 0,
cellwise_scaling = FALSE,
return_weighted = FALSE)
# combine signature scores with meta.data
= cbind(mscore, md)
dat.fit saveRDS(dat.fit, file = paste0(datapath, 'dat.fit.mono.rds'))
= readRDS(file = here('mid_res/mrna/generated_data/dat.fit.mono.rds'))
dat.fit
# note strucure with one major outlier in cell number in this dataset:
table(dat.fit$time, dat.fit$sample_id) %>% t()
# lmer formula for the 3 signatures
= 'sig ~ 0 + time + (1|pt_id)'
f1
library(emmeans)
# fit model for each signature
= lme4::lmer(formula = f1,data = dat.fit)
m1 = emmeans::emmeans(m1,specs = ~time, lmer.df = 'asymptotic')
emm1
# contrast time differences
= levels(dat.fit$time)
clevels
#make custom contrasts
= c(1, 0, 0 ,0)
c0 = c(0, 1, 0 ,0)
c1 = c(0, 0, 1 ,0)
c3 = c(0, 0, 0 ,1)
c4 = list( "time1vs0" = c1 - c0, 'time22vs21' = c4 - c3 )
contrast_list = list( 'sig' = emm1 )
clist
=
c.res lapply(clist, function(x) {
::contrast(object = x, method = contrast_list) %>%
emmeans::tidy()
broom%>%
} ) bind_rows(.id = 'signature')
::fwrite(x = c.res, file = paste0(datapath,'c.res.mono.txt'), sep = '\t')
data.table
# delta contrast
= emmeans::contrast(object = emm1, method = contrast_list)
cmat pairs(cmat, reverse = TRUE)
# contrast estimate SE df z.ratio p.value
# time22vs21 - time1vs0 0.0967 0.00642 Inf 15.065 <.0001
cmat# contrast estimate SE df z.ratio p.value
# time1vs0 0.0846 0.00492 Inf 17.203 <.0001
# time22vs21 0.1813 0.00414 Inf 43.835 <.0001
# plotsingle cell distributionn and emmeans contrasts
= list(theme_bw(),
em_aes coord_flip(),
theme(axis.title.y = element_text(size = 7),
axis.text.y = element_text(size = 7)),
scale_color_manual('grey')
)
= list(theme_bw(), ylab(label ='Baseline high responder\nCD14 Mono signature'))
plot.aes
= sapply(c('grey', '#e2a359', 'grey', '#e2a359'), col.alpha, 0.8) %>% unname()
cu # combined signature change emm in p1 and change y value in p0
= ggplot(dat.fit, aes(x = time, y = sig, fill = time)) +
p0 geom_violin(show.legend = F) +
+
plot.aes xlab('time') +
scale_fill_manual(values = cu)+
theme(axis.title.x = element_text(size = 12))
ggsave(p0, filename = paste0(figpath, 'sig.pdf'), width = 2.5, height = 3)
= plot(emm1) + em_aes
p1 ggsave(p1, filename = paste0(figpath, 'sig.emm.pdf'), width = 1, height = 3)
Run same test in DCs
mid_res/mrna/mrna_3_signature.test.mdc.r
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
suppressMessages(library(here))
suppressMessages(library(scglmmr))
suppressMessages(library(emmeans))
#source(file = here('functions/scglmmr.functions.R'))
source('functions/MattPMutils.r')
# set save paths
= file.path(here('mid_res/mrna/generated_data/'))
datapath = file.path(here('mid_res/mrna/figures/'))
figpath
# load baseline monocyte leadingedge index unique genes
= readRDS(file = here('mid_res/baseline_response/dataV3/g0.sub.rds'))
gs0 = LeadingEdgeIndexed(gsea.result.list = gs0, padj.threshold = 0.05)
li0 = li0$mDC
li0
# define sigs
= list('sig' = unique(unlist(li0)))
sig.test
# load monocyte gated CITE-seq data from pfizer data
= readRDS('mid_res/mrna/generated_data/s.mdc.rds')
s.mdc = NormalizeData(s.mdc,assay = 'RNA', normalization.method = 'LogNormalize')
s.mdc # define umi matrix and metadata
= s.mdc@assays$RNA@data
umi = s.mdc@meta.data
md # format metadata for lme4
$time = factor(md$day,levels = c('0', '1', '21', '22'))
md$pt_id = factor(as.character(md$pt_id))
md
# module score for the 3 signatures defined above.
= WeightedCellModuleScore(gene_matrix = umi,
mscore module_list = sig.test,
threshold = 0,
cellwise_scaling = FALSE,
return_weighted = FALSE)
# combine signature scores with meta.data
= cbind(mscore, md)
dat.fit saveRDS(dat.fit, file = paste0(datapath, 'dat.fit.mdc.rds'))
= readRDS(file = here('mid_res/mrna/generated_data/dat.fit.mdc.rds'))
dat.fit
# note strucure not as bad on outlier sample as mono
table(dat.fit$time, dat.fit$sample_id) %>% t()
# lmer formula for the 3 signatures
= 'sig ~ 0 + time + (1|pt_id)'
f1
library(emmeans)
# fit model for each signature
= lme4::lmer(formula = f1, data = dat.fit)
m1 = emmeans::emmeans(m1, specs = ~time, lmer.df = 'asymptotic')
emm1
# contrast time differences
= levels(dat.fit$time)
clevels
#make custom contrasts
= c(1, 0, 0 ,0)
c0 = c(0, 1, 0 ,0)
c1 = c(0, 0, 1 ,0)
c3 = c(0, 0, 0 ,1)
c4 = list( "time1vs0" = c1 - c0,
contrast_list 'time22vs21' = c4 - c3)
= list('sig' = emm1)
clist
=
c.res lapply(clist, function(x) {
::contrast(object = x, method = contrast_list) %>%
emmeans::tidy()
broom%>%
} ) bind_rows(.id = 'signature')
::fwrite(x = c.res, file = paste0(datapath,'c.res.mDC.txt'), sep = '\t')
data.table
# delta contrast
= emmeans::contrast(object = emm1, method = contrast_list)
cmat pairs(cmat, reverse = TRUE)
# contrast estimate SE df z.ratio p.value
# time22vs21 - time1vs0 0.0557 0.0165 Inf 3.366 0.0008
cmat# contrast estimate SE df z.ratio p.value
# time1vs0 0.0755 0.0118 Inf 6.422 <.0001
# time22vs21 0.1312 0.0116 Inf 11.284 <.0001
# plotsingle cell distributionn and emmeans contrasts
= list(theme_bw(),
em_aes coord_flip(),
theme(axis.title.y = element_text(size = 7),
axis.text.y = element_text(size = 7)),
scale_color_manual('grey')
)
= list(theme_bw(), ylab(label ='Baseline high responder\nmDC signature'))
plot.aes = sapply(c('grey', '#e2a359', 'grey', '#e2a359'), col.alpha, 0.8) %>% unname()
cu
= ggplot(dat.fit, aes(x = time, y = sig, fill = time)) +
p0 geom_violin(show.legend = F) +
+
plot.aes xlab('time') +
scale_fill_manual(values = cu)+
theme(axis.title.x = element_text(size = 12))
ggsave(p0, filename = paste0(figpath, 'sig_mDC.pdf'), width = 2.5, height = 3)
= plot(emm1) + em_aes
p1 ggsave(p1, filename = paste0(figpath, 'sig_mDC.emm.pdf'), width = 1, height = 3)
Aggregate log cpm in high and low responders at baseline
mid_res/nat_adj/1.calc.lcpm.baselineH1_as03_modelgenes.r
# R version 4.0.5
suppressMessages(library(here))
suppressMessages(library(tidyverse))
suppressMessages(library(scglmmr))
source(here('functions/MattPMutils.r'))
# set save paths
= here("mid_res/nat_adj/figures/V4/"); dir.create(figpath)
figpath = here("mid_res/nat_adj/generated_data//V4/"); dir.create(datapath)
datapath
# define high responders
= c("205","207","209","212","215","234","237","245","250","256")
high.responders
# read pb data, subset to day 0 non adj, subset out day 0 metadata.
= readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
pb = lapply(pb, function(x) x = x[ ,1:40])
pb = gsub("~.*","",colnames(pb[[1]]))
cnames = lapply(pb, function(x){
pb %>% as.data.frame() %>% setNames(nm = cnames) %>% as.matrix()
x
})= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV3/d0.rds'))
d0 = lapply(pb, function(x){ x = x[ , rownames(d0)]})
d0d
# make a list of genes indxed by celltype for genes to fit from H5 model
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/git_ignore/av_tidy.rds'))
av_tidy = lapply(av_tidy , function(x) unique(x$gene))
genes.test
# check names are the same of indexed genes and average data
stopifnot(isTRUE(all.equal(names(d0d), names(genes.test))))
# calcualte log CPM of the baseline pseudobulk data
= list()
av for (i in 1:length(d0d)) {
= edgeR::DGEList( d0d[[i]] )
dge = dge[genes.test[[i]], ]
dge = edgeR::cpm(dge, log = TRUE)
av[[i]]
}names(av) = names(d0d)
saveRDS(av,file = paste0(datapath,'av.rds'))
# tidy aggregated data
= list()
av0 for (i in 1:length(pb)) {
= names(av)[i]
ct = rownames(av[[i]])
gs = GetTidySummary(
av0[[i]] av.exprs.list = av,
celltype.index = i,
genes.use = gs) %>%
mutate(response = if_else(str_sub(sample, 1, 3) %in% high.responders, 'High', "Low")) %>%
mutate(response = factor(response, levels = c('Low', 'High')))
}names(av0) = names(pb)
saveRDS(av0, file = paste0(datapath,'av0.rds'))
Validate AS03 specificity of combined signatures in external AS03
cohort.
mid_res/nat_adj/2.AS03.innatesigs.vand.validationcohort.r
suppressMessages(library(here))
suppressMessages(library(tidyverse))
source(here('functions/scglmmr.functions.R'))
suppressMessages(library(magrittr))
source(here('functions/MattPMutils.r'))
# set save paths
= here("mid_res/nat_adj/figures/V4/")
figpath = here("mid_res/nat_adj/generated_data//V4/")
datapath
# load mono and mDC AS03 specific signatures
= readRDS(file = here('mid_res/combined_contrast/generated_data/mo.noifn.rds'))
mono.genes = readRDS(file = here('mid_res/combined_contrast/generated_data/dc.noifn.rds'))
mdc.genes
# extract leading edge into signatures
= mono.genes$leadingEdge %>% unlist() %>% unique()
mono.genes = mdc.genes$leadingEdge %>% unlist() %>% unique()
mdc.genes = c(mono.genes, mdc.genes) %>% unique()
li.full = list('AS03_signature' = li.full, "AS03_Monocyte" = mono.genes, 'AS03_mDC' = mdc.genes)
as03.sig = as03.sig
as03.sig.list saveRDS(as03.sig.list, file = paste0(datapath, 'as03.sig.list.rds'))
# validation cohort for the highlighted signatures
= readRDS(file = here('mid_res/vand/generated_data/fit1.rds'))
vand.fit = ExtractResult(model.fit.list = vand.fit,what = 'lmer.z.ranks', coefficient.number = 1, coef.name = 'delta')
vand.rank
# Enrichment of AS03 signatures without ifn signatures
= FgseaList(rank.list.celltype = vand.rank, pathways = as03.sig)
gvand saveRDS(gvand,file = paste0(datapath,'gvand.rds'))
$MNC
gvand# pathway pval padj log2err ES NES size leadingEdge celltype
# 1: AS03_signature 1.910373e-34 5.731119e-34 1.529705 0.5714156 3.144152 263 SERPINA1,SECTM1,HCK,PLSCR1,LILRA1,FCGR1B,... MNC
# 2: AS03_Monocyte 4.563863e-26 6.845794e-26 1.326716 0.5979691 3.088277 170 SERPINA1,SECTM1,HCK,PLSCR1,LILRA1,FCGR2A,... MNC
# 3: AS03_mDC 6.087321e-21 6.087321e-21 1.186651 0.5916694 2.984150 145 SERPINA1,SECTM1,HCK,LILRA1,FCGR1B,FCGR2A,... MNC
$DNC
gvand# pathway pval padj log2err ES NES size leadingEdge celltype
# 1: AS03_signature 4.154660e-17 1.246398e-16 1.0672100 0.4932650 2.402933 257 PSME2,FPR1,SLC31A2,PSMB10,PSME1,ALDH1A1,... DNC
# 2: AS03_mDC 1.761296e-12 2.641944e-12 0.9101197 0.5311486 2.420236 146 PSME2,FPR1,SLC31A2,PSMB10,PSME1,ALDH1A1,... DNC
# 3: AS03_Monocyte 6.171525e-10 6.171525e-10 0.8012156 0.4774659 2.201396 163 FPR1,SLC31A2,DPYD,IFIH1,LILRB3,PLSCR1,... DNC
# leading edge from each innate cell types - AS03 signature
= gvand$DNC %>% filter(pathway == 'AS03_mDC') %$% leadingEdge %>% unlist()
dc.as03.sig.validated = gvand$MNC %>% filter(pathway == 'AS03_Monocyte') %$% leadingEdge %>% unlist()
mono.as03.sig.validated saveRDS(dc.as03.sig.validated, file = paste0(datapath,'dc.as03.sig.validated.rds'))
saveRDS(mono.as03.sig.validated, file = paste0(datapath,'mono.as03.sig.validated.rds'))
::fwrite(list(dc.as03.sig.validated),file = paste0(datapath,'dc.as03.sig.validated.txt'),sep = '\t')
data.table::fwrite(list(mono.as03.sig.validated),file = paste0(datapath,'mono.as03.sig.validated.txt'),sep = '\t')
data.table
# plot enrichment distributions
= list(geom_line(color = "deepskyblue3", size = 2 ))
enrline #mono
= fgsea::plotEnrichment(pathway = as03.sig$AS03_Monocyte, stats = vand.rank$MNC) + enrline
p ggsave(p, filename = paste0(figpath, 'mono.vand.enr.2.pdf'), width = 5, height = 3)
# mDC
= fgsea::plotEnrichment(pathway = as03.sig$AS03_mDC, stats = vand.rank$DNC) + enrline
p ggsave(p, filename = paste0(figpath, 'dc.vand.enr.2.pdf'), width = 5, height = 3)
Test AS03 specific mDC and monocyte phenotypes in high vs low
responders at baseline.
mid_res/nat_adj/3.natural.adjuvant.signatures.r
suppressMessages(library(here))
suppressMessages(library(tidyverse))
source(here('functions/scglmmr.functions.R'))
source(here('functions/MattPMutils.r'))
set.seed(1990)
# set save paths
= here("mid_res/nat_adj/figures/V4/")
figpath = here("mid_res/nat_adj/generated_data/V4/")
datapath
# set theme
= c("grey48", "grey", "grey48", "deepskyblue3")
cu = sapply(cu, col.alpha, alpha = 0.4) %>% unname()
cu.alpha = list(
mtheme geom_boxplot(show.legend = FALSE, outlier.shape = NA),
theme_bw(base_size = 10.5),
theme(axis.text.x=element_text(angle = -90, hjust = 0)),
theme(strip.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.text = element_text(face = "bold",family = "Helvetica"),
axis.text.y = element_text(size = 6),
axis.title.y = element_text(size = 10))
)= sapply(c('dodgerblue', 'red'), col.alpha, 0.2) %>% unname()
cua
# load average day 1 comparison cohort data
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gene_dist/av_tidy.rds'))
av_tidy
# AS03 adjuvant signatures (no ifn)
= readRDS(file = here('mid_res/nat_adj/generated_data/V4/as03.sig.list.rds'))
as03.sig.list
# mdc Combined AS03 signature average across time between groups
=
mdc.sig.av $mDC %>%
av_tidyfilter(gene %in% as03.sig.list$AS03_mDC) %>%
group_by(sample, group) %>%
summarize(meansig = mean(count))
#plot
= ggplot(mdc.sig.av, aes(x = group, y = meansig, fill = group , color = group)) +
p +
mtheme theme(axis.title.x = element_blank()) +
ylab('mDC AS03 Adjuvant Signature') +
scale_color_manual(values = cu) +
scale_fill_manual(values = cu.alpha) +
ggtitle('mDC')
ggsave(p,filename = paste0(figpath, 'as03_mDC_sig.2.pdf'), width = 1.9, height = 3)
# monocyte Combined AS03 signature average across time between groups
=
mono.sig.av $CD14_Mono %>%
av_tidyfilter(gene %in% as03.sig.list$AS03_Monocyte) %>%
group_by(sample, group) %>%
summarize(meansig = mean(count))
#plot
= ggplot(mono.sig.av, aes(x = group, y = meansig, fill = group , color = group)) +
p +
mtheme theme(axis.title.x = element_blank()) +
ylab('CD14 Mono AS03 Adjuvant Signature') +
scale_color_manual(values = cu) +
scale_fill_manual(values = cu.alpha) +
ggtitle('CD14 Monocytes')
ggsave(p,filename = paste0(figpath, 'as03_mono_sig.2.pdf'), width = 1.9, height = 3)
##########################
## GSEA of NA signatures
##########################
# enrichment of validated signatures (Vand cohort) in baseline high vs low responders
= readRDS(file = here('mid_res/nat_adj/generated_data/V4/mono.as03.sig.validated.rds'))
mono.as03.sig.validated = readRDS(file = here('mid_res/nat_adj/generated_data/V4/dc.as03.sig.validated.rds'))
dc.as03.sig.validated
# high vs low model gene ranks within mono and mDC age and sex adjusted
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/cont0.rds'))
cont0 = ExtractResult(model.fit.list = cont0, what = 'gene.t.ranks',coefficient.number = 1, coef.name = 'adjmfc')
r0
# enrichment
= fgsea::fgsea(pathways = list('AS03.mono' = mono.as03.sig.validated), stats = r0$CD14_Mono)
mono.na.gsea # pathway pval padj log2err ES NES size leadingEdge
# 1: AS03.mono 7.515986e-13 7.515986e-13 0.921426 0.6263153 2.74048 78 S100A11,S100A12,APOBEC3A,DDX60,DDX58,CREB5,...
= fgsea::plotEnrichment(pathway = mono.as03.sig.validated, stats = r0$CD14_Mono) +
p geom_line(size = 1.5, color = 'red') +
theme(plot.title = element_text(size = 10))
pggsave(p, filename =paste0(figpath, 'mono.natadj.gsea.2.pdf'), width = 5, height = 3)
$leadingEdge
mono.na.gsea# [1] "S100A11" "S100A12" "APOBEC3A" "DDX60" "DDX58" "CREB5" "TFEC" "S100A9" "GCA" "FGL2" "NACC2"
# [12] "KYNU" "SLC16A3" "MS4A4A" "S100A8" "FCGR3A" "SAMHD1" "TNFSF13B" "C19orf59" "CCR2" "MARCO" "P2RY13"
# [23] "RSAD2" "SERPINA1" "FPR1" "FGR" "PLSCR1" "SIGLEC9" "IFIH1" "ACSL1" "LMNB1" "LRRK2" "MNDA"
# [34] "PLBD1" "KIAA0513" "AQP9" "SLC31A2" "LILRB1" "VCAN"
::fwrite(mono.na.gsea$leadingEdge,file = paste0(datapath,'mono.na.gsea.leadingEdge.txt'))
data.table
= fgsea::fgsea(pathways = list('AS03.mdc' = dc.as03.sig.validated), stats = r0$mDC)
mdc.na.gsea # pathway pval padj log2err ES NES size leadingEdge
# 1: AS03.mdc 0.02383525 0.02383525 0.3524879 0.3713362 1.521304 58 S100A8,S100A9,PSMB6,SERPINA1,RB1,SLC31A2,...
= fgsea::plotEnrichment(pathway = dc.as03.sig.validated,stats = r0$mDC) +
p geom_line(size = 1.5, color = 'red') +
theme(plot.title = element_text(size = 10))
pggsave(p, filename =paste0(figpath, 'mdc.natadj.gsea.pdf'), width = 5, height = 3)
$leadingEdge
mdc.na.gsea# [1] "S100A8" "S100A9" "PSMB6" "SERPINA1" "RB1" "SLC31A2" "TYMP" "S100A11" "MS4A4A" "FCN1" "LILRB2"
# [12] "PSMA7" "CDC26" "RBX1" "PSMB3" "PLBD1" "LMNB1" "PPP2R5E" "KYNU"
::fwrite(mdc.na.gsea$leadingEdge,file = paste0(datapath,'mdc.na.gsea.leadingEdge.txt')) data.table
Analyze flow cytometry data for differences at baseline in high and
low responders. Only test innate cell subsets to focus on hypothesis
generated from analysis of CITE-seq data. Further test the identified
activated monocyte phenotype for its longitudinal kinetics and day 1 vs
baseline fold change difference in high and low responders.
mid_res/flow_kinetic/flow_cellfreq_kinetics.r
# flow activated monocyte kinetic
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(emmeans))
source(here('functions/MattPMutils.r'))
# save path
= here('mid_res/flow_kinetic/figures/');
figpath dir.create(figpath,recursive = TRUE)
# test only innate subsets based on hypothesis generated from CITE-seq data
# load flow data
= data.table::fread('data/CHI_H1N1_data/flow/flow_annotation.txt', header = TRUE)
fp = paste('ID', 64:78,sep = '') %>% as.character()
id.select = fp %>% filter(`Population ID` %in% id.select)
fp
# load flow data day 1 fold changes
=
fd ::fread(file = here('data/CHI_H1N1_data/flow/day1-day0.log10.txt'),header = TRUE) %>%
data.tablefilter(ID %in% fp$`Population ID`) %>%
mutate(subset_name = plyr::mapvalues(ID, from = fp$`Population ID`,to = fp$`Subset name`)) %>%
select(-ID) %>%
column_to_rownames('subset_name') %>%
t() %>%
as.data.frame() %>%
rownames_to_column('Subject')
# map titers to data
= data.table::fread(file = here('data/CHI_H1N1_data/titer/titer_processed.txt'))
titer $adjMFC_class = plyr::mapvalues(x = fd$Subject, from = titer$Subject,to = titer$adjMFC_class )
fd= fd %>% select(adjMFC_class, everything())
fd $adjMFC_class = factor(fd$adjMFC_class, levels = c('0','1','2'))
fd= fd[!is.na(fd$adjMFC_class), ]
fd
# baseline
# these are raw percentages so use non parametric rank stats
=
fd3 ::fread(file = here('data/CHI_H1N1_data/flow/day0.raw.txt'),header = TRUE) %>%
data.tablefilter(ID %in% fp$`Population ID`) %>%
mutate(subset_name = plyr::mapvalues(ID, from = fp$`Population ID`,to = fp$`Subset name`)) %>%
select(-ID) %>%
column_to_rownames('subset_name') %>%
t() %>%
as.data.frame() %>%
rownames_to_column('Subject')
# map adj mfc class
$adjMFC_class = plyr::mapvalues(x = fd3$Subject, from = titer$Subject,to = titer$adjMFC_class )
fd3= fd3 %>% select(adjMFC_class, everything())
fd3 $adjMFC_class = factor(fd3$adjMFC_class, levels = c('0','1','2'))
fd3= fd3[!is.na(fd3$adjMFC_class), ]
fd3 = fd3[!fd3$adjMFC_class == '1', ]
fd3
= apply(
wilcox.res3 X = fd3[, 3:ncol(fd3)],
MARGIN = 2,
FUN = function(x) {
wilcox.test(x ~ fd3$adjMFC_class) %>% broom::tidy()
%>%
}) bind_rows(.id = 'subset')
%>% filter(p.value < 0.1)
wilcox.res3 saveRDS(wilcox.res3,file = paste0(datapath,'wilcox.res3.rds'))
# comparison
= list(c('2','0'))
flow_compare
# color specification
= sapply(c('dodgerblue', 'red'), col.alpha, 0.2) %>% unname()
cu1 = c('dodgerblue', 'red')
cu2
# theme
= list(
mtheme theme_bw(),
geom_boxplot(show.legend = FALSE, outlier.shape = 21),
::stat_compare_means(comparisons = flow_compare,method = 'wilcox', paired = FALSE),
ggpubrscale_y_continuous( breaks= scales::pretty_breaks(), expand = c(0.15,0)),
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
axis.text.x = element_text(size = 10, color = 'black')),
scale_x_discrete(labels = c("low", "high")),
xlab("Antibody \n Response"),
scale_fill_manual(values = cu1),
scale_color_manual(values = cu2)
)
# plot
= ggplot(fd3, aes(x = adjMFC_class, y = `activated monocyte HLA-DR+`,
p3 color = adjMFC_class,
fill = adjMFC_class)) +
mthemeggsave(p3,filename = paste0(figpath, 'mono_HLADR.pdf'), width = 2, height = 3)
# longitudinal analysis of cell population frequency
# connection between baseline (-7 and 0) and day 1 kinetics
= data.table::fread(file = here('data/CHI_H1N1_data/flow/longitudinal/pre7.raw.txt'), header = TRUE)
d01 = data.table::fread(file = here('data/CHI_H1N1_data/flow/longitudinal/day0.raw.txt'), header = TRUE)
d02 = data.table::fread(file = here('data/CHI_H1N1_data/flow/longitudinal/day1.raw.txt'), header = TRUE)
d1 = data.table::fread(file = here('data/CHI_H1N1_data/flow/longitudinal/day7.raw.txt'), header = TRUE)
d7 = data.table::fread(file = here('data/CHI_H1N1_data/flow/longitudinal/day70.raw.txt'), header = TRUE)
d70
= list('t01' = d01, 't02' = d02, 't1' = d1, 't7' = d7, 't70' = d70 )
d.list
# format, combine and label by time
= lapply(d.list, function(x)
d %>%
x filter(ID %in% fp$`Population ID`) %>%
mutate(subset_name = plyr::mapvalues(ID, from = fp$`Population ID`,to = fp$`Subset name`)) %>%
select(-ID) %>%
column_to_rownames('subset_name') %>%
t() %>%
as.data.frame() %>%
rownames_to_column('Subject') ) %>%
bind_rows(.id = 'timepoint')
# map adj mfc class
$adjMFC_class = plyr::mapvalues(x = d$Subject, from = titer$Subject,to = titer$adjMFC_class )
d= d %>% filter(adjMFC_class %in% c('0','2'))
d $adjMFC_class = factor(d$adjMFC_class, levels = c('0','2'))
d$timepoint = factor(d$timepoint, levels = c("t01", "t02", "t1", "t7", "t70"))
d
# plot time course
= list(
mtheme2 theme_bw(),
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
axis.text.x = element_text(size = 10, color = 'black')),
scale_fill_manual(values = cu1),
scale_color_manual(values = cu2)
)
# monocyte
= d[!is.na(d$`activated monocyte HLA-DR+`), ]
d2 =
p3ggplot(d2, aes(x = timepoint, y = `activated monocyte HLA-DR+`, color = adjMFC_class, group = Subject)) +
geom_line(size = 0.5, alpha = 0.2, show.legend = FALSE) +
geom_point(size = 0.5, alpha = 0.2, shape = 21, show.legend = FALSE) +
geom_smooth(data = d2, size = 2.5,
method = 'loess',
aes(x = timepoint, y = `activated monocyte HLA-DR+`,
color = adjMFC_class,
fill = adjMFC_class,
group = adjMFC_class), alpha = 0.2,
se = TRUE, show.legend = FALSE) +
scale_x_discrete(expand = c(0,0.1)) +
mtheme2
p3ggsave(p3,filename = paste0(figpath, 'monoDRfreqtime.pdf'), width = 3, height = 3)
### mixed model
= d %>%
d2 select(timepoint, subjectid = Subject,
drmono = `activated monocyte HLA-DR+`,
adjmfc = adjMFC_class) %>%
mutate(timepoint= as.character(timepoint)) %>%
mutate(timepoint = ifelse(timepoint %in% c('t01','t02'),yes = 't0',no = timepoint)) %>%
mutate(timepoint = factor(timepoint , levels = c( "t0", "t1" , "t7" ,"t70")))
# fit model
= lme4::lmer(drmono ~ timepoint + (1|subjectid),data = d2)
m = lme4::lmer(drmono ~ timepoint*adjmfc + (1|subjectid),data = d2)
m2 anova(m,m2)
# Data: d2
# Models:
# m: drmono ~ timepoint + (1 | subjectid)
# m2: drmono ~ timepoint * adjmfc + (1 | subjectid)
# npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
# m 6 615.95 632.58 -301.98 603.95
# m2 10 615.02 642.73 -297.51 595.02 8.9311 4 0.06284 .
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
= emmeans(m2, revpairwise ~ timepoint|adjmfc)
emm1 = emm1$contrasts[c(1,7), ] %>% plot() + theme_bw() +
p xlab('effect size') +
ylab('group') +
theme(text = element_text(size = 5))
pggsave(p,filename = paste0(figpath, 'monoDREMM.pdf'), width = 3, height = 1)
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] emmeans_1.5.4 scglmmr_0.1.0 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
# [9] tibble_3.1.8 ggplot2_3.3.3 tidyverse_1.3.0 here_1.0.1
#
# loaded via a namespace (and not attached):
# [1] utf8_1.2.2 tidyselect_1.2.0 lme4_1.1-26 htmlwidgets_1.5.3
# [5] RSQLite_2.2.7 AnnotationDbi_1.52.0 grid_4.0.5 BiocParallel_1.24.1
# [9] scatterpie_0.1.7 munsell_0.5.0 codetools_0.2-18 statmod_1.4.35
# [13] withr_2.4.3 colorspace_2.0-0 GOSemSim_2.16.1 Biobase_2.50.0
# [17] knitr_1.39 rstudioapi_0.13 stats4_4.0.5 ggsignif_0.6.0
# [21] DOSE_3.16.0 labeling_0.4.2 MatrixGenerics_1.2.1 Rdpack_2.1.1
# [25] GenomeInfoDbData_1.2.4 polyclip_1.10-0 bit64_4.0.5 farver_2.0.3
# [29] pheatmap_1.0.12 rprojroot_2.0.2 downloader_0.4 coda_0.19-4
# [33] vctrs_0.5.1 generics_0.1.2 TH.data_1.0-10 xfun_0.30
# [37] R6_2.5.0 doParallel_1.0.16 GenomeInfoDb_1.26.7 graphlayouts_0.7.2
# [41] locfit_1.5-9.4 bitops_1.0-6 cachem_1.0.4 fgsea_1.16.0
# [45] DelayedArray_0.16.3 assertthat_0.2.1 scales_1.1.1 nnet_7.3-15
# [49] multcomp_1.4-16 ggraph_2.0.5 enrichplot_1.10.2 gtable_0.3.0
# [53] egg_0.4.5 tidygraph_1.2.0 sandwich_3.0-0 rlang_1.0.6
# [57] slanter_0.2-0 splines_4.0.5 rstatix_0.7.0 checkmate_2.0.0
# [61] broom_0.7.5 BiocManager_1.30.10 reshape2_1.4.4 abind_1.4-5
# [65] modelr_0.1.8 backports_1.2.1 Hmisc_4.5-0 qvalue_2.22.0
# [69] clusterProfiler_3.18.1 tools_4.0.5 ellipsis_0.3.2 gplots_3.1.1
# [73] RColorBrewer_1.1-2 BiocGenerics_0.36.1 Rcpp_1.0.9 plyr_1.8.6
# [77] base64enc_0.1-3 progress_1.2.2 zlibbioc_1.36.0 RCurl_1.98-1.3
# [81] prettyunits_1.1.1 rpart_4.1-15 ggpubr_0.4.0 viridis_0.5.1
# [85] cowplot_1.1.1 S4Vectors_0.28.1 zoo_1.8-8 cluster_2.1.2
# [89] SummarizedExperiment_1.20.0 haven_2.4.3 ggrepel_0.9.1 fs_1.5.0
# [93] variancePartition_1.25.6 magrittr_2.0.3 data.table_1.14.0 DO.db_2.9
# [97] openxlsx_4.2.3 reprex_1.0.0 mvtnorm_1.1-1 packrat_0.7.0
# [101] matrixStats_0.58.0 hms_1.0.0 GSVA_1.38.2 xtable_1.8-4
# [105] pbkrtest_0.5-0.1 RhpcBLASctl_0.21-247.1 XML_3.99-0.6 jpeg_0.1-8.1
# [109] rio_0.5.16 readxl_1.3.1 IRanges_2.24.1 gridExtra_2.3
# [113] compiler_4.0.5 KernSmooth_2.23-18 crayon_1.4.1 shadowtext_0.0.9
# [117] htmltools_0.5.2 minqa_1.2.4 mgcv_1.8-34 ggfun_0.0.4
# [121] Formula_1.2-4 lubridate_1.8.0 DBI_1.1.1 corrplot_0.84
# [125] tweenr_1.0.2 dbplyr_2.1.0 MASS_7.3-53.1 boot_1.3-27
# [129] Matrix_1.4-1 car_3.0-10 cli_3.4.1 rbibutils_2.0
# [133] parallel_4.0.5 igraph_1.2.6 GenomicRanges_1.42.0 pkgconfig_2.0.3
# [137] rvcheck_0.1.8 foreign_0.8-81 xml2_1.3.2 foreach_1.5.1
# [141] annotate_1.68.0 XVector_0.30.0 GeneOverlap_1.26.0 estimability_1.3
# [145] rvest_0.3.6 digest_0.6.27 graph_1.68.0 cellranger_1.1.0
# [149] fastmatch_1.1-0 htmlTable_2.1.0 edgeR_3.32.1 GSEABase_1.52.1
# [153] curl_4.3 gtools_3.8.2 nloptr_1.2.2.2 lifecycle_1.0.3
# [157] nlme_3.1-152 jsonlite_1.7.2 aod_1.3.1 carData_3.0-4
# [161] viridisLite_0.3.0 limma_3.46.0 fansi_0.4.2 pillar_1.8.1
# [165] lattice_0.20-41 fastmap_1.1.0 httr_1.4.2 survival_3.2-10
# [169] GO.db_3.12.1 glue_1.6.2 UpSetR_1.4.0 zip_2.1.1
# [173] png_0.1-7 iterators_1.0.13 bit_4.0.4 ggforce_0.3.3
# [177] stringi_1.5.3 blob_1.2.1 org.Hs.eg.db_3.12.0 latticeExtra_0.6-29
# [181] caTools_1.18.1 memoise_2.0.0
Visualize stimulated and unstimulated cells identified by
HDStim.
mid_res/stim/visualize_stim_cells.r
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(HDStIM))
library(Rcpp)
library(emmeans)
library(lme4)
source('functions/MattPMutils.r')
# save paths
= here('mid_res/stim/figures/')
figpath = here('mid_res/stim/generated_data/')
datapath dir.create(figpath); dir.create(datapath)
# read data from stim cell selector
= readRDS(file = here('data/stim/mapped_data.rds'))
d
$umap_plot_data
d
= HDStIM::plot_umap(mapped_data = d)
up = up[[4]]
pd = pd$data
pd2 $stim = ifelse(str_sub(pd2$response_status, -5,-1) == 'Stim.', yes = 'LPS stimulated', no = 'unstimulated')
pd2= ggsci::pal_d3( palette = 'category20', 0.5)(2) %>% rev
cf =
p ggplot(pd2, aes(x = UMAP1, y = UMAP2, fill = stim )) +
geom_point(shape = 21, stroke = 0.1) +
theme_bw() +
scale_fill_manual(values = cf) +
ggtitle(pd$labels$title)
ggsave(p,filename = paste0(figpath, 'umap_lps_mono.png'), width = 5, height = 4)
ggsave(p,filename = paste0(figpath, 'umap_lps_mono_outline.pdf'), width = 5, height = 4)
Fit mixed effects model of median phospho protein marker expression
in classical monocytes pre vs post stimulation and compare effects in
high vs low responders.
mid_res/cytof_stim/stim_test_ag.r
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(HDStIM))
suppressMessages(library(Rcpp))
suppressMessages(library(emmeans))
suppressMessages(library(lme4))
source('functions/MattPMutils.r')
# save paths
= here('mid_res/cytof_stim/figures/')
figpath = here('mid_res/cytof_stim/generated_data/')
datapath dir.create(figpath); dir.create(datapath)
# baseline bulk theme
= sapply(c('dodgerblue','red'), col.alpha, 0.2) %>% unname()
cu1 = c('dodgerblue', 'red')
cu2
# read data from stim cell selector
= readRDS(file = here('data/stim/mapped_data.rds'))
d
# celltype markers
= c('CD45', 'CD7', 'CD19', 'CD4', 'IgD', 'CD20',
cmarkers "CD11c", "CD127", 'CD25', 'CD123', 'CD27', 'CD24',
'CD14', 'CD56', 'CD16', 'CD38', 'CD8', 'CD45RA',
'CD3', 'HLA_DR')
# phenotyping markers
= c("pPLCg2", "pSTAT5", "AKT", "pSTAT1", "pP38",
pmarkers "pSTAT3", "IkBa", "pCREB", "pERK1_2", "pS6")
# innate cells
= c('CD14Mono', 'DC1', 'DC2')
innate.cells
########################
# make a data matrix
= d$response_mapping_main
dr $cell_id = paste(dr$sample_id, rownames(dr),sep = '_')
dr
= as.data.frame(dr) %>%
dr column_to_rownames('cell_id') %>%
mutate(sx = paste(patient_id, condition, stim_type, sep = '_')) %>%
mutate(response.stim = paste(condition, stim_type, sep = '_')) %>%
mutate(adjMFC = factor(condition,levels = c('high', 'low')))
# log transform the state marker matrix
= dr %>% select(all_of(pmarkers))
mat #mat = log(mat + 1 )
#make a separate dataframe of metadata
= c(cmarkers, pmarkers)
prots = dr %>% select(!all_of(prots))
met
# combine log transformed data back with md
stopifnot(isTRUE(all.equal(rownames(met), rownames(mat))))
= cbind(met, mat )
dr
# aggregate the protein markers to test the median log transformed marker intensity
= dr %>%
da group_by(sx, response.stim, patient_id, batch, cell_population, stim_type) %>%
summarize_at(.vars = pmarkers, .funs = median) %>%
filter(cell_population %in% c('CD14Mono', 'DC1', 'DC2', 'CD16Mono'))
# model
= as.formula(
f1 ~ 0 + batch + response.stim + ( 1 | patient_id)
prot
)
# separate by stim
= da %>%
lps filter(stim_type %in% c('U', 'L')) %>%
filter(cell_population == 'CD14Mono')
# modify group factor for contrast estimand
$response.stim = factor(
lps$response.stim,
lpslevels = c("high_U", "high_L", "low_U", "low_L")
)
= lps
dfit = c ( '(high_L - high_U) - (low_L - low_U)' )
contrast_sub = reslist = list()
emmeans for (i in 1:length(pmarkers)) {
# test
= pmarkers[i]
prot.name print(prot.name)
# # metadata
= dfit %>% select(!all_of(pmarkers))
met
# extract single protein
= data.frame(dfit[ ,prot.name])
dat_vec colnames(dat_vec) = 'prot'
# model data to fit
= base::data.frame(cbind(dat_vec, met))
dat_fit
# save quick plot
= dat_fit %>% filter(response.stim %in% c('low_L', 'high_L'))
dplot $response.stim = factor(dplot$response.stim, levels = c('low_L', 'high_L'))
dplot= ggplot(dplot, aes(x = response.stim,y = prot , color = response.stim, fill = response.stim)) +
p theme_bw() +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
#geom_jitter(show.legend = FALSE, width = 0.2, shape = 21, color = 'black', stroke = 0.3) +
ylab(prot.name) +
scale_fill_manual(values = cu1) +
scale_color_manual(values = cu2)
pggsave(p, filename = paste0(figpath,prot.name,'lps.pdf'), width = 1.75, height = 2)
# fit models
= tryCatch(
m1 ::lmer(formula = f1, data = dat_fit),
lme4error = function(e)
return(NA)
)
# emmeans to apply contrast of fold changes
= tryCatch(
emm1 ::emmeans(
emmeansobject = m1,
specs = ~ 'response.stim',
data = dat_fit,
lmer.df = "asymptotic"),
error = function(e) return(NA)
)
# apply contrasts
if (!is.na(emm1)) {
= contrast(emm1, method = 'revpairwise',adjust = NULL)
m1.cont = pairs(m1.cont,adjust = NULL) %>%
m1cont as.data.frame() %>%
filter( contrast == '(high_L - high_U) - (low_L - low_U)' )
# store results
= emm1
emmeans[[i]] names(emmeans)[i] = prot.name
# store contrast test
= m1cont %>%
tidy.fit mutate(prot = prot.name) %>%
select(prot, everything())
= tidy.fit
reslist[[i]] else{
} # store results
= emm1
emmeans[[i]] names(emmeans)[i] = prot.name
= data.frame(
reslist[[i]] prot = prot.name,
contrast = NA,
estimate = NA,
SE = NA,
df = NA,
z.ratio = NA,
p.value = NA
)
}
}= do.call(rbind,reslist)
rd $stim = 'LPS'
rd::fwrite(rd,file = paste0(datapath,'mono14_lps.txt'), sep = '\t')
data.table
######################
# PMA
= da %>%
pma filter(stim_type %in% c('U', 'P')) %>%
filter(cell_population == 'CD14Mono')
# modify group factor for contrast estimand
$response.stim = factor(
pma$response.stim,
pmalevels = c("high_U", "high_P", "low_U", "low_P")
)= pma
dfit = c ( '(high_P - high_U) - (low_P - low_U)' )
contrast_sub = reslist = list()
emmeans for (i in 1:length(pmarkers)) {
#i = 5
# test
= pmarkers[i]
prot.name print(prot.name)
# # metadata
= dfit %>% select(!all_of(pmarkers))
met
# extract single protein
= data.frame(dfit[ ,prot.name])
dat_vec colnames(dat_vec) = 'prot'
# model data to fit
= base::data.frame(cbind(dat_vec, met))
dat_fit
# save quick plot
= dat_fit %>% filter(response.stim %in% c('low_P', 'high_P'))
dplot $response.stim = factor(dplot$response.stim, levels = c('low_P', 'high_P'))
dplot= ggplot(dplot, aes(x = response.stim,y = prot , color = response.stim, fill = response.stim)) +
p theme_bw() +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(show.legend = FALSE, width = 0.2, shape = 21, color = 'black', stroke = 0.3) +
ylab(prot.name) +
scale_fill_manual(values = cu1) +
scale_color_manual(values = cu2)
pggsave(p, filename = paste0(figpath,prot.name,'pma.pdf'), width = 1.75, height = 2)
# fit models
= tryCatch(
m1 ::lmer(formula = f1, data = dat_fit),
lme4error = function(e)
return(NA)
)
# emmeans to apply contrast of fold changes
= tryCatch(
emm1 ::emmeans(
emmeansobject = m1,
specs = ~ 'response.stim',
data = dat_fit,
lmer.df = "asymptotic"
), error = function(e) return(NA)
)
# apply contrasts
if (!is.na(emm1)) {
= contrast(emm1, method = 'revpairwise',adjust = NULL)
m1.cont = pairs(m1.cont,adjust = NULL) %>%
m1cont as.data.frame() %>%
filter( contrast == '(high_P - high_U) - (low_P - low_U)' )
# store results
= emm1
emmeans[[i]] names(emmeans)[i] = prot.name
# store contrast test
= m1cont %>%
tidy.fit mutate(prot = prot.name) %>%
select(prot, everything())
= tidy.fit
reslist[[i]] else{
} # store results
= emm1
emmeans[[i]] names(emmeans)[i] = prot.name
= data.frame(
reslist[[i]] prot = prot.name,
contrast = NA,
estimate = NA,
SE = NA,
df = NA,
z.ratio = NA,
p.value = NA
)
}
}= do.call(rbind,reslist)
rd $stim = 'PMA'
rd::fwrite(rd,file = paste0(datapath,'mono14_PMA.txt'), sep = '\t')
data.table
# IFN
# separate by stim
= da %>%
ifn filter(stim_type %in% c('U', 'A')) %>%
filter(cell_population == 'CD14Mono')
# modify group factor for contrast estimand
$response.stim = factor(
ifn$response.stim,
ifnlevels = c("high_U", "high_A", "low_U", "low_A")
)= ifn
dfit = c ( '(high_A - high_U) - (low_A - low_U)' )
contrast_sub = reslist = list()
emmeans for (i in 1:length(pmarkers)) {
# test
= pmarkers[i]
prot.name print(prot.name)
# # metadata
= dfit %>% select(!all_of(pmarkers))
met
# extract single protein
= data.frame(dfit[ ,prot.name])
dat_vec colnames(dat_vec) = 'prot'
# model data to fit
= base::data.frame(cbind(dat_vec, met))
dat_fit
# save quick plot
= dat_fit %>% filter(response.stim %in% c('low_A', 'high_A'))
dplot $response.stim = factor(dplot$response.stim, levels = c('low_A', 'high_A'))
dplot= ggplot(dplot, aes(x = response.stim,y = prot , color = response.stim, fill = response.stim)) +
p theme_bw() +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(show.legend = FALSE, width = 0.2, shape = 21, color = 'black', stroke = 0.3) +
ylab(prot.name) +
scale_fill_manual(values = cu1) +
scale_color_manual(values = cu2)
ggsave(p, filename = paste0(figpath,prot.name,'IFN.pdf'), width = 1.75, height = 2)
# fit models
= tryCatch(
m1 ::lmer(formula = f1, data = dat_fit),
lme4error = function(e)
return(NA)
)
# emmeans to apply contrast of fold changes
= tryCatch(
emm1 ::emmeans(
emmeansobject = m1,
specs = ~ 'response.stim',
data = dat_fit,
lmer.df = "asymptotic"
),error = function(e)
return(NA)
)
# apply contrasts
if (!is.na(emm1)) {
= contrast(emm1, method = 'revpairwise',adjust = NULL)
m1.cont = pairs(m1.cont,adjust = NULL) %>%
m1cont as.data.frame() %>%
filter( contrast == '(high_A - high_U) - (low_A - low_U)' )
# store results
= emm1
emmeans[[i]] names(emmeans)[i] = prot.name
# store contrast test
= m1cont %>%
tidy.fit mutate(prot = prot.name) %>%
select(prot, everything())
= tidy.fit
reslist[[i]] else{
} # store results
= emm1
emmeans[[i]] names(emmeans)[i] = prot.name
= data.frame(
reslist[[i]] prot = prot.name,
contrast = NA,
estimate = NA,
SE = NA,
df = NA,
z.ratio = NA,
p.value = NA
)
}
}= do.call(rbind,reslist)
rd $stim = 'IFN'
rd::fwrite(rd,file = paste0(datapath,'mono14_IFN.txt'), sep = '\t')
data.table
sessionInfo()
# R version 4.0.5 Patched (2021-03-31 r80136)
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] lme4_1.1-26 Matrix_1.4-1 emmeans_1.5.4 Rcpp_1.0.9 HDStIM_0.1.0
# [6] here_1.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.4 purrr_0.3.4
# [11] readr_1.4.0 tidyr_1.1.2 tibble_3.1.8 ggplot2_3.3.3 tidyverse_1.3.0
#
# loaded via a namespace (and not attached):
# [1] httr_1.4.2 jsonlite_1.7.2 splines_4.0.5 modelr_0.1.8 assertthat_0.2.1
# [6] statmod_1.4.35 cellranger_1.1.0 yaml_2.2.1 pillar_1.8.1 backports_1.2.1
# [11] lattice_0.20-41 glue_1.6.2 digest_0.6.27 rvest_0.3.6 minqa_1.2.4
# [16] colorspace_2.0-0 sandwich_3.0-0 htmltools_0.5.2 plyr_1.8.6 pkgconfig_2.0.3
# [21] broom_0.7.5 haven_2.4.3 xtable_1.8-4 mvtnorm_1.1-1 scales_1.1.1
# [26] farver_2.0.3 generics_0.1.2 ellipsis_0.3.2 TH.data_1.0-10 withr_2.4.3
# [31] Boruta_7.0.0 cli_3.4.1 survival_3.2-10 magrittr_2.0.3 crayon_1.4.1
# [36] readxl_1.3.1 evaluate_0.15 estimability_1.3 fs_1.5.0 fansi_0.4.2
# [41] nlme_3.1-152 MASS_7.3-53.1 xml2_1.3.2 rsconnect_0.8.25 tools_4.0.5
# [46] data.table_1.14.0 hms_1.0.0 lifecycle_1.0.3 multcomp_1.4-16 munsell_0.5.0
# [51] reprex_1.0.0 packrat_0.7.0 compiler_4.0.5 rlang_1.0.6 grid_4.0.5
# [56] nloptr_1.2.2.2 ggridges_0.5.3 rstudioapi_0.13 rmarkdown_2.9 labeling_0.4.2
# [61] boot_1.3-27 gtable_0.3.0 codetools_0.2-18 DBI_1.1.1 R6_2.5.0
# [66] zoo_1.8-8 lubridate_1.8.0 knitr_1.39 fastmap_1.1.0 uwot_0.1.10
# [71] utf8_1.2.2 rprojroot_2.0.2 stringi_1.5.3 parallel_4.0.5 vctrs_0.5.1
# [76] xfun_0.30 dbplyr_2.1.0 tidyselect_1.2.0 coda_0.19-4
Write results for supplementary tables.
mid_res/data_write/Final_script_wite_table_fsc.r
# make tables
suppressMessages(library(tidyverse))
suppressMessages(library(here))
suppressMessages(library(magrittr))
source(file = here('functions/scglmmr.functions.R'))
= file.path(here('mid_res/data_write/generated_data/'));
datapath dir.create(datapath)
# specify order of variables in the output table for readability
= c('contrast', 'celltype', 'pathway', 'NES', 'padj', 'leadingEdge')
var.order
# result format for gsea results
= function(list){
filter.gsea lapply(list, function(x) x %>% filter(padj < 0.05))
}
= function(x) {
format.result %>%
x select(all_of(var.order), everything()) %>%
arrange(celltype, NES) %>%
::remove_rownames()
tibble
}
# Baseline curated high responder signals gsea age sex batch adjusted
= readRDS(file = here("mid_res/baseline_response/dataV3/g0.sub.rds"))
g0.sub = do.call(rbind, g0.sub)
g0.sub $contrast = 'baseline high vs low responders'
g0.sub= format.result(g0.sub) %>%
d0.res select(-c(signal)) %>%
mutate(model = 'gene ~ 0 + response.group + batch + sex + age')
# day 1 non-adjuvanted vaccine
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g1c.rds'))
g1c = do.call(rbind, g1c) %>%
g1c filter(padj < 0.05)
$contrast = '24h vs baseline unadjuvanted vaccine'
g1c$model = 'gene ~ 0 + timepoint + batch + sex + age + (1|subjectid) '
g1c
# day 7 non-adjuvanted vaccine
= readRDS(file = here('mid_res/1_H1N1_pseudobulk_DE/dataV4/gsea/g7f.rds'))
g7f = lapply(g7f, function(x)
g7f %>%
x filter(!str_sub(pathway, 1,5) == 'REACT' ) %>%
filter(NES > 0) %>%
filter(pval <0.1)
)= do.call(rbind, g7f)
g7f $contrast = 'day 7 vs baseline unadjuvanted vaccine'
g7f$model = 'gene ~ 0 + timepoint + batch + sex + age + (1|subjectid) '
g7f
# as03 model
= readRDS(file = here('mid_res/3_H1N1_H5N1_joint_pseudobulk_DE/dataV4/gsea/gc.rds'))
gc = lapply(gc, function(x) x %>% filter(padj < 0.05))
gc = do.call(rbind, gc)
gc $contrast = 'AS03 vs unadjuvanted vaccine day 1 vs baseline fold change difference'
gc= format.result(gc) %>%
gc mutate(model = '0 + timepoint_vaccinegroup + sex + age + (1|subjectid)')
# combine
= rbind(g1c, g7f, gc, d0.res)
d
# reorder columns
= d %>% select(pathway, celltype, model, contrast, pval, padj, NES, leadingEdge, everything())
d
# write results
::fwrite(d,
data.tablefile = paste0(datapath,'combined.results.fsc.txt'),
sep = '\t')
# gene signatures
= readRDS(file = here('signature_curation/sig_test_sub.rds'))
core_sigs = data.table::fread(file = here('signature_curation/sig_test_sub_annotation.txt'))
mann = mann
dmod $signature_genes = core_sigs
dmod
# add natural adjuvant signatures
= readRDS(file = here('mid_res/nat_adj/generated_data/V4/as03.sig.list.rds'))
as03.sig.list = readRDS(file = here('mid_res/nat_adj/generated_data/V4/mono.as03.sig.validated.rds'))
mono.as03.sig.validated = readRDS(file = here('mid_res/nat_adj/generated_data/V4/dc.as03.sig.validated.rds'))
dc.as03.sig.validated = list( 'AS03_Monocyte_validated' = mono.as03.sig.validated,
validated 'AS03_mDC_validated' = dc.as03.sig.validated)
= read.csv(file = here('mid_res/nat_adj/generated_data/V4/mdc.na.gsea.leadingEdge.txt'),header = FALSE )$V1
mdc.na.gsea = read.csv(file = here('mid_res/nat_adj/generated_data/V4/mono.na.gsea.leadingEdge.txt'),header = FALSE )$V1
mono.na.gsea = list('Monocyte_highresponder_naturaladjuvant' = mono.na.gsea,
natural.adjuvant 'mDC_highresponder_naturaladjuvant' = mdc.na.gsea)
# combine
= c(as03.sig.list, validated, natural.adjuvant)
AdjuvantSignatures
= data.frame(
dcitepathway = names(AdjuvantSignatures),
annotation = c(rep('CITE-seq contrast model combined mDC and CD14 monocytes age and sex adjusted', 3),
rep('CITE-seq contrast model combined mDC and CD14 monocytes age and sex adjusted validated in external cohort', 2),
rep('CITE-seq contrast model combined mDC and CD14 monocytes age and sex adjusted Validated in external cohort and elevated in high responders at baseline also age and sex adjusted in high vs low responders', 2)
)
)$signature_genes = AdjuvantSignatures
dcite# combine with signatures
= rbind(dmod, dcite)
dmod
::fwrite(dmod,
data.tablefile = paste0(datapath,'combined.modules.fsc.txt'),
sep = '\t')
## day 7 signatures
= readRDS("signature_curation/core_d7.rds")
sig7 = sig7[c(6,9,10)]
sig7 names(sig7)
= data.frame(
d7 pathway = names(sig7),
annotation = c(
'Li Blood Transcriptome Modules',
'bulk derived signture (genes with d7 fold change robustly correlated with adjMFC)',
'genes coherently perturbecd day 7 vs baseline ("cluster 4") Tsang 2014 Fig 3C')
)$signature_genes = sig7
d7::fwrite(d7,
data.tablefile = paste0(datapath,'day7.bulk.signatures.txt'),
sep = '\t')
### Variance fractions
= readRDS(file = here('mid_res/variance_partition/generated_data/vp.rds'))
vp = as.data.frame(vp) %>%
vp rownames_to_column('gene') %>%
select(gene, everything()) %>%
mutate(model = '~ age + (1|sex) + (1|subjectid) + (1|celltype) + (1|timepoint) + (1|adjmfc.group) + (1|celltype:timepoint)')
# write
::fwrite(vp,
data.tablefile = paste0(datapath,'variance.partition.across.celltypes.txt'),
sep = '\t')
# per cell type results
#pb = readRDS(file = here('mid_res/variance_partition/generated_data/pb_vp.rds'))
= list.files(
dl path = here('mid_res/variance_partition/generated_data/'),
pattern = '.rds',
recursive = TRUE,
full.names = TRUE
)= dl[-c(15,17)] # remove total bulk
dl
# get cell type names (file names)
= list.files(
cts path = here('mid_res/variance_partition/generated_data/'),
pattern = '.rds',
recursive = TRUE,
full.names = FALSE
)= cts[-c(15,17)]
cts = str_replace_all(string = cts,pattern = 'vp.rds', replacement = '')
cts # read and format variance partition results
= lapply(dl, readRDS)
vl for (i in 1:length(vl)) {
= vl[[i]] %>%
vl[[i]] as.data.frame() %>%
rownames_to_column('gene') %>%
mutate(model_fit_within_this_celltype = names(vl)[i]) %>%
mutate(model = '~ age + (1|sex) + (1|subjectid) + (1|timepoint) + (1|adjmfc.group) + (1|timepoint:adjmfc.group)')
}= do.call(rbind, vl)
vp_within ::fwrite(vp_within,
data.tablefile = paste0(datapath,'variance.partition.within.celltypes.txt'),
sep = '\t')
# Added a table for subject level metadata.
# modified in uploaded version to mask subject 38.
= read.delim(file = 'data/full_metadata/full_sample_metadata.txt',header = T,sep = '\t')
met = met %>% filter(CITEdata == '1')
met ::fwrite(met,
data.tablefile = paste0(datapath,'metadata.txt'),
sep = '\t')
Additional code provided here for reference. This documents
how starting data used in that analysis was created from raw sequence
counts. The Zenodo repository also includes CITE-seq data in additional
formats for reuse with other packages as follows:
flu_vacc_CITEseq_Seurat4.rds
- a Seurat version 4 object
(separate assays for RNA and ADT).
flu_vacc_CITEseq_combinedassay.h5ad
- an Anndata object for
analysis in python / scanpy. (RNA and ADT
combined into a single assay).
Data from one subject could not be released due to the type of data reuse consent provided by this subject. For this reason starting raw data with all cells multiplexed together cannot be released. The workflow for pre-processing is provided here for reference.
The contents of the zenodo repository include:
Flu_CITEseq_preprocess
+ data
+
signature_curation
.
The directories data
and signature_curation
should be placed directly inside the root directory of the analysis code
downloaded from github to reproduce analysis reported in the manuscript.
The structure of the analysis directory should be:
fsc-master
–signature_curation
–data
–functions
–mid_res
Scripts here are used to document the creation of starting single
cell data object h1h5_annotated_with_meta.rds
from raw HTO
ADT and mRNA umi reads. h1h5_annotated_with_meta.rds
is the
starting data used in the main analysis directory mid_res
.
A visual guide below shows how the object is built.
Scripts:
1_write_bc_for_citeseqcount.r
1a_RunCITEseqCount.txt
2_merge10xlanes_hiseq_hto_adt_rna_V8.r
2a_RUNDEMUXLET.txt
3_hto_demux_by_batch.r
3_Multiseq_Demultiplex_by_batch.r
4_validate_htodmx_w_demuxlet_singlets.r
4_validate_multiseq_w_demuxlet_singlets.r
5_save_merged_Seurat_singlet_object.r
Visual guide:
-Experiment
┌───────────────────────┐ ┌─────────────────────┐
│ Concentrated antibody │ │ Pooled cells │
│ based on titration │ │ 12 donors │
│ 83 surface antibodies │ │ 24 samples │
│ + 4 isotype control │ │ per batch │
└──────────────┬────────┘ └────┬────────────────┘
│ │
│ ┌─────────┐ │
│ │ stain │ │
└───▶│ cells │──┘
└────┬────┘
┌──────────────┐ │
│ load on 10X │ │
┌┴────┬─────────┴─┘
│ │
│ │
▼ ▼
.─. .─. ┌──────────────────────┐
( ) ( ) │... x 6 lanes / batch │
`─' `─' └──────────────────────┘
│ ┌──────────────────────┐
│ │ ... x 3 batches │
-Processing │ └──────────────────────┘
▼
┌───────────────────────────────┐
│ HIseq 2500 sequencing GEMs │
└───────────────┬───────────────┘
│ 1_write_bc_for_citeseqcount.R
┌──────┘ ┌────────▶ Save barcode "whitelist"
│ │ for ADT alignment / dsb.
│ │ │
▼ │ ▼
┌──────────────────┐ │ ┌──────────────────┐
│Cell Ranger Count │ │ │ Cite-Seq-Count │
└──────────────────┘ │ └─────────┬────────┘
│ │ X 18 lanes
X 18 lanes │ │
│ │ ▼
▼ │
cell │ cell cell
┌────────┐ │ ┌────────┐ ┌────────┐
│ ┌──────┴─┐ │ │ ┌──────┴─┐ │ ┌──────┴─┐
gene│ │ │ │ HTO │ │ │ ADT │ │ │
│ │mRNA UMI│ │ │ │HTO UMI │ │ │ADT UMI │
│ │ .mtx │─────────┘ │ │ .mtx │ │ │ .mtx │
└─┤ │ └─┤ │ └─┤ │
└────────┘ └────────┘ └────────┘
... x 18 lanes ... x 18 lanes ... x 18 lanes
│ │
│ 3_hto_demux_by_batch.R
- demultiplexing │ ┌──────────┴─────┐
- doublet exclusion │ │ HTODemux │
- cell qc │ └──────────┬─────┘ Saved:
│ 3_Multiseq_Demultiplex_by_batch.R background drops
▼ ┌──────────┴─────┐
┌────────────────┐ │ multiseq │ empty droplets
│ Demuxlet │ │ deMULTIplex │ ┌────────┐
└────────────────┘ └────────────────┘ │ │
│ │ ADT │ │
└────────────┬──────────┘ │ │
▼ └────────┘
Merge quality-controlled singlets │
4_validate_multiseq_w_demuxlet_singlets.R │
4_validate_htodmx_w_demuxlet_singlets.R │
5_save_merged_Seurat_singlet_object.R │
│
┌───────────────────────────────────────────────────┐ │
│ h1_h5_merged_seurat_object_demultiplexed_sng.rds │ │
└───────────────────────────────────────────────────┘ │
│
cells cells ┌───────────┘
┌────────┐ ┌────────┐ │
gene│ │ │ │ │
│ │ ADT │ │ │
│ │ │ │ │
└────────┘ └────────┘ │
│ │ │
│ └─────────────┤
│ │
▼ ▼
- normalization ┌────────────────┐ ┌─────────────────────────────────┐
- clustering │ scran RNA norm │ │ dsb normalization and denoising │
└────────────────┘ └─────────────────────────────────┘
│
1_dsbnorm_prot_scrannorm_rna.R │
▼
┌────────────────────────────────────────┐
│ protein based clustering │
│ cell type annotation │
└────────────────────────────────────────┘
│
1_merged_h1h5_adt_clustering.R
2_joint_cluster_annotation.R
│
┌────────────────────────────┘
│
│
│
▼
┌───────────────────────────────────────┐
│h1h5_annotated_with_meta.rds │
│ mRNA │
│ - raw counts │
│ - scran normalized values │
│ ADT │
│ - raw counts │
│ - dsb normalized values │
│ cell meta data │
│ - donor information │
└───────────────────────────────────────┘
Code is provided for all steps for reference.
The directories listed below contain processing scripts for different
papers which used this same starting data. These scripts can be ignored
and are included for internal reference.
raw_dsb_data
h5_CHI
This section uses R version 3.5.1, CITE-seq-count version 1.4.2, and demuxlet.
Directory 1: /1_preprocessing/ Contains the pipeline for read mapping demultiplexing etc. Cells are demultiplexed with a combination of genotype based demultiplexing and hashing antibodies. Genotype data is required to run these steps and we therefore provide scripts and outputs.
= c(
fl list.files(here('preprocess/1_preprocessing/'))
) data.frame(fl = sort(fl))
## fl
## 1 1_write_bc_for_citeseqcount.R
## 2 1a_RunCITEseqCount.txt
## 3 2_merge10xlanes_hiseq_hto_adt_rna_V8.R
## 4 2a_RUNDEMUXLET.txt
## 5 3_hto_demux_by_batch.R
## 6 3_Multiseq_Demultiplex_by_batch.R
## 7 4_validate_htodmx_w_demuxlet_singlets.R
## 8 4_validate_multiseq_w_demuxlet_singlets.R
## 9 5_save_merged_Seurat_singlet_object.R
## 10 functions
The pipeline listed above must be run in order of the numbered r scripts listed above.
The first script writes a set of barcodes to be used as a whitelist
for CITE-seq-count alignment that have a minimum amount of (ambient)
mRNA reads. This reads the umi count data from
hiseq_output_0353_0355
which were aligned with Cell
Ranger.
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
suppressMessages(library(here))
# library(data.table)
options(stringsAsFactors = FALSE)
setwd(here())
= here()
dir
dir
dir.create(path = "data/1_preprocessing_data/preprocess_output", recursive = T)
dir.create(path = "data/1_preprocessing_data/barcodes_for_CITEseqCount", recursive = T)
dir.create(path = "data/1_preprocessing_data/demuxlet_barcodes",recursive = T)
# full path to reads.
= paste0(dir,"/hiseq_output_0353_0355/")
path_to_reads
#read in ensembl gene mapping applies to all sets.
= read.table(file = paste0(path_to_reads, "H1B1ln1cDNA/outs/raw_feature_bc_matrix/features.tsv.gz"),
x header= FALSE, stringsAsFactors = F,sep = '\t')
# changed Seurat v 2.3.4 Read10X fcn to accomodate cellranger 3.0 compressed output and filenames.
source("1_preprocessing/functions/preprocessing_functions.R")
= list.files(path_to_reads, full.names=T, pattern = "cDNA")
umi.files = list.files(path_to_reads, full.names=F, pattern = "cDNA")
umi.names = list()
umi.list = lapply(umi.files, function(x){
umi.list Read10X_MPM(data.dir = paste0(x,"/outs/raw_feature_bc_matrix/"))
})#saveRDS(umi.list, file = "data/preprocess_output/raw_umi_list.rds")
# create object with minimal filtering to retain background drops for ADT normalization / HTOdemux function.
# using the CreateSeuratFunction for convenience of the min.genes cell filter parameter.
= lapply(umi.list, function(x){ CreateSeuratObject(raw.data = x, min.genes = 10, min.cells = 5) })
sl = lapply(sl, function(x){ x@cell.names })
lane.barcodes
# output a whitelist of barcodes for CITEseq count.
for (i in 1:length(lane.barcodes)) {
write.table(lane.barcodes[[i]],
file =paste0(dir,"/data/1_preprocessing_data/barcodes_for_CITEseqCount/",umi.names[i],"_","barcode.list.tsv"),
sep = '\t',quote = FALSE, col.names = F, row.names = F)
}
sessionInfo()
# MPM -- ran on http://ai-rstudioprd1.niaid.nih.gov:8787/
# containerized version Jan 29 2020
# R version 3.5.2 (2018-12-20)
# Platform: x86_64-redhat-linux-gnu (64-bit)
# Running under: Red Hat Enterprise Linux Server 7.6 (Maipo)
#
# Matrix products: default
# BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
# [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
# [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] here_0.1 Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 forcats_0.3.0 stringr_1.4.0 dplyr_0.7.8 purrr_0.3.2
# [9] readr_1.3.1 tidyr_0.8.3 tibble_2.1.1 ggplot2_3.1.1 tidyverse_1.2.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 ggridges_0.5.1 class_7.3-14 modeltools_0.2-22 rprojroot_1.3-2
# [7] mclust_5.4.3 htmlTable_1.13.1 base64enc_0.1-3 proxy_0.4-22 rstudioapi_0.9.0 npsurv_0.4-0
# [13] bit64_0.9-7 flexmix_2.3-14 mvtnorm_1.0-8 lubridate_1.7.4 xml2_1.2.0 codetools_0.2-15
# [19] splines_3.5.2 R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-3 knitr_1.22 Formula_1.2-3
# [25] jsonlite_1.6 ica_1.0-2 broom_0.5.0 cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7
# [31] R.oo_1.22.0 compiler_3.5.2 httr_1.3.1 backports_1.1.3 assertthat_0.2.1 lazyeval_0.2.2
# [37] cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6 tools_3.5.2 igraph_1.2.4
# [43] bindrcpp_0.2.2 gtable_0.3.0 glue_1.3.1 reshape2_1.4.3 RANN_2.6.1 Rcpp_1.0.0
# [49] cellranger_1.1.0 trimcluster_0.1-2.1 gdata_2.18.0 ape_5.2 nlme_3.1-137 iterators_1.0.10
# [55] fpc_2.1-11.1 lmtest_0.9-36 gbRd_0.4-11 xfun_0.6 rvest_0.3.2 irlba_2.3.3
# [61] gtools_3.8.1 DEoptimR_1.0-8 zoo_1.8-3 MASS_7.3-51.1 scales_1.0.0 hms_0.4.2
# [67] doSNOW_1.0.16 parallel_3.5.2 RColorBrewer_1.1-2 yaml_2.2.0 reticulate_1.10 pbapply_1.3-4
# [73] gridExtra_2.3 segmented_0.5-3.0 rpart_4.1-13 latticeExtra_0.6-28 stringi_1.4.3 foreach_1.4.4
# [79] checkmate_1.9.1 caTools_1.17.1.1 bibtex_0.4.2 Rdpack_0.9-0 SDMTools_1.1-221 rlang_0.3.1
# [85] pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.2-6 bitops_1.0-6 lattice_0.20-38 ROCR_1.0-7
# [91] bindr_0.1.1 htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
# [97] R6_2.4.0 snow_0.4-2 gplots_3.0.1.1 Hmisc_4.2-0 pillar_1.3.1 haven_1.1.2
# [103] foreign_0.8-71 withr_2.1.2 mixtools_1.1.0 fitdistrplus_1.0-14 survival_2.43-3 nnet_7.3-12
# [109] tsne_0.1-3 hdf5r_1.0.0 modelr_0.1.2 crayon_1.3.4 KernSmooth_2.23-15 grid_3.5.2
# [115] readxl_1.1.0 data.table_1.12.0 metap_1.0 digest_0.6.18 diptest_0.75-7 R.utils_2.7.0
# [121] stats4_3.5.2 munsell_0.5.0
Next CITEseq Count was run for each lane separately for both ADTs and
HTOs with the -t
parameter referencing the barcode lists
created above.
# Create a combined Seurat object for all hiseq runs merged across 18 10X lanes.
# Add the raw data matrices from CellRanger containing all possible cell barcodes to retain background "negative"
# This script appends barcodes with the lane name e.g. AAACCTGAGAAGGTTT_H1B1ln1 (H1N1 cohort, batch 1 lane 1).
# ADT assay HTO assay and RNA assays are all merged for each lane by the intersection of barcodes detected in all three assays.
# Add metadata for batch, HTO and ADT QC metrics
# The current version accomodates sparse matrix input for ADT HTO data from CiteseqCount v1.4.1.
# used http://ai-rstudioprd1.niaid.nih.gov:8787/
suppressMessages(library(tidyverse))
suppressMessages(library(Seurat))
suppressMessages(library(here))
# library(data.table)
options(stringsAsFactors = FALSE)
setwd(here())
= here()
dir
dir
# full path to reads.
= paste0(dir,"/hiseq_output_0353_0355/")
path_to_reads
#read in ensembl gene mapping applies to all sets.
= read.table(file = paste0(path_to_reads, "H1B1ln1cDNA/outs/raw_feature_bc_matrix/features.tsv.gz"), header= FALSE, stringsAsFactors = F,sep = '\t')
x
# changed Seurat v 2.3.4 Read10X fcn to accomodate cellranger 3.0 compressed output and filenames.
source(here("../functions/preprocessing_functions.R"))
= list.files(path_to_reads, full.names=T, pattern = "cDNA")
umi.files = list.files(path_to_reads, full.names=F, pattern = "cDNA")
umi.names = list()
umi.list = lapply(umi.files,
umi.list function(x){Read10X_MPM(data.dir = paste0(x,"/outs/raw_feature_bc_matrix/"))})
# from 1a
# create object with minimal filtering to retain background drops for ADT normalization / HTOdemux function.
# sl = lapply(umi.list, function(x){ CreateSeuratObject(raw.data = x, min.genes = 10, min.cells = 2) })
# # create a list of barcode names
# lane.barcodes = lapply(sl, function(x){ x@cell.names })
# load and format antibody data mapped from CITEseqCount V1.4.2
= list.files(paste0(path_to_reads,"CITEseqCountOut_v1.4.2"), full.names=T, pattern = "ADT")
adt.files = list.files(paste0(path_to_reads,"CITEseqCountOut_v1.4.2"), full.names=F, pattern = "ADT_")
adt.names = lapply(adt.files,function(x){ Read10X_V3(data.dir = paste0(x,"/umi_count/"), gene.column = 1) })
adt.list # remove the unneeded barcode string on the protein name and the "unmapped" row
for (i in 1:length(adt.list)) {
@Dimnames[[1]][1:87] = str_sub(string = adt.list[[i]]@Dimnames[[1]][1:87], start = 1,end = -17)
adt.list[[i]]= adt.list[[i]][-88, ]
adt.list[[i]]
}# subset list of ADT by top ranked barcodes from ADT assay.
for (i in 1:length(adt.list)) {
= colSums(adt.list[[i]]) %>% sort(decreasing = TRUE)
adt_sum = names(adt_sum)[1:35000]
subs = adt.list[[i]][ ,colnames(adt.list[[i]]) %in% subs]
adt.list[[i]]
}names(adt.list) = adt.names
# repeat the steps above on the hashing data. 13 hto are mapped -- test w/ lapply(hto.list, dim %>% print)
= list.files(paste0(path_to_reads,"CITEseqCountOut_v1.4.2"), full.names=T, pattern = "HTO")
hto.files = list.files(paste0(path_to_reads,"CITEseqCountOut_v1.4.2"), full.names=F, pattern = "HTO_")
hto.names = lapply(hto.files,function(x){ Read10X_V3(data.dir = paste0(x,"/umi_count/"), gene.column = 1) })
hto.list for (i in 1:length(hto.list)) {
@Dimnames[[1]][1:87]=str_sub(string = hto.list[[i]]@Dimnames[[1]][1:87], start = 1,end = -17)
hto.list[[i]]=hto.list[[i]][-14, ]
hto.list[[i]]
}# Get top ranked barcodes from HTO assay
for (i in 1:length(hto.list)) {
= colSums(adt.list[[i]]) %>% sort(decreasing = TRUE)
hto_sum = names(hto_sum)[1:35000]
subs = hto.list[[i]][ ,colnames(hto.list[[i]]) %in% subs]
hto.list[[i]]
}names(hto.list) = hto.names
# Subset cells in each 10X lane by the intersection of barcodes in ADT, HTO, RNA assays
# append barcode name with a string corresponding to each lane; crucial to avoid barcode collision.
= list()
lane.barcodes for (i in 1:length(umi.list)) {
@Dimnames[[2]] = paste0(umi.list[[i]]@Dimnames[[2]],"_",substr(umi.names[[i]], 1,7))
umi.list[[i]]@Dimnames[[2]] = paste0(adt.list[[i]]@Dimnames[[2]],"_",substr(adt.names[[i]], 12,18))
adt.list[[i]]@Dimnames[[2]] = paste0(hto.list[[i]]@Dimnames[[2]],"_",substr(hto.names[[i]], 12,18))
hto.list[[i]]= intersect(x = colnames(hto.list[[i]]), y = colnames(adt.list[[i]]))
lane.barcodes[[i]] = intersect(x = colnames(umi.list[[i]]),y = lane.barcodes[[i]])
lane.barcodes[[i]] = umi.list[[i]][ ,as.character(lane.barcodes[[i]])]
umi.list[[i]] = adt.list[[i]][ ,as.character(lane.barcodes[[i]])]
adt.list[[i]] = hto.list[[i]][ ,as.character(lane.barcodes[[i]])]
hto.list[[i]]
}
# merge into a single Seurat Object; merge prior to adding assay data b/c Seurat V2 drops ADT and HTO matrix after merge.
# minimal to retain drops with residual ADT for later normalization.
= lapply(umi.list, function(x){ CreateSeuratObject(raw.data = x, min.genes = 10, min.cells = 5) })
Seurat.Object.List = Seurat.Object.List[[1]]
b1 for (i in 2:length(Seurat.Object.List)) {
= MergeSeurat(object1 = b1, object2 = Seurat.Object.List[[i]], do.normalize = F)
b1
}
## subset HTO and ADT matrix to merged cells and set CITE, HTO assay raw data
names(adt.list) = names(hto.list) = NULL
= do.call(cbind, adt.list)
adt.merged = adt.merged[ ,b1@cell.names]
adt.merged = SetAssayData(b1, assay.type = "CITE", slot ="raw.data", new.data = adt.merged)
b1
## merge hto data
= do.call(what = cbind, args = hto.list)
hto.merged = hto.merged[ ,b1@cell.names]
hto.merged = SetAssayData(b1,assay.type = "HTO",slot = "raw.data",new.data = hto.merged)
b1
# Add metadata parameters to Seurat object - pecent mitochondrial genes, tenx_lane, cohort, batch, barcode checker.
= grep(pattern = "^MT-", rownames(b1@data), value = TRUE)
MT = Matrix::colSums(b1@raw.data[MT, ])/Matrix::colSums(b1@raw.data)
pctMT = AddMetaData(b1, metadata = pctMT, col.name = "pctMT")
b1
= b1@meta.data %>%
metadf rownames_to_column("barcodes") %>%
mutate(barcode_check = barcodes) %>%
mutate(tenx_lane = substr(barcode_check, 18,24)) %>%
mutate(cohort = if_else(str_sub(tenx_lane, 1, 2) == "H1", "H1N1", "H5N1")) %>%
mutate(batch = if_else(cohort == "H1N1" & substr(barcodes,21,21) == 1, true = "1",
if_else(cohort == "H1N1" & substr(barcodes,21,21) == 2, true = "2", false = "3"))) %>%
column_to_rownames("barcodes") %>%
select(barcode_check, tenx_lane, cohort, batch)
= AddMetaData(b1, metadata = metadf)
b1
# write barcode output add string "-1" for demuxlet barcodes
= str_sub(umi.names, 1,7)
lane.names = lapply(lane.barcodes, function(x){ str_sub(x, 1,16) })
lane.barcodes.sub = lapply(lane.barcodes.sub, function(x){ paste0(x,"-1") })
demuxlet.barcode
for (i in 1:length(demuxlet.barcode)) {
write.table(demuxlet.barcode[[i]],
file = paste0(dir,"/data/1_preprocessing_data/demuxlet_barcodes/",lane.names[i],"_","barcode.list.tsv"),
sep = "\t", col.names = F, row.names = F, quote = F)
}
# Seurat object merged
saveRDS(b1, file = paste0(dir,"/data/Merged_H1_H5_",ncol(b1@raw.data),"_cells.rds"))
saveRDS(hto.merged, file = paste0(dir,"/data/hto.merged.v2.rds"))
saveRDS(adt.merged, file = paste0(dir,"/data/adt.merged.v2.rds"))
sessionInfo()
# MPM -- ran on http://ai-rstudioprd1.niaid.nih.gov:8787/
# containerized version Jan 29 2020
# R version 3.5.2 (2018-12-20)
# Platform: x86_64-redhat-linux-gnu (64-bit)
# Running under: Red Hat Enterprise Linux Server 7.6 (Maipo)
#
# Matrix products: default
# BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
# [4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
# [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] bindrcpp_0.2.2 here_0.1 Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 forcats_0.3.0
# [7] stringr_1.4.0 dplyr_0.7.8 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.1
# [13] ggplot2_3.1.1 tidyverse_1.2.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 ggridges_0.5.1 class_7.3-14 modeltools_0.2-22
# [6] rprojroot_1.3-2 mclust_5.4.3 htmlTable_1.13.1 base64enc_0.1-3 proxy_0.4-22
# [11] rstudioapi_0.9.0 npsurv_0.4-0 bit64_0.9-7 flexmix_2.3-14 mvtnorm_1.0-8
# [16] lubridate_1.7.4 xml2_1.2.0 codetools_0.2-15 splines_3.5.2 R.methodsS3_1.7.1
# [21] lsei_1.2-0 robustbase_0.93-3 knitr_1.22 Formula_1.2-3 jsonlite_1.6
# [26] ica_1.0-2 broom_0.5.0 cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7
# [31] R.oo_1.22.0 compiler_3.5.2 httr_1.3.1 backports_1.1.3 assertthat_0.2.1
# [36] lazyeval_0.2.2 cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6
# [41] tools_3.5.2 igraph_1.2.4 gtable_0.3.0 glue_1.3.1 reshape2_1.4.3
# [46] RANN_2.6.1 Rcpp_1.0.0 cellranger_1.1.0 trimcluster_0.1-2.1 gdata_2.18.0
# [51] ape_5.2 nlme_3.1-137 iterators_1.0.10 fpc_2.1-11.1 lmtest_0.9-36
# [56] gbRd_0.4-11 xfun_0.6 rvest_0.3.2 irlba_2.3.3 gtools_3.8.1
# [61] DEoptimR_1.0-8 zoo_1.8-3 MASS_7.3-51.1 scales_1.0.0 hms_0.4.2
# [66] doSNOW_1.0.16 parallel_3.5.2 RColorBrewer_1.1-2 yaml_2.2.0 reticulate_1.10
# [71] pbapply_1.3-4 gridExtra_2.3 segmented_0.5-3.0 rpart_4.1-13 latticeExtra_0.6-28
# [76] stringi_1.4.3 foreach_1.4.4 checkmate_1.9.1 caTools_1.17.1.1 bibtex_0.4.2
# [81] Rdpack_0.9-0 SDMTools_1.1-221 rlang_0.3.1 pkgconfig_2.0.2 dtw_1.20-1
# [86] prabclus_2.2-6 bitops_1.0-6 lattice_0.20-38 ROCR_1.0-7 bindr_0.1.1
# [91] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
# [96] R6_2.4.0 snow_0.4-2 gplots_3.0.1.1 Hmisc_4.2-0 pillar_1.3.1
# [101] haven_1.1.2 foreign_0.8-71 withr_2.1.2 mixtools_1.1.0 fitdistrplus_1.0-14
# [106] survival_2.43-3 nnet_7.3-12 tsne_0.1-3 hdf5r_1.0.0 modelr_0.1.2
# [111] crayon_1.3.4 KernSmooth_2.23-15 grid_3.5.2 readxl_1.1.0 data.table_1.12.0
# [116] metap_1.0 digest_0.6.18 diptest_0.75-7 R.utils_2.7.0 stats4_3.5.2
# [121] munsell_0.5.0
Demuxlet was next run for each of the 18 lanes separately referencing non-imputed illumina chip based genotype data on a single HPC core with 20G ram with the following options:
demuxlet
--sam /path_to_bam
--tag-group CB
--tag-UMI UB
--vcf /path_to_vcf
--min-snp 5
--field GT
--geno-error 0.01
--out out_path/
--alpha 0 --alpha 0.5
--group-list /path_to_barcode_list
Note that raw genotype data (the actual vcf files) cannot be uploaded to this repository, so the outputs of demuxlet are provided for each lane herein.
Demultiplex the hashing antibodies (which define each subjects cells from different timepoints) using Seurats HTODemux function and the functions from the multiseq demultiplexing method then take the union of singlets.
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
setwd(here())
= here()
dir
# read in data
#dir.create("preprocess_output/demultiplexing_plots")
# dir.create("preprocess_output/demultiplexing_output")
# source(here("/1_preprocessing/functions/demultiplexing_functions.R"))
# get data for each batch separately
= readRDS(file = "data/Merged_H1_H5_629949_cells.rds")
s = s %>%
h1 SetAllIdent(id = "cohort") %>%
SubsetData(ident.use = "H1N1") %>%
SetAllIdent(id = "batch")
= SubsetData(h1, ident.use = "1")
h1b1 = SubsetData(h1, ident.use = "2")
h1b2 = s %>% SetAllIdent(id = "cohort") %>% SubsetData(ident.use = "H5N1")
h5
# Essential: remove the unused hash from H1 batch 1 and batch 2. (used all for H5)
= h1b1@assay$HTO@raw.data
h1b1hto = h1b1hto[-12, ]
h1b1hto = h1b2@assay$HTO@raw.data
h1b2hto = h1b2hto[-c(10,12), ]
h1b2hto = h5@assay$HTO@raw.data
h5b1hto
# add back subsetted hto data
= SetAssayData(h1b1,new.data = h1b1hto, assay.type = "HTO", slot = "raw.data")
h1b1 = SetAssayData(h1b2, new.data = h1b2hto, assay.type = "HTO", slot = "raw.data")
h1b2 = SetAssayData(h5, new.data = h5b1hto, assay.type = "HTO", slot = "raw.data")
h5
# list of objects indexed by batch.
## normalize data for HTODemux
# !! CITE normalization gets overwritted with the dsb norm in the next script! bug in seurat v2 req. CITE normalized for HTODMX
= list(h1b1,h1b2, h5)
l = lapply(l, function(x){
l = x %>%
x NormalizeData(assay.type = "CITE",normalization.method = "genesCLR") %>%
NormalizeData(assay.type = "HTO", normalization.method = "genesCLR") %>%
ScaleData(assay.type = "HTO", do.par = T, num.cores = 8)
})
#l = list(h1b1,h1b2, h5)
#for (i in 1:length(l)) {
# l[[i]] <- NormalizeData(l[[i]],assay.type = "CITE",normalization.method = "genesCLR",display.progress = T)
# l[[i]] <- NormalizeData(l[[i]],assay.type = "HTO", normalization.method = "genesCLR",display.progress = T)
# l[[i]] <- ScaleData(l[[i]], assay.type = "HTO", display.progress = FALSE, do.par = T, num.cores = 8)
#}
# demultiplex round 1 with HTODemux for each batch
= l[[1]]
h1b1 = HTODemux(h1b1, assay.type = "HTO", positive_quantile = 0.99995)
h1b1 = SetAllIdent(h1b1, id = "hash_maxID")
h1b1 # at current stage the day 0 low serving as bridge across batches won't be classified, just keep as d0 high.
= c("HTO_1", "HTO_2", "HTO_3", "HTO_4", "HTO_5", "HTO_6","HTO_7",
current.hash "HTO_8","HTO_9", "HTO_10","HTO_12","HTO_14")
= c("d0 high","d1 high","d0 high","d7 high","unstained_control","d0 low","d1 low",
responder "d0 low","d7 low","d0 high","d0 low","d0 high")
@ident = plyr::mapvalues(x = h1b1@ident, from = current.hash, to = responder)
h1b1= StashIdent(object = h1b1, save.name = "adjmfc.time")
h1b1
# batch 2
= l[[2]]
h1b2 = HTODemux(h1b2, assay.type = "HTO", positive_quantile = 0.99999)
h1b2 = SetAllIdent(h1b2, id = "hash_maxID")
h1b2 # Confirm that levels(factor(h1b2@ident)) = 11 not 12 adn does not include hto 10
= c( "HTO_1", "HTO_2", "HTO_3", "HTO_4", "HTO_5", "HTO_6",
current.hash2 "HTO_7","HTO_8","HTO_9","HTO_12", "HTO_14")
= c("d0 high", "d1 high", "d0 high", "d7 high", "unstained_control", "d0 low", "d1 low",
responder2 "d0 low", "d7 low","d0 low","d0 high")
@ident = plyr::mapvalues(x = h1b2@ident, from = current.hash2, to = responder2)
h1b2= StashIdent(object = h1b2, save.name = "adjmfc.time")
h1b2
= l[[3]]
h5 = HTODemux(h5, assay.type = "HTO", positive_quantile = 0.9999)
h5 # H5N1 for consistency for later scripts use adfmfc.time for H5, (HTO only map to timepoint in this batch).
= c("HTO_1", "HTO_2", "HTO_3", "HTO_4", "HTO_5", "HTO_6", "HTO_7", "HTO_8",
current.hash3 "HTO_9", "HTO_10", "HTO_12", "HTO_13", "HTO_14")
= c("d0", "d1", "dC", "d1", "d0", "dC", "d0", "d1",
responder3 "dC", "d0", "d1", "dC", "unstained_control")
# responder3 = c("day0", "day1", "dc", "day1", "bridge_H1209_d0", "dc", "day0", "day1",
# "dc", "day0", "day1", "dc", "unstained_control")
@ident = plyr::mapvalues(x = h5@ident, from = current.hash3, to = responder3)
h5= StashIdent(object = h5, save.name = "adjmfc.time")
h5
# merge objects and add back assay data which is lost when merging (in V2.3.4)
= MergeSeurat(h1b1, h1b2, do.normalize = FALSE,do.scale = FALSE)
h1n1.all = MergeSeurat(h5, h1n1.all, do.normalize = FALSE, do.scale = FALSE )
h1h5
# read hto data
= readRDS("data/hto.merged.v2.rds")
hto = hto[ ,h1h5@cell.names]
hto
# read adt data
= readRDS("data/adt.merged.v2.rds")
adt = adt[ ,h1h5@cell.names]
adt
# SetAssayData [contains baked in row match argument to make sure cell order is the same].
= SetAssayData(h1h5, assay.type = "CITE", slot = "raw.data", new.data = adt)
h1h5 = SetAssayData(h1h5, assay.type = "HTO", slot = "raw.data", new.data = hto)
h1h5 #dir.create("../data/baseline_nov22")
saveRDS(h1h5, file = "data/h1h5.all.htodemux.rds")
sessionInfo()
# MPM -- ran on http://ai-rstudioprd1.niaid.nih.gov:8787/
# containerized version Jan 29 2020
# R version 3.5.2 (2018-12-20)
# Platform: x86_64-redhat-linux-gnu (64-bit)
# Running under: Red Hat Enterprise Linux Server 7.6 (Maipo)
#
# Matrix products: default
# BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8 LC_Nrm(list=setdiff(ls(), "x")) AME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] bindrcpp_0.2.2 here_0.1 Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 forcats_0.3.0 stringr_1.4.0 dplyr_0.7.8 purrr_0.3.2 readr_1.3.1
# [11] tidyr_0.8.3 tibble_2.1.1 ggplot2_3.1.1 tidyverse_1.2.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 ggridges_0.5.1 class_7.3-14 modeltools_0.2-22 rprojroot_1.3-2 mclust_5.4.3 htmlTable_1.13.1
# [9] base64enc_0.1-3 proxy_0.4-22 rstudioapi_0.9.0 npsurv_0.4-0 bit64_0.9-7 flexmix_2.3-14 mvtnorm_1.0-8 lubridate_1.7.4
# [17] xml2_1.2.0 codetools_0.2-15 splines_3.5.2 R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-3 knitr_1.22 Formula_1.2-3
# [25] jsonlite_1.6 ica_1.0-2 broom_0.5.0 cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7 R.oo_1.22.0 compiler_3.5.2
# [33] httr_1.3.1 backports_1.1.3 assertthat_0.2.1 lazyeval_0.2.2 cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6
# [41] tools_3.5.2 igraph_1.2.4 gtable_0.3.0 glue_1.3.1 reshape2_1.4.3 RANN_2.6.1 Rcpp_1.0.0 cellranger_1.1.0
# [49] trimcluster_0.1-2.1 gdata_2.18.0 ape_5.2 nlme_3.1-137 iterators_1.0.10 fpc_2.1-11.1 lmtest_0.9-36 gbRd_0.4-11
# [57] xfun_0.6 rvest_0.3.2 irlba_2.3.3 gtools_3.8.1 DEoptimR_1.0-8 zoo_1.8-3 MASS_7.3-51.1 scales_1.0.0
# [65] hms_0.4.2 doSNOW_1.0.16 parallel_3.5.2 RColorBrewer_1.1-2 yaml_2.2.0 reticulate_1.10 pbapply_1.3-4 gridExtra_2.3
# [73] segmented_0.5-3.0 rpart_4.1-13 latticeExtra_0.6-28 stringi_1.4.3 foreach_1.4.4 checkmate_1.9.1 caTools_1.17.1.1 bibtex_0.4.2
# [81] Rdpack_0.9-0 SDMTools_1.1-221 rlang_0.3.1 pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.2-6 bitops_1.0-6 lattice_0.20-38
# [89] ROCR_1.0-7 bindr_0.1.1 htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5 R6_2.4.0
# [97] snow_0.4-2 gplots_3.0.1.1 Hmisc_4.2-0 pillar_1.3.1 haven_1.1.2 foreign_0.8-71 withr_2.1.2 mixtools_1.1.0
# [105] fitdistrplus_1.0-14 survival_2.43-3 nnet_7.3-12 tsne_0.1-3 hdf5r_1.0.0 modelr_0.1.2 crayon_1.3.4 KernSmooth_2.23-15
# [113] grid_3.5.2 readxl_1.1.0 data.table_1.12.0 metap_1.0 digest_0.6.18 diptest_0.75-7 R.utils_2.7.0 stats4_3.5.2
# [121] munsell_0.5.0
Multiseq:
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(here))
setwd(here())
= here()
dir # read in data
dir.create("preprocess_output/demultiplexing_plots")
dir.create("preprocess_output/demultiplexing_output")
source(here("../functions/demultiplexing_functions.R"))
# get data for each batch separately
= readRDS(file = "data/Merged_H1_H5_629949_cells.rds")
s = s %>%
h1 SetAllIdent(id = "cohort") %>%
SubsetData(ident.use = "H1N1") %>%
SetAllIdent(id = "batch")
= SubsetData(h1, ident.use = "1")
h1b1 = SubsetData(h1, ident.use = "2")
h1b2 = s %>% SetAllIdent(id = "cohort") %>% SubsetData(ident.use = "H5N1")
h5 rm(s)
gc()
# Essential: remove the unused hash from H1 batch 1 and batch 2. (used all for H5)
= h1b1@assay$HTO@raw.data
h1b1hto = h1b1hto[-12, ]
h1b1hto = h1b2@assay$HTO@raw.data
h1b2hto = h1b2hto[-c(10,12), ]
h1b2hto = h5@assay$HTO@raw.data
h5b1hto
# library(deMULTIplex) > sourced from preprocessing_functions.R
# format data matrices for deMULTIplex workflow
= h1b1hto %>% t %>% as.matrix %>% as.data.frame
b1hto = h1b2hto %>% t %>% as.matrix %>% as.data.frame
b2hto = h5b1hto %>% t %>% as.matrix %>% as.data.frame
h5hto = list(b1hto, b2hto, h5hto)
hto_list
# demultiplex each batch
= lapply(hto_list, function(x){ RundeMULTIplex(htomatrix = x) })
final.calls.list
# remove random duplicates (<50 at most of 600k barcodes)
for (i in 1:length(final.calls.list)) {
= final.calls.list[[i]] %>% names() %>% unique()
ub = final.calls.list[[i]][ub]
final.calls.list[[i]]
}
# check dimensional agreement and print current results pre rescue
lapply(hto_list, dim)
lapply(final.calls.list, length)
lapply(final.calls.list, table)
# save object
saveRDS(final.calls.list, file = "preprocess_output/demultiplexing_output/final.calls.list.prerescue.rds")
# remove large seurat objects from workspace
rm(h1b1, h1b2, h5)
gc()
## Rescue negative cells (rate limiting step)
# returns an embedded list with elements reclass.res (a list) and reclass cells
= list()
reclass.results for (i in 1:length(final.calls.list)) {
=
reclass.results[[i]] deMULTIplexRescueNegative(htomatrix = hto_list[[i]],
RundeMULTIplex.calls.vector = final.calls.list[[i]])
}
saveRDS(object = reclass.results, file = "preprocess_output/demultiplexing_output/reclass.res.list.rds")
# plot rescued cells to find optimal classification stability threshold
for (i in 1:length(final.calls.list)) {
deMULTIplexRescueNegativePlot(reclass.res = reclass.results[[i]][[1]],
save.path = "preprocess_output/demultiplexing_plots/",
batch.name = paste0("batch",i))
}
###### !!!!! select optimal threshold based on plot ######
= c(14,15,16)
threshold_vector = list()
final.calls.list.rescued for (i in 1:length(final.calls.list)) {
=
final.calls.list.rescued[[i]] deMULTIplexFinalizeCalls(final.calls = final.calls.list[[i]],
reclass.cells = reclass.results[[i]][[2]],
reclass.threshold = threshold_vector[i])
}
# reload data to add multiseq calls as metadata.
= readRDS(file = "/data/Merged_H1_H5_629949_cells.rds")
s = s %>%
h1 SetAllIdent(id = "cohort") %>%
SubsetData(ident.use = "H1N1") %>%
SetAllIdent(id = "batch")
= SubsetData(h1, ident.use = "1")
h1b1 = SubsetData(h1, ident.use = "2")
h1b2 = s %>% SetAllIdent(id = "cohort") %>% SubsetData(ident.use = "H5N1")
h5 rm(s)
gc()
## Add back multiseq calls to object:
= AddMetaData(h1b1, metadata = final.calls.list[[1]], col.name = "multiseq_hashid") %>% SetAllIdent(id = "multiseq_hashid")
h1b1 = AddMetaData(h1b2, metadata = final.calls.list[[2]], col.name = "multiseq_hashid") %>% SetAllIdent(id = "multiseq_hashid")
h1b2 = AddMetaData(h5, metadata = final.calls.list[[3]], col.name = "multiseq_hashid") %>% SetAllIdent(id = "multiseq_hashid")
h5
# manually map values to the correct timepoint response variable based on hashing.
# H1N1 batch 1
= c("HTO_1", "HTO_2", "HTO_3", "HTO_4", "HTO_5", "HTO_6","HTO_7","HTO_8",
current.hash "HTO_9", "HTO_10","HTO_12","HTO_14")
= c("d0 high", "d1 high", "d0 high", "d7 high", "unstained_control", "d0 low",
responder "d1 low", "d0 low", "d7 low", "d0 high", "d0 low", "d0 high")
@ident = plyr::mapvalues(x = h1b1@ident, from = current.hash, to = responder)
h1b1= StashIdent(object = h1b1, save.name = "adjmfc.time")
h1b1
# H1N1 batch 2
= c("HTO_1", "HTO_2", "HTO_3", "HTO_4", "HTO_5", "HTO_6", "HTO_7", "HTO_8",
current.hash2 "HTO_9", "HTO_12", "HTO_14")
= c("d0 high", "d1 high", "d0 high", "d7 high", "unstained_control", "d0 low",
responder2 "d1 low", "d0 low", "d7 low", "d0 low", "d0 high")
@ident = plyr::mapvalues(x = h1b2@ident, from = current.hash2, to = responder2)
h1b2= StashIdent(object = h1b2, save.name = "adjmfc.time")
h1b2
# H5N1 for consistency for later scripts use adfmfc.time for H5, (HTO only map to timepoint in this batch).
= c("HTO_1", "HTO_2", "HTO_3", "HTO_4", "HTO_5", "HTO_6", "HTO_7", "HTO_8",
current.hash3 "HTO_9", "HTO_10", "HTO_12", "HTO_13", "HTO_14")
# C represents Day 100.
= c("d0", "d1", "dC", "d1", "d0", "dC", "d0", "d1",
responder3 "dC", "d0", "d1", "dC", "unstained_control")
# responder3 = c("day0", "day1", "day100", "day1", "bridge_H1209_d0", "day100", "day0", "day1",
# "day100", "day0", "day1", "day100", "unstained_control")
@ident = plyr::mapvalues(x = h5@ident, from = current.hash3, to = responder3)
h5= StashIdent(object = h5, save.name = "adjmfc.time")
h5
# merge objects and add back assay data which is lost when merging.
= MergeSeurat(h1b1, h1b2, do.normalize = FALSE,do.scale = FALSE)
s = MergeSeurat(h5, s, do.normalize = FALSE, do.scale = FALSE )
s = readRDS("data/adt.merged.v2.rds")
adt = readRDS("data/hto.merged.v2.rds")
hto
# SetAssayData contains a row match argument to make sure cell order is the same.
= SetAssayData(s, assay.type = "CITE", slot = "raw.data", new.data = adt)
s = SetAssayData(s, assay.type = "HTO", slot = "raw.data", new.data = hto)
s
# Add Timepoint metadata acros all batches. if cohort is H1 use adjmfctime sub, H5 use adjmfctime value.
=
h1h5_md @meta.data %>%
sselect(adjmfc.time, cohort, barcode_check) %>%
mutate(timepoint = if_else(cohort == "H1N1",true = str_sub(adjmfc.time, 1,2), false = adjmfc.time)) %>%
select(timepoint, barcode_check) %>%
column_to_rownames("barcode_check")
= AddMetaData(s, metadata = h1h5_md)
s
# save results
saveRDS(s, file = here("data/H1_H5_multiseq_demultiplexed_V6.rds"))
sessionInfo()
#
# R version 3.5.2 (2018-12-20)
# Platform: x86_64-redhat-linux-gnu (64-bit)
# Running under: Red Hat Enterprise Linux Server 7.6 (Maipo)
#
# Matrix products: default
# BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
# [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
# [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] bindrcpp_0.2.2 reshape2_1.4.3 KernSmooth_2.23-15 deMULTIplex_1.0.2 here_0.1 forcats_0.3.0
# [7] stringr_1.4.0 dplyr_0.7.8 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.1
# [13] tidyverse_1.2.1 Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 class_7.3-14 modeltools_0.2-22 ggridges_0.5.1 rprojroot_1.3-2
# [7] mclust_5.4.3 htmlTable_1.13.1 base64enc_0.1-3 rstudioapi_0.9.0 proxy_0.4-22 npsurv_0.4-0
# [13] flexmix_2.3-14 bit64_0.9-7 lubridate_1.7.4 mvtnorm_1.0-8 xml2_1.2.0 codetools_0.2-15
# [19] splines_3.5.2 R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-3 knitr_1.22 Formula_1.2-3
# [25] jsonlite_1.6 broom_0.5.0 ica_1.0-2 cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7
# [31] R.oo_1.22.0 compiler_3.5.2 httr_1.3.1 backports_1.1.3 assertthat_0.2.1 lazyeval_0.2.2
# [37] cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6 tools_3.5.2 igraph_1.2.4
# [43] gtable_0.3.0 glue_1.3.1 RANN_2.6.1 Rcpp_1.0.0 cellranger_1.1.0 trimcluster_0.1-2.1
# [49] gdata_2.18.0 ape_5.2 nlme_3.1-137 iterators_1.0.10 fpc_2.1-11.1 gbRd_0.4-11
# [55] lmtest_0.9-36 xfun_0.6 rvest_0.3.2 irlba_2.3.3 gtools_3.8.1 DEoptimR_1.0-8
# [61] MASS_7.3-51.1 zoo_1.8-3 scales_1.0.0 hms_0.4.2 doSNOW_1.0.16 parallel_3.5.2
# [67] RColorBrewer_1.1-2 yaml_2.2.0 reticulate_1.10 pbapply_1.3-4 gridExtra_2.3 rpart_4.1-13
# [73] segmented_0.5-3.0 latticeExtra_0.6-28 stringi_1.4.3 foreach_1.4.4 checkmate_1.9.1 caTools_1.17.1.1
# [79] bibtex_0.4.2 matrixStats_0.54.0 Rdpack_0.9-0 SDMTools_1.1-221 rlang_0.3.1 pkgconfig_2.0.2
# [85] dtw_1.20-1 prabclus_2.2-6 bitops_1.0-6 evaluate_0.13 lattice_0.20-38 ROCR_1.0-7
# [91] bindr_0.1.1 labeling_0.3 htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4
# [97] magrittr_1.5 R6_2.4.0 snow_0.4-2 gplots_3.0.1.1 Hmisc_4.2-0 haven_1.1.2
# [103] pillar_1.3.1 foreign_0.8-71 withr_2.1.2 fitdistrplus_1.0-14 mixtools_1.1.0 survival_2.43-3
# [109] nnet_7.3-12 tsne_0.1-3 modelr_0.1.2 crayon_1.3.4 hdf5r_1.0.0 rmarkdown_1.10
# [115] readxl_1.1.0 grid_3.5.2 data.table_1.12.0 metap_1.0 digest_0.6.18 diptest_0.75-7
# [121] R.utils_2.7.0 stats4_3.5.2 munsell_0.5.0
The singlet calls merged with the hashing antibody barcode singlets
are then combined and qcd in the next scripts.
The singlet ID from demuxlet must match the expected subject ID based on
the hashing antibodies that identified that individual sample which were
demiltiplexed above using HTODemux and Multiseq demultiuplexing.
# demultiplex round 2; add donor ID from multiseq.
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(here))
setwd(here())
# make output for demultiplexing results.
# dir.create("data/baseline_nov22/demultiplexing_output")
= readRDS("data/h1h5.all.htodemux.rds")
s = s@meta.data %>% mutate(barcode.full = barcode_check)
metadf
# vector of tenxlanes (!!reorder by batch, lane)
= unique(s@meta.data$tenx_lane) %>% as.vector()
tenx.lanes = c(tenx.lanes[c(7:12)], tenx.lanes[c(13:18)], tenx.lanes[c(1:6)]) %>% as.vector
tenx.lanes
#!! File str must be exact and contain the.best files from version 7 output.
# read in dmeuxlet output as list of data frames; add barcode.full with lane tag string.
= list.files(path = here("demuxlet/demuxlet_run_v7"), full.names=T, pattern = ".best")
dmx.files
# file merge order checker
stopifnot(str_sub(dmx.files, -9,-9) == as.character(c(1:6,1:6,1:6)))
# visual sanity check, ensure files and lanes have the same index
print(c(dmx.files, tenx.lanes))
# read in demuxlet results
= list()
dmx.list for (i in 1:length(tenx.lanes)) {
= read.table(file = dmx.files[[i]], header = T) %>%
dmx.list[[i]] mutate(bc = substr(BARCODE, start = 1,stop = 16)) %>%
select(bc, everything()) %>%
mutate(barcode_check = paste0(bc,"_",tenx.lanes[i]))
print(dmx.files[i])
print(table(substr(dmx.list[[i]]$BEST,1,3)))
}
# combine and add batch to dmx results.
=
dmx.res do.call(rbind, dmx.list) %>%
mutate(cohort = str_sub(barcode_check, -7,-6)) %>%
mutate(batch =
if_else(cohort == "H1" & substr(barcode_check,21,21) == 1, true = "1",
if_else(cohort == "H1" & substr(barcode_check,21,21) == 2, true = "2", false = "3")))
# split by batch to map batch specific hash to samples.
= list()
dmx.batch for (i in 1:length(unique(dmx.res$batch))) {
= dmx.res %>% filter(batch == i)
dmx.batch[[i]]
}= list()
s.batch for (i in 1:length(dmx.batch)) {
= s@meta.data %>% filter(batch == i)
s.batch[[i]]
}
= full_join(dmx.batch[[1]], s.batch[[1]], by = "barcode_check")
batch_1 = full_join(dmx.batch[[2]], s.batch[[2]], by = "barcode_check")
batch_2 = full_join(dmx.batch[[3]], s.batch[[3]], by = "barcode_check")
batch_3
# map demuxlet sample ID to expected sample ID
# Batch 1
=
b1_merged %>%
batch_1 mutate(SNG.1ST = as.character(SNG.1ST)) %>%
mutate(DMX_GLOBAL_BEST = str_sub(BEST, 1,3)) %>%
mutate(SNG.1ST = substr(SNG.1ST,1,3)) %>%
mutate(joint_classification_global = paste0(DMX_GLOBAL_BEST,"_",hto_classification_global)) %>%
mutate(expect_sampleID =
if_else(hash_maxID == "HTO_1" | hash_maxID == "HTO_2", true = "237:207",
if_else(hash_maxID == "HTO_3" | hash_maxID == "HTO_4", true = "212:245:256",
if_else(hash_maxID == "HTO_10", true = "256",
if_else(hash_maxID == "HTO_6" | hash_maxID == "HTO_7", true = "200:273",
if_else(hash_maxID == "HTO_8" | hash_maxID == "HTO_9", true = "277:261:233",
if_else(hash_maxID == "HTO_12", true = "233",
if_else(hash_maxID == "HTO_14", true = "209",
if_else(hash_maxID == "HTO_5", true = "unstained_control", false = "na"))))))))) %>%
separate(expect_sampleID, c("expect_sampleID_1", "expect_sampleID_2", "expect_sampleID_3"),sep = ":") %>%
mutate(dmx_hto_match =
if_else(SNG.1ST == expect_sampleID_1 | SNG.1ST == expect_sampleID_2 | SNG.1ST == expect_sampleID_3, true = "1",false = "0"))
# Batch 2
=
b2_merged %>%
batch_2 mutate(SNG.1ST = as.character(SNG.1ST)) %>%
mutate(DMX_GLOBAL_BEST = str_sub(BEST, 1,3)) %>%
mutate(SNG.1ST = substr(SNG.1ST,1,3)) %>%
mutate(joint_classification_global = paste0(DMX_GLOBAL_BEST,"_",hto_classification_global)) %>%
mutate(expect_sampleID =
if_else(hash_maxID == "HTO_1" | hash_maxID == "HTO_2", true = "205:250",
if_else(hash_maxID == "HTO_3" | hash_maxID == "HTO_4", true = "215:234:209",
if_else(hash_maxID == "HTO_6" | hash_maxID == "HTO_7", true = "236:279",
if_else(hash_maxID == "HTO_8" | hash_maxID == "HTO_9", true = "201:268:229",
if_else(hash_maxID == "HTO_12", true = "268",
if_else(hash_maxID == "HTO_14", true = "209",
if_else(hash_maxID == "HTO_5", true = "unstained_control", false = "na")))))))) %>%
separate(expect_sampleID, c("expect_sampleID_1", "expect_sampleID_2", "expect_sampleID_3"),sep = ":") %>%
mutate(dmx_hto_match =
if_else(SNG.1ST == expect_sampleID_1 | SNG.1ST == expect_sampleID_2 | SNG.1ST == expect_sampleID_3, true = "1",false = "0"))
# batch 3
=
b3_merged %>%
batch_3 mutate(SNG.1ST = as.character(SNG.1ST)) %>%
mutate(SNG.1ST = if_else(SNG.1ST == "209_209", true = "209", false = SNG.1ST)) %>%
mutate(DMX_GLOBAL_BEST = str_sub(BEST, 1,3)) %>%
mutate(joint_classification_global = paste0(DMX_GLOBAL_BEST,"_",hto_classification_global)) %>%
mutate(expect_sampleID =
if_else(hash_maxID == "HTO_1" | hash_maxID == "HTO_2" | hash_maxID == "HTO_3",true = "H5N1-017:H5N1-043:H5N1-031",
if_else(hash_maxID == "HTO_7" | hash_maxID == "HTO_8" | hash_maxID == "HTO_9", true = "H5N1-038:H5N1-021",
if_else(hash_maxID == "HTO_4" | hash_maxID == "HTO_6", true = "H5N1-043",
if_else(hash_maxID == "HTO_10" | hash_maxID == "HTO_12" | hash_maxID == "HTO_13",true = "H5N1-011",
if_else(hash_maxID == "HTO_14", true = "unstained_control",
if_else(hash_maxID == "HTO_5", true = "209", false = "na"))))))) %>%
separate(expect_sampleID, c("expect_sampleID_1", "expect_sampleID_2", "expect_sampleID_3"),sep = ":") %>%
mutate(dmx_hto_match =
if_else(SNG.1ST == expect_sampleID_1 | SNG.1ST == expect_sampleID_2 | SNG.1ST == expect_sampleID_3, true = "1",false = "0"))
# merge results...
= rbind(b1_merged, b2_merged, b3_merged)
test_merged ## Save unfiltered full merged utput:
saveRDS(test_merged, file = "data/1_preprocessing_data/merged_dmx_hto_metadata_full.rds")
# remove unstained control cells, subset down to concordant hto and DMX singlets
= test_merged %>%
unstained_cells filter(expect_sampleID_1 == "unstained_control") %$%
barcode_check
# retain only singlets that are also not the unstained controls.
=
test_result_singlet %>%
test_merged filter(!(barcode_check %in% unstained_cells)) %>%
filter(dmx_hto_match == 1) %>%
filter(joint_classification_global == "SNG_Singlet")
# Save protein negative control cells for DSB normalization:
=
neg_control_cells %>%
test_merged filter(!(barcode_check %in% unstained_cells)) %>%
filter(hto_classification_global == "Negative") %>%
filter(DMX_GLOBAL_BEST == "AMB")
saveRDS(neg_control_cells, file = "data/neg_control_cells_b1_2_3.rds")
# save singlet data for downstream processing.
# remove day C H5N1 cells
# Remove bridge cells from b1 and b3 so all 209 cells arefrom batch 2, the same batch as the second 209 timepoint.
# remove negative control cells
= test_result_singlet %>% filter(batch.x %in% c("1","3") & SNG.1ST == "209")
bridge
# remove neg_control_cells, bridge sample, add demuxlet metadata, add timepoint, sample, sampleid info.
=
singlet_add %>%
test_result_singlet filter(!(barcode_check %in% bridge$barcode_check)) %>%
mutate(timepoint = str_sub(adjmfc.time, 1,2)) %>%
select(DEMUXLET.RD.PASS = RD.PASS,
DEMUXLET.N.SNP = N.SNP,DMX_GLOBAL_BEST,
DEMUXLET.BARCODE = BARCODE,
sampleid = SNG.1ST,
barcode_check,
joint_classification_global,
dmx_hto_match, %>%
timepoint) unite("sample", sampleid, timepoint, remove = FALSE) %>%
filter(!(timepoint == "dC")) %>%
column_to_rownames("barcode_check")
# remove doublets and negative cells add concordant singlets and sample IDs to seurat object
= SubsetData(s, cells.use = rownames(singlet_add)) %>% AddMetaData(metadata = singlet_add)
s = SubsetData(s, subset.raw = TRUE)
s saveRDS(s, "data/h1h5_demultilexed_singlets_htodmx.RDS")
sessionInfo()
# MPM -- ran on http://ai-rstudioprd1.niaid.nih.gov:8787/
# containerized version Jan 29 2020
# R version 3.5.2 (2018-12-20)
# Platform: x86_64-redhat-linux-gnu (64-bit)
# Running under: Red Hat Enterprise Linux Server 7.6 (Maipo)
#
# Matrix products: default
# BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] magrittr_1.5 bindrcpp_0.2.2 here_0.1 Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 forcats_0.3.0 stringr_1.4.0 dplyr_0.7.8 purrr_0.3.2
# [11] readr_1.3.1 tidyr_0.8.3 tibble_2.1.1 ggplot2_3.1.1 tidyverse_1.2.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 ggridges_0.5.1 class_7.3-14 modeltools_0.2-22 rprojroot_1.3-2 mclust_5.4.3 htmlTable_1.13.1
# [9] base64enc_0.1-3 proxy_0.4-22 rstudioapi_0.9.0 npsurv_0.4-0 bit64_0.9-7 flexmix_2.3-14 mvtnorm_1.0-8 lubridate_1.7.4
# [17] xml2_1.2.0 codetools_0.2-15 splines_3.5.2 R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-3 knitr_1.22 Formula_1.2-3
# [25] jsonlite_1.6 ica_1.0-2 broom_0.5.0 cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7 R.oo_1.22.0 compiler_3.5.2
# [33] httr_1.3.1 backports_1.1.3 assertthat_0.2.1 lazyeval_0.2.2 cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6
# [41] tools_3.5.2 igraph_1.2.4 gtable_0.3.0 glue_1.3.1 reshape2_1.4.3 RANN_2.6.1 Rcpp_1.0.0 cellranger_1.1.0
# [49] trimcluster_0.1-2.1 gdata_2.18.0 ape_5.2 nlme_3.1-137 iterators_1.0.10 fpc_2.1-11.1 lmtest_0.9-36 gbRd_0.4-11
# [57] xfun_0.6 rvest_0.3.2 irlba_2.3.3 gtools_3.8.1 DEoptimR_1.0-8 zoo_1.8-3 MASS_7.3-51.1 scales_1.0.0
# [65] hms_0.4.2 doSNOW_1.0.16 parallel_3.5.2 RColorBrewer_1.1-2 yaml_2.2.0 reticulate_1.10 pbapply_1.3-4 gridExtra_2.3
# [73] segmented_0.5-3.0 rpart_4.1-13 latticeExtra_0.6-28 stringi_1.4.3 foreach_1.4.4 checkmate_1.9.1 caTools_1.17.1.1 bibtex_0.4.2
# [81] Rdpack_0.9-0 SDMTools_1.1-221 rlang_0.3.1 pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.2-6 bitops_1.0-6 lattice_0.20-38
# [89] ROCR_1.0-7 bindr_0.1.1 htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4 R6_2.4.0 snow_0.4-2
# [97] gplots_3.0.1.1 Hmisc_4.2-0 pillar_1.3.1 haven_1.1.2 foreign_0.8-71 withr_2.1.2 mixtools_1.1.0 fitdistrplus_1.0-14
# [105] survival_2.43-3 nnet_7.3-12 tsne_0.1-3 hdf5r_1.0.0 modelr_0.1.2 crayon_1.3.4 KernSmooth_2.23-15 grid_3.5.2
# [113] readxl_1.1.0 data.table_1.12.0 metap_1.0 digest_0.6.18 diptest_0.75-7 R.utils_2.7.0 stats4_3.5.2 munsell_0.5.0
Repeat for Multiseq demultiplexed singlets:
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(here))
setwd(here())
# Get metdata and vector of sequential tenx lanes for order of current object.
#s = readRDS("data/H1_H5_hash_demultiplexed_V6.rds")
= readRDS("data/H1_H5_multiseq_demultiplexed_V6.rds")
s =
global @meta.data %>%
sselect(barcode_check, multiseq_hashid) %>%
mutate(multiseq_classification_global = if_else(str_sub(multiseq_hashid,1,2) == "HT", "Singlet", multiseq_hashid)) %>%
select(multiseq_classification_global, barcode_check) %>%
column_to_rownames("barcode_check")
= AddMetaData(s, metadata = global)
s = s@meta.data
metadf
# vector of tenxlanes (!!reorder by batch, lane)
= unique(s@meta.data$tenx_lane)
tenx.lanes = c(tenx.lanes[c(7:12)], tenx.lanes[c(13:18)], tenx.lanes[c(1:6)]) %>% as.vector
tenx.lanes
#!! File str must be exact and contain the.best files from version 7 output.
# read in dmeuxlet output as list of data frames; add barcode.full with lane tag string.
#dmx.files = list.files(path = here("../demuxlet/demuxlet_run_v7"), full.names=T, pattern = ".best")
= list.files(path = here("demuxlet/demuxlet_run_v7/"), full.names=T, pattern = ".best")
dmx.files
# ensure order of files matchesorder of lane names.
stopifnot( str_sub(dmx.files, -9, -9 ) == str_sub(tenx.lanes, -1, -1 ) )
# 1 read in 18 demuxlet output files corresponding to results from each 10X lane.
= list()
dmx.list for (i in 1:length(tenx.lanes)) {
= read.table(file = dmx.files[[i]], header = T) %>%
dmx.list[[i]] mutate(bc = substr(BARCODE, start = 1,stop = 16)) %>%
select(bc, everything()) %>%
mutate(barcode_check = paste0(bc,"_",tenx.lanes[i]))
print(dmx.files[i])
print(table(substr(dmx.list[[i]]$BEST,1,3)))
}
# combine and add batch to dmx results.
=
dmx.res do.call(rbind, dmx.list) %>%
mutate(cohort = str_sub(barcode_check, -7,-6)) %>%
mutate(batch = if_else(cohort == "H1" & substr(barcode_check,21,21) == 1, true = "1",
if_else(cohort == "H1" & substr(barcode_check,21,21) == 2, true = "2", false = "3")))
# split by batch to map batch specific hash to samples.
= list()
dmx.batch for (i in 1:length(unique(dmx.res$batch))) {
= dmx.res %>% filter(batch == i)
dmx.batch[[i]]
}= list()
s.batch for (i in 1:length(dmx.batch)) {
= s@meta.data %>% filter(batch == i)
s.batch[[i]]
}
= full_join(dmx.batch[[1]], s.batch[[1]], by = "barcode_check")
batch_1 = full_join(dmx.batch[[2]], s.batch[[2]], by = "barcode_check")
batch_2 = full_join(dmx.batch[[3]], s.batch[[3]], by = "barcode_check")
batch_3
# map demuxlet sample ID to expected sample ID
# Batch 1
=
b1_merged %>%
batch_1 mutate(SNG.1ST = as.character(SNG.1ST)) %>%
mutate(DMX_GLOBAL_BEST = str_sub(BEST, 1,3)) %>%
mutate(SNG.1ST = substr(SNG.1ST,1,3)) %>%
mutate(joint_classification_global = paste0(DMX_GLOBAL_BEST,"_",multiseq_classification_global)) %>%
mutate(expect_sampleID =
if_else(multiseq_hashid == "HTO_1" | multiseq_hashid == "HTO_2", true = "237:207",
if_else(multiseq_hashid == "HTO_3" | multiseq_hashid == "HTO_4", true = "212:245:256",
if_else(multiseq_hashid == "HTO_10", true = "256",
if_else(multiseq_hashid == "HTO_6" | multiseq_hashid == "HTO_7", true = "200:273",
if_else(multiseq_hashid == "HTO_8" | multiseq_hashid == "HTO_9", true = "277:261:233",
if_else(multiseq_hashid == "HTO_12", true = "233",
if_else(multiseq_hashid == "HTO_14", true = "209",
if_else(multiseq_hashid == "HTO_5", true = "unstained_control", false = "na"))))))))) %>%
separate(expect_sampleID, c("expect_sampleID_1", "expect_sampleID_2", "expect_sampleID_3"),sep = ":") %>%
mutate(dmx_hto_match = if_else(SNG.1ST == expect_sampleID_1 | SNG.1ST == expect_sampleID_2 | SNG.1ST == expect_sampleID_3, true = "1",false = "0"))
# Batch 2
=
b2_merged %>%
batch_2 mutate(SNG.1ST = as.character(SNG.1ST)) %>%
mutate(DMX_GLOBAL_BEST = str_sub(BEST, 1,3)) %>%
mutate(SNG.1ST = substr(SNG.1ST,1,3)) %>%
mutate(joint_classification_global = paste0(DMX_GLOBAL_BEST,"_",multiseq_classification_global)) %>%
mutate(expect_sampleID =
if_else(multiseq_hashid == "HTO_1" | multiseq_hashid == "HTO_2", true = "205:250",
if_else(multiseq_hashid == "HTO_3" | multiseq_hashid == "HTO_4", true = "215:234:209",
if_else(multiseq_hashid == "HTO_6" | multiseq_hashid == "HTO_7", true = "236:279",
if_else(multiseq_hashid == "HTO_8" | multiseq_hashid == "HTO_9", true = "201:268:229",
if_else(multiseq_hashid == "HTO_12", true = "268",
if_else(multiseq_hashid == "HTO_14", true = "209",
if_else(multiseq_hashid == "HTO_5", true = "unstained_control", false = "na")))))))) %>%
separate(expect_sampleID, c("expect_sampleID_1", "expect_sampleID_2", "expect_sampleID_3"),sep = ":") %>%
mutate(dmx_hto_match = if_else(SNG.1ST == expect_sampleID_1 | SNG.1ST == expect_sampleID_2 | SNG.1ST == expect_sampleID_3,true = "1",false = "0"))
# batch 3
=
b3_merged %>%
batch_3 mutate(SNG.1ST = as.character(SNG.1ST)) %>%
mutate(SNG.1ST = if_else(SNG.1ST == "209_209", true = "209", false = SNG.1ST)) %>%
mutate(DMX_GLOBAL_BEST = str_sub(BEST, 1,3)) %>%
mutate(joint_classification_global = paste0(DMX_GLOBAL_BEST,"_",multiseq_classification_global)) %>%
mutate(expect_sampleID =
if_else(multiseq_hashid == "HTO_1" | multiseq_hashid == "HTO_2" | multiseq_hashid == "HTO_3",true = "H5N1-017:H5N1-043:H5N1-031",
if_else(multiseq_hashid == "HTO_7" | multiseq_hashid == "HTO_8" | multiseq_hashid == "HTO_9", true = "H5N1-038:H5N1-021",
if_else(multiseq_hashid == "HTO_4" | multiseq_hashid == "HTO_6", true = "H5N1-043",
if_else(multiseq_hashid == "HTO_10" | multiseq_hashid == "HTO_12" | multiseq_hashid == "HTO_13",true = "H5N1-011",
if_else(multiseq_hashid == "HTO_14", true = "unstained_control",
if_else(multiseq_hashid == "HTO_5", true = "209", false = "na"))))))) %>%
separate(expect_sampleID, c("expect_sampleID_1", "expect_sampleID_2", "expect_sampleID_3"),sep = ":") %>%
mutate(dmx_hto_match = if_else(SNG.1ST == expect_sampleID_1 | SNG.1ST == expect_sampleID_2 | SNG.1ST == expect_sampleID_3, true = "1",false = "0"))
# merge results...
= rbind(b1_merged, b2_merged, b3_merged)
test_merged ## Save unfiltered full merged utput:
saveRDS(test_merged, file = "data/1_preprocessing_data/merged_MULTISEQ_hto_metadata_full.rds")
# remove unstained control cells
=
unstained_cells %>%
test_merged filter(expect_sampleID_1 == "unstained_control")
# saveRDS(unstained_cells, file = "preprocess_output/demultiplexing_output/unstained_cells_batch_1_2_3.rds")
# and filter only to cells that are classified as singlets called by BOTH methods and expect ID from hashing matches DMX ID
= unstained_cells %$% barcode_check
unstained_cells =
test_result_singlet %>%
test_merged filter(dmx_hto_match == 1) %>%
filter(joint_classification_global == "SNG_Singlet") %>%
filter(!(barcode_check %in% unstained_cells))
# table(test_result_singlet$multiseq_classification_global, test_result_singlet$DMX_GLOBAL_BEST)
# table(test_result_singlet$SNG.1ST, test_result_singlet$batch.x)
# table(test_result_singlet$adjmfc.time, test_result_singlet$SNG.1ST)
######### NOT USED- used the neg calls from HTODMX
#########
# Save protein negative control cells for DSB normalization:
# filter to multiseq negative and demuxlet Ambiguous cells
=
neg_control_cells %>%
test_merged filter(!(barcode_check %in% unstained_cells)) %>%
filter(multiseq_classification_global == "Negative") %>%
filter(DMX_GLOBAL_BEST == "AMB")
# saveRDS(neg_control_cells, file = "preprocess_output/demultiplexing_output/neg_control_cells_b1_2_3.rds")
#######
# save singlet data for downstream processing.
# remove day C H5N1 cells
# Remove bridge cells from b1 and b3 so all 209 samples arefrom batch 1 and 2.
# remove negative control cells
= test_result_singlet %>% filter(batch.x %in% c("1","3") & SNG.1ST == "209")
bridge
# remove neg_control_cells
=
singlet_add %>%
test_result_singlet filter(!(barcode_check %in% bridge$barcode_check)) %>%
select(DEMUXLET.RD.PASS = RD.PASS, DEMUXLET.N.SNP = N.SNP,DMX_GLOBAL_BEST, DEMUXLET.BARCODE = BARCODE,
sampleid = SNG.1ST, barcode_check, joint_classification_global, dmx_hto_match, timepoint) %>%
unite("sample", sampleid, timepoint, remove = FALSE) %>%
filter(!(timepoint == "dC")) %>%
select(-c("timepoint")) %>%
column_to_rownames("barcode_check")
# remove doublets and negative cells add concordant singlets and sample IDs to seurat object
=
s SubsetData(s, cells.use = rownames(singlet_add)) %>%
AddMetaData(metadata = singlet_add)
saveRDS(s, "data/h1_h5_demultilexed_singlets_multiseq.RDS")
sessionInfo()
Finally save the object:
h1_h5_merged_seurat_object_demultiplexed_sng.rds
, a Seurat
version 2 object that is further processed in the directory
Flu_CITEseq_normalize_cluster
.
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(here))
setwd(here())
# read HTODemux concordant singlets with Demuxlet and Multiseq concordant singlets with Demuxlet
# get union of barcodes with hash and genetic evidence for singlet call, merge into Seurat object and save for QC in normalization pipeline.
= readRDS("data/h1h5_demultilexed_singlets_htodmx.RDS")
htodmx = readRDS("data/h1_h5_demultilexed_singlets_multiseq.RDS") %>% SubsetData(subset.raw = TRUE)
multiseq
# drop pct mt to calc on sub in normalization script.
= c("nGene", "nUMI", "barcode_check", "tenx_lane" ,"cohort", "batch", "DEMUXLET.RD.PASS",
vars_add "DEMUXLET.N.SNP", "DMX_GLOBAL_BEST", "sample", "sampleid", "joint_classification_global",
"dmx_hto_match", "adjmfc.time", "timepoint")
# meta dat union
= htodmx@meta.data %>% select(vars_add, hash_call = hash_maxID)
htodmx_md = multiseq@meta.data %>% select(vars_add, hash_call = multiseq_hashid)
multiseq_md = union(htodmx_md, multiseq_md) %>% mutate(barcode = barcode_check) %>% column_to_rownames("barcode")
metadata
rm(htodmx, multiseq)
gc()
# concordant singlet from htodmx, multiseq demuxlet union object for downstream QC
= readRDS("data/Merged_H1_H5_629949_cells.rds") %>%
s SubsetData(cells.use = metadata$barcode_check, subset.raw = TRUE) %>%
AddMetaData(metadata = metadata)
gc()
saveRDS(s,file = "data/h1_h5_merged_seurat_object_demultiplexed_sng.rds")
sessionInfo()
# R version 3.5.3 Patched (2019-03-11 r77192)
# Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Mojave 10.14.6
#
# Matrix products: default
# BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
#
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] here_0.1 magrittr_1.5 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.1 purrr_0.3.3 readr_1.3.1 tidyr_0.8.3 tibble_2.1.1 tidyverse_1.2.1 Seurat_2.3.4
# [12] Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 class_7.3-15 modeltools_0.2-22 ggridges_0.5.1 rprojroot_1.3-2 mclust_5.4.5 htmlTable_1.13.1
# [9] base64enc_0.1-3 rstudioapi_0.10 proxy_0.4-23 npsurv_0.4-0 flexmix_2.3-15 bit64_0.9-7 lubridate_1.7.4 mvtnorm_1.0-10
# [17] xml2_1.2.0 codetools_0.2-16 splines_3.5.3 R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-5 knitr_1.23 Formula_1.2-3
# [25] jsonlite_1.6 broom_0.5.2 ica_1.0-2 cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7 R.oo_1.22.0 compiler_3.5.3
# [33] httr_1.4.0 backports_1.1.4 assertthat_0.2.1 lazyeval_0.2.2 cli_1.1.0 lars_1.2 acepack_1.4.1 htmltools_0.3.6
# [41] tools_3.5.3 igraph_1.2.4.1 gtable_0.3.0 glue_1.3.1 RANN_2.6.1 reshape2_1.4.3 Rcpp_1.0.1 cellranger_1.1.0
# [49] gdata_2.18.0 ape_5.3 nlme_3.1-137 iterators_1.0.10 fpc_2.2-1 gbRd_0.4-11 lmtest_0.9-37 xfun_0.7
# [57] rvest_0.3.4 irlba_2.3.3 gtools_3.8.1 DEoptimR_1.0-8 MASS_7.3-51.1 zoo_1.8-6 scales_1.0.0 hms_0.4.2
# [65] doSNOW_1.0.16 parallel_3.5.3 RColorBrewer_1.1-2 reticulate_1.12 pbapply_1.4-0 gridExtra_2.3 rpart_4.1-13 segmented_0.5-4.0
# [73] latticeExtra_0.6-28 stringi_1.4.3 foreach_1.4.4 checkmate_1.9.3 caTools_1.17.1.2 bibtex_0.4.2 Rdpack_0.11-0 SDMTools_1.1-221.1
# [81] rlang_0.4.0 pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.3-1 bitops_1.0-6 lattice_0.20-38 ROCR_1.0-7 htmlwidgets_1.3
# [89] bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4 R6_2.4.0 generics_0.0.2 snow_0.4-3 gplots_3.0.1.1 Hmisc_4.2-0
# [97] haven_2.1.0 pillar_1.4.1 foreign_0.8-71 withr_2.1.2 fitdistrplus_1.0-14 mixtools_1.1.0 survival_2.43-3 nnet_7.3-12
# [105] tsne_0.1-3 modelr_0.1.4 crayon_1.3.4 hdf5r_1.2.0 KernSmooth_2.23-15 readxl_1.3.1 grid_3.5.3 data.table_1.12.2
# [113] metap_1.1 digest_0.6.19 diptest_0.75-7 R.utils_2.8.0 stats4_3.5.3 munsell_0.5.0
Directory 2: /Flu_CITEseq_normalize_cluster/
Script 1: runs normalization of mRNA using scran and normalization of ADTs with dsb. Flu_CITEseq_normalize_cluster/1_Flu_CITEseq_normalize/1_dsbnorm_prot_scrannorm_rna.R
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
library(here)
library(dsb)
# set save path
= here("1_Flu_CITEseq_normalize/normalization_data/")
datapath dir.create(datapath)
# read in final singlets for each batch, make list of seurat objects by batch.
= readRDS(file = here("data/h1_h5_merged_seurat_object_demultiplexed_sng.rds")) %>%
s SetAllIdent(id = "batch")
= SubsetData(s, ident.use = "1", subset.raw = T)
h1b1 = SubsetData(s, ident.use = "2", subset.raw = T)
h1b2 = SubsetData(s, ident.use = "3", subset.raw = T)
h5 rm(s)
gc()
#########################################################
###PART 1 Protein normalization with the DSB Package ####
#########################################################
# get vector of negative cells from script 4 hashing validation
# Get HTOdemux round 1 demultipleixng object including Hash calls for Sng doublet negative
= readRDS("data/neg_control_cells_b1_2_3.rds")
neg_control_cells = readRDS("data/Merged_H1_H5_629949_cells.rds") %>%
s SubsetData(cells.use = neg_control_cells$barcode_check, subset.raw = TRUE)
gc()
= unique(s@meta.data$batch)
batch_vector = SetAllIdent(s, id = "batch")
s = list()
neg_adt for (i in 1:3) {
= s %>%
neg_adt[[i]] SetAllIdent(id = "batch") %>%
SubsetData(ident.use = batch_vector[i])
}rm(s)
gc()
# filter by nGene (separate mRNA assay not considered by mdemultiplexing) to remove possible true cells
# get list of protein matrices by batch for negative control drops.
= lapply(neg_adt, function(x){ SubsetData(x, accept.high = 80, subset.name = "nGene", subset.raw = TRUE) })
neg_adt = lapply(neg_adt, function(x){ x@assay$CITE@raw.data })
neg_adt
# make list of positive protein matrices by batch
= list(h1b1, h1b2, h5)
stained = lapply(stained, function(x){x@assay$CITE@raw.data})
pos_adt
# define a vector of the isotype controls in the data
= c("Mouse IgG2bkIsotype_PROT", "MouseIgG1kappaisotype_PROT","MouseIgG2akappaisotype_PROT", "RatIgG2bkIsotype_PROT")
isotypes
# apply denoised scaled by background protein normalization.
# https://mattpm.github.io/dsb/index.html
= list()
dsb_norm for (i in 1:length(neg_adt)) {
= DSBNormalizeProtein(cell_protein_matrix = pos_adt[[i]],
dsb_norm[[i]] empty_drop_matrix = neg_adt[[i]],
denoise.counts = TRUE,
use.isotype.control = TRUE,
isotype.control.name.vec = isotypes)
}
# these data get added to object by batch at end of script
############################################
### PART 2 mRNA normalization ####
############################################
# convert to SCE object calculate QC metrics.
= list(h1b1, h1b2, h5)
sc
# get vector of cells to remove that are in the top 5% of mt expression (this version = cells with mt percentage > 10%)
= lapply(sc, function(x){ x@meta.data$pctMT %>% quantile(0.95) }) %>% unlist(use.names = F)
mt_threshold = round(mean(mt_threshold), 2)
mt_threshold
## Convert to SCE object to convert QC stats and subset on lib size and mitochondrial percentage.
= lapply(sc, function(x){ Convert(from = x, to = "sce") })
sc = lapply(sc, FUN = scater::calculateQCMetrics)
sc
# outlier cells based on log lib size (UMI) 3.5 median absolute deviations > or < median lib size
= lapply(sc, function(x){ scater::isOutlier(x$log10_total_counts, type = "lower", nmads = 3.5, log = FALSE) })
low = lapply(sc, function(x){ scater::isOutlier(x$log10_total_counts, type = "higher", nmads = 3.5, log = FALSE) })
high = lapply(sc, function(x){ x$pctMT > mt_threshold })
mt = list()
outlier_cell for (i in 1:length(sc)) {
= low[[i]] | high[[i]] | mt[[i]]
outlier_cell[[i]] = sc[[i]][ ,!outlier_cell[[i]] ]
sc[[i]]
}
#normalize multibatch normalization h1 and h5 cells.
#sc = lapply(sc, function(x) { scater::normalizeSCE(x, return_log = TRUE, log_exprs_offset = 1, centre_size_factors = FALSE)} )
= scran::multiBatchNorm( sc[[1]], sc[[2]], sc[[3]] )
batch.norm
# merge batch indexed lists of metadata and normalized counts
= c("total_features_by_counts",
vars_add "log10_total_features_by_counts",
"log10_total_counts",
"pct_counts_in_top_50_features")
= lapply(batch.norm, function(x){ logcounts(x) })
norm_sce = do.call(cbind, norm_sce)
norm_counts = lapply(batch.norm, colData)
merge_meta = do.call(rbind, merge_meta) %>%
merge_meta as.data.frame() %>%
select(vars_add)
rm(sc)
gc()
# add scran normalized counts to Seurat obect.
= readRDS(file = "data/h1_h5_merged_seurat_object_demultiplexed_sng.rds") %>% SetAllIdent(id = "cohort")
s = s %>%
s SubsetData(cells.use = rownames(merge_meta), subset.raw = TRUE) %>%
AddMetaData(metadata = merge_meta) %>%
SetAssayData(new.data = norm_counts, assay.type = "RNA", slot = "data")
@calc.params$NormalizeData = list(normalization.method = "scran")
s
# add CITE DSB normalized assay data. Merge the 3 batches and only include the outlier QCd cells
= do.call(cbind, dsb_norm)
CITE = CITE[ ,rownames(s@meta.data)]
CITE = SetAssayData(s, new.data = CITE, assay.type = "CITE", slot = "data")
s
saveRDS(s, file = paste0(datapath, "h1h5_dsb_scran_norm.rds"))
sessionInfo()
# R version 3.5.3 Patched (2019-03-11 r77192)
# attached base packages:
# [1] parallel stats4 stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] scran_1.10.2 SingleCellExperiment_1.4.1 SummarizedExperiment_1.12.0 DelayedArray_0.8.0 matrixStats_0.54.0
# [6] Biobase_2.42.0 GenomicRanges_1.34.0 GenomeInfoDb_1.18.2 IRanges_2.16.0 S4Vectors_0.20.1
# [11] BiocGenerics_0.28.0 BiocParallel_1.16.6 edgeR_3.24.3 limma_3.38.3 dsb_0.1.0
# [16] here_0.1 magrittr_1.5 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.1
# [21] purrr_0.3.3 readr_1.3.1 tidyr_0.8.3 tibble_2.1.1 tidyverse_1.2.1
# [26] Seurat_2.3.4 Matrix_1.2-15 cowplot_0.9.4 ggplot2_3.1.1
#
# loaded via a namespace (and not attached):
# [1] reticulate_1.12 R.utils_2.8.0 tidyselect_0.2.5 htmlwidgets_1.3 grid_3.5.3 Rtsne_0.15
# [7] devtools_2.0.2 munsell_0.5.0 codetools_0.2-16 ica_1.0-2 statmod_1.4.32 withr_2.1.2
# [13] colorspace_1.4-1 knitr_1.23 rstudioapi_0.10 ROCR_1.0-7 robustbase_0.93-5 dtw_1.20-1
# [19] gbRd_0.4-11 Rdpack_0.11-0 labeling_0.3 lars_1.2 GenomeInfoDbData_1.2.0 pheatmap_1.0.12
# [25] bit64_0.9-7 rhdf5_2.26.2 rprojroot_1.3-2 generics_0.0.2 xfun_0.7 diptest_0.75-7
# [31] R6_2.4.0 ggbeeswarm_0.6.0 locfit_1.5-9.1 hdf5r_1.2.0 flexmix_2.3-15 bitops_1.0-6
# [37] assertthat_0.2.1 SDMTools_1.1-221.1 scales_1.0.0 nnet_7.3-12 beeswarm_0.2.3 gtable_0.3.0
# [43] npsurv_0.4-0 processx_3.3.1 rlang_0.4.0 splines_3.5.3 lazyeval_0.2.2 acepack_1.4.1
# [49] broom_0.5.2 checkmate_1.9.3 reshape2_1.4.3 modelr_0.1.4 backports_1.1.4 Hmisc_4.2-0
# [55] tools_3.5.3 usethis_1.5.0 gplots_3.0.1.1 RColorBrewer_1.1-2 proxy_0.4-23 dynamicTreeCut_1.63-1
# [61] sessioninfo_1.1.1 ggridges_0.5.1 Rcpp_1.0.1 plyr_1.8.4 base64enc_0.1-3 zlibbioc_1.28.0
# [67] RCurl_1.95-4.12 ps_1.3.0 prettyunits_1.0.2 rpart_4.1-13 pbapply_1.4-0 viridis_0.5.1
# [73] zoo_1.8-6 haven_2.1.0 cluster_2.0.7-1 fs_1.3.1 data.table_1.12.2 lmtest_0.9-37
# [79] RANN_2.6.1 mvtnorm_1.0-10 fitdistrplus_1.0-14 pkgload_1.0.2 hms_0.4.2 lsei_1.2-0
# [85] mclust_5.4.5 readxl_1.3.1 gridExtra_2.3 testthat_2.1.1 compiler_3.5.3 scater_1.10.1
# [91] KernSmooth_2.23-15 crayon_1.3.4 R.oo_1.22.0 htmltools_0.3.6 segmented_0.5-4.0 Formula_1.2-3
# [97] snow_0.4-3 lubridate_1.7.4 MASS_7.3-51.1 fpc_2.2-1 cli_1.1.0 R.methodsS3_1.7.1
# [103] gdata_2.18.0 metap_1.1 igraph_1.2.4.1 pkgconfig_2.0.2 foreign_0.8-71 xml2_1.2.0
# [109] foreach_1.4.4 vipor_0.4.5 XVector_0.22.0 bibtex_0.4.2 rvest_0.3.4 callr_3.2.0
# [115] digest_0.6.19 tsne_0.1-3 cellranger_1.1.0 htmlTable_1.13.1 DelayedMatrixStats_1.4.0 curl_3.3
# [121] kernlab_0.9-27 gtools_3.8.1 modeltools_0.2-22 nlme_3.1-137 jsonlite_1.6 Rhdf5lib_1.4.3
# [127] BiocNeighbors_1.0.0 desc_1.2.0 viridisLite_0.3.0 pillar_1.4.1 lattice_0.20-38 httr_1.4.0
# [133] DEoptimR_1.0-8 pkgbuild_1.0.3 survival_2.43-3 glue_1.3.1 remotes_2.0.4 png_0.1-7
# [139] prabclus_2.3-1 iterators_1.0.10 bit_1.1-14 class_7.3-15 stringi_1.4.3 HDF5Array_1.10.1
# [145] mixtools_1.1.0 doSNOW_1.0.16 latticeExtra_0.6-28 caTools_1.17.1.2 memoise_1.1.0 irlba_2.3.3
# [151] ape_5.3
Cluster cells using protein distance matrix on dsb normalized values without dimensionality reduction.
# combined H1 H5 clustering for H5 annotation
# This script is run in a HPC environment bc the distance matrix created is > 100gb
## clustering
# HPC
#qrsh -l himem,mem_free=640G,h_vmem=80G -pe threaded 8
#module load Anaconda3/5.3.0
#conda activate Seurat2.3.4
#cd /hpcdata/sg/sg_data/users/mulemp/1PhD/Flu_CITEseq_cluster_030920/
#R
.libPaths()
# 3.5 is now loaded with Seurat 2.3.4
# run clustering
library(Seurat)
library(tidyverse)
library(parallelDist)
library(here)
# make output directory
= here("2_Flu_CITEseq_cluster_030920/clustering_data/")
datapath dir.create(datapath)
# cluster resolution parameters (1.2 was selected from set of values)
= c(1.2)
res
## test verstion######
#h1h5 = readRDS("Flu_CITEseq_normalize_data/h1h5_dsb_scran_norm.rds") %>%
# SetAllIdent(id = "sample") %>%
# SubsetData(max.cells.per.ident = 10, subset.raw = T)
#######################
# load merged data
= readRDS("Flu_CITEseq_normalize_data/h1h5_dsb_scran_norm.rds")
h1h5 = GetAssayData(h1h5, assay.type = "CITE", slot = "data")
pmtx
# remove isotype contorls and CD206
= rownames(pmtx)
prot = prot[-c(19, 67:70)]
prot_subset = pmtx[prot_subset, ]
pmtx
# create distance matrix across all proteins.
= parDist(t(pmtx))
p3_dist = as.matrix(p3_dist)
p3_dist
# cluster
for (i in 1:length(res)) {
= FindClusters(h1h5,
h1h5 distance.matrix = p3_dist,
k.param = 50,
print.output = F,
resolution = res[i],
random.seed = 1,
algorithm = 3,
modularity.fxn = 1)
= StashIdent(h1h5, save.name = paste0("c_",res[i]))
h1h5
}= h1h5@meta.data %>% select(c_1.2)
md_save saveRDS(md_save, file = paste0(datapath,"joint_cluster_metadata.rds"))
sessionInfo()
# MPM ran on NIAID HPC Mar 9 2020 containerized version
# R version 3.5.1 (2018-07-02)
# Platform: x86_64-conda_cos6-linux-gnu (64-bit)
# Running under: Red Hat Enterprise Linux Server 7.2 (Maipo)
#
# Matrix products: default
# BLAS/LAPACK: /hpcdata/sg/sg_data/users/mulemp/condaenvirons/envs/Seurat2.3.4/lib/R/lib/libRlapack.so
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
# [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
# [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
# [9] LC_ADDRESS=C LC_TELEPHONE=C
# [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] parallelDist_0.2.4 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3
# [5] purrr_0.3.2 readr_1.3.1 tidyr_1.0.0 tibble_2.1.3
# [9] tidyverse_1.2.1 Seurat_2.3.4 Matrix_1.2-17 cowplot_1.0.0
# [13] ggplot2_3.2.1
#
# loaded via a namespace (and not attached):
# [1] Rtsne_0.15 colorspace_1.4-1 class_7.3-15
# [4] modeltools_0.2-22 ggridges_0.5.1 mclust_5.4.5
# [7] htmlTable_1.13.2 base64enc_0.1-3 rstudioapi_0.10
# [10] proxy_0.4-23 npsurv_0.4-0 flexmix_2.3-15
# [13] bit64_0.9-7 lubridate_1.7.4 xml2_1.2.2
# [16] codetools_0.2-16 splines_3.5.1 R.methodsS3_1.7.1
# [19] lsei_1.2-0 robustbase_0.93-5 knitr_1.25
# [22] zeallot_0.1.0 Formula_1.2-3 jsonlite_1.6
# [25] broom_0.5.2 ica_1.0-2 cluster_2.1.0
# [28] kernlab_0.9-27 png_0.1-7 R.oo_1.22.0
# [31] compiler_3.5.1 httr_1.4.1 backports_1.1.5
# [34] assertthat_0.2.1 lazyeval_0.2.2 cli_1.1.0
# [37] lars_1.2 acepack_1.4.1 htmltools_0.4.0
# [40] tools_3.5.1 igraph_1.2.4.1 gtable_0.3.0
# [43] glue_1.3.1 RANN_2.6.1 reshape2_1.4.3
# [46] Rcpp_1.0.2 cellranger_1.1.0 vctrs_0.2.0
# [49] gdata_2.18.0 ape_5.3 nlme_3.1-141
# [52] iterators_1.0.12 fpc_2.2-3 gbRd_0.4-11
# [55] lmtest_0.9-37 xfun_0.10 rvest_0.3.4
# [58] lifecycle_0.1.0 irlba_2.3.3 gtools_3.8.1
# [61] DEoptimR_1.0-8 MASS_7.3-51.4 zoo_1.8-6
# [64] scales_1.0.0 hms_0.5.1 doSNOW_1.0.18
# [67] parallel_3.5.1 RColorBrewer_1.1-2 reticulate_1.13
# [70] pbapply_1.4-2 gridExtra_2.3 rpart_4.1-15
# [73] segmented_1.0-0 latticeExtra_0.6-28 stringi_1.4.3
# [76] foreach_1.4.7 checkmate_1.9.4 caTools_1.17.1.2
# [79] bibtex_0.4.2 Rdpack_0.11-0 SDMTools_1.1-221.1
# [82] rlang_0.4.0 pkgconfig_2.0.3 dtw_1.21-3
# [85] prabclus_2.3-1 bitops_1.0-6 lattice_0.20-38
# [88] ROCR_1.0-7 htmlwidgets_1.5 bit_1.1-14
# [91] tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
# [94] R6_2.4.0 generics_0.0.2 snow_0.4-3
# [97] gplots_3.0.1.1 Hmisc_4.2-0 haven_2.1.1
# [100] pillar_1.4.2 foreign_0.8-72 withr_2.1.2
# [103] fitdistrplus_1.0-14 mixtools_1.1.0 survival_2.44-1.1
# [106] nnet_7.3-12 tsne_0.1-3 modelr_0.1.5
# [109] crayon_1.3.4 hdf5r_1.3.0 KernSmooth_2.23-15
# [112] readxl_1.3.1 grid_3.5.1 data.table_1.12.4
# [115] metap_1.1 digest_0.6.21 diptest_0.75-7
# [118] R.utils_2.9.0 RcppParallel_4.4.4 stats4_3.5.1
# [121] munsell_0.5.0
#
Annotate cells and use add custom gating for purifying cell
populations. Add sample metadata and save output data used as starting
object. This script creates the .rds object
Flu_CITEseq_preprocess/Flu_CITEseq_normalize_cluster/2_Flu_CITEseq_cluster_030920/clustering_data/h1h5_annotated_with_meta.rds
.
This is identical to the object in the starting data root data/
directory downloaded from zenodo with exception of removal of 1 subject
from the H5N1 cohort not consented for data reuse as described
above.
# cluster annotation and QC
suppressMessages(library(Seurat))
suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(here))
here()
= here("2_Flu_CITEseq_cluster_030920/clustering_data/")
cluster_path = here("2_Flu_CITEseq_cluster_030920/cluster_annotations/")
annotation_path
#### joint cluster data and cluster annotations
= readRDS(file = here("Flu_CITEseq_normalize_cluster/1_Flu_CITEseq_normalize/normalization_data/h1h5_dsb_scran_norm.rds"))
h1h5 = readRDS(file = here("Flu_CITEseq_normalize_cluster/2_Flu_CITEseq_cluster_030920/clustering_data/joint_cluster_metadata.rds"))
h1h5cluster = AddMetaData(h1h5, metadata = h1h5cluster)
h1h5
# sample cell type tables
#dir.create("tables")
#source(file = "functions/GetSubjectCelltypeTable.R")
#joint_table = GetSubjectCelltypeTable(Seurat.Object = h1h5, celltype_column = "c_1", sample_column = "sampleid")
#write.csv(as.matrix(joint_table$table), file = "tables/joint_sample_table.csv")
# load coarse annotation mapping from contingency with fds_v3 data version
= read_delim(file = here('Flu_CITEseq_normalize_cluster/2_Flu_CITEseq_cluster_030920/cluster_annotations/cluster_annotations2.txt'),
annotation delim = "\t")
# get joint metadata
= h1h5@meta.data
md $celltype_m_joint = plyr::mapvalues(md$c_1.2, from = annotation$joint_cluster, to = annotation$annotation)
md= md %>% select(celltype_m_joint)
mdadd = AddMetaData(h1h5, metadata = mdadd)
h1h5
# plot dist of mg subsets
source(file = "functions/geneplot4.R")
#
= h1h5 %>%
mem SetAllIdent(id = "celltype_m_joint") %>%
SubsetData(ident.use = "CD4_Efct_Mem_Tcell", subset.raw = TRUE)
# plot marker distribution
= GenePlot4(mem, gene1 = "CD161_PROT", gene2 = "CD45RA_PROT")
p1 = GenePlot4(mem, gene1 = "CD3_PROT", gene2 = "CD62L_PROT", plot.ident = T, ident.plot = "batch")
p2 = plot_grid(p1,p2)
p
# Gate out naive cells and manually gate CD161+ and Central memory T cells from the joint memory cluster.
= function(SeuratObject, return.seurat = FALSE ) {
Gate_naive = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD45RA_PROT", ] > 3 ))
cells if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= function(SeuratObject, return.seurat = FALSE ) {
Gate_161 = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD45RA_PROT", ] < 3 &
cells "CD161_PROT", ] > 6.5 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= function(SeuratObject, return.seurat = FALSE ) {
Gate_mem = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD45RA_PROT", ] < 3 &
cells "CD161_PROT", ] < 6.5 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= function(SeuratObject, return.seurat = FALSE ) {
Gate_cm = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD45RA_PROT", ] < 3 &
cells "CD161_PROT", ] < 6.5 &
adt["CD62L_PROT", ] > 6 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= Gate_naive(mem)
naive = Gate_161(mem)
t161 = Gate_mem(mem, return.seurat = TRUE)
mem_true = Gate_cm(mem_true)
cm
= h1h5@meta.data %>%
newmd mutate_if(is.factor, as.character) %>%
select(barcode_check, c_1.2, celltype_m_joint, sample) %>%
mutate(celltype_m_joint =
if_else(celltype_m_joint == "CD4_Efct_Mem_Tcell" & barcode_check %in% naive, true = "CD4Naive_Tcell",
if_else(celltype_m_joint == "CD4_Efct_Mem_Tcell" & barcode_check %in% t161, true = "CD4_CD161_Mem_Tcell",
if_else(celltype_m_joint == "CD4_Efct_Mem_Tcell" & barcode_check %in% mem_true@cell.names, true = "CD4_Efct_Mem_Tcell",
if_else(celltype_m_joint == "CD4_Efct_Mem_Tcell" & barcode_check %in% cm, true = "CD4_Central_Mem_Tcell", false = celltype_m_joint)))))
# table(newmd$celltype_m_joint, newmd$sample)
## MAIT vs CD161+ CD8 t cells
= c("CD8_CD161_Tcell", "CD8_Mem_Tcell", "CD8_Naive_Tcell", "MAIT_Like")
cd8_clusters = SetAllIdent(h1h5, id = "celltype_m_joint")
h1h5 = SubsetData(h1h5, ident.use = cd8_clusters, subset.raw = TRUE)
cd8 RidgePlot(cd8, features.plot = c("CD3_PROT", "CD4_PROT", "CD8_PROT", "CD161_PROT", "CD45RA_PROT", "CD45RO_PROT"), same.y.lims = F)
# fix mait
= h1h5 %>%
mait SetAllIdent(id = "celltype_m_joint") %>%
SubsetData(ident.use = "MAIT_Like", subset.raw = TRUE)
= WhichCells(mait, accept.high = 4.5, subset.name = "CD8_PROT")
mait_cells = WhichCells(mait, accept.low = 4.5, subset.name = "CD8_PROT")
mem_cells
= newmd %>%
newmd2 mutate(celltype_m_joint =
if_else(celltype_m_joint == 'MAIT_Like' & barcode_check %in% mem_cells, true = "CD8_Mem_Tcell",
if_else(celltype_m_joint == 'MAIT_Like' & barcode_check %in% mait_cells, true = "MAIT_Like",
false = celltype_m_joint )))
= SubsetData(cd8, ident.use = "CD8_Mem_Tcell")
memory = GenePlot4(memory, gene1 = "CD45RO_PROT", "CD161_PROT")
p1 = GenePlot4(memory, gene1 = "CD3_PROT", "CD161_PROT")
p2 = GenePlot4(memory, gene1 = "CD8_PROT", "CD161_PROT")
p3 = GenePlot4(memory, gene1 = "CD45RO_PROT", "CD45RA_PROT")
p4 plot_grid(p1,p2, p3, p4)
# based on below add to mait and CD8 CD161 cluster respectivey with gates.
= WhichCells(memory, accept.low = 3, subset.name = "CD161_PROT")
mait_like = SubsetData(memory, cells.use = mait_like)
test GenePlot4(test, 'CD3_PROT', 'CD8_PROT')
= function(SeuratObject, return.seurat = FALSE ) {
Gate_CD8CD161 = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD8_PROT", ] > 4.5 &
cells # adt["CD45RO_PROT", ] < 6 &
"CD161_PROT", ] > 3 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= function(SeuratObject, return.seurat = FALSE ) {
Gate_MAIT = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD8_PROT", ] < 4.5 &
cells # adt["CD45RO_PROT", ] < 6 &
"CD161_PROT", ] > 3 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= function(SeuratObject, return.seurat = FALSE ) {
GateNiaveCD8 = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD45RA_PROT", ] > 4 &
cells "CD45RO_PROT", ] < 6 &
adt["CD161_PROT", ] < 4.5 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= function(SeuratObject, return.seurat = FALSE ) {
GateMemCD8 = as.matrix(SeuratObject@assay$CITE@data)
adt <- names(which( adt["CD45RA_PROT", ] < 4 &
cells "CD45RO_PROT", ] > 6 &
adt["CD161_PROT", ] < 4.5 ))
adt[if(return.seurat == TRUE) {
= SubsetData(SeuratObject, cells.use = cells)
sub return(sub)
else { return(cells) }
}
}
= Gate_CD8CD161(memory)
cd8cd161 = Gate_MAIT(memory)
mait_2 = GateNiaveCD8(memory)
naive_2 = GateMemCD8(memory)
memcd8
= newmd2 %>%
newmd3 mutate(celltype_m_joint =
if_else(celltype_m_joint == "CD8_Mem_Tcell" & barcode_check %in% cd8cd161, true = "CD8_CD161_Tcell",
if_else(celltype_m_joint == "CD8_Mem_Tcell" & barcode_check %in% mait_2, true = "MAIT_Like",
if_else(celltype_m_joint == "CD8_Mem_Tcell" & barcode_check %in% naive_2, true = "CD8_Naive_Tcell",
if_else(celltype_m_joint == "CD8_Mem_Tcell" & barcode_check %in% memcd8, true = "CD8_Mem_Tcell",
false = celltype_m_joint)))))
# table(newmd3$celltype_m_joint, newmd3$sample)
# add metadata and save object with joint cluster annotations.
= newmd4 %>%
newmd4 select(celltype_joint = celltype_m_joint, barcode_check) %>%
column_to_rownames("barcode_check")
= AddMetaData(h1h5,metadata = newmd4)
h1h5 saveRDS(h1h5, file = paste0(cluster_path, "h1h5_annotated.rds"))
This is used as the starting object for downstream analysis. This object is a Seurat version 2 S4 class object, but it is parsed and analyzed with other packages and custom functions including Seurat version 4 (not version 2).
Software package versions
R version 4.0.5
dsb_1.0.2
ggsci_2.9
sp_1.4-5
SeuratObject_4.1.0
Seurat_4.0.1
viridis_0.5.1
viridisLite_0.3.0
scglmmr_0.1.0
variancePartition_1.25.6
BiocParallel_1.24.1
limma_3.46.0
magrittr_2.0.3
lme4_1.1-26
Matrix_1.4-1
emmeans_1.5.4
Rcpp_1.0.9
HDStIM_0.1.0
here_1.0.1
forcats_0.5.1
stringr_1.4.0
dplyr_1.0.4
purrr_0.3.4
readr_1.4.0
tidyr_1.1.2
tibble_3.1.8
ggplot2_3.3.3
tidyverse_1.3.0
R version 3.5.3
scglmmr_0.1.0
ggraph_1.0.2
igraph_1.2.4.1
ggsci_2.9
ggridges_0.5.1
monocle_2.10.1
DDRTree_0.1.5
irlba_2.3.3
VGAM_1.1-1
Biobase_2.42.0
BiocGenerics_0.28.0
viridis_0.5.1
viridisLite_0.3.0
here_0.1
Seurat_2.3.4
Matrix_1.2-15
cowplot_0.9.4
magrittr_2.0.1
forcats_0.4.0
stringr_1.4.0
dplyr_0.8.5
purrr_0.3.3
readr_1.3.1
tidyr_1.0.2
tibble_2.1.1
ggplot2_3.1.1
tidyverse_1.2.1