# InteRD

## Installation

if (!require("devtools")) {
install.packages("devtools")
}
devtools::install_github("chencxxy28/InteRD")

In this tutorial, we use SCDC (Dong et al. (2021)) for illustration to conduct reference-based deconvolution.

#load
library(InteRD)

## Implementation (Case study 1)

First, read in scRNA-seq data from our website

readRDSFromWeb <- function(ref) {
}
xin<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/Xin_nonD.rds")

Second, use function generateBulk() to create the pseudo bulk samples, where ct.varname specifies the name of cell type clustering result variable. sample specifies the name of subjects information variable. ct.sub specifies the names of cell types used to construct pseudo bulk samples. Here we provide an example where the pseudo bulk samples are generated from Segerstolpe et al. (2016).

set.seed(1234567)
pseudo.seger<-generateBulk(seger[["sc.eset.qc"]], ct.varname = "cluster", sample = "sample", ct.sub = c("alpha","beta","delta","gamma"), nbulk = 40, low_s = 0.3, upp_s = 0.7)
truep<-pseudo.seger$true_p[complete.cases(pseudo.seger$true_p),]

The generated pseudo bulk object contains a matrix of true cell type proportions (pseudo.seger$truep) and the ExpressionSet object (pseudo.seger$pseudo_eset).

InteRD1 algorithm is the first step of InteRD to do multiple reference ensemble, which takes as input the target bulk data, a list of marker genes (pre-selected by Seurat based on a single cell data from Xin et al. (2016)), a list of subject-level cell type proportions based on each reference set. For illustration, we use the function SCDC_ENSEMBLE() from SCDC package (Dong et al. (2021)) to obtain single-reference-based estimates and ensemble estimates. For reference panels, we consider two single cell data, one from Baron et al. (2016), and the other from Xin et al. (2016). The estimates can be obtained by running the following code

set.seed(1234567)
library(SCDC)
##ensemble of multiple reference sets
#resuts based on SCDC
pancreas.sc <- list(baron = baron$sc.eset.qc, xin = xin ) SCDC_results<-SCDC_ENSEMBLE(bulk.eset = pseudo.seger$pseudo_eset, sc.eset.list = pancreas.sc, ct.varname = "cluster",
sample = "sample", weight.basis =TRUE,truep = truep, ct.sub =  c("alpha","beta","delta","gamma"), search.length = 0.02,grid.search=TRUE)
comb<-SCDC_results$prop.only weight_matrix<-SCDC_results$w_table["mAD_Y",1:2]

# saveRDS(comb,"/Users/chixiang.chen/Library/CloudStorage/OneDrive-UniversityofMarylandSchoolofMedicine/postdoc/postdoc/deconvolution/ref_based_rd/data_InteRD/comb_seger.rds")
#results based on InteRD1
list_marker<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/list_markerbaron20.rds") #get markers selected from xin et al (2016)
lambda_option<-c(0,0.01,0.05,0.1,1,5,100)
cell_type_unique<-c("alpha","beta","delta","gamma")
InteRD1.output<-InteRD1(bulk.data =pseudo.seger$pseudo_eset,list_marker,cell_type_unique,comb_used=comb,lambda_option) InteRD1<-InteRD.predict.prop(InteRD.output=InteRD1.output) We provide the function evaluate to assess the performance of estimated proportions versus the true proportions based on mean absolute deviance (MAD, the smaller the better), Kendall correlation coefficient (Ken, the larger the better), and Pearson correlation coefficient (Cor, the larger the better). evaluate(SCDC_ENSEMBLE_MAD,pseudo.seger$true_p)$all.eva ## all.mad all.ken all.cor ## 1 0.1069572 0.7358974 0.8939433 evaluate(InteRD1,pseudo.seger$true_p)$all.eva ## all.mad all.ken all.cor ## 1 0.04851359 0.004487179 0.02931761 InteRD2 algorithm is the second step of InteRD to further integrate prior biological information into the deconvolution in a robust manner. The prior information is the population-level mean cell-type proportions and their corresponding standard deviations across samples. In this tutorial, we obtain this information from scRNA-seq data from Segerstolpe et al. (2016). ave_est = pop.ct.prop.scRNA(scRNA=seger[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.prop
ave_sd = pop.ct.prop.scRNA(scRNA=seger[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.sd lambda_option<-c(0,seq(from=1,to=20,length=4),seq(from=30,to=100,length=4),200,500,1000000^2) InteRD2.output<-InteRD2(bulk.data=pseudo.seger$pseudo_eset,list_marker,cell_type_unique,comb_sampled=InteRD1,ave_est,ave_sd,lambda_option=lambda_option)
InteRD2<-InteRD.predict.prop(InteRD.output=InteRD2.output)

Note that Reference-free approach and TOAST are special cases of InteRD2. The TOAST can be run based on the function InteRD2 by specifying lambda_option=0, and Reference-free approach can be run by the function Ref_free. The following is the demo for reference-free approach.

ref_free.output<-Ref_free(bulk.data=pseudo.seger$pseudo_eset,list_marker=list_marker,cell_type_unique=cell_type_unique) ## [1] 1 reffree<-InteRD.predict.prop(InteRD.output=ref_free.output) Based on the evaluations, we can tell that InteRD2 is better than InteRD1, the InteRD estimates are all better than the existing method. evaluate(SCDC_ENSEMBLE_MAD,pseudo.seger$true_p)$all.eva ## all.mad all.ken all.cor ## 1 0.1069572 0.7358974 0.8939433 evaluate(reffree,pseudo.seger$true_p)$all.eva ## all.mad all.ken all.cor ## 1 0.03773585 0.7487179 0.9264559 evaluate(InteRD1,pseudo.seger$true_p)$all.eva ## all.mad all.ken all.cor ## 1 0.04851359 0.004487179 0.02931761 evaluate(InteRD2,pseudo.seger$true_p)$all.eva ## all.mad all.ken all.cor ## 1 0.01548666 0.7583333 0.9313362 ## Implementation (Case study 2) Now let us consider another pseudo data generated from Baron et al. (2016). set.seed(1234567) pseudo.baron<-generateBulk(baron[["sc.eset.qc"]], ct.varname = "cluster", sample = "sample", ct.sub = c("alpha","beta","delta","gamma"), nbulk = 40, low_s = 0.3, upp_s = 0.7) truep<-pseudo.baron$true_p[complete.cases(pseudo.baron$true_p),] Then we apply SCDC to obtain the estimates based on InteRD and SCDC-ENSEMBLE, where we consider two single cell data, one from Segerstolpe et al. (2016), and the other from Xin et al. (2016). The estimates can be obtained by running the following code set.seed(1234567) ##ensemble of multiple reference sets #resuts based on SCDC pancreas.sc <- list(seger = seger$sc.eset.qc,
xin   = xin
)
SCDC_results<-SCDC_ENSEMBLE(bulk.eset = pseudo.baron$pseudo_eset, sc.eset.list = pancreas.sc, ct.varname = "cluster", sample = "sample", weight.basis =TRUE,truep = truep, ct.sub = c("alpha","beta","delta","gamma"), search.length = 0.02,grid.search=TRUE) comb<-SCDC_results$prop.only
weight_matrix<-SCDC_results$w_table["mAD_Y",1:2] SCDC_ENSEMBLE_MAD<-SCDC:::wt_prop(weight_matrix,comb) # saveRDS(SCDC_ENSEMBLE_MAD,"/Users/chixiang.chen/Library/CloudStorage/OneDrive-UniversityofMarylandSchoolofMedicine/postdoc/postdoc/deconvolution/ref_based_rd/data_InteRD/SCDC_ENSEMBLE_MAD_baron.rds") # saveRDS(comb,"/Users/chixiang.chen/Library/CloudStorage/OneDrive-UniversityofMarylandSchoolofMedicine/postdoc/postdoc/deconvolution/ref_based_rd/data_InteRD/comb_baron.rds") #results based on InteRD1 SCDC_ENSEMBLE_MAD<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/SCDC_ENSEMBLE_MAD_baron.rds") comb<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/comb_baron.rds") list_marker<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/list_markerbaron20.rds") #get markers selected from xin et al (2016) lambda_option<-c(0,0.01,0.05,0.1,1,5,100) cell_type_unique<-c("alpha","beta","delta","gamma") InteRD1.output<-InteRD1(bulk.data =pseudo.baron$pseudo_eset,list_marker,cell_type_unique,comb_used=comb,lambda_option)
InteRD1<-InteRD.predict.prop(InteRD.output=InteRD1.output)

After obtaining InteRD1, we can obtain InteRD2 by integrating prior biological information into the deconvolution in a robust manner. The prior information is the population-level mean cell-type proportions and their corresponding standard deviations across samples that are obtained from scRNA-seq data from Baron et al. (2016).

ave_est = pop.ct.prop.scRNA(scRNA=baron[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.prop ave_sd = pop.ct.prop.scRNA(scRNA=baron[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.sd
lambda_option<-c(0,seq(from=1,to=20,length=4),seq(from=30,to=100,length=4),200,500,1000000^2)
InteRD2.output<-InteRD2(bulk.data=pseudo.baron$pseudo_eset,list_marker,cell_type_unique,comb_sampled=InteRD1,ave_est,ave_sd,lambda_option=lambda_option) InteRD2<-InteRD.predict.prop(InteRD.output=InteRD2.output) Based on the evaluations, we can tell that InteRD-based estimates (both InteRD1 and InteRD2) are better than the existing method. evaluate(SCDC_ENSEMBLE_MAD,pseudo.baron$true_p)$all.eva evaluate(InteRD1,pseudo.baron$true_p)$all.eva evaluate(InteRD2,pseudo.baron$true_p)\$all.eva