Skip to content
Snippets Groups Projects
turtle_final_analysis.R 7.11 KiB
# analysis pipeline for the turtle.all dataset, after SVM data filtering
# analysis with the R package Seurat (Rahul Satija lab); see also tutorials on the Seurat webpage 
# here: code used for the turtle.all dataset. A similar code was used for the other datasets (lizard.all, turtle.neurons and lizard.neurons). 
# The parameters chosen for the analysis of these datasets (number of principal components, resolution parameter for clustering etc) are reported in the Supplementary Materials of Tosches et al 2018

# set working directory, load packages and functions

library(Seurat)
library(DESeq2)
library(edgeR)
library(limSolve)
library(scran)
library(Rtsne)
library(matrixStats)
library(gplots)
library(dplyr)
library(Matrix) # this is for working with sparse matrices
library(useful)
library(gmodels)
library(sva)
library(bigpca)


source("./DropSeq_functions.R")
source("./CalcOurMetadata.R")

############################# entire turtle dataset ##################################

# the turtle object loaded below includes all cells, before SVM filtering. 
# It also has a svm_class column in the data.info slot, where good and bad quality cells are assigned to the +1 and -1 classes, respectively. 
load("161105_turtle_all_.Robj")

# set file names
Date = "170210"
ObjectName = "turtle.all"  # Seurat object name
Description = "filt_combat"  
comments = "cells from the object 161105_turtle_all_.Robj. Batch correction with Combat. Selection of PCs with randomization test. Slow tSNE."

# import data
turtle.good.cells <- rownames(turtle@data.info[turtle@data.info$svm_class == "1",])
turtle.all.data <- turtle@raw.data[,turtle.good.cells] 
dim(turtle.all.data)  
# [1] 23500 18853

# setup Seurat object with good quality cells only
turtle.all  <- new("seurat", raw.data = turtle.all.data)
turtle.all <- Setup(turtle.all, min.cells = 3, min.genes = 800, project = "turtle", do.scale = T, do.center = F) 
# 22334 genes across 18853 samples

# add metadata from source: this adds experiment info (batches etc) 
turtle.all@data.info <- CalcOurMetadata(turtle.all@ident,turtle.all@data,turtle.all@data.info)

# add metadata: this adds QC data
metadata <- read.table("d34567_stats_all_all_cells.txt")
metadata <- metadata[rownames(metadata) %in% rownames(turtle.all@data.info),]
turtle.all = AddMetaData(turtle.all, metadata[,8:12])

# filter, subsetting etc 
pMitoHigh = 7
pIntergenicHigh = 60
pCodingLow = 20
pIntronicHigh = 30
pUtrLow = 10

turtle.all <- SubsetData(turtle.all, subset.name = "percent.mito", accept.high = pMitoHigh)
#  22334 genes across 18842 samples.
turtle.all <- SubsetData(turtle.all, subset.name = "pINTERGENIC", accept.high = pIntergenicHigh)
# 22334 genes across 18838 samples.
turtle.all <- SubsetData(turtle.all, subset.name = "pCODING", accept.low = pCodingLow)
# 22334 genes across 18829 samples.
turtle.all <- SubsetData(turtle.all, subset.name = "pINTRONIC", accept.high = pCodingLow)
# 22334 genes across 18828 samples.
turtle.all <- SubsetData(turtle.all, subset.name = "pUTR", accept.low = pUtrLow)
# 22334 genes across 18828 samples.

dim(turtle.all@data)
# [1] 22334 18828

# Find variable genes
xlow=0.25
xhigh=3
ylow=0.5

pdf(file=paste(Date, ObjectName, Description, "MeanVarPlot", ".pdf", sep="_"))
turtle.all <- MeanVarPlot(turtle.all, x.low.cutoff = xlow, x.high.cutoff= xhigh, y.cutoff = ylow)
dev.off()
length(turtle.all@var.genes)
# 1101


################# CORRECT FOR BATCH EFFECTS ###################
###############################################################

# batch correction with ComBat
# see also Shekhar et al Cell 2016

batch.ids <- as.factor(turtle.all@data.info$animalident)
data.use = turtle.all@data
corrected.data <- ComBat(dat=data.use, batch=batch.ids, par.prior=T, prior.plots=F) 
turtle.all@scale.data <- t(scale(t(corrected.data)))


######## PCA and FIND THE SIGNIFICANT PRINCIPAL COMPONENTS ###########
######################################################################

turtle.all <- PCAFast(turtle.all, pc.genes = turtle.all@var.genes, pcs.compute = 100, do.print = F) 

# eigenvalues
turtle.pca.eig <- turtle.all@pca.obj[[1]]$d

# plot PCA
pdf(file=paste(Date, ObjectName, Description, "PCA", ".pdf", sep="_"))
PCElbowPlot(turtle.all, num.pc=100)
plot(turtle.pca.eig, pch=19, cex=0.5)
PCAPlot(turtle.all,1,2, pt.size=0.7, no.legend=T)
GenePlot(turtle.all@pca.rot[,1:2], turtle.all@data.info)
PCHeatmap(turtle.all, pc.use = 1, cells.use = 100, do.balanced = TRUE)
PCHeatmap(turtle.all, pc.use = 2, cells.use = 100, do.balanced = TRUE)
dev.off()


####### Significant principal components

# permutation test to find number of significant PCs

library(bigpca)

pc.data = turtle.all@scale.data[turtle.all@var.genes,]
eig.data = big.PCA(t(pc.data), pcs.to.keep = NA, thin = FALSE, SVD = F, save.pcs = FALSE, verbose=F)
eig.data = eig.data$Evalues
eig.random <- list() # empty list to fill
pb   <- txtProgressBar(1, 1000, style=3)
for (i in (1:1000)) {
     random.data <- as.matrix(apply(pc.data,1,sample)) # here it permutes the data
     capture.output(random.pca <- big.PCA(random.data, pcs.to.keep = NA, thin = FALSE, SVD = F, save.pcs = FALSE, verbose=F))       		 eig.random[[i]] <- random.pca$Evalues  # save PCA eigenvalues
     if(i %% 10 == 0){
          setTxtProgressBar(pb, i)}
}

eig.random.table <- data.frame(matrix(unlist(eig.random),nrow=1000, byrow=T)) 
colnames(eig.random.table) <- paste0("PC",1:ncol(eig.random.table))
write.table(eig.random.table, file=paste(Date, ObjectName, Description, "randomizedPCs", ".txt", sep="_"), quote=F, col.names=T,row.names=F)
nPCs = length(eig.data[eig.data > max(eig.random.table)])
# nPCs is 35


######### PCA and tSNE ###########
##################################

turtle.all <- RunTSNE(turtle.all, dims.use = 1:nPCs, do.fast = F)

res = 0.3	 # resolution for clustering
turtle.all <- FindClusters(turtle.all ,pc.use = 1:nPCs, resolution = res, reuse.SNN = F, do.sparse = T, save.SNN=T)
turtle.all = BuildClusterTree(turtle.all,do.plot=F)  

# plot cluster tree
pdf(file=paste(Date, ObjectName, Description, (paste(nPCs,"PCs",sep="")), (paste("res",res,sep="")),"ClusterTree", ".pdf", sep="_"))
PlotClusterTree(turtle.all)
dev.off()

# plot tSNE
pdf(file=paste(Date, ObjectName, Description, (paste(nPCs,"PCs",sep="")), (paste("res",res,sep="")),"tSNE", ".pdf", sep="_"), width=8)
TSNEPlot(turtle.all, pt.size=0.7, do.label=T)
TSNEPlot(turtle.all, pt.size=0.7, group.by="animalident")
TSNEPlot(turtle.all, pt.size=0.7, group.by="areaident")
dev.off()

# plot metadata on tSNE
pdf(file=paste(Date, ObjectName, Description, (paste(nPCs,"PCs",sep="")),"tSNE_metadata", ".pdf", sep="_"))
GenePlot(turtle.all@tsne.rot, turtle.all@data.info)
UMIPlot(turtle.all@tsne.rot, turtle.all@data.info)
pCODINGPlot(turtle.all@tsne.rot, turtle.all@data.info)
pUTRPlot(turtle.all@tsne.rot, turtle.all@data.info)
pINTERGENICPlot(turtle.all@tsne.rot, turtle.all@data.info)
MitoPlot(turtle.all@tsne.rot, turtle.all@data.info)
dev.off()

sink(file=paste(Date, ObjectName, Description, (paste(nPCs,"PCs",sep="")), (paste("res",res,sep="")),"AnalysisLog", ".txt", sep="_"))
PrintAnalysisLog(turtle.all, turtle.all@data, turtle.all@data.info,  turtle.all@raw.data, comments)
sink()

save(turtle.all, file = paste(Date, ObjectName, Description, ".Robj", sep="_"))