Commit c660e865 authored by Georgi Tushev's avatar Georgi Tushev
Browse files

ercc normed

parent afcabb43
.DS_Store
.Rhistory
*.m~
......@@ -8,40 +8,27 @@
# load edgeR
library(edgeR);
library(RUVSeq);
library(rstudioapi)
# set working directory
(WD <- dirname(rstudioapi::getSourceEditorContext()$path))
if (!is.null(WD)) setwd(WD)
# clean current variables
rm(list = ls());
# read UTR counts
utrs<-as.matrix(read.table("/Users/tushevg/Desktop/UTRPolyribosomes/countTable_UTRPolysomes_UTRs_21Nov2017.txt",stringsAsFactors=F, header=T, row.names=1, check.names=F, colClasses=c("factor", rep("NULL", 5), rep("integer", 12))));
filterUTRs<-grep("group", rownames(utrs));
utrs<-as.matrix(read.table("../tables/countTable_UTRPolysomes_UTRSupp_23Nov2017.txt",stringsAsFactors=F, header=T, row.names=1, check.names=F));
# read ERCC counts
ercc<-as.matrix(read.table("/Users/tushevg/Desktop/UTRPolyribosomes/countTable_UTRPolysomes_ERCC_21Nov2017.txt",stringsAsFactors=F, header=T, row.names=1, check.names=F));
ercc<-as.matrix(read.table("../tables/countTable_UTRPolysomes_ERCC_21Nov2017.txt",stringsAsFactors=F, header=T, row.names=1, check.names=F));
filterErcc<-apply(ercc, 1, function(x) length(x[x>=5])>=2);
ercc<-ercc[filterErcc,];
# reorder data by name
utrs<-utrs[,c("InputHc2","InputHc3","80SHc2","80SHc3","2and3Hc2","2and3Hc3","4and5Hc2","4and5Hc3","6and7Hc2","6and7Hc3","bigger7Hc2","bigger7Hc3")];
ercc<-ercc[,c("InputHc2","InputHc3","80SHc2","80SHc3","2and3Hc2","2and3Hc3","4and5Hc2","4and5Hc3","6and7Hc2","6and7Hc3","bigger7Hc2","bigger7Hc3")];
# concatenate data
data<-rbind(utrs, ercc);
# filter for 2 reads per million
#libsize<-colSums(data);
#tmp<-1e6*data / libsize;
#filter<-apply(tmp, 1, function(x) length(x[x>=2])>=2);
#data<-data[filter,];
# filter for 5 reads in at least two samples
filter<-apply(data, 1,function(x) length(x[x>=5])>=2);
data<-data[filter,];
utrs<-rownames(data)[grep("group", rownames(data))];
ercc<-rownames(data)[grep("^ERCC", rownames(data))];
# normalize with loess regression
weights<-c(7/100,7/100,10/1000,10/1000,16/1000,16/1000,22/1000,22/1000,24/1000,24/1000,7/100,7/100);
namesUtrs<-rownames(data)[grep("group", rownames(data))];
namesErcc<-rownames(data)[grep("^ERCC", rownames(data))];
# set factors
levelSample<-as.factor(rep(c("Input","80S","2and3","4and5","6and7","bigger7"), each=2));
......@@ -51,7 +38,7 @@ set<-newSeqExpressionSet(as.matrix(data), phenoData=data.frame(levelSample, row.
# plot raw data
library(RColorBrewer)
colors<-brewer.pal(3, "Set2");
colors<-brewer.pal(6, "Set2");
plotRLE(set, outline=FALSE, ylim=c(-4,4), col=colors[levelSample]);
plotPCA(set, col=colors[levelSample]);
......@@ -61,12 +48,12 @@ plotRLE(set, outline=FALSE, ylim=c(-4,4), col=colors[levelSample]);
plotPCA(set, col=colors[levelSample]);
# unwanted variation with spiked-ins
set1<-RUVg(set, ercc, k=1);
set1<-RUVg(set, namesErcc, k=1);
pData(set1)
plotRLE(set, outline=FALSE, ylim=c(-4,4), col=colors[levelSample]);
plotPCA(set, col=colors[levelSample]);
write.table(normCounts(set1),"/Users/tushevg/Desktop/UTRPolyribosomes/countTable_UTRPolysomes_RUVg_22Nov2017.txt",quote=FALSE,sep="\t",col.names=TRUE);
write.table(normCounts(set),"../tables/countTable_UTRPolysomes_RUVg_22Nov2017.txt",quote=FALSE,sep="\t",col.names=TRUE);
# differential expression analysis EdgeR
design<-model.matrix(~levelSample + W_1, data=pData(set1));
......
% normaliseErcc
clc
clear variables
close all
%% read count file
data = readCountTable('../tables/countTable_UTRPolysomes_UQNormed_24Nov2017.txt');
%% extract ercc
idxErcc = strncmp('ERCC', data.label, 4);
ercc.ids = data.label(idxErcc);
ercc.counts = data.counts(idxErcc,:);
%% read ercc mix concentration
fr=fopen('../tables/cms_095046.txt','r');
fgetl(fr);
txt = textscan(fr,'%*s %s %*s %n %n %*s %*s','delimiter','\t');
fclose(fr);
mix = txt{2};
[~, idxRef, idxQry] = intersect(txt{1}, ercc.ids);
ercc.mix(idxQry,1) = mix(idxRef);
%% assign weight
ercc.volume = [7, 10, 16, 22, 24, 7];
ercc.dilution = 1./[100, 1000, 1000, 1000, 1000, 100];
ercc.weight = ercc.volume .* ercc.dilution;
ercc.weight = repmat(ercc.weight, 2, 1);
ercc.weight = ercc.weight(:);
%% calculate concentration
ercc.attomoles = ercc.mix * ercc.weight';
%% linear regression
clrmtx = jet(6);
figure('color','w');
i = 1;
pArray = zeros(12, 2);
hold on;
k = 1;
for k = 1 : 12
X = log2(ercc.counts(:,k) + 1);
Y = log2(ercc.attomoles(:,k) + 1);
idxFilter = X >= 4;
plot(X,Y,'.','color',clrmtx(i,:));
p = polyfit(X(idxFilter),Y(idxFilter),1);
pArray(k,:) = p;
xfit = linspace(min(X),max(X),100);
yfit = polyval(p, xfit);
plot(xfit, yfit, '-','color',clrmtx(i,:),'linewidth',1.2);
if mod(k,2) == 0
i = i + 1;
end
end
hold off;
nfactor = mean(pArray(:,1))./pArray(:,1);
ncounts = bsxfun(@times, data.counts(~idxErcc,:), nfactor') + 1;
%% write out normed counts
fmtout = repmat({'%.6f\t'},size(ncounts,2)+1,1);
fmtout(1) = {'%s\t'};
fmtout(end) = {'%.6f\n'};
fmtout = sprintf('%s',fmtout{:});
mtxout = [data.label(~idxErcc),num2cell(ncounts)]';
hdrout = sprintf('%s\t',data.header{:});
hdrout(end) = [];
fw = fopen('../tables/countTable_UTRPolysomes_ERCCNormed_24Nov2017.txt','w');
fprintf(fw,'%s\n',hdrout);
fprintf(fw,fmtout,mtxout{:});
fclose(fw);
%% plot results
%{
rle = log2(ncounts ./ median(ncounts(:)));
figure('color','w');
hold on;
plot([0.5,12.5],[0,0],'k');
boxplot(rle,'Symbol','','colors',kron(clrmtx,ones(2,1)));
hold off
set(gca,'Box','off',...
'XTickLabel',data.header(2:end),...
'XTickLabelRotation',45,...
'YLim',[-5,8.5]);
c = pca(ncounts,'Algorithm','eig','Centered',true);
i = 1;
figure('color','w');
hold on;
for k = 1 : 12
h(k) = plot(c(k,1),c(k,2),'.','color',clrmtx(i,:),'MarkerSize',20);
if mod(k,2) == 0
i = i + 1;
end
end
hl = legend(h(1:2:end),{'Input','80s','2and3','4and5','6and7','>7'});
set(hl,'Location','NorthWest','EdgeColor','w');
hold off;
%}
......@@ -3,8 +3,8 @@ clc
clear variables
close all
tmp = readCountTable('countTable_UTRPolysomes_RUVg_22Nov2017.txt');
%{
tmp = readCountTable('../tables/countTable_UTRPolysomes_RUVg_22Nov2017.txt');
x = tmp.counts(:,1:2);
y = tmp.counts(:,3:2:end);
......@@ -38,25 +38,24 @@ for k = 1 : N
end
hold off;
%{
%}
%% read stats file
stats = readCountTable('countTable_UTRPolysomes_Stats_21Nov2017.txt');
%stats = readCountTable('../tables/countTable_UTRPolysomes_Stats_21Nov2017.txt');
%% read ercc file
ercc = readCountTable('countTable_UTRPolysomes_ERCC_21Nov2017.txt');
fr=fopen('cms_095046.txt','r');
ercc = readCountTable('../tables/countTable_UTRPolysomes_ERCC_21Nov2017.txt');
fr=fopen('../tables/cms_095046.txt','r');
fgetl(fr);
txt = textscan(fr,'%*s %s %*s %n %n %*s %*s','delimiter','\t');
fclose(fr);
ercc.ref=txt{1};
ercc.mix1=txt{2};
ercc.mix2=txt{3};
ercc.volume = [16,22,24,10,7,7];
ercc.volume = [7,10,16,22,24,7];
ercc.volume = repmat(ercc.volume, 2, 1);
ercc.volume = ercc.volume(:)';
ercc.dilution = [1000,1000,1000,1000,100,100];
ercc.dilution = [100,1000,1000,1000,1000,100];
ercc.dilution = repmat(ercc.dilution, 2, 1);
ercc.dilution = ercc.dilution(:)';
ercc.weight = ercc.volume ./ ercc.dilution;
......@@ -65,31 +64,43 @@ ercc.mix = repmat(ercc.mix1,1,size(ercc.counts,2));
ercc.conc = bsxfun(@times, ercc.mix, ercc.weight);
[~,idxQry, idxRef] = intersect(ercc.label, ercc.ref);
X = ercc.conc(idxRef,:);
Y = ercc.counts(idxQry,:);
Y = ercc.conc(idxRef,:);
X = ercc.counts(idxQry,:);
idxFilter = (sum(Y >= 5, 2) >= 2);
X = X(idxFilter,:);
Y = Y(idxFilter,:);
c = pca(Y);
Ysmth = smooth(X(:),Y(:),'lowess');
Ysmth = reshape(Ysmth, size(Y));
YY = bsxfun(@rdivide, Y, sum(Y));
%% example variation, raw counts
tmp = Y+1;
tmpr = bsxfun(@rdivide, tmp, median(tmp(:)));
figure('color','w')
clrmtx = jet(12);
PP = zeros(12,2);
figure('color','w');
hold on;
plot([0,13],[0,0],'k');
boxplot(log2(tmpr),'color',[.45,.45,.45]);
for k = 1 : 12
pX = log2(X(:,k));
pY = log2(Y(:,k));
p = polyfit(pX,pY,1);
PP(k,:) = p;
plot(pX,pY,'.','color',clrmtx(k,:));
xfit = linspace(min(pX),max(pX),100);
yfit = polyval(p, xfit);
plot(xfit,yfit, 'color',clrmtx(k,:));
end
xfit = linspace(2,15,100);
yfit = polyval(median(PP),xfit);
plot(xfit, yfit, 'color','k','linewidth',1.2);
hold off;
set(gca,'XTickLabel',ercc.header(2:end),...
'XTickLabelRotation',45);
%}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment