Commit 947eaf12 authored by Georgi Tushev's avatar Georgi Tushev
Browse files

experiment summary

parent 768973df
# UTRPolyribosomes # UTRPolyribosomes
3'UTR-End Sequencing on polyribosome fractions 3'UTR-End Sequencing on polyribosome fractions
## Layout
samples (x 2 replica):
* input (bound and unbound fraction together)
* 80S (monosome fraction)
* 2and3
* 4and5
* 6and7
* \>7
\+ spike-in controls (ERCC)
The library preparation is done using [MACE](https://www.ncbi.nlm.nih.gov/pubmed/25052703) method. Reads are sequenced on Illumina NextSeq 550.
## Overview
#!/usr/bin/perl
use Getopt::Long;
use warnings;
use strict;
sub usage($);
sub fileList($$);
sub checkPaths($$$$);
sub parseFileList($$$);
sub parseLogs($$);
sub reportSummary($$);
#sub parseLogFile($);
#sub parseCounts($$);
MAIN:
{
my $help;
my $pathLogs;
my $pathCoverage;
my $keyDelimiter = '_';
my $reportErcc;
my %summary = ();
# input parser
GetOptions(
"logs=s" => \$pathLogs,
"gbed=s" => \$pathCoverage,
"delim=s" => \$keyDelimiter,
"ercc" => \$reportErcc,
"help" => \$help
) or usage("Error :: invalid command line options");
#defaults
usage("version 0.1, Nov 2017") if($help);
usage("Error :: path to log files is required!") if(!defined($pathLogs));
usage("Error :: path to gbed files is required!") if(!defined($pathCoverage));
# check paths
checkPaths($pathLogs, $pathCoverage, $keyDelimiter, \%summary);
# parse logs
parseFileList(\%summary, "log", \&parseLogs);
parseFileList(\%summary, "gbed", \&parseCoverage);
# report
reportSummary(\%summary, $reportErcc);
exit 0;
}
### --- subroutines --- ###
## CHECKPATHS
sub checkPaths($$$$)
{
my $pathLogs = $_[0];
my $pathCoverage = $_[1];
my $keyDelimiter = $_[2];
my $summary = $_[3];
my $listLogs = fileList($pathLogs, "txt");
my $listCoverage = fileList($pathCoverage, "gbed.gz");
# check logs and gbed
my $iter = 0;
foreach my $fileLog (@{$listLogs})
{
# current files
my $fileBam = $listCoverage->[$iter];
# check if logs and gbed match
my $checkKey = $fileBam;
$checkKey =~ s/\.gbed.gz//g;
usage("Error :: Log and Bam files mismatch!") if (index($fileLog, $checkKey) == -1);
# check summary key
my $summaryKey = ($checkKey =~ m/(.*)$keyDelimiter.*/) ? $1 : "unknown";
usage("Error :: Summary key cannot be parsed!") if($summaryKey eq "unknown");
# set summary
$summary->{$checkKey}{"log"} = $pathLogs . '/' . $fileLog;
$summary->{$checkKey}{"gbed"} = $pathCoverage . '/' . $fileBam;
# next set files
$iter++;
}
return;
}
## FILELIST
sub fileList($$)
{
my $path = $_[0];
my $extension = $_[1];
my @list = ();
opendir(my $ph, $path) or die $!;
while(my $file = readdir($ph))
{
# filter for file
next unless (-f "$path/$file");
# filter for extension
next unless ($file =~ m/\.$extension$/);
# accumulate list
push(@list, $file);
}
closedir($ph);
return \@list;
}
## PARSEFILELIST
sub parseFileList($$$)
{
my $summary = $_[0];
my $secondaryKey = $_[1];
my $fcnCallback = $_[2];
foreach my $primaryKey (keys %{$summary})
{
my $queryFile = $summary->{$primaryKey}{$secondaryKey};
$fcnCallback->($queryFile, $summary->{$primaryKey});
}
return;
}
## PARSELOGS
sub parseLogs($$)
{
my $logFile = $_[0];
my $summary = $_[1];
my $countsRaw = 0;
my $countsUniquelyAligned = 0;
# parse each file
open(my $fh, "<", $logFile) or die $!;
while(<$fh>)
{
chomp($_);
# parse total number of reads
if($_ =~ m/Number of input reads \|\s+([0-9]+)/g)
{
$countsRaw = $1;
}
# parse uniquely aligned reads
if($_ =~ m/Uniquely mapped reads number \|\s+([0-9]+)/g)
{
$countsUniquelyAligned = $1;
}
}
close($fh);
# check results
usage("Error :: could not parse log files!") if ($countsRaw == 0 || $countsUniquelyAligned == 0);
# fill summary
$summary->{"reads.raw"} = $countsRaw;
$summary->{"reads.aligned"} = $countsUniquelyAligned;
return;
}
## PARSECOVERAGE
sub parseCoverage($$)
{
my $gbedFile = $_[0];
my $summary = $_[1];
# fix gbed file spaces
$gbedFile =~ s/\ /\\ /g;
# open stream
my $erccCount = 0;
my $mitoCount = 0;
open(my $fh, "gzcat $gbedFile|") or die $!;
while(<$fh>)
{
chomp($_);
# count ERCC
if($_ =~ m/^ERCC/)
{
my @line = split("\t", $_, 6);
$erccCount += $line[3];
$summary->{"list.ercc"}{$line[0]} += $line[3];
}
# count Mitochondria
if($_ =~ m/^chrM/)
{
my @line = split("\t", $_, 6);
$mitoCount += $line[3];
}
}
close($fh);
$summary->{"reads.ercc"} = $erccCount;
$summary->{"reads.mito"} = $mitoCount;
}
## REPORTSUMMARY
sub reportSummary($$)
{
my $summary = $_[0];
my $reportErcc = $_[1];
if ($reportErcc)
{
# extract Ercc list
my %listErcc = ();
foreach my $key(sort keys %{$summary})
{
foreach my $ercc (sort keys %{$summary->{$key}{"list.ercc"}})
{
$listErcc{$ercc} = 1;
}
}
# print table
my @keysList = sort keys %{$summary};
print "ERCC\t",join("\t", @keysList),"\n";
foreach my $ercc (sort keys %listErcc)
{
print $ercc;
foreach my $key (@keysList)
{
my $erccCount = exists($summary->{$key}{"list.ercc"}{$ercc}) ? $summary->{$key}{"list.ercc"}{$ercc} : 0;
print "\t",$erccCount;
}
print "\n";
}
}
else
{
# header
print "sample\treads.raw\treads.aligned\treads.ercc\treads.mito\tratio.aligned\tratio.ercc\tratio.mito\n";
foreach my $key (sort keys %{$summary})
{
# counts
my $countsRaw = exists($summary->{$key}{"reads.raw"}) ? $summary->{$key}{"reads.raw"} : 0;
my $countsAligned = exists($summary->{$key}{"reads.aligned"}) ? $summary->{$key}{"reads.aligned"} : 0;
my $countsErcc = exists($summary->{$key}{"reads.ercc"}) ? $summary->{$key}{"reads.ercc"} : 0;
my $countsMito = exists($summary->{$key}{"reads.mito"}) ? $summary->{$key}{"reads.mito"} : 0;
# percentage
my $percentAligned = (100 * $countsAligned)/$countsRaw;
my $percentErcc = (100 * $countsErcc)/$countsRaw;
my $percentMito = (100 * $countsMito)/$countsRaw;
# output
print $key,"\t",$countsRaw,"\t",$countsAligned,"\t",$countsErcc,"\t",$countsMito,"\t",sprintf("%.6f",$percentAligned),"\t",sprintf("%.6f",$percentErcc),"\t",sprintf("%.6f",$percentMito),"\n";
}
}
}
## USAGE
sub usage($)
{
my $message = $_[0];
if (defined $message && length $message)
{
$message .= "\n" unless($message =~ /\n$/);
}
my $command = $0;
$command =~ s#^.*/##;
print STDERR (
$message,
"usage: $command -logs /path/to/STAR/logFiles/ -gbed /path/to/STAR/gbedFiles/\n" .
"description: creates a table for summarising reads\n" .
"parameters:\n" .
"--logs\n" .
"\t/path/to/STAR/logFiles/, log files are created during STAR alignment\n" .
"--gbed\n" .
"\t/path/to/STAR/gbedFiles/, coverage files\n" .
"--delim\n" .
"\tdelimiter to extract sample key\n" .
"-ercc\n" .
"\texports ercc table instead of experiment report\n" .
"-help\n" .
"\tdefine usage\n"
);
die("\n");
}
### ------------------- ###
=head
sub parseCounts($$)
{
my $file_counts = $_[0];
my $summary = $_[1];
my @keyPrimary = ();
open(my $fh, "<", $file_counts) or die $!;
while(<$fh>)
{
chomp($_);
# skip commands
next if($_ =~ m/^#/);
# split header
if($_ =~ m/^Geneid/)
{
my @header = split("\t", $_);
@header = @header[6..$#header];
# parse header
foreach my $label (@header)
{
my @tmp = split("\/",$label);
@tmp = split("\_", $tmp[-1]);
my $key = $tmp[0];
push(@keyPrimary, $key);
}
}
else
{
my @line = split("\t", $_);
@line = @line[6..$#line];
my $keySecondary = ($_ =~ m/^ERCC\-/) ? "ercc" : "gene";
for(my $i = 0; $i <= $#line; $i++)
{
$summary->{$keyPrimary[$i]}{$keySecondary} += $line[$i];
}
}
}
close($fh);
}
=cut
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment