This page contains the protocol and the software used at Computational Genomics Lab to perform the RGASP evaluations at Sanger Center cluster server. To take advantage of the multiple CPUs available on that server via the queuing system, most of the processes were turned into scripts running a single step of the pipeline (S), on every sequence being considered (Q), for each submission (M), on each of the three species (P). That approach yields Q x M x P scripts running simultaneously (depending on CPUs availability at the queue), and a minimum of S x Q x M x P processes executed on each RGASP analysis round.
Software tools on this web page are provided “AS IS”. Those programs are distributed under Free Software Foundation GNU General Public License (GNU-GPL), unless stated otherwise.
Preliminaries
The following files in tabular format contain information that will be used by different bash commands on the sections below:
Species | Sequence Identifiers | Sequence Info | RGASP Round 1 | RGASP
Round 2 |
HUMAN | ids.hum (61 b) | seqinfo.hum (4.82 Kb) | human.ids (3.03 Kb) | human.ids (676 b) |
WORM | ids.wor (18 b) | seqinfo.wor (938 b) | worm.ids (5.02 Kb) | worm.ids (494 b) |
FLY | ids.fly (63 b) | seqinfo.fly (1.85 Kb) | fly.ids (4.03 Kb) | fly.ids (442 b) |
Code Libraries
Here we provide a number of software libraries/modules that can be called by many of the programs listed on the sections below.
File | Description | Pre-requisites | ||
global.pm (35.35 Kb) | A set of general purpose functions | Getopt::Long , Term::ANSIColor , Benchmark , POSIX
| ||
largeseqs.pm (41.66 Kb) | Loading large sets of sequences in fasta or tabular format. | Inline , Inline::C , Inline::Files , Inline::Filters |
Main Programs
File | Description | Pre-requisites | ||
validate_cds.pl (32.24 Kb) | Checking GFF features for RGASP submissions | global , largeseqs |
GFF Features Validation
Testing if validate_cds.pl
works as expected (you can click on the test files shown on the following command-line to download them):
validate_cds.pl validate_cds.fa validate_cds.gff > validate_cds.gff.cds
Validating GFF Submissions
Making Boxplots
The following files contain some vectors with settings for the plots, like group names, color definitions, etc…, as well as the R functions used on the following code blocks:
RGASPr1
############ R RGASP1 WD <- "./rgasp1/summary"; OD <- "./rgasp1/plots"; source("./plots_rgasp1.r"); # HUMAN SCATTER-/BOX- PLOTS Mnuch <- read.table(paste(WD,"all_human_ALL.evalnucleotide", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mexnh <- read.table(paste(WD,"all_human_ALL.evalexin", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mgenh <- read.table(paste(WD,"all_human_ALL.evalgene", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set, "ALL", "[ALL]", "human", gp.human); do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.part, "partial", "[PARTIAL PREDs]", "human", gp.human); do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon", "[nonCDS EXONS]", "human", gp.human); do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.cds, "cds", "[CDS EXONS]", "human", gp.human); do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.full, "full", "[CDS+nonCDS EXONS]", "human", gp.human); do.boxplots(Mnuch, Mexnh, Mgenh, hum.set, "ALL", "[ALL]", "human", gp.human); do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.part, "partial","[PARTIAL PREDs]", "human", gp.human); do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon", "[nonCDS EXONS]", "human", gp.human); do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.cds, "cds", "[CDS EXONS]", "human", gp.human); do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.full, "full", "[CDS+nonCDS EXONS]", "human", gp.human); do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set, "ALL", "[ALL]", "human", gp.human, seqs.human, seqs.human.color); do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.part, "partial","[PARTIAL PREDs]", "human", gp.human, seqs.human, seqs.human.color); do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon", "[nonCDS EXONS]", "human", gp.human, seqs.human, seqs.human.color); do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.cds, "cds", "[CDS EXONS]", "human", gp.human, seqs.human, seqs.human.color); do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.full, "full", "[CDS+nonCDS EXONS]", "human", gp.human, seqs.human, seqs.human.color); do.boxplots.subs(Mnuch, Mexnh, Mgenh, hum.set, "human", gp.human, seqs.human); # Same for WORM and FLY, replacing "hum"/"human" accordingly...
RGASPr2
############ R RGASP2 WD <- "./rgasp2/summary/"; OD <- "./rgasp2/plots"; source("./plots_rgasp2.r"); # HUMAN SCATTER-/BOX- PLOTS (by ANNOTATION SUBSET) predsets <- c("ALL", "FIL", "HIG", "MED", "LOW"); for (i in 1:length(predsets)) { SET <- predsets[i]; SETV <- paste("[",SET,"]",sep=""); Mnuc <- read.table(paste(WD,"all_human_ALL.",SET,".evalnucleotide",sep=""), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mexn <- read.table(paste(WD,"all_human_ALL.",SET,".evalexin", sep=""), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mgen <- read.table(paste(WD,"all_human_ALL.",SET,".evalgene", sep=""), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); # Mnuc$SEQUENCE <- paste("chr_",Mnuc$SEQUENCE,sep="") # Mexn$SEQUENCE <- paste("chr_",Mexn$SEQUENCE,sep="") do.scatterplots(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human); do.boxplots(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human); do.boxplots.seqs(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human, seqs.human, seqs.human.color); do.boxplots.subs(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human, seqs.human); }; # for SET # Same for WORM and FLY, replacing "hum"/"human" accordingly...
# HUMAN 4-VARS BOX-PLOTS (ANNOTATION SUBSETS MERGED) Mnuc.A <- read.table(paste(WD,"all_human_ALL.ALL.evalnucleotide", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mexn.A <- read.table(paste(WD,"all_human_ALL.ALL.evalexin", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mgen.A <- read.table(paste(WD,"all_human_ALL.ALL.evalgene", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mnuc.F <- read.table(paste(WD,"all_human_ALL.FIL.evalnucleotide", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mexn.F <- read.table(paste(WD,"all_human_ALL.FIL.evalexin", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mgen.F <- read.table(paste(WD,"all_human_ALL.FIL.evalgene", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mnuc.L <- read.table(paste(WD,"all_human_ALL.LOW.evalnucleotide", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mexn.L <- read.table(paste(WD,"all_human_ALL.LOW.evalexin", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mgen.L <- read.table(paste(WD,"all_human_ALL.LOW.evalgene", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mnuc.M <- read.table(paste(WD,"all_human_ALL.MED.evalnucleotide", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mexn.M <- read.table(paste(WD,"all_human_ALL.MED.evalexin", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mgen.M <- read.table(paste(WD,"all_human_ALL.MED.evalgene", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mnuc.H <- read.table(paste(WD,"all_human_ALL.HIG.evalnucleotide", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mexn.H <- read.table(paste(WD,"all_human_ALL.HIG.evalexin", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); Mgen.H <- read.table(paste(WD,"all_human_ALL.HIG.evalgene", sep="/"), header = TRUE, row.names = NULL, # col.names = NULL, as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".", comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE ); do.boxplots.comp5(Mnuc.A, Mexn.A, Mgen.A, Mnuc.F, Mexn.F, Mgen.F, Mnuc.L, Mexn.L, Mgen.L, Mnuc.M, Mexn.M, Mgen.M, Mnuc.H, Mexn.H, Mgen.H, human.set, "ALL", "[ALL]", "human", gp.human); # Same for WORM and FLY, replacing "hum"/"human" accordingly...