Back to RGASP Summary page

 

This page contains the protocol and the software used at Computational Genomics Lab to perform the RGASP evaluations at Sanger Center(external link) cluster server. To take advantage of the multiple CPUs available on that server via the queuing system, most of the processes were turned into scripts running a single step of the pipeline (S), on every sequence being considered (Q), for each submission (M), on each of the three species (P). That approach yields Q x M x P scripts running simultaneously (depending on CPUs availability at the queue), and a minimum of S x Q x M x P processes executed on each RGASP analysis round.

Software tools on this web page are provided "AS IS". Those programs are distributed under Free Software Foundation(external link) GNU General Public License (GNU-GPL), unless stated otherwise.



Preliminaries

 
The following files in tabular format contain information that will be used by different bash commands on the sections below:

Species
Sequence
Identifiers
Sequence
Info
RGASP
Round 1
RGASP
Round 2
HUMAN ids.hum (61 b) seqinfo.hum (4.82 Kb) human.ids (3.03 Kb) human.ids (676 b)
WORM ids.wor (18 b) seqinfo.wor (938 b) worm.ids (5.02 Kb) worm.ids (494 b)
FLY ids.fly (63 b) seqinfo.fly (1.85 Kb) fly.ids (4.03 Kb) fly.ids (442 b)

 

Code Libraries

 
Here we provide a number of software libraries/modules that can be called by many of the programs listed on the sections below.

File Description Pre-requisites
global.pm (35.35 Kb) A set of general purpose functions Getopt :: Long, Term::ANSIColor, Benchmark, POSIX
largeseqs.pm (41.66 Kb) Loading large sets of sequences
in fasta or tabular format.
Inline, Inline::C, Inline::Files, Inline::Filters

 

Main Programs

 

File Description Pre-requisites
validate_cds.pl (32.24 Kb) Checking GFF features for RGASP submissions global, largeseqs

 

GFF Features Validation

 
Testing if validate_cds.pl works as expected (you can click on the test files shown on the following command-line to download them):

validate_cds.pl  validate_cds.fa  validate_cds.gff  >  validate_cds.gff.cds

 

Validating GFF Submissions

 

Making Boxplots

 
The following files contain some vectors with settings for the plots, like group names, color definitions, etc..., as well as the R functions used on the following code blocks:

 

RGASPr1

 

R Console
############ R RGASP1

WD <- "./rgasp1/summary";
OD <- "./rgasp1/plots";
source("./plots_rgasp1.r");

# HUMAN SCATTER-/BOX- PLOTS

Mnuch <- read.table(paste(WD,"all_human_ALL.evalnucleotide", sep="/"),
                   header = TRUE, row.names = NULL, # col.names = NULL,
                   as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                   comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexnh <- read.table(paste(WD,"all_human_ALL.evalexin", sep="/"),
                   header = TRUE, row.names = NULL, # col.names = NULL,
                   as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                   comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgenh <- read.table(paste(WD,"all_human_ALL.evalgene", sep="/"),
                   header = TRUE, row.names = NULL, # col.names = NULL,
                   as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                   comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );

do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set,      "ALL",     "[ALL]",              "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.part, "partial", "[PARTIAL PREDs]",    "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon",    "[nonCDS EXONS]",     "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.cds,  "cds",     "[CDS EXONS]",        "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.full, "full",    "[CDS+nonCDS EXONS]", "human", gp.human);

do.boxplots(Mnuch, Mexnh, Mgenh, hum.set,      "ALL",    "[ALL]",              "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.part, "partial","[PARTIAL PREDs]",    "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon",   "[nonCDS EXONS]",     "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.cds,  "cds",    "[CDS EXONS]",        "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.full, "full",   "[CDS+nonCDS EXONS]", "human", gp.human);

do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set,      "ALL",    "[ALL]",              "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.part, "partial","[PARTIAL PREDs]",    "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon",   "[nonCDS EXONS]",     "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.cds,  "cds",    "[CDS EXONS]",        "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.full, "full",   "[CDS+nonCDS EXONS]", "human", gp.human, seqs.human, seqs.human.color);

do.boxplots.subs(Mnuch, Mexnh,  Mgenh, hum.set, "human", gp.human, seqs.human);

# Same for WORM and FLY, replacing "hum"/"human" accordingly...

 

RGASPr2

 

R Console
############ R RGASP2

WD <- "./rgasp2/summary/";
OD <- "./rgasp2/plots";

source("./plots_rgasp2.r");

# HUMAN SCATTER-/BOX- PLOTS (by ANNOTATION SUBSET)

predsets <- c("ALL", "FIL", "HIG", "MED", "LOW");

for (i in 1:length(predsets)) {

    SET <- predsets[i];
    SETV <- paste("[",SET,"]",sep="");

    Mnuc <- read.table(paste(WD,"all_human_ALL.",SET,".evalnucleotide",sep=""),
                       header = TRUE, row.names = NULL, # col.names = NULL,
                       as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                       comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
    Mexn <- read.table(paste(WD,"all_human_ALL.",SET,".evalexin", sep=""),
                       header = TRUE, row.names = NULL, # col.names = NULL,
                       as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                       comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
    Mgen <- read.table(paste(WD,"all_human_ALL.",SET,".evalgene", sep=""),
                       header = TRUE, row.names = NULL, # col.names = NULL,
                       as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                       comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );

    # Mnuc$SEQUENCE <- paste("chr_",Mnuc$SEQUENCE,sep="")
    # Mexn$SEQUENCE <- paste("chr_",Mexn$SEQUENCE,sep="")

    do.scatterplots(Mnuc,  Mexn, Mgen, human.set, SET, SETV, "human", gp.human);
    do.boxplots(Mnuc,      Mexn, Mgen, human.set, SET, SETV, "human", gp.human);
    do.boxplots.seqs(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human, seqs.human, seqs.human.color);
    do.boxplots.subs(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human, seqs.human);

}; # for SET

# Same for WORM and FLY, replacing "hum"/"human" accordingly...

 

R Console
# HUMAN 4-VARS BOX-PLOTS (ANNOTATION SUBSETS MERGED)

Mnuc.A <- read.table(paste(WD,"all_human_ALL.ALL.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.A <- read.table(paste(WD,"all_human_ALL.ALL.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.A <- read.table(paste(WD,"all_human_ALL.ALL.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.F <- read.table(paste(WD,"all_human_ALL.FIL.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.F <- read.table(paste(WD,"all_human_ALL.FIL.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.F <- read.table(paste(WD,"all_human_ALL.FIL.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.L <- read.table(paste(WD,"all_human_ALL.LOW.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.L <- read.table(paste(WD,"all_human_ALL.LOW.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.L <- read.table(paste(WD,"all_human_ALL.LOW.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.M <- read.table(paste(WD,"all_human_ALL.MED.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.M <- read.table(paste(WD,"all_human_ALL.MED.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.M <- read.table(paste(WD,"all_human_ALL.MED.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.H <- read.table(paste(WD,"all_human_ALL.HIG.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.H <- read.table(paste(WD,"all_human_ALL.HIG.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.H <- read.table(paste(WD,"all_human_ALL.HIG.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );

do.boxplots.comp5(Mnuc.A, Mexn.A, Mgen.A,
        	  Mnuc.F, Mexn.F, Mgen.F,
        	  Mnuc.L, Mexn.L, Mgen.L,
                  Mnuc.M, Mexn.M, Mgen.M,
                  Mnuc.H, Mexn.H, Mgen.H,
                  human.set, "ALL",   "[ALL]", "human", gp.human);

# Same for WORM and FLY, replacing "hum"/"human" accordingly...

 

Back to RGASP Summary page



The original document is available at https://compgen.bio.ub.edu/CompGenOld/RGASP+Software