Preliminaries
- Code Libraries
- Main Programs
  - GFF Features Validation
Validating GFF Submissions
Making Boxplots
- RGASPr1
- RGASPr2

This page contains the protocol and the software used at Computational Genomics Lab to perform the RGASP evaluations at Sanger Center cluster server. To take advantage of the multiple CPUs available on that server via the queuing system, most of the processes were turned into scripts running a single step of the pipeline (S), on every sequence being considered (Q), for each submission (M), on each of the three species (P). That approach yields Q x M x P scripts running simultaneously (depending on CPUs availability at the queue), and a minimum of S x Q x M x P processes executed on each RGASP analysis round.

Software tools on this web page are provided “AS IS”. Those programs are distributed under Free Software Foundation GNU General Public License (GNU-GPL), unless stated otherwise.

Preliminaries

The following files in tabular format contain information that will be used by different bash commands on the sections below:

Species	Sequence Identifiers	Sequence Info	RGASP Round 1	RGASP Round 2
HUMAN	ids.hum (61 b)	seqinfo.hum (4.82 Kb)	human.ids (3.03 Kb)	human.ids (676 b)
WORM	ids.wor (18 b)	seqinfo.wor (938 b)	worm.ids (5.02 Kb)	worm.ids (494 b)
FLY	ids.fly (63 b)	seqinfo.fly (1.85 Kb)	fly.ids (4.03 Kb)	fly.ids (442 b)

Code Libraries

Here we provide a number of software libraries/modules that can be called by many of the programs listed on the sections below.

File	Description	Pre-requisites
global.pm (35.35 Kb)	A set of general purpose functions	`Getopt::Long`, `Term::ANSIColor`, `Benchmark`, `POSIX`
largeseqs.pm (41.66 Kb)	Loading large sets of sequences in fasta or tabular format.	`Inline`, `Inline::C`, `Inline::Files`, `Inline::Filters`

Main Programs

File	Description	Pre-requisites
validate_cds.pl (32.24 Kb)	Checking GFF features for RGASP submissions	`global`, `largeseqs`

GFF Features Validation

Testing if validate_cds.pl works as expected (you can click on the test files shown on the following command-line to download them):

validate_cds.pl  validate_cds.fa  validate_cds.gff  >  validate_cds.gff.cds

Validating GFF Submissions

Making Boxplots

The following files contain some vectors with settings for the plots, like group names, color definitions, etc…, as well as the R functions used on the following code blocks:

RGASPr1

R Console

############ R RGASP1

WD <- "./rgasp1/summary";
OD <- "./rgasp1/plots";
source("./plots_rgasp1.r");

# HUMAN SCATTER-/BOX- PLOTS

Mnuch <- read.table(paste(WD,"all_human_ALL.evalnucleotide", sep="/"),
                   header = TRUE, row.names = NULL, # col.names = NULL,
                   as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                   comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexnh <- read.table(paste(WD,"all_human_ALL.evalexin", sep="/"),
                   header = TRUE, row.names = NULL, # col.names = NULL,
                   as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                   comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgenh <- read.table(paste(WD,"all_human_ALL.evalgene", sep="/"),
                   header = TRUE, row.names = NULL, # col.names = NULL,
                   as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                   comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );

do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set,      "ALL",     "[ALL]",              "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.part, "partial", "[PARTIAL PREDs]",    "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon",    "[nonCDS EXONS]",     "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.cds,  "cds",     "[CDS EXONS]",        "human", gp.human);
do.scatterplots(Mnuch, Mexnh, Mgenh, hum.set.full, "full",    "[CDS+nonCDS EXONS]", "human", gp.human);

do.boxplots(Mnuch, Mexnh, Mgenh, hum.set,      "ALL",    "[ALL]",              "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.part, "partial","[PARTIAL PREDs]",    "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon",   "[nonCDS EXONS]",     "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.cds,  "cds",    "[CDS EXONS]",        "human", gp.human);
do.boxplots(Mnuch, Mexnh, Mgenh, hum.set.full, "full",   "[CDS+nonCDS EXONS]", "human", gp.human);

do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set,      "ALL",    "[ALL]",              "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.part, "partial","[PARTIAL PREDs]",    "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.exon, "exon",   "[nonCDS EXONS]",     "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.cds,  "cds",    "[CDS EXONS]",        "human", gp.human, seqs.human, seqs.human.color);
do.boxplots.seqs(Mnuch, Mexnh, Mgenh, hum.set.full, "full",   "[CDS+nonCDS EXONS]", "human", gp.human, seqs.human, seqs.human.color);

do.boxplots.subs(Mnuch, Mexnh,  Mgenh, hum.set, "human", gp.human, seqs.human);

# Same for WORM and FLY, replacing "hum"/"human" accordingly...

RGASPr2

R Console

############ R RGASP2

WD <- "./rgasp2/summary/";
OD <- "./rgasp2/plots";

source("./plots_rgasp2.r");

# HUMAN SCATTER-/BOX- PLOTS (by ANNOTATION SUBSET)

predsets <- c("ALL", "FIL", "HIG", "MED", "LOW");

for (i in 1:length(predsets)) {

    SET <- predsets[i];
    SETV <- paste("[",SET,"]",sep="");

    Mnuc <- read.table(paste(WD,"all_human_ALL.",SET,".evalnucleotide",sep=""),
                       header = TRUE, row.names = NULL, # col.names = NULL,
                       as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                       comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
    Mexn <- read.table(paste(WD,"all_human_ALL.",SET,".evalexin", sep=""),
                       header = TRUE, row.names = NULL, # col.names = NULL,
                       as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                       comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
    Mgen <- read.table(paste(WD,"all_human_ALL.",SET,".evalgene", sep=""),
                       header = TRUE, row.names = NULL, # col.names = NULL,
                       as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                       comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );

    # Mnuc$SEQUENCE <- paste("chr_",Mnuc$SEQUENCE,sep="")
    # Mexn$SEQUENCE <- paste("chr_",Mexn$SEQUENCE,sep="")

    do.scatterplots(Mnuc,  Mexn, Mgen, human.set, SET, SETV, "human", gp.human);
    do.boxplots(Mnuc,      Mexn, Mgen, human.set, SET, SETV, "human", gp.human);
    do.boxplots.seqs(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human, seqs.human, seqs.human.color);
    do.boxplots.subs(Mnuc, Mexn, Mgen, human.set, SET, SETV, "human", gp.human, seqs.human);

}; # for SET

# Same for WORM and FLY, replacing "hum"/"human" accordingly...

R Console

# HUMAN 4-VARS BOX-PLOTS (ANNOTATION SUBSETS MERGED)

Mnuc.A <- read.table(paste(WD,"all_human_ALL.ALL.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.A <- read.table(paste(WD,"all_human_ALL.ALL.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.A <- read.table(paste(WD,"all_human_ALL.ALL.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.F <- read.table(paste(WD,"all_human_ALL.FIL.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.F <- read.table(paste(WD,"all_human_ALL.FIL.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.F <- read.table(paste(WD,"all_human_ALL.FIL.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.L <- read.table(paste(WD,"all_human_ALL.LOW.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.L <- read.table(paste(WD,"all_human_ALL.LOW.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.L <- read.table(paste(WD,"all_human_ALL.LOW.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.M <- read.table(paste(WD,"all_human_ALL.MED.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.M <- read.table(paste(WD,"all_human_ALL.MED.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.M <- read.table(paste(WD,"all_human_ALL.MED.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mnuc.H <- read.table(paste(WD,"all_human_ALL.HIG.evalnucleotide", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mexn.H <- read.table(paste(WD,"all_human_ALL.HIG.evalexin", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );
Mgen.H <- read.table(paste(WD,"all_human_ALL.HIG.evalgene", sep="/"),
                     header = TRUE, row.names = NULL, # col.names = NULL,
                     as.is = TRUE, na.strings = c("NAN", "NA", "nan", "na"), dec = ".",
                     comment.char = "#", strip.white = TRUE, blank.lines.skip = TRUE );

do.boxplots.comp5(Mnuc.A, Mexn.A, Mgen.A,
        	  Mnuc.F, Mexn.F, Mgen.F,
        	  Mnuc.L, Mexn.L, Mgen.L,
                  Mnuc.M, Mexn.M, Mgen.M,
                  Mnuc.H, Mexn.H, Mgen.H,
                  human.set, "ALL",   "[ALL]", "human", gp.human);

# Same for WORM and FLY, replacing "hum"/"human" accordingly...

Back to RGASP Summary page

Table of contents

Preliminaries

Code Libraries

Main Programs

GFF Features Validation

Validating GFF Submissions

Making Boxplots

RGASPr1

RGASPr2