diff options
-rw-r--r-- | academic/spidey/README | 22 | ||||
-rw-r--r-- | academic/spidey/slack-desc | 6 | ||||
-rw-r--r-- | academic/spidey/spidey.1 | 279 | ||||
-rw-r--r-- | academic/spidey/spidey.SlackBuild | 61 | ||||
-rw-r--r-- | academic/spidey/spidey.info | 6 |
5 files changed, 349 insertions, 25 deletions
diff --git a/academic/spidey/README b/academic/spidey/README index 0912181c25..6e7c198210 100644 --- a/academic/spidey/README +++ b/academic/spidey/README @@ -1,9 +1,15 @@ -Spidey is an mRNA-to-genomic alignment program. For a complete -description of how Spidey works, visit -http://www.ncbi.nlm.nih.gov/spidey/spideydoc.html. +Spidey: an mRNA-to-genomic alignment program. -This is just repackaging of the ready binary for x86 and will not run -on x86_64. It will probably work just fine on a Slackware multilib -box, but we do not support that ;). If you want to build spidey from -source, you should download and compile the NCBI toolkit. For more -information: http://www.ncbi.nlm.nih.gov/spidey/spideysource.html +Spidey is a tool for aligning one or more mRNA sequences to a given +genomic sequence. It was written with two main goals in mind: +1) find good alignments regardless of intron size +2) avoid getting confused by nearby pseudogenes and paralogs. + +The following programs provide a GUI to run spidey: +-ugene +-perlprimer + +This is just repackaging of precompiled binaries: +- x86 platform: the executable is provided by upstream (NCBI). +- x86_64 platform: the executable is kindly provided by the UniPro +Ugene project, where it is part of their External Tools meta-package. diff --git a/academic/spidey/slack-desc b/academic/spidey/slack-desc index cd570e4133..3689502f74 100644 --- a/academic/spidey/slack-desc +++ b/academic/spidey/slack-desc @@ -8,12 +8,12 @@ |-----handy-ruler------------------------------------------------------| spidey: spidey (mRNA-to-genomic alignment) spidey: -spidey: Spidey is an mRNA-to-genomic alignment program. -spidey: +spidey: Spidey is a tool for aligning one or more mRNA sequences +spidey: to a given genomic sequence. spidey: +spidey: Home: http://www.ncbi.nlm.nih.gov/spidey/index.html spidey: spidey: spidey: spidey: spidey: -spidey: Home: http://www.ncbi.nlm.nih.gov/spidey/index.html diff --git a/academic/spidey/spidey.1 b/academic/spidey/spidey.1 new file mode 100644 index 0000000000..f3e2836f95 --- /dev/null +++ b/academic/spidey/spidey.1 @@ -0,0 +1,279 @@ +.TH SPIDEY 1 2005-01-25 NCBI "NCBI Tools User's Manual" +.SH NAME +spidey \- align mRNA sequences to a genome +.SH SYNOPSIS +.B spidey +[\|\fB\-\fP\|] +[\|\fB\-F\fP\ \fIN\fP\|] +[\|\fB\-G\fP\|] +[\|\fB\-L\fP\ \fIN\fP\|] +[\|\fB\-M\fP\ \fIfilename\fP\|] +[\|\fB\-N\fP\ \fIfilename\fP\|] +[\|\fB\-R\fP\ \fIfilename\fP\|] +[\|\fB\-S\fP\ \fIp/m\fP\|] +[\|\fB\-T\fP\ \fIN\fP\|] +[\|\fB\-X\fP\|] +[\|\fB\-a\fP\ \fIfilename\fP\|] +[\|\fB\-c\fP\ \fIN\fP\|] +[\|\fB\-d\fP\|] +[\|\fB\-e\fP\ \fIX\fP\|] +[\|\fB\-f\fP\ \fIX\fP\|] +[\|\fB\-g\fP\ \fIX\fP\|] +\fB\-i\fP\ \fIfilename\fP +[\|\fB\-j\fP\|] +[\|\fB\-k\fP\ \fIfilename\fP\|] +[\|\fB\-l\fP\ \fIN\fP\|] +\fB\-m\fP\ \fIfilename\fP +[\|\fB\-n\fP\ \fIN\fP\|] +[\|\fB\-o\fP\ \fIstr\fP\|] +[\|\fB\-p\fP\ \fIN\fP\|] +[\|\fB\-r\fP\ \fIc/d/m/p/v\fP\|] +[\|\fB\-s\fP\|] +[\|\fB\-t\fP\ \fIfilename\fP\|] +[\|\fB\-u\fP\|] +[\|\fB\-w\fP\|] +.SH DESCRIPTION +\fBspidey\fP is a tool for aligning one or more mRNA sequences to a +given genomic sequence. \fBspidey\fP was written with two main goals +in mind: find good alignments regardless of intron size; and avoid +getting confused by nearby pseudogenes and paralogs. Towards the +first goal, \fBspidey\fP uses BLAST and Dot View (another local +alignment tool) to find its alignments; since these are both local +alignment tools, \fBspidey\fP does not intrinsically favor shorter or +longer introns and has no maximum intron size. To avoid mistakenly +including exons from paralogs and pseudogenes, \fBspidey\fP first +defines windows on the genomic sequence and then performs the +mRNA-to-genomic alignment separately within each window. Because of +the way the windows are constructed, neighboring paralogs or +pseudogenes should be in separate windows and should not be included +in the final spliced alignment. +.SS Initial alignments and construction of genomic windows +\fBspidey\fP takes as input a single genomic sequence and a set of +mRNA accessions or FASTA sequences. All processing is done one mRNA +sequence at a time. The first step for each mRNA sequence is a +high-stringency BLAST against the genomic sequence. The resulting +hits are analyzed to find the genomic windows. +.PP +The BLAST alignments are sorted by score and then assigned into +windows by a recursive function which takes the first alignment and +then goes down the alignment list to find all alignments that are +consistent with the first (same strand of mRNA, both the mRNA and +genomic coordinates are nonoverlapping and linearly consistent). On +subsequent passes, the remaining alignments are examined and are put +into their own nonoverlapping, consistent windows, until no alignments +are left. Depending on how many gene models are desired, the +top \fIn\fP windows are chosen to go on to the next step and the others +are deleted. +.SS Aligning in each window +Once the genomic windows are constructed, the initial BLAST alignments +are freed and another BLAST search is performed, this time with the +entire mRNA against the genomic region defined by the window, and at a +lower stringency than the initial search. \fBspidey\fP then uses a +greedy algorithm to generate a high-scoring, nonoverlapping subset of +the alignments from the second BLAST search. This consistent set is +analyzed carefully to make sure that the entire mRNA sequence is +covered by the alignments. When gaps are found between the +alignments, the appropriate region of genomic sequence is searched +against the missing mRNA, first using a very low-stringency BLAST and, +if the BLAST fails to find a hit, using DotView functions to locate +the alignment. When gaps are found at the ends of the alignments, the +BLAST and DotView searches are actually allowed to extend past the +boundaries of the window. If the 3' end of the mRNA does not align +completely, it is first examined for the presence of a poly(A) tail. +No attempt is made to align the portion of the mRNA that seems to be a +poly(A) tail; sometimes there is a poly(A) tail that does align to the +genomic sequence, and these are noted because they indicate the +possibility of a pseudogene. +.PP +Now that the mRNA is completely covered by the set of alignments, the +boundaries of the alignments (there should be one alignment per exon +now) are adjusted so that the alignments abut each other precisely and +so that they are adjacent to good splice donor and acceptor sites. +Most commonly, two adjacent exons' alignments overlap by as much as 20 +or 30 base pairs on the mRNA sequence. The true exon boundary may lie +anywhere within this overlap, or (as we have seen empirically) even a +few base pairs outside the overlap. To position the exon boundaries, +the overlap plus a few base pairs on each side is examined for splice +donor sites, using functions that have different splice matrices +depending on the organism chosen. The top few splice donor sites (by +score) are then evaluated as to how much they affect the original +alignment boundaries. The site that affects the boundaries the least +is chosen, and is evaluated as to the presence of an acceptor site. +The alignments are truncated or extended as necessary so that they +terminate at the splice donor site and so that they do not overlap. +.SS Final result +The windows are examined carefully to get the percent identity per +exon, the number of gaps per exon, the overall percent identity, the +percent coverage of the mRNA, presence of an aligning or non-aligning +poly(A) tail, number of splice donor sites and the presence or absence +of splice donor and acceptor sites for each exon, and the occurrence +of an mRNA that has a 5' or 3' end (or both) that does not align to +the genomic sequence. If the overall percent identity and percent +length coverage are above the user-defined cutoffs, a summary report +is printed, and, if requested, a text alignment showing identities and +mismatches is also printed. +.SS Interspecies alignments +\fBspidey\fP is capable of performing interspecies alignments. The +major difference in interspecies alignments is that the mRNA-genomic +identity will not be close to 100% as it is in intraspecies +alignments; also, the alignments have numerous and lengthy gaps. If +\fBspidey\fP is used in its normal mode to do interspecies alignments, +it produces gene models with many, many short exons. When the +interspecies flag is set, \fBspidey\fP uses different BLAST parameters +to encourage longer and more gaps and to not penalize as heavily for +mismatches. This way, the alignments for the exons are much longer +and more closely approximate the actual gene structure. +.SS Extracting CDS alignments +When \fBspidey\fP is run in network-aware mode or when ASN.1 files are +used for the mRNA records, it is capable of extracting a CDS alignment +from an mRNA alignment and printing the CDS information also. Since +the CDS alignment is just a subset of the mRNA alignment, it is +relatively straightforward to truncate the exon alignments as +necessary and to generate a CDS alignment. Furthermore, the +untranslated regions are now defined, so the percent identity for the +5' and 3' untranslated regions is also calculated. +.PP +.SH OPTIONS +A summary of options is included below. +.TP +\fB\-\fP +Print usage message. +.TP +\fB\-F\fP\ \fIN\fP +Start of genomic interval desired (from; 0-based). +.TP +\fB\-G\fP +Input file is a GI list. +.TP +\fB\-L\fP\ \fIN\fP +The extra-large intron size to use (default = 220000). +.TP +\fB\-M\fP\ \fIfilename\fP +File with donor splice matrix. +.TP +\fB\-N\fP\ \fIfilename\fP +File with acceptor splice matrix. +.TP +\fB\-R\fP\ \fIfilename\fP +File (including path) to repeat blast database for filtering. +.TP +\fB\-S\fP\ \fIp/m\fP +Restrict to plus (p) or minus (m) strand of genomic sequence. +.TP +\fB\-T\fP\ \fIN\fP +Stop of genomic interval desired (to; 0-based). +.TP +\fB\-X\fP +Use extra-large intron sizes (increases the limit for initial and +terminal introns from 100kb to 240kb and for all others from 35kb to +120kb); may result in significantly longer compute times. +.TP +\fB\-a\fP\ \fIfilename\fP +Output file for alignments when directed to a separate file with +\fB-p\ 3\fP (default = spidey.aln). +.TP +\fB\-c\fP\ \fIN\fP +Identity cutoff, in percent, for quality control purposes. +.TP +\fB\-d\fP +Also try to align coding sequences corresponding to the given mRNA +records (may require network access). +.TP +\fB\-e\fP\ \fIX\fP +First-pass e-value (default = 1.0e-10). Higher values increase speed +at the cost of sensitivity. +.TP +\fB\-f\fP\ \fIX\fP +Second-pass e-value (default = 0.001). +.TP +\fB\-g\fP\ \fIX\fP +Third-pass e-value (default = 10). +.TP +\fB\-i\fP\ \fIfilename\fP +Input file containing the genomic sequence in ASN.1 or FASTA format. +If your computer is running on a network that can access GenBank, you +can substitute the desired accession number for the filename. +.TP +\fB\-j\fP +Print ASN.1 alignment? +.TP +\fB\-k\fP\ \fIfilename\fP +File for ASN.1 output with \fB-k\fP (default = spidey.asn). +.TP +\fB\-l\fP\ \fIN\fP +Length coverage cutoff, in percent. +.TP +\fB\-m\fP\ \fIfilename\fP +Input file containing the mRNA sequence(s) in ASN.1 or FASTA format, +or a list of their accessions (with \fB-G\fP). If your computer is +running on a network that can access GenBank, you can substitute a +single accession number for the filename. +.TP +\fB\-n\fP\ \fIN\fP +Number of gene models to return per input mRNA (default = 1). +.TP +\fB\-o\fP\ \fIstr\fP +Main output file (default = stdout; contents controlled by \fB-p\fP). +.TP +\fB\-p\fP\ \fIN\fP +Print alignment? +.RS +.PD 0 +.IP \fB0\fP +summary and alignments together (default) +.IP \fB1\fP +just the summary +.IP \fB2\fP +just the alignments +.IP \fB3\fP +summary and alignments in different files +.PD +.RE +.TP +\fB\-r\fP\ \fIc/d/m/p/v\fP +Organism of genomic sequence, used to determine splice matrices. +.RS +.PD 0 +.IP \fBc\fP +C. elegans +.IP \fBd\fP +Drosophila +.IP \fBm\fP +Dictyostelium discoideum +.IP \fBp\fP +plant +.IP \fBv\fP +vertebrate (default) +.PD +.RE +.TP +\fB\-s\fP +Tune for interspecies alignments. +.TP +\fB\-t\fP\ \fIfilename\fP +File with feature table, in 4 tab-delimited columns: +.RS +.PD 0 +.IP \fIseqid\fP +(e.g., \fBNM_04377.1\fP) +.IP \fIname\fP +(only \fBrepetitive_region\fP is currently supported) +.IP \fIstart\fP +(0-based) +.IP \fIstop\fP +(0-based) +.PD +.RE +.TP +\fB\-u\fP +Make a multiple alignment of all input mRNAs (which must overlap on +the genomic sequence). +.TP +\fB\-w\fP +Consider lowercase characters in input FASTA sequences to be masked. +.SH AUTHOR +Sarah Wheelan and others at the National Center for Biotechnology +Information; Steffen Moeller contributed to this documentation. +.SH SEE ALSO +.BR blast (1), +<http://www.ncbi.nlm.nih.gov/spidey> diff --git a/academic/spidey/spidey.SlackBuild b/academic/spidey/spidey.SlackBuild index 443940eb11..5a31c4f26a 100644 --- a/academic/spidey/spidey.SlackBuild +++ b/academic/spidey/spidey.SlackBuild @@ -1,25 +1,57 @@ #!/bin/sh -# Slackware build script for spidey -# Written by Petar Petrov, <ppetrov@paju.oulu.fi> and -# hereby submitted to the public domain - -# THIS SLACKBUILD IS DISTRIBUTETD IN THE HOPE OF BEING -# USEFUL BUT WITHOUT ANY WARRANTY. THE AUTHOR IS _NOT_ -# RESPONSIBLE FOR ANY DAMAGE OR DATA LOSS CAUSED BY IT. +# Copyright 2011-2015 Petar Petrov, petar.petrov@student.oulu.fi +# All rights reserved. +# +# Redistribution and use of this script, with or without modification, is +# permitted provided that the following conditions are met: +# +# 1. Redistributions of this script must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PRGNAM=spidey -VERSION=${VERSION:-20060601} -BUILD=${BUILD:-1} +VERSION=${VERSION:-20060601} # Keep the date of the 32bit binary as version. +BUILD=${BUILD:-2} TAG=${TAG:-_SBo} -ARCH=i386 +if [ -z "$ARCH" ]; then + case "$( uname -m )" in + i?86) ARCH=i386 ;; + arm*) ARCH=arm ;; + *) ARCH=$( uname -m ) ;; + esac +fi CWD=$(pwd) TMP=${TMP:-/tmp/SBo} PKG=$TMP/package-$PRGNAM OUTPUT=${OUTPUT:-/tmp} + +if [ "$ARCH" != "i386" ] && [ "$ARCH" != "x86_64" ]; then + printf "\n\n$ARCH is not supported... \n" + exit 1 +fi + +# Determine the source arch. Many thanks to the Ugene project for the +# 64bit executable! +if [ "$ARCH" = "x86_64" ]; then + SRCARCH=".64" +else + SRCARCH="" +fi + set -e rm -rf $PKG @@ -28,13 +60,20 @@ cd $TMP rm -rf $PRGNAM-$VERSION mkdir $PRGNAM-$VERSION cd $PRGNAM-$VERSION -gunzip -c $CWD/$PRGNAM.linux.gz > spidey + +gunzip -c $CWD/$PRGNAM.linux${SRCARCH}.gz > spidey install -D -m755 spidey $PKG/usr/bin/spidey +mkdir -p $PKG/usr/man/man1 +cp $CWD/$PRGNAM.1 $PKG/usr/man/man1/$PRGNAM.1 + find $PKG -print0 | xargs -0 file | grep -e "executable" -e "shared object" | grep ELF \ | cut -f 1 -d : | xargs strip --strip-unneeded 2> /dev/null || true +find $PKG/usr/man -type f -exec gzip -9 {} \; +for i in $( find $PKG/usr/man -type l ) ; do ln -s $( readlink $i ).gz $i.gz ; rm $i ; done + mkdir -p $PKG/usr/doc/$PRGNAM-$VERSION cat $CWD/$PRGNAM.SlackBuild > $PKG/usr/doc/$PRGNAM-$VERSION/$PRGNAM.SlackBuild diff --git a/academic/spidey/spidey.info b/academic/spidey/spidey.info index 324a7da4a9..3a21a46c7a 100644 --- a/academic/spidey/spidey.info +++ b/academic/spidey/spidey.info @@ -3,8 +3,8 @@ VERSION="20060601" HOMEPAGE="http://www.ncbi.nlm.nih.gov/spidey/index.html" DOWNLOAD="ftp://ftp.ncbi.nih.gov/pub/wheelan/Spidey/spidey.linux.gz" MD5SUM="2e56ef2e4fcf57eca266fb1b3bb56c7e" -DOWNLOAD_x86_64="UNSUPPORTED" -MD5SUM_x86_64="UNSUPPORTED" +DOWNLOAD_x86_64="http://www.student.oulu.fi/~ppetrov/source/spidey.linux.64.gz" +MD5SUM_x86_64="79f1f95976346e0d0f5c7f717deac176" REQUIRES="" MAINTAINER="Petar Petrov" -EMAIL="ppetrov@paju.oulu.fi" +EMAIL="petar.petrov@student.oulu.fi" |