first batch

Former-commit-id: 1eecb206a17c4aff21d1170b48db134ce3c4f14e
This commit is contained in:
Eric Coissac
2025-03-01 16:15:28 +01:00
parent 4e51d42b85
commit 2c012eec8e
596 changed files with 5247 additions and 77743 deletions

View File

@ -0,0 +1,528 @@
#!/bin/bash
#
# BUILD REFERENCE : From SwissProt
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
SPDIR="$CDS_DATA_DIR/sp_mitodb"
SPGENESDIR="$SPDIR/genes"
CORE_GENES="atp6 atp8"
CORE_GENES="$CORE_GENES cox1 cox2 cox3"
CORE_GENES="$CORE_GENES cytb"
CORE_GENES="$CORE_GENES nd1 nd2 nd3 nd4 nd4L nd5 nd6"
function download_swissprot() {
URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"
curl $URL
}
function extract_mito_entries() {
awk -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
/OC Eukaryota; Metazoa;/ && /OG Mitochondrion\./ && !/DE Flags: Fragment;/ {print $0}
' $1
}
function extract_mito_gene_entries() {
awk -v gene=$1 -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
($0 ~ gene"_") && /OC Eukaryota; Metazoa;/ && /OG Mitochondrion./ {print $0}
' $2
}
function extract_mito_gene_frg() {
awk -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
/DE Flags: Fragment;/ {print $0}
' $1
}
function extract_mito_gene_ac() {
awk -v ac=$1 -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
($0 ~ "AC " ac ";") && /OC Eukaryota; Metazoa;/ && /OG Mitochondrion./ {print $0}
' $2
}
function extract_fasta_protein() {
$AwkCmd '
/^ID/ {ID=$2}
/^AC/ {AC=$2; sub(";","",AC)}
/^DR EMBL;/ {
EMBL=$4;
sub(";","",EMBL);
sub("-","xxx",EMBL);
if ( EMBL != "xxx" ) {
EBI="curl \"https://www.ebi.ac.uk/ena/browser/api/embl/" EMBL "?download=true\" | egrep \"FT +/gene=\" | cut -d \"=\" -f 2 | tr -d \"\\\"\""
EBI | getline GENE
close(EBI)
} else {
GENE = "xxx"
}
}
/^ / {gsub(/ /,"",$0); SEQ=SEQ $0}
/^\/\// {
if (GENE != "xxx") {
print ">"ID,"SP_AC="AC";","EMBL_AC="EMBL";","gene="GENE";";
print SEQ
}
ID="xxx";
AC="xxx";
EMBL="xxx";
GENE="xxx"
SEQ=""
}
' $1 \
| formatfasta
}
function rename_rules() {
egrep "^>" $1 \
| sed 's/^>//' \
| sed 's/_/=/' \
| tr -d ";" \
| awk -F"=" '{print $1,$NF}' \
| sort \
| uniq -c \
| awk '{
x=$1
$1=$2
$2=sprintf("%06d",x)
print $0}' \
| sort -rn \
| sed 's/ /=/' \
| sed 's/ /=/' \
| awk -F "=" '
($1 == current) {
sub(/^0+/,"",occurrence)
print "# based on",occurrence,"observations renames",$NF,"in",gene
print "/^>" current "/ s@gene=" $NF "@gene=" gene "@"
}
($1 != current) {
current= $1
gene=$NF
occurrence=$2
}
'
}
function rename_genes() {
local n=1
local input=$1
cat $input > __tmp__$$__fasta
rules=${input/fst/rules}
#echo $rule 1>&2
rename_rules __tmp__$$__fasta > $rules.$n
while [[ -s $rules.$n ]] ; do
sed -f $rules.$n __tmp__$$__fasta > __tmp2__$$__fasta
mv __tmp2__$$__fasta __tmp__$$__fasta
((n++))
rename_rules __tmp__$$__fasta > $rules.$n
done
cat __tmp__$$__fasta
rm __tmp__$$__fasta
}
function clean_strange_gene_name() {
local input=$1
local output=${input/.fst/_sp_ebi.genes}
local to_keep=${input/.fst/_to_keep.lst}
local to_be_removed=${input/.fst/_to_be_removed.lst}
grep '^>' $input \
| sed 's/^>//' \
| sed 's/_/=/' \
| sed 's/;$//' \
| awk -F'=' '{print $NF,$1}' \
| sort \
| uniq -c \
| $AwkCmd '{
x=$1
$1=$2
$2=sprintf("%06d",x)
print $0}' \
| sort -r > $output
$AwkCmd '
($1!=current) {current=$1; print $3}
' $output > $to_keep
$AwkCmd '
($1==current) {print $3}
($1!=current) {current=$1}
' $output > $to_be_removed
filter_sp_fasta_db $to_be_removed $input > ${input/.fst/_strange_genes.fst}
filter_sp_fasta_db $to_keep $input
}
function filter_sp_fasta_db() {
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
$AwkCmd -F "_" -v gene_pattern=$gene_pattern '
/^>/ && (gene ~ gene_pattern) {
print entry
}
/^>/ {
entry = $0
gene = $1
sub(/^>/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
if (gene ~ gene_pattern)
print entry
}
' $2
}
function filter_out_sp_fasta_db() {
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
$AwkCmd -v gene_pattern=$gene_pattern '
/^>/ && (gene !~ gene_pattern) {
print entry
}
/^>/ {
entry = $0
gene = $1
sub(/^>/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
if (gene !~ gene_pattern)
print entry
}
' $2
}
function split_by_gene() {
$AwkCmd -F "=" -v SPGENES=$SPGENESDIR '
function writegene(gene,entry) {
outputdir = SPGENES "/" gene
output = outputdir "/" gene ".fst"
system("mkdir -p " outputdir)
print entry >> output
close(output)
}
/^>/ && gene!="" {
writegene(gene,entry)
}
/^>/ {
entry = $0
gene = $NF
sub(/;$/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
writegene(gene,entry)
}
' $1
}
function dereplicate() {
local CDHIT_ID=0.95
local CDHIT_DELTA=0.95
local gene="${1}"
local fastain="${gene}/${gene}.fst"
local cdhitout="${gene}/${gene}.cdhit.fst"
cd-hit -i "${fastain}" \
-o "${cdhitout}" \
-c ${CDHIT_DELTA} \
-G 1 \
-g 1 \
-aL 0.95 \
-s ${CDHIT_ID} \
-b 350 -p 1 \
-d 0 -n 3
local fasta1="${gene}/${gene}.1l.fst"
}
function dereplicate_genes() {
pushd $SPGENESDIR
for g in * ; do
dereplicate $g ;
done
popd
}
function buildGeneBlastDB() {
local gene="${1}"
local fastain="${gene}/${gene}.cdhit.fst"
loginfo " formatting Blast $gene DB"
timeoutcmd 300 makeblastdb -dbtype prot -in ${fastain} >& /dev/null
loginfo "Done"
}
function buildBlastDBs() {
pushd $SPGENESDIR
for g in * ; do
buildGeneBlastDB $g ;
done
popd
}
function list_shell_genes() {
pushd $SPDIR
ls genes \
| grep -v '\.' \
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
| grep -iv '^orf' \
| grep -iv "$RPS12_GENE"
popd
}
function list_dust_genes() {
pushd $SPDIR 1>&2
ls genes \
| grep -v '\.' \
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
| grep -i '^orf'
popd 1>&2
}
function build_core_libraries() {
pushd $SPDIR 1>&2
rm -rf core
mkdir -p core
for gene in $CORE_GENES ; do
cp genes/$gene/$gene.cdhit.fst core/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr core/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin core/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq core/$gene.fst.psq
done
popd 1>&2
}
function build_rps12_library() {
pushd $SPDIR 1>&2
local gene=$RPS12_GENE
rm -rf RPS12
mkdir -p RPS12
cp genes/$gene/$gene.cdhit.fst RPS12/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr RPS12/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin RPS12/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq RPS12/$gene.fst.psq
popd 1>&2
}
function build_shell_libraries() {
pushd $SPDIR 1>&2
rm -rf shell
mkdir -p shell
for gene in $(list_shell_genes) ; do
cp genes/$gene/$gene.cdhit.fst shell/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr shell/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin shell/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq shell/$gene.fst.psq
done
popd 1>&2
}
function build_dust_libraries() {
pushd $SPDIR 1>&2
rm -rf dust
mkdir -p dust
for gene in $(list_dust_genes) ; do
cp genes/$gene/$gene.cdhit.fst dust/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr dust/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin dust/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq dust/$gene.fst.psq
done
popd 1>&2
}
function get_product_line() {
pushd $SPGENESDIR 1>&2
local gene=$1
local spac=$(head -1 $gene/$gene.cdhit.fst \
| awk '
{ AC=$2;
sub(/^SP_AC=/,"",AC);
sub(/;$/,"",AC);
print AC}')
popd 1>&2
pushd $SPDIR 1>&2
extract_mito_gene_ac $spac rawdata/SP_Mito.dat \
| grep "^DE " \
| $AwkCmd -v gene=$gene '
function remove_tails(line) {
sub(/ *(\{[^}]+\});/,"",line)
st = index(line,"=")
return substr(line,st+1)
}
/DE +RecName:/ {
full = remove_tails($0)
}
/DE +Short=/ {
ns = remove_tails($0)
if (length(ns) > length(short)) {
short = ns
}
}
/DE +EC=/ {
ec = remove_tails($0)
}
END {
if (length(short) > 10) {
product = short
} else {
product = full
}
if (ec != "") {
product = product " (EC:" ec ")"
}
if (product == "") {
product = "Hypothetical protein of unknown function"
}
gsub(/ /,"_",product)
print gene,gene,"--","--","--",product
}
'
popd 1>&2
}
function build_annotate_lst() {
pushd $SPGENESDIR 1>&2
for gene in * ; do
get_product_line $gene
done
popd 1>&2
}
pushd $SPDIR
####
#
# Download and prepare raw library from Swissprot FTP site
#
###
rm -rf rawdata
mkdir -p rawdata
pushd rawdata
download_swissprot | extract_mito_entries > SP_Mito.dat
extract_fasta_protein SP_Mito.dat > SP_Mito_gene_db.fst
popd
####
#
# Clean swiss-prot fasta file for gene name annotation
#
###
pushd rawdata
rename_genes SP_Mito_gene_db.fst > SP_Mito_gene_db.clean_name.fst
clean_strange_gene_name SP_Mito_gene_db.clean_name.fst \
> SP_Mito_gene_db.good_gene.fst
popd
####
#
# Prepare the database for all genes
#
###
rm -rf genes
split_by_gene rawdata/SP_Mito_gene_db.good_gene.fst
dereplicate_genes
buildBlastDBs
####
#
# Prepare the differente gene databases for CDS annotation
#
###
build_core_libraries
build_shell_libraries
build_dust_libraries
build_rps12_library
####
#
# Build Annotation file
#
###
build_annotate_lst > Annot.lst
# ls ../../mitodb/core/ | grep -v '\.' | sort > core_genes.lst
# ls | grep -v '\.' | awk '{print tolower($1),$1}' | sort > sp_genes.lst
# join -a 1 -e xxxx core_genes.lst sp_genes.lst > join_core.lst
popd

BIN
detectors/ir/.DS_Store vendored

Binary file not shown.

View File

@ -0,0 +1,23 @@
#!/bin/bash
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
( \
for f in $1/*.gbk* ; do \
if [[ "$f" =~ \.gz$ ]] ; then \
GREP=zgrep; \
else \
GREP=grep; \
fi; \
${GREP} -H -A 1 ' ORGANISM' $f; \
done \
) | \
grep -B 1 Metazoa | \
$AwkCmd '{print $1}' | \
grep '\.gbk' | \
sed -E 's/(^.*\.gbk(.gz)?).$/\1/' | \
uniq

Binary file not shown.

View File

@ -0,0 +1,141 @@
#!/bin/bash
#
# BUILD RRNA models
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
function fasta1li {
$AwkCmd '/^>/ {if (sequence) \
{print sequence}; \
print $0; \
sequence=""} \
!/^>/ {sequence = sequence $0} \
END {print sequence}' $1
}
function dereplicate {
DATA=$1
sumaclust -t 1 $DATA | \
fasta1li | \
grep -A 1 '^>' | \
grep -A1 'cluster_center=True;' | \
grep -v -- -- | \
sed -E "s/count=[0-9]+; //" | \
sed 's/cluster_weight/count/' | \
$AwkCmd ' /^>/ {SEQ++;\
match($0,"count=[0-9][0-9]*;");\
count=substr($0,RSTART,RLENGTH);\
$1=$1"_"SEQ;\
print $1,count} \
!/^>/ {print $0}'
}
function clustering {
DATA=$1
rm -rf $DATA
mkdir $DATA
sumaclust -t 0.9 $DATA.fasta | \
fasta1li > $DATA.clust.fasta
cluster=$(grep '^>' $DATA.clust.fasta | \
sed -E 's/.*cluster=([^;]+);.*$/\1/' | \
sort -u)
for c in $cluster; do
w=$(grep "$c" "${DATA}.clust.fasta" | \
head -1 | \
sed -E 's/.*cluster_weight=([^;]+);.*$/\1/')
out=$(printf "${DATA}/%05d_%s" $w $c)
grep -A1 "$c" "${DATA}.clust.fasta" | \
grep -v -- -- > "$out.fasta"
muscle -in "$out.fasta" -out "$out.align.fasta"
done
}
function revcomp {
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
function comp(seq) { \
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
return res; \
} \
function rev(seq) { \
"echo "seq" | rev " | getline res; \
close("echo "seq" | rev "); \
return res; \
} \
function revcomp(seq) { \
res=rev(comp(seq)); \
return res; \
} \
\
(seq) && /^>/ {print head; \
printfasta(revcomp(seq)); \
seq=""} \
/^>/ {head=$0} \
! /^>/ {seq=seq$0} \
END { print head; \
printfasta(revcomp(seq)); \
}' $1
}
pushTmpDir ORG.buildRRNAMito
loginfo "Tempdir: $(pwd)"
openLogFile "${RRNA_DATA_DIR}/rRNA_mito_models.log"
loginfo "Selecting Metazoa genebank entries..."
METAZOA=$(${PROG_DIR}/../../normalize/tools/selectMetazoa.sh $*)
loginfo " --> $(echo ${METAZOA} | wc -w) entries selected"
loginfo "Done"
loginfo "Extracting 12S rRNA sequences..."
rm -f raw_12S.fasta
for f in ${METAZOA}; do
loginfo "Extracting 12S rRNA sequences from ${f}..."
${PROG_DIR}/extract_ref12S.sh ${f} >> raw_12S.fasta
done
loginfo " --> $(fastaCount raw_12S.fasta) retreived sequences"
dereplicate raw_12S.fasta >> 12S.fasta
loginfo " --> $(fastaCount 12S.fasta) distinct sequences"
loginfo "Done"
loginfo "Clustering 12S rRNA sequences..."
clustering 12S
loginfo "Done"
loginfo "Installing 12S rRNA sequences..."
cp -r 12S "${RRNA_DATA_DIR}/RRNA_12S_mito"
loginfo "Done"
loginfo "Extracting 16S rRNA sequences..."
rm -f raw_16S.fasta
for f in ${METAZOA}; do
${PROG_DIR}/extract_ref16S.sh ${f} >> raw_16S.fasta
done
loginfo " --> $(fastaCount raw_16S.fasta) retreived sequences"
dereplicate raw_16S.fasta > 16S.fasta
loginfo " --> $(fastaCount 16S.fasta) distinct sequences"
loginfo "Done"
loginfo "Clustering 16S rRNA sequences..."
clustering 16S
loginfo "Done"
loginfo "Installing 16S rRNA sequences..."
cp -r 16S "${RRNA_DATA_DIR}/RRNA_16S_mito"
loginfo "Done"
popTmpDir

View File

@ -0,0 +1,55 @@
#!/bin/bash
#
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
function comp(seq) { \
"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; \
close("echo "seq" | tr acgtACGT tgcaTGCA "); \
return res; \
} \
function rev(seq) { \
"echo "seq" | rev " | getline res; \
close("echo "seq" | rev "); \
return res; \
} \
function revcomp(seq) { \
res=rev(comp(seq)); \
return res; \
} \
\
/^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""} \
/^ rRNA / {LOCUS=$2; STRAND=1} \
/^ rRNA / && /complement/ {STRAND=0; \
sub("complement\\(","",LOCUS); \
sub("\\)","",LOCUS); \
} \
/12S/ {split(LOCUS,POS,"."); \
FROM=POS[1]; \
TO=POS[3]; \
LENGTH=TO-FROM+1 \
} \
/^ORIGIN/ {seqon=1} \
/^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7; \
gsub("[^acgt]","n",seq);\
sequence=sequence seq \
} \
/^\/\// && FROM \
{print ">RRNA12S_"AC" Strand="STRAND";", \
"cut="FROM".."TO";", \
"seq_length="LENGTH";"; \
SS=substr(sequence,FROM,LENGTH); \
if (! STRAND) \
SS=revcomp(SS); \
printfasta(SS); \
} \
' $*

View File

@ -2,7 +2,12 @@
#
gawk 'function printfasta(seq) { \
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,12 @@
#
gawk 'function printfasta(seq) { \
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -1,8 +1,12 @@
#!/bin/bash
#
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
gawk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -2,7 +2,12 @@
#
gawk 'function printfasta(seq) { \
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"
function revcomp {
gawk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -0,0 +1,29 @@
#!/bin/bash
#
# splitgbk.sh:
# Split a gbk file in multiple files
# each containing a single sequence
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
inputfile=$1
dest=${inputfile/.*/}
mkdir -p $dest
$AwkCmd -v dest="$dest" '/^LOCUS/ {
AC=$2;
destfile = sprintf("%s/%s.gbk", dest, AC);
}
{
print $0 >> destfile
}
/^\/\// {
close(destfile);
}
' $inputfile

View File

@ -32,7 +32,7 @@ pushTmpDir ORG.trna
TRNA=$(basename ${QUERY})
aragorn -i -w -seq -gc11 ${QUERY} | \
aragorn -i -w -seq -gc${2} ${QUERY} | \
${AwkCmd} -f ${PROG_DIR}/../lib/aragorn_wrapper.awk

View File

@ -0,0 +1,390 @@
'\" t
.\" Title: aragorn
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 02/24/2013
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "ARAGORN" "1" "02/24/2013" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.\" http://bugs.debian.org/507673
.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.\" -----------------------------------------------------------------
.\" * set default formatting
.\" -----------------------------------------------------------------
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
.ad l
.\" -----------------------------------------------------------------
.\" * MAIN CONTENT STARTS HERE *
.\" -----------------------------------------------------------------
.SH "NAME"
aragorn \- detect tRNA genes in nucleotide sequences
.SH "SYNOPSIS"
.sp
\fBaragorn\fR [\fIOPTION\fR]\&... \fIFILE\fR
.SH "OPTIONS"
.PP
\fB\-m\fR
.RS 4
Search for tmRNA genes\&.
.RE
.PP
\fB\-t\fR
.RS 4
Search for tRNA genes\&. By default, all are detected\&. If one of
\fB\-m\fR
or
\fB\-t\fR
is specified, then the other is not detected unless specified as well\&.
.RE
.PP
\fB\-mt\fR
.RS 4
Search for Metazoan mitochondrial tRNA genes\&. tRNA genes with introns not detected\&.
\fB\-i\fR,
\fB\-sr\fR
switchs ignored\&. Composite Metazoan mitochondrial genetic code used\&.
.RE
.PP
\fB\-mtmam\fR
.RS 4
Search for Mammalian mitochondrial tRNA genes\&.
\fB\-i\fR,
\fB\-sr\fR
switchs ignored\&.
\fB\-tv\fR
switch set\&. Mammalian mitochondrial genetic code used\&.
.RE
.PP
\fB\-mtx\fR
.RS 4
Same as
\fB\-mt\fR
but low scoring tRNA genes are not reported\&.
.RE
.PP
\fB\-mtd\fR
.RS 4
Overlapping metazoan mitochondrial tRNA genes on opposite strands are reported\&.
.RE
.PP
\fB\-gc\fR[\fInum\fR]
.RS 4
Use the GenBank transl_table = [\fInum\fR] genetic code\&. Individual modifications can be appended using
\fI,BBB\fR=<aa> B = A,C,G, or T\&. <aa> is the three letter code for an amino\-acid\&. More than one modification can be specified\&. eg
\fB\-gcvert\fR,aga=Trp,agg=Trp uses the Vertebrate Mitochondrial code and the codons AGA and AGG changed to Tryptophan\&.
.RE
.PP
\fB\-gcstd\fR
.RS 4
Use standard genetic code\&.
.RE
.PP
\fB\-gcmet\fR
.RS 4
Use composite Metazoan mitochondrial genetic code\&.
.RE
.PP
\fB\-gcvert\fR
.RS 4
Use Vertebrate mitochondrial genetic code\&.
.RE
.PP
\fB\-gcinvert\fR
.RS 4
Use Invertebrate mitochondrial genetic code\&.
.RE
.PP
\fB\-gcyeast\fR
.RS 4
Use Yeast mitochondrial genetic code\&.
.RE
.PP
\fB\-gcprot\fR
.RS 4
Use Mold/Protozoan/Coelenterate mitochondrial genetic code\&.
.RE
.PP
\fB\-gcciliate\fR
.RS 4
Use Ciliate genetic code\&.
.RE
.PP
\fB\-gcflatworm\fR
.RS 4
Use Echinoderm/Flatworm mitochondrial genetic code
.RE
.PP
\fB\-gceuplot\fR
.RS 4
Use Euplotid genetic code\&.
.RE
.PP
\fB\-gcbact\fR
.RS 4
Use Bacterial/Plant Chloroplast genetic code\&.
.RE
.PP
\fB\-gcaltyeast\fR
.RS 4
Use alternative Yeast genetic code\&.
.RE
.PP
\fB\-gcascid\fR
.RS 4
Use Ascidian Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcaltflat\fR
.RS 4
Use alternative Flatworm Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcblep\fR
.RS 4
Use Blepharisma genetic code\&.
.RE
.PP
\fB\-gcchloroph\fR
.RS 4
Use Chlorophycean Mitochondrial genetic code\&.
.RE
.PP
\fB\-gctrem\fR
.RS 4
Use Trematode Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcscen\fR
.RS 4
Use Scenedesmus obliquus Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcthraust\fR
.RS 4
Use Thraustochytrium Mitochondrial genetic code\&.
.RE
.PP
\fB\-tv\fR
.RS 4
Do not search for mitochondrial TV replacement loop tRNA genes\&. Only relevant if
\fB\-mt\fR
used\&.
.RE
.PP
\fB\-c7\fR
.RS 4
Search for tRNA genes with 7 base C\-loops only\&.
.RE
.PP
\fB\-i\fR
.RS 4
Search for tRNA genes with introns in anticodon loop with maximum length 3000 bases\&. Minimum intron length is 0 bases\&. Ignored if
\fB\-m\fR
is specified\&.
.RE
.PP
\fB\-i\fR[\fImax\fR]
.RS 4
Search for tRNA genes with introns in anticodon loop with maximum length [\fImax\fR] bases\&. Minimum intron length is 0 bases\&. Ignored if
\fB\-m\fR
is specified\&.
.RE
.PP
\fB\-i\fR[\fImin\fR],[\fImax\fR]
.RS 4
Search for tRNA genes with introns in anticodon loop with maximum length [\fImax\fR] bases, and minimum length [\fImin\fR] bases\&. Ignored if
\fB\-m\fR
is specified\&.
.RE
.PP
\fB\-io\fR
.RS 4
Same as
\fB\-i\fR, but allow tRNA genes with long introns to overlap shorter tRNA genes\&.
.RE
.PP
\fB\-if\fR
.RS 4
Same as
\fB\-i\fR, but fix intron between positions 37 and 38 on C\-loop (one base after anticodon)\&.
.RE
.PP
\fB\-ifo\fR
.RS 4
Same as
\fB\-if\fR
and
\fB\-io\fR
combined\&.
.RE
.PP
\fB\-ir\fR
.RS 4
Same as
\fB\-i\fR, but report tRNA genes with minimum length [\fImin\fR] bases rather than search for tRNA genes with minimum length [\fImin\fR] bases\&. With this switch, [\fImin\fR] acts as an output filter, minimum intron length for searching is still 0 bases\&.
.RE
.PP
\fB\-c\fR
.RS 4
Assume that each sequence has a circular topology\&. Search wraps around each end\&. Default setting\&.
.RE
.PP
\fB\-l\fR
.RS 4
Assume that each sequence has a linear topology\&. Search does not wrap\&.
.RE
.PP
\fB\-d\fR
.RS 4
Double\&. Search both strands of each sequence\&. Default setting\&.
.RE
.PP
\fB\-s\fR or \fB\-s+\fR
.RS 4
Single\&. Do not search the complementary (antisense) strand of each sequence\&.
.RE
.PP
\fB\-sc\fR or \fB\-s\-\fR
.RS 4
Single complementary\&. Do not search the sense strand of each sequence\&.
.RE
.PP
\fB\-ps\fR
.RS 4
Lower scoring thresholds to 95% of default levels\&.
.RE
.PP
\fB\-ps\fR[\fInum\fR]
.RS 4
Change scoring thresholds to [\fInum\fR] percent of default levels\&.
.RE
.PP
\fB\-rp\fR
.RS 4
Flag possible pseudogenes (score < 100 or tRNA anticodon loop <> 7 bases long)\&. Note that genes with score < 100 will not be detected or flagged if scoring thresholds are not also changed to below 100% (see \-ps switch)\&.
.RE
.PP
\fB\-seq\fR
.RS 4
Print out primary sequence\&.
.RE
.PP
\fB\-br\fR
.RS 4
Show secondary structure of tRNA gene primary sequence using round brackets\&.
.RE
.PP
\fB\-fasta\fR
.RS 4
Print out primary sequence in fasta format\&.
.RE
.PP
\fB\-fo\fR
.RS 4
Print out primary sequence in fasta format only (no secondary structure)\&.
.RE
.PP
\fB\-fon\fR
.RS 4
Same as
\fB\-fo\fR, with sequence and gene numbering in header\&.
.RE
.PP
\fB\-fos\fR
.RS 4
Same as
\fB\-fo\fR, with no spaces in header\&.
.RE
.PP
\fB\-fons\fR
.RS 4
Same as
\fB\-fo\fR, with sequence and gene numbering, but no spaces\&.
.RE
.PP
\fB\-w\fR
.RS 4
Print out in Batch mode\&.
.RE
.PP
\fB\-ss\fR
.RS 4
Use the stricter canonical 1\-2 bp spacer1 and 1 bp spacer2\&. Ignored if
\fB\-mt\fR
set\&. Default is to allow 3 bp spacer1 and 0\-2 bp spacer2, which may degrade selectivity\&.
.RE
.PP
\fB\-v\fR
.RS 4
Verbose\&. Prints out information during search to STDERR\&.
.RE
.PP
\fB\-a\fR
.RS 4
Print out tRNA domain for tmRNA genes\&.
.RE
.PP
\fB\-a7\fR
.RS 4
Restrict tRNA astem length to a maximum of 7 bases
.RE
.PP
\fB\-aa\fR
.RS 4
Display message if predicted iso\-acceptor species does not match species in sequence name (if present)\&.
.RE
.PP
\fB\-j\fR
.RS 4
Display 4\-base sequence on 3\*(Aq end of astem regardless of predicted amino\-acyl acceptor length\&.
.RE
.PP
\fB\-jr\fR
.RS 4
Allow some divergence of 3\*(Aq amino\-acyl acceptor sequence from NCCA\&.
.RE
.PP
\fB\-jr4\fR
.RS 4
Allow some divergence of 3\*(Aq amino\-acyl acceptor sequence from NCCA, and display 4 bases\&.
.RE
.PP
\fB\-q\fR
.RS 4
Dont print configuration line (which switchs and files were used)\&.
.RE
.PP
\fB\-rn\fR
.RS 4
Repeat sequence name before summary information\&.
.RE
.PP
\fB\-O\fR [\fIoutfile\fR]
.RS 4
Print output to
\fI\&. If [\*(Aqoutfile\fR] already exists, it is overwritten\&. By default all output goes to stdout\&.
.RE
.SH "DESCRIPTION"
.sp
aragorn detects tRNA, mtRNA, and tmRNA genes\&. A minimum requirement is at least a 32 bit compiler architecture (variable types int and unsigned int are at least 4 bytes long)\&.
.sp
[\fIFILE\fR] is assumed to contain one or more sequences in FASTA format\&. Results of the search are printed to STDOUT\&. All switches are optional and case\-insensitive\&. Unless \-i is specified, tRNA genes containing introns are not detected\&.
.SH "AUTHORS"
.sp
Bjorn Canback <bcanback@acgt\&.se>, Dean Laslett <gaiaquark@gmail\&.com>
.SH "REFERENCES"
.sp
Laslett, D\&. and Canback, B\&. (2004) ARAGORN, a program for the detection of transfer RNA and transfer\-messenger RNA genes in nucleotide sequences Nucleic Acids Research, 32;11\-16
.sp
Laslett, D\&. and Canback, B\&. (2008) ARWEN: a program to detect tRNA genes in metazoan mitochondrial nucleotide sequences Bioinformatics, 24(2); 172\-175\&.