2c012eec8e0a3e814b36b2482af170cc51a0eb7d

@ -0,0 +1,528 @@
#!/bin/bash
#
#                           BUILD REFERENCE : From SwissProt
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink

THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

SPDIR="$CDS_DATA_DIR/sp_mitodb"
SPGENESDIR="$SPDIR/genes"

CORE_GENES="atp6 atp8"
CORE_GENES="$CORE_GENES cox1 cox2 cox3"
CORE_GENES="$CORE_GENES cytb"
CORE_GENES="$CORE_GENES nd1 nd2 nd3 nd4 nd4L nd5 nd6"

function download_swissprot() {
    URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"

    curl $URL
}

function extract_mito_entries() {
    awk -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        /OC   Eukaryota; Metazoa;/ && /OG   Mitochondrion\./ && !/DE   Flags: Fragment;/ {print $0}
    ' $1
}

function extract_mito_gene_entries() {

    awk -v gene=$1 -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        ($0 ~ gene"_") && /OC   Eukaryota; Metazoa;/ && /OG   Mitochondrion./ {print $0}
    ' $2
}

function extract_mito_gene_frg() {

    awk -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        /DE   Flags: Fragment;/ {print $0}
    ' $1
}

function extract_mito_gene_ac() {

    awk -v ac=$1 -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        ($0 ~ "AC   " ac ";") && /OC   Eukaryota; Metazoa;/ && /OG   Mitochondrion./ {print $0}
    ' $2
}

function extract_fasta_protein() {
    $AwkCmd '
        /^ID/ {ID=$2} 
        /^AC/ {AC=$2; sub(";","",AC)} 
        /^DR   EMBL;/ {
            EMBL=$4; 
            sub(";","",EMBL); 
            sub("-","xxx",EMBL);
            if ( EMBL != "xxx" ) {
                EBI="curl \"https://www.ebi.ac.uk/ena/browser/api/embl/" EMBL "?download=true\" | egrep \"FT +/gene=\" | cut -d \"=\" -f 2 | tr -d \"\\\"\""
                EBI | getline GENE
                close(EBI)
            } else {
                GENE = "xxx"
            }
            } 
        /^   / {gsub(/ /,"",$0); SEQ=SEQ $0}
        /^\/\// {
            if (GENE != "xxx") {
                print ">"ID,"SP_AC="AC";","EMBL_AC="EMBL";","gene="GENE";";
                print SEQ 
            }
            ID="xxx";
            AC="xxx"; 
            EMBL="xxx"; 
            GENE="xxx"
            SEQ=""
        }
        ' $1 \
    | formatfasta
}

function rename_rules() {
    egrep "^>" $1 \
    | sed 's/^>//' \
    | sed 's/_/=/' \
    | tr -d ";" \
    | awk -F"=" '{print $1,$NF}' \
    | sort \
    | uniq -c \
    | awk '{
        x=$1
        $1=$2
        $2=sprintf("%06d",x)
        print $0}' \
    | sort -rn \
    | sed 's/ /=/' \
    | sed 's/ /=/' \
    | awk -F "=" '
        ($1 == current) {
            sub(/^0+/,"",occurrence)
            print "# based on",occurrence,"observations renames",$NF,"in",gene
            print "/^>" current "/ s@gene=" $NF "@gene=" gene "@"
        }
        ($1 != current) {
            current= $1
            gene=$NF 
            occurrence=$2
        }
        '
}

function rename_genes() {
    local n=1
    local input=$1
    cat $input > __tmp__$$__fasta
    rules=${input/fst/rules}
    #echo $rule 1>&2

    rename_rules __tmp__$$__fasta > $rules.$n
    while [[ -s $rules.$n ]] ; do
        sed -f $rules.$n __tmp__$$__fasta > __tmp2__$$__fasta
        mv __tmp2__$$__fasta __tmp__$$__fasta
        ((n++))
        rename_rules __tmp__$$__fasta > $rules.$n
    done

    cat __tmp__$$__fasta
    rm __tmp__$$__fasta
}


function clean_strange_gene_name() {
    local input=$1
    local output=${input/.fst/_sp_ebi.genes}
    local to_keep=${input/.fst/_to_keep.lst}
    local to_be_removed=${input/.fst/_to_be_removed.lst}

    grep '^>' $input \
    | sed 's/^>//' \
    | sed 's/_/=/' \
    | sed 's/;$//' \
    | awk -F'=' '{print $NF,$1}' \
    | sort \
    | uniq -c \
    | $AwkCmd '{
        x=$1
        $1=$2
        $2=sprintf("%06d",x)
        print $0}' \
    | sort -r > $output

    $AwkCmd '
        ($1!=current) {current=$1; print $3}
        ' $output > $to_keep

    $AwkCmd '
        ($1==current) {print $3} 
        ($1!=current) {current=$1}
        ' $output > $to_be_removed

    filter_sp_fasta_db $to_be_removed $input > ${input/.fst/_strange_genes.fst}
    filter_sp_fasta_db $to_keep $input
}

function filter_sp_fasta_db() {
    gene_pattern="$(echo $(cat $1) | tr ' ' '|')"

    $AwkCmd -F "_" -v gene_pattern=$gene_pattern '
        /^>/ && (gene ~ gene_pattern) {
            print entry
        }

        /^>/ {
            entry = $0
            gene = $1
            sub(/^>/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            if (gene ~ gene_pattern)
                print entry
        }
    ' $2

}

function filter_out_sp_fasta_db() {
    gene_pattern="$(echo $(cat $1) | tr ' ' '|')"

    $AwkCmd -v gene_pattern=$gene_pattern '
        /^>/ && (gene !~ gene_pattern) {
            print entry
        }

        /^>/ {
            entry = $0
            gene = $1
            sub(/^>/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            if (gene !~ gene_pattern)
                print entry
        }
    ' $2

}

function split_by_gene() {
    $AwkCmd -F "=" -v SPGENES=$SPGENESDIR '
        function writegene(gene,entry) {
            outputdir = SPGENES "/" gene 
            output = outputdir "/" gene ".fst"
            system("mkdir -p " outputdir)
            print entry >> output
            close(output)
        }

        /^>/ && gene!="" {
            writegene(gene,entry)
        }

        /^>/ {
            entry = $0
            gene = $NF
            sub(/;$/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            writegene(gene,entry)
        }
    ' $1
}

function dereplicate() {
    local CDHIT_ID=0.95
    local CDHIT_DELTA=0.95

 	local gene="${1}"
 	local fastain="${gene}/${gene}.fst"
 	local cdhitout="${gene}/${gene}.cdhit.fst"

     	cd-hit 	-i "${fastain}" \
 	    	-o "${cdhitout}" \
 	    	-c ${CDHIT_DELTA} \
 	    	-G 1 \
			-g 1 \
			-aL 0.95 \
			-s ${CDHIT_ID} \
 	    	-b 350 -p 1 \
 	    	-d 0 -n 3

 	local fasta1="${gene}/${gene}.1l.fst"
}

function dereplicate_genes() {
    pushd $SPGENESDIR
    for g in * ; do 
        dereplicate $g ; 
    done
    popd
}

function buildGeneBlastDB() {
 	local gene="${1}"
 	local fastain="${gene}/${gene}.cdhit.fst"

    loginfo "  formatting Blast $gene DB"
    timeoutcmd 300 makeblastdb -dbtype prot -in ${fastain} >& /dev/null
	loginfo "Done"
}

function buildBlastDBs() {
    pushd $SPGENESDIR
    for g in * ; do 
        buildGeneBlastDB $g ; 
    done
    popd
}

function list_shell_genes() {
    pushd $SPDIR

    ls genes \
    | grep -v '\.' \
    | egrep -iv $(tr " " "|" <<< $CORE_GENES) \
    | grep -iv '^orf' \
    | grep -iv "$RPS12_GENE"

    popd
}

function list_dust_genes() {
    pushd $SPDIR 1>&2

    ls genes \
    | grep -v '\.' \
    | egrep -iv $(tr " " "|" <<< $CORE_GENES) \
    | grep -i '^orf' 

    popd 1>&2
}

function build_core_libraries() {
    pushd $SPDIR 1>&2

    rm -rf core
    mkdir -p core

    for gene in $CORE_GENES ; do 
        cp genes/$gene/$gene.cdhit.fst core/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr core/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin core/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq core/$gene.fst.psq
    done

    popd 1>&2
}

function build_rps12_library() {
    pushd $SPDIR 1>&2

    local gene=$RPS12_GENE

    rm -rf RPS12
    mkdir -p RPS12
    
    cp genes/$gene/$gene.cdhit.fst RPS12/$gene.fst
    cp genes/$gene/$gene.cdhit.fst.phr RPS12/$gene.fst.phr
    cp genes/$gene/$gene.cdhit.fst.pin RPS12/$gene.fst.pin
    cp genes/$gene/$gene.cdhit.fst.psq RPS12/$gene.fst.psq

    popd 1>&2
}

function build_shell_libraries() {
    pushd $SPDIR 1>&2

    rm -rf shell
    mkdir -p shell

    for gene in $(list_shell_genes) ; do 
        cp genes/$gene/$gene.cdhit.fst shell/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr shell/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin shell/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq shell/$gene.fst.psq
    done

    popd 1>&2
}

function build_dust_libraries() {
    pushd $SPDIR 1>&2

    rm -rf dust
    mkdir -p dust

    for gene in $(list_dust_genes) ; do 
        cp genes/$gene/$gene.cdhit.fst dust/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr dust/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin dust/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq dust/$gene.fst.psq
    done

    popd 1>&2
}

function get_product_line() {
    pushd $SPGENESDIR 1>&2

    local gene=$1
    local spac=$(head -1 $gene/$gene.cdhit.fst \
                 | awk '
                     { AC=$2; 
                       sub(/^SP_AC=/,"",AC); 
                       sub(/;$/,"",AC); 
                       print AC}')

    popd 1>&2

    pushd $SPDIR 1>&2

    extract_mito_gene_ac $spac rawdata/SP_Mito.dat \
    | grep "^DE   " \
    | $AwkCmd -v gene=$gene '
        function remove_tails(line) {
            sub(/ *(\{[^}]+\});/,"",line)
            st = index(line,"=")
            return substr(line,st+1)
        } 

        /DE +RecName:/ {
            full = remove_tails($0)
        }
        /DE +Short=/ {
            ns = remove_tails($0)
            if (length(ns) > length(short)) {
                short = ns
            }
        }
        /DE +EC=/ {
            ec = remove_tails($0)
        }

        END {
            if (length(short) > 10) {
                product = short
            } else {
                product = full
            }

            if (ec != "") {
                product = product " (EC:" ec ")"
            }

            if (product == "") {
                product = "Hypothetical protein of unknown function"
            } 

            gsub(/ /,"_",product)

            print gene,gene,"--","--","--",product
        }
      '
    popd 1>&2
}

function build_annotate_lst() {
    pushd $SPGENESDIR 1>&2

    for gene in * ; do 
        get_product_line $gene
    done

    popd 1>&2
}

pushd $SPDIR

####
#
# Download and prepare raw library from Swissprot FTP site
#
###

rm -rf rawdata
mkdir -p rawdata

pushd rawdata

download_swissprot | extract_mito_entries > SP_Mito.dat

extract_fasta_protein SP_Mito.dat > SP_Mito_gene_db.fst

popd

####
#
# Clean swiss-prot fasta file for gene name annotation
#
###

pushd rawdata

rename_genes SP_Mito_gene_db.fst > SP_Mito_gene_db.clean_name.fst

clean_strange_gene_name SP_Mito_gene_db.clean_name.fst \
                      > SP_Mito_gene_db.good_gene.fst

popd

####
#
# Prepare the database for all genes
#
###

rm -rf genes
split_by_gene rawdata/SP_Mito_gene_db.good_gene.fst

dereplicate_genes

buildBlastDBs

####
#
# Prepare the differente gene databases for CDS annotation
#
###

build_core_libraries
build_shell_libraries
build_dust_libraries
build_rps12_library

####
#
# Build Annotation file
#
###

build_annotate_lst > Annot.lst

# ls ../../mitodb/core/ | grep -v '\.' | sort > core_genes.lst
# ls | grep -v '\.' | awk '{print tolower($1),$1}' | sort > sp_genes.lst
# join -a 1 -e xxxx core_genes.lst sp_genes.lst  > join_core.lst

popd
@ -0,0 +1,23 @@
#!/bin/bash

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"


(                                          \
	for f in $1/*.gbk* ; do                       \
		if [[ "$f" =~ \.gz$ ]] ; then      \
			GREP=zgrep;                    \
	    else                               \
			GREP=grep;                     \
	    fi;                                \
		${GREP} -H -A 1 '  ORGANISM' $f;   \
    done                                   \
) | \
  grep -B 1 Metazoa | \
  $AwkCmd '{print $1}' | \
  grep '\.gbk' | \
  sed -E 's/(^.*\.gbk(.gz)?).$/\1/' | \
  uniq
@ -0,0 +1,141 @@
#!/bin/bash
#
#                           BUILD RRNA models
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

function fasta1li {

    $AwkCmd '/^>/ {if (sequence) \
                  {print sequence}; \
               print $0; \
               sequence=""} \
         !/^>/ {sequence = sequence $0} \
         END {print sequence}' $1
}

function dereplicate {
	DATA=$1
	sumaclust -t 1 $DATA | \
		fasta1li | \
		grep -A 1 '^>' | \
		grep -A1 'cluster_center=True;' | \
		grep -v -- -- | \
		sed -E "s/count=[0-9]+; //" | \
		sed 's/cluster_weight/count/' | \
		$AwkCmd ' /^>/ {SEQ++;\
		            match($0,"count=[0-9][0-9]*;");\
		            count=substr($0,RSTART,RLENGTH);\
		            $1=$1"_"SEQ;\
		            print $1,count} \
			 !/^>/ {print $0}'
}


function clustering {
	DATA=$1
	rm -rf $DATA
	mkdir $DATA
	sumaclust -t 0.9 $DATA.fasta | \
		fasta1li > $DATA.clust.fasta
	cluster=$(grep '^>' $DATA.clust.fasta | \
	            sed -E 's/.*cluster=([^;]+);.*$/\1/' | \
	            sort -u)
	for c in $cluster; do
		w=$(grep "$c" "${DATA}.clust.fasta" | \
			head -1 | \
			sed -E 's/.*cluster_weight=([^;]+);.*$/\1/')
	    out=$(printf "${DATA}/%05d_%s" $w $c)
        grep -A1  "$c" "${DATA}.clust.fasta" | \
           grep -v -- -- > "$out.fasta"
        muscle -in "$out.fasta" -out "$out.align.fasta"
	done
}

function revcomp {
    $AwkCmd 'function printfasta(seq) {                                  \
            seqlen=length(seq);                                       \
            for (i=1; i <= seqlen; i+=60)                              \
              print substr(seq,i,60);                                 \
         }                                                          \
        function comp(seq) {                                           \
            "echo "seq" | tr acgtACGT tgcaTGCA " | getline res;     \
            close("echo "seq" | tr acgtACGT tgcaTGCA "); \
            return res;                                                \
        }                                                              \
        function rev(seq) {                                            \
            "echo "seq" | rev " | getline res;                         \
            close("echo "seq" | rev ");                                \
            return res;                                                \
        }                                                              \
        function revcomp(seq) {                                        \
            res=rev(comp(seq));                                        \
            return res;                                                \
        }                                                              \
                                                                       \
        (seq) && /^>/ {print head;                                     \
                       printfasta(revcomp(seq));                       \
                       seq=""}                                         \
        /^>/   {head=$0}                                               \
        ! /^>/ {seq=seq$0}                                             \
        END { print head;                                     \
              printfasta(revcomp(seq));                       \
            }' $1
}


pushTmpDir ORG.buildRRNAMito
	loginfo "Tempdir: $(pwd)"

	openLogFile "${RRNA_DATA_DIR}/rRNA_mito_models.log"
	
	loginfo "Selecting Metazoa genebank entries..."
		METAZOA=$(${PROG_DIR}/../../normalize/tools/selectMetazoa.sh $*) 
		loginfo " --> $(echo ${METAZOA} | wc -w) entries selected"
	loginfo "Done"
	
	loginfo "Extracting 12S rRNA sequences..."
		rm -f raw_12S.fasta
		for f in ${METAZOA}; do
			loginfo "Extracting 12S rRNA sequences from ${f}..."
			${PROG_DIR}/extract_ref12S.sh ${f} >> raw_12S.fasta
		done
		loginfo " --> $(fastaCount raw_12S.fasta) retreived sequences"
		dereplicate raw_12S.fasta >> 12S.fasta
		loginfo " --> $(fastaCount 12S.fasta) distinct sequences"
	loginfo "Done"

	loginfo "Clustering 12S rRNA sequences..."
		clustering 12S
	loginfo "Done"
	
	loginfo "Installing 12S rRNA sequences..."
		cp -r 12S 	"${RRNA_DATA_DIR}/RRNA_12S_mito"
	loginfo "Done"


	loginfo "Extracting 16S rRNA sequences..."
		rm -f raw_16S.fasta
		for f in ${METAZOA}; do
			${PROG_DIR}/extract_ref16S.sh ${f} >> raw_16S.fasta
		done
		loginfo " --> $(fastaCount raw_16S.fasta) retreived sequences"
		dereplicate raw_16S.fasta > 16S.fasta
		loginfo " --> $(fastaCount 16S.fasta) distinct sequences"
	loginfo "Done"

	loginfo "Clustering 16S rRNA sequences..."
		clustering 16S
	loginfo "Done"
	
	loginfo "Installing 16S rRNA sequences..."
		cp -r 16S 	"${RRNA_DATA_DIR}/RRNA_16S_mito"
	loginfo "Done"


popTmpDir
@ -0,0 +1,55 @@
#!/bin/bash
#


# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

  $AwkCmd 'function printfasta(seq) {                                            \
             seqlen=length(seq);                                             \
             for (i=1; i <= seqlen; i+=60)                                   \
                 print substr(seq,i,60);                                    \
             }                                                               \
       function comp(seq) {                   						                   \
              "echo "seq" | tr acgtACGT tgcaTGCA " | getline res; 	         \
              close("echo "seq" | tr acgtACGT tgcaTGCA ");                   \
              return res;                        						                 \
             }                                      						             \
       function rev(seq) {                    						                   \
              "echo "seq" | rev " | getline res; 						                 \
              close("echo "seq" | rev ");                                    \
              return res;                        						                 \
             }                                      						             \
       function revcomp(seq) {                						                   \
              res=rev(comp(seq));                						                 \
              return res;                        						                 \
             }                                      						             \
                                                                             \
       /^LOCUS / {AC=$2; sequence=""; seqon=0; FROM="";TO=""}                \
       /^     rRNA  / {LOCUS=$2; STRAND=1}                             \
       /^     rRNA  / && /complement/ {STRAND=0;                       \
                                             sub("complement\\(","",LOCUS);  \
                                             sub("\\)","",LOCUS);    \
                                            }                      \
       /12S/ {split(LOCUS,POS,".");         \
                                     FROM=POS[1];                  \
                                     TO=POS[3];                    \
                                     LENGTH=TO-FROM+1              \
                                    }                              \
       /^ORIGIN/ {seqon=1}                                         \
       /^ *[1-9][0-9]* [a-z ]+$/ && seqon {seq=$2 $3 $4 $5 $6 $7;  \
                                           gsub("[^acgt]","n",seq);\
                                           sequence=sequence seq   \
                                          }                        \
       /^\/\// && FROM    \
                        {print ">RRNA12S_"AC" Strand="STRAND";",       \
                               "cut="FROM".."TO";",                \
                               "seq_length="LENGTH";";             \
                         SS=substr(sequence,FROM,LENGTH);          \
                         if (! STRAND)                             \
                           SS=revcomp(SS);                         \
                         printfasta(SS);                           \
       }             \
      ' $*
@ -2,7 +2,12 @@
#


  gawk 'function printfasta(seq) {                                            \
# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

  $AwkCmd 'function printfasta(seq) {                                            \
             seqlen=length(seq);                                             \
             for (i=1; i <= seqlen; i+=60)                                   \
                 print substr(seq,i,60);                                    \

@ -2,7 +2,12 @@
#


  gawk 'function printfasta(seq) {                                            \
# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

  $AwkCmd 'function printfasta(seq) {                                            \
             seqlen=length(seq);                                             \
             for (i=1; i <= seqlen; i+=60)                                   \
                 print substr(seq,i,60);                                     \

@ -1,8 +1,12 @@
#!/bin/bash
#

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

  gawk 'function printfasta(seq) {                                            \
  $AwkCmd 'function printfasta(seq) {                                            \
             seqlen=length(seq);                                             \
             for (i=1; i <= seqlen; i+=60)                                   \
                 print substr(seq,i,60);                                     \

@ -2,7 +2,12 @@
#


  gawk 'function printfasta(seq) {                                            \
# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

  $AwkCmd 'function printfasta(seq) {                                            \
             seqlen=length(seq);                                             \
             for (i=1; i <= seqlen; i+=60)                                   \
                 print substr(seq,i,60);                                     \

@ -11,7 +11,7 @@ source "${THIS_DIR}/../../../scripts/bash_init.sh"


function revcomp {
    gawk 'function printfasta(seq) {                                  \
    $AwkCmd 'function printfasta(seq) {                                  \
            seqlen=length(seq);                                       \
            for (i=1; i <= seqlen; i+=60)                              \
              print substr(seq,i,60);                                 \

@ -0,0 +1,29 @@
#!/bin/bash
#
#                           splitgbk.sh:
#                           Split a gbk file in multiple files
#                           each containing a single sequence
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

inputfile=$1
dest=${inputfile/.*/}

mkdir -p $dest

$AwkCmd -v dest="$dest" '/^LOCUS/ {
            AC=$2;
            destfile = sprintf("%s/%s.gbk", dest, AC);
        }
        { 
            print $0 >> destfile
        }
        /^\/\// {
            close(destfile);
        }
        ' $inputfile
@ -32,7 +32,7 @@ pushTmpDir ORG.trna

	TRNA=$(basename ${QUERY})
	
	aragorn -i -w -seq -gc11 ${QUERY} | \
	aragorn -i -w -seq -gc${2} ${QUERY} | \
		${AwkCmd} -f ${PROG_DIR}/../lib/aragorn_wrapper.awk
	

@ -0,0 +1,390 @@
'\" t
.\"     Title: aragorn
.\"    Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\"      Date: 02/24/2013
.\"    Manual: \ \&
.\"    Source: \ \&
.\"  Language: English
.\"
.TH "ARAGORN" "1" "02/24/2013" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.\" http://bugs.debian.org/507673
.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\" -----------------------------------------------------------------
.\" * set default formatting
.\" -----------------------------------------------------------------
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
.ad l
.\" -----------------------------------------------------------------
.\" * MAIN CONTENT STARTS HERE *
.\" -----------------------------------------------------------------
.SH "NAME"
aragorn \- detect tRNA genes in nucleotide sequences
.SH "SYNOPSIS"
.sp
\fBaragorn\fR [\fIOPTION\fR]\&... \fIFILE\fR
.SH "OPTIONS"
.PP
\fB\-m\fR
.RS 4
Search for tmRNA genes\&.
.RE
.PP
\fB\-t\fR
.RS 4
Search for tRNA genes\&. By default, all are detected\&. If one of
\fB\-m\fR
or
\fB\-t\fR
is specified, then the other is not detected unless specified as well\&.
.RE
.PP
\fB\-mt\fR
.RS 4
Search for Metazoan mitochondrial tRNA genes\&. tRNA genes with introns not detected\&.
\fB\-i\fR,
\fB\-sr\fR
switchs ignored\&. Composite Metazoan mitochondrial genetic code used\&.
.RE
.PP
\fB\-mtmam\fR
.RS 4
Search for Mammalian mitochondrial tRNA genes\&.
\fB\-i\fR,
\fB\-sr\fR
switchs ignored\&.
\fB\-tv\fR
switch set\&. Mammalian mitochondrial genetic code used\&.
.RE
.PP
\fB\-mtx\fR
.RS 4
Same as
\fB\-mt\fR
but low scoring tRNA genes are not reported\&.
.RE
.PP
\fB\-mtd\fR
.RS 4
Overlapping metazoan mitochondrial tRNA genes on opposite strands are reported\&.
.RE
.PP
\fB\-gc\fR[\fInum\fR]
.RS 4
Use the GenBank transl_table = [\fInum\fR] genetic code\&. Individual modifications can be appended using
\fI,BBB\fR=<aa> B = A,C,G, or T\&. <aa> is the three letter code for an amino\-acid\&. More than one modification can be specified\&. eg
\fB\-gcvert\fR,aga=Trp,agg=Trp uses the Vertebrate Mitochondrial code and the codons AGA and AGG changed to Tryptophan\&.
.RE
.PP
\fB\-gcstd\fR
.RS 4
Use standard genetic code\&.
.RE
.PP
\fB\-gcmet\fR
.RS 4
Use composite Metazoan mitochondrial genetic code\&.
.RE
.PP
\fB\-gcvert\fR
.RS 4
Use Vertebrate mitochondrial genetic code\&.
.RE
.PP
\fB\-gcinvert\fR
.RS 4
Use Invertebrate mitochondrial genetic code\&.
.RE
.PP
\fB\-gcyeast\fR
.RS 4
Use Yeast mitochondrial genetic code\&.
.RE
.PP
\fB\-gcprot\fR
.RS 4
Use Mold/Protozoan/Coelenterate mitochondrial genetic code\&.
.RE
.PP
\fB\-gcciliate\fR
.RS 4
Use Ciliate genetic code\&.
.RE
.PP
\fB\-gcflatworm\fR
.RS 4
Use Echinoderm/Flatworm mitochondrial genetic code
.RE
.PP
\fB\-gceuplot\fR
.RS 4
Use Euplotid genetic code\&.
.RE
.PP
\fB\-gcbact\fR
.RS 4
Use Bacterial/Plant Chloroplast genetic code\&.
.RE
.PP
\fB\-gcaltyeast\fR
.RS 4
Use alternative Yeast genetic code\&.
.RE
.PP
\fB\-gcascid\fR
.RS 4
Use Ascidian Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcaltflat\fR
.RS 4
Use alternative Flatworm Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcblep\fR
.RS 4
Use Blepharisma genetic code\&.
.RE
.PP
\fB\-gcchloroph\fR
.RS 4
Use Chlorophycean Mitochondrial genetic code\&.
.RE
.PP
\fB\-gctrem\fR
.RS 4
Use Trematode Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcscen\fR
.RS 4
Use Scenedesmus obliquus Mitochondrial genetic code\&.
.RE
.PP
\fB\-gcthraust\fR
.RS 4
Use Thraustochytrium Mitochondrial genetic code\&.
.RE
.PP
\fB\-tv\fR
.RS 4
Do not search for mitochondrial TV replacement loop tRNA genes\&. Only relevant if
\fB\-mt\fR
used\&.
.RE
.PP
\fB\-c7\fR
.RS 4
Search for tRNA genes with 7 base C\-loops only\&.
.RE
.PP
\fB\-i\fR
.RS 4
Search for tRNA genes with introns in anticodon loop with maximum length 3000 bases\&. Minimum intron length is 0 bases\&. Ignored if
\fB\-m\fR
is specified\&.
.RE
.PP
\fB\-i\fR[\fImax\fR]
.RS 4
Search for tRNA genes with introns in anticodon loop with maximum length [\fImax\fR] bases\&. Minimum intron length is 0 bases\&. Ignored if
\fB\-m\fR
is specified\&.
.RE
.PP
\fB\-i\fR[\fImin\fR],[\fImax\fR]
.RS 4
Search for tRNA genes with introns in anticodon loop with maximum length [\fImax\fR] bases, and minimum length [\fImin\fR] bases\&. Ignored if
\fB\-m\fR
is specified\&.
.RE
.PP
\fB\-io\fR
.RS 4
Same as
\fB\-i\fR, but allow tRNA genes with long introns to overlap shorter tRNA genes\&.
.RE
.PP
\fB\-if\fR
.RS 4
Same as
\fB\-i\fR, but fix intron between positions 37 and 38 on C\-loop (one base after anticodon)\&.
.RE
.PP
\fB\-ifo\fR
.RS 4
Same as
\fB\-if\fR
and
\fB\-io\fR
combined\&.
.RE
.PP
\fB\-ir\fR
.RS 4
Same as
\fB\-i\fR, but report tRNA genes with minimum length [\fImin\fR] bases rather than search for tRNA genes with minimum length [\fImin\fR] bases\&. With this switch, [\fImin\fR] acts as an output filter, minimum intron length for searching is still 0 bases\&.
.RE
.PP
\fB\-c\fR
.RS 4
Assume that each sequence has a circular topology\&. Search wraps around each end\&. Default setting\&.
.RE
.PP
\fB\-l\fR
.RS 4
Assume that each sequence has a linear topology\&. Search does not wrap\&.
.RE
.PP
\fB\-d\fR
.RS 4
Double\&. Search both strands of each sequence\&. Default setting\&.
.RE
.PP
\fB\-s\fR or \fB\-s+\fR
.RS 4
Single\&. Do not search the complementary (antisense) strand of each sequence\&.
.RE
.PP
\fB\-sc\fR or \fB\-s\-\fR
.RS 4
Single complementary\&. Do not search the sense strand of each sequence\&.
.RE
.PP
\fB\-ps\fR
.RS 4
Lower scoring thresholds to 95% of default levels\&.
.RE
.PP
\fB\-ps\fR[\fInum\fR]
.RS 4
Change scoring thresholds to [\fInum\fR] percent of default levels\&.
.RE
.PP
\fB\-rp\fR
.RS 4
Flag possible pseudogenes (score < 100 or tRNA anticodon loop <> 7 bases long)\&. Note that genes with score < 100 will not be detected or flagged if scoring thresholds are not also changed to below 100% (see \-ps switch)\&.
.RE
.PP
\fB\-seq\fR
.RS 4
Print out primary sequence\&.
.RE
.PP
\fB\-br\fR
.RS 4
Show secondary structure of tRNA gene primary sequence using round brackets\&.
.RE
.PP
\fB\-fasta\fR
.RS 4
Print out primary sequence in fasta format\&.
.RE
.PP
\fB\-fo\fR
.RS 4
Print out primary sequence in fasta format only (no secondary structure)\&.
.RE
.PP
\fB\-fon\fR
.RS 4
Same as
\fB\-fo\fR, with sequence and gene numbering in header\&.
.RE
.PP
\fB\-fos\fR
.RS 4
Same as
\fB\-fo\fR, with no spaces in header\&.
.RE
.PP
\fB\-fons\fR
.RS 4
Same as
\fB\-fo\fR, with sequence and gene numbering, but no spaces\&.
.RE
.PP
\fB\-w\fR
.RS 4
Print out in Batch mode\&.
.RE
.PP
\fB\-ss\fR
.RS 4
Use the stricter canonical 1\-2 bp spacer1 and 1 bp spacer2\&. Ignored if
\fB\-mt\fR
set\&. Default is to allow 3 bp spacer1 and 0\-2 bp spacer2, which may degrade selectivity\&.
.RE
.PP
\fB\-v\fR
.RS 4
Verbose\&. Prints out information during search to STDERR\&.
.RE
.PP
\fB\-a\fR
.RS 4
Print out tRNA domain for tmRNA genes\&.
.RE
.PP
\fB\-a7\fR
.RS 4
Restrict tRNA astem length to a maximum of 7 bases
.RE
.PP
\fB\-aa\fR
.RS 4
Display message if predicted iso\-acceptor species does not match species in sequence name (if present)\&.
.RE
.PP
\fB\-j\fR
.RS 4
Display 4\-base sequence on 3\*(Aq end of astem regardless of predicted amino\-acyl acceptor length\&.
.RE
.PP
\fB\-jr\fR
.RS 4
Allow some divergence of 3\*(Aq amino\-acyl acceptor sequence from NCCA\&.
.RE
.PP
\fB\-jr4\fR
.RS 4
Allow some divergence of 3\*(Aq amino\-acyl acceptor sequence from NCCA, and display 4 bases\&.
.RE
.PP
\fB\-q\fR
.RS 4
Dont print configuration line (which switchs and files were used)\&.
.RE
.PP
\fB\-rn\fR
.RS 4
Repeat sequence name before summary information\&.
.RE
.PP
\fB\-O\fR [\fIoutfile\fR]
.RS 4
Print output to
\fI\&. If [\*(Aqoutfile\fR] already exists, it is overwritten\&. By default all output goes to stdout\&.
.RE
.SH "DESCRIPTION"
.sp
aragorn detects tRNA, mtRNA, and tmRNA genes\&. A minimum requirement is at least a 32 bit compiler architecture (variable types int and unsigned int are at least 4 bytes long)\&.
.sp
[\fIFILE\fR] is assumed to contain one or more sequences in FASTA format\&. Results of the search are printed to STDOUT\&. All switches are optional and case\-insensitive\&. Unless \-i is specified, tRNA genes containing introns are not detected\&.
.SH "AUTHORS"
.sp
Bjorn Canback <bcanback@acgt\&.se>, Dean Laslett <gaiaquark@gmail\&.com>
.SH "REFERENCES"
.sp
Laslett, D\&. and Canback, B\&. (2004) ARAGORN, a program for the detection of transfer RNA and transfer\-messenger RNA genes in nucleotide sequences Nucleic Acids Research, 32;11\-16
.sp
Laslett, D\&. and Canback, B\&. (2008) ARWEN: a program to detect tRNA genes in metazoan mitochondrial nucleotide sequences Bioinformatics, 24(2); 172\-175\&.