831669433e1b156f9854e51bdd96a53f2fe04787

@ -6,10 +6,13 @@
#
#  Annotate CDS using exonerate
#
#  do_exonerate.sh <FASTAGENOM> <FASTAPROT> [<OUTDIR>]
#  do_exonerate.sh <PASS> <FASTAGENOM> <FASTAPROT> <ANNOTFILE> <MODELDIR> [<OUTDIR>]
#
#   - <PASS>       : the pass running exonarate
#		- <FASTAGENOM> : The fasta file containing the genome to annotate
#		- <FASTAPROT>  : The fasta file containing the protein family
#   - <ANNOTFILE>  : The annotation file used to add product info
#	  - <MODELDIR>   : Directory containing model parameters for exonerate
#
#  Results are in file : `basename <FASTAGENOM>:r`.`basename <FASTAPROT>:r`.res 
#
@ -26,17 +29,21 @@ alias Override 'if (-e \!:2) set \!:1 = \!:2'

NeedArg 2

set Pass = $Argv[1]; Shift
set GenoFile = $Argv[1]; Shift
set GenoName = `basename $GenoFile:r`

set ProtFile = $Argv[1]; Shift
set ProtDir  = `dirname $ProtFile`
set DBDir    = `dirname $ProtDir`
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
set ProtType = `basename $ProtDir`

set AnnotFile = $Argv[1]; Shift

NeedFile $GenoFile
NeedFile $ProtFile
NeedFile $ProtDir/Annot.lst
NeedFile $AnnotFile

set ModelsDir = $PROG_DIR/../models
if ($#Argv > 0) then
@ -188,7 +195,7 @@ if ( -z $base.exo.best) then
		$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN         \
		        -v ALLOW_STOP=1     \
		        -v EXCLUDE=$GenoName                \
		        -f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best		
		        -f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best	
	endif
endif

@ -196,7 +203,16 @@ endif
# get annotations
#

egrep "^$ProtName " $ProtDir/Annot.lst | $AwkCmd '{print "c annot", $0}' > T_$$
set sp_match=`awk '/^e similarity/ {print $12}' $base.exo.best | head -1`

if ( ${%sp_match} > 0 ) then
  set sp_ac=`awk -v sp="$sp_match" '($1 ~ sp) {sub(/SP_AC=/,"",$2); sub(/;$/,"",$2); print $2} ' $DbFile`
   echo "             " patch ac $sp_match to $sp_ac > /dev/stderr
  sed "s/$sp_match/$sp_ac/g" $base.exo.best > T_$$
  mv T_$$ $base.exo.best
endif

egrep "^$ProtName " $AnnotFile | $AwkCmd '{print "c annot", $0}' > T_$$

#
# extend start/stop
@ -213,7 +229,7 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk             \
# translate
#

echo "c pass pass1 $ProtType" > $base.iff
echo "c pass $Pass $ProtType" > $base.iff

$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
        -f $LIB_DIR/translate.awk T_$$ >> $base.iff

@ -37,10 +37,13 @@ else
    TEMP=""
fi

DBROOT="$CDS_DATA_DIR/chlorodb/RPS12"
RPS12DB="${DBROOT}/RPS12_DB.clean.fst"
DBROOT="$CDS_DATA_DIR/sp_chlorodb/RPS12"
RPS12DB="${DBROOT}/rps12.fst"
DELTA=50

AnnotFile="$CDS_DATA_DIR/sp_chlorodb/Annot.lst"
ModelsDir="$CDS_DATA_DIR/sp_chlorodb/models"

SEQLEN=$(seqlength "${QUERY}")
SEQUENCE=$(readfirstfastaseq "${QUERY}") 

@ -61,7 +64,9 @@ blastx \
                BEGIN {BEST_EVAL = 1e-40; 
                       OUT = 0} 
                /^#/ {next} 
                ($2 == PREV_CDS) { HSPs = HSPs "\n" $0;} 
                ($2 == PREV_CDS) && (($11 + 0.0) < (1e-5 + 0.0)) { 
                    HSPs = HSPs "\n" $0;
                    } 
                
                (OUT < 20) && ($2 != PREV_CDS) && (BEST_EVAL < (1e-20 + 0.0)) { 
                    if (PREV_CDS) print HSPs; 
@ -75,6 +80,7 @@ blastx \
                (BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)} 
          ' > "rps12_locate.hsps"

    
    #
    # Extracting protein ids from selected blast HSPs
    #
@ -83,7 +89,6 @@ blastx \
        | sort \
        | uniq > "dbsel.txt"


    #
    # Extract corresponding protein sequences
    # from the RPS12 database.
@ -134,7 +139,7 @@ blastx \
                        }
                    }
        ' \
        | sort -nk 3 \
        | sort -nk 3  \
        | $AwkCmd '($3 != old3 || $4 != old4) { 
                          i++
                          old3=$3 
@ -262,13 +267,14 @@ blastx \
    # It should be one or two fragments
    #
        export PASS1_SPEEDUP=0
        cp $DBROOT/Annot.lst RPS12
        nbseq = 0
        nbseq=0
        for fasta in rps12_fragments_*.fasta ; do
            tcsh -f ${PROG_DIR}/do_exonerate.csh \
                Pass2 \
                $fasta \
                "RPS12/rps12.fasta" \
                $DBROOT/../models $(pwd)
                $AnnotFile \
                $ModelsDir $(pwd)
            ((nbseq=nbseq+1))
        done


@ -35,19 +35,22 @@ Genome=$(basename ${Fasta%.*})
# DbRoot is set to its default values except 
# if the second argument precise another DbRoot

DbRoot="$CDS_DATA_DIR/chlorodb"
DbRoot="$CDS_DATA_DIR/sp_chlorodb"

if (( $# > 0)) ; then
  DbRoot="$1"; Shift
fi

AnnotFile="$DbRoot/Annot.lst" 

needdir $DbRoot
needdir $DbRoot/core
needfile $DbRoot/core/Annot.lst
needfile $AnnotFile
needdir $DbRoot/models

assignundef cdsdetection_pass1 yes
assignundef cdsdetection_pass2 yes
assignundef cdsdetection_pass3 yes

temp=$(mktempdir $(hostname))

@ -63,33 +66,47 @@ Fasta="$temp/genome.fasta"
#

if [[ "$cdsdetection_pass1" == "yes" ]] ; then
    for dir in "core" "shell" "dust" ; do
    for dir in "core" ; do
    if [[ -d $DbRoot/$dir ]] ; then
        fams=$(ls $DbRoot/$dir/*.clean.fst)
        fams=$(ls $DbRoot/$dir/*.fst)
        loginfo "running pass1:$dir exonerate of $Genome on $DbRoot"
        for f in $fams ; do
        tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
        tcsh -f $PROG_DIR/do_exonerate.csh Pass1 $Fasta $f $AnnotFile $DbRoot/models $temp
        done
    fi
    done

    cp $temp/genome.cds.fasta $Genome.cds.fasta 

    mv $temp/genome.cds.fasta $Genome.cds_pass1.fasta 
fi


#
# pass2: transsplicing
# pass2: RPS12 gene with transsplicing 
#

if [[ "$cdsdetection_pass2" == "yes" ]] ; then
    loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
    $PROG_DIR/do_rps12.sh $Fasta $temp
fi

#
# pass3: prokov
# pass3: run exonerate on shell and dust
#

if [[ "$cdsdetection_pass3" == "yes" ]] ; then
    for dir in "shell" ; do
    if [[ -d $DbRoot/$dir ]] ; then
        fams=$(ls $DbRoot/$dir/*.fst)
        loginfo $fams
        loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
        for f in $fams ; do
          tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
        done
    fi
    done
    mv $temp/genome.cds.fasta $Genome.cds_pass2.fasta 
fi

# $PROG_DIR/do_prokov.sh $Fasta $Genome.cds.fasta $temp

#

@ -142,8 +142,7 @@ function Unk(s) {
}

/^e similarity/ {
  split($12, a, "@")
  Simil = a[1] ":" a[2]
  Simil = a[1] "UniProtKB/Swiss-Prot:" $12
  next
}

@ -179,7 +178,7 @@ function Unk(s) {
			QQualifier("note","nonfunctional due to a frameshift")
		}
	QQualifier("product", Product)
	QQualifier("inference", "similar to DNA sequence:" Simil)
	QQualifier("inference", "similar to " Simil)
	# QQualifier("inference", "org.annot -- detect pass:" PassType ":" PassInfo)
	if (FrameShift==0) {
		if (match(Translat,/\*/)>0) {

@ -0,0 +1,542 @@
#!/bin/bash
#
#                           BUILD REFERENCE : From SwissProt
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink

THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

SPDIR="$CDS_DATA_DIR/sp_chlorodb"
SPGENESDIR="$SPDIR/genes"

CORE_GENES="accD"
CORE_GENES="$CORE_GENES atpA atpB atpE atpF atpH atpI"
CORE_GENES="$CORE_GENES ccsA"
CORE_GENES="$CORE_GENES cemA"
CORE_GENES="$CORE_GENES clpP"
CORE_GENES="$CORE_GENES infA"
CORE_GENES="$CORE_GENES matK"
CORE_GENES="$CORE_GENES ndhA ndhB ndhC ndhD ndhE ndhF ndhG ndhH ndhI ndhJ ndhK"
CORE_GENES="$CORE_GENES petA petB petD petG petL petN"
CORE_GENES="$CORE_GENES psaA psaB psaC psaI psaJ psbA"
CORE_GENES="$CORE_GENES psbB psbC psbD psbE psbF psbH psbI psbJ psbK psbL psbM psbN psbT psbZ"
CORE_GENES="$CORE_GENES rbcL"
CORE_GENES="$CORE_GENES rpl14 rpl16 rpl2 rpl20 rpl22 rpl23 rpl32 rpl33 rpl36"
CORE_GENES="$CORE_GENES rpoA rpoB rpoC1 rpoC2"
CORE_GENES="$CORE_GENES rps2 rps3 rps4 rps7 rps8 rps11 rps14 rps15 rps16 rps18 rps19"
CORE_GENES="$CORE_GENES ycf1 ycf2 ycf3 ycf4"

RPS12_GENE="rps12"

function download_swissprot() {
    URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"

    curl $URL
}

function extract_chloro_entries() {
    awk -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        /OC   Eukaryota; Viridiplantae; Streptophyta;/ && /OG   Plastid; Chloroplast./ && !/DE   Flags: Fragment;/ {print $0}
    ' $1
}

function extract_chloro_gene_entries() {

    awk -v gene=$1 -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        ($0 ~ gene"_") && /OC   Eukaryota; Viridiplantae; Streptophyta;/ && /OG   Plastid; Chloroplast./ {print $0}
    ' $2
}

function extract_chloro_gene_frg() {

    awk -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        /DE   Flags: Fragment;/ {print $0}
    ' $1
}

function extract_chloro_gene_ac() {

    awk -v ac=$1 -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        ($0 ~ "AC   " ac ";") && /OC   Eukaryota; Viridiplantae; Streptophyta;/ && /OG   Plastid; Chloroplast./ {print $0}
    ' $2
}

function extract_fasta_protein() {
    $AwkCmd '
        /^ID/ {ID=$2} 
        /^AC/ {AC=$2; sub(";","",AC)} 
        /^DR   EMBL;/ {
            EMBL=$4; 
            sub(";","",EMBL); 
            sub("-","xxx",EMBL);
            if ( EMBL != "xxx" ) {
                EBI="curl \"https://www.ebi.ac.uk/ena/browser/api/embl/" EMBL "?download=true\" | egrep \"FT +/gene=\" | cut -d \"=\" -f 2 | tr -d \"\\\"\""
                EBI | getline GENE
                close(EBI)
            } else {
                GENE = "xxx"
            }
            } 
        /^   / {gsub(/ /,"",$0); SEQ=SEQ $0}
        /^\/\// {
            if (GENE != "xxx") {
                print ">"ID,"SP_AC="AC";","EMBL_AC="EMBL";","gene="GENE";";
                print SEQ 
            }
            ID="xxx";
            AC="xxx"; 
            EMBL="xxx"; 
            GENE="xxx"
            SEQ=""
        }
        ' $1 \
    | formatfasta
}

function rename_rules() {
    egrep "^>" $1 \
    | sed 's/^>//' \
    | sed 's/_/=/' \
    | tr -d ";" \
    | awk -F"=" '{print $1,$NF}' \
    | sort \
    | uniq -c \
    | awk '{
        x=$1
        $1=$2
        $2=sprintf("%06d",x)
        print $0}' \
    | sort -rn \
    | sed 's/ /=/' \
    | sed 's/ /=/' \
    | awk -F "=" '
        ($1 == current) {
            sub(/^0+/,"",occurrence)
            print "# based on",occurrence,"observations renames",$NF,"in",gene
            print "/^>" current "/ s@gene=" $NF "@gene=" gene "@"
        }
        ($1 != current) {
            current= $1
            gene=$NF 
            occurrence=$2
        }
        '
}

function rename_genes() {
    local n=1
    local input=$1
    cat $input > __tmp__$$__fasta
    rules=${input/fst/rules}
    #echo $rule 1>&2

    rename_rules __tmp__$$__fasta > $rules.$n
    while [[ -s $rules.$n ]] ; do
        sed -f $rules.$n __tmp__$$__fasta > __tmp2__$$__fasta
        mv __tmp2__$$__fasta __tmp__$$__fasta
        ((n++))
        rename_rules __tmp__$$__fasta > $rules.$n
    done

    cat __tmp__$$__fasta
    rm __tmp__$$__fasta
}


function clean_strange_gene_name() {
    local input=$1
    local output=${input/.fst/_sp_ebi.genes}
    local to_keep=${input/.fst/_to_keep.lst}
    local to_be_removed=${input/.fst/_to_be_removed.lst}

    grep '^>' $input \
    | sed 's/^>//' \
    | sed 's/_/=/' \
    | sed 's/;$//' \
    | awk -F'=' '{print $NF,$1}' \
    | sort \
    | uniq -c \
    | $AwkCmd '{
        x=$1
        $1=$2
        $2=sprintf("%06d",x)
        print $0}' \
    | sort -r > $output

    $AwkCmd '
        ($1!=current) {current=$1; print $3}
        ' $output > $to_keep

    $AwkCmd '
        ($1==current) {print $3} 
        ($1!=current) {current=$1}
        ' $output > $to_be_removed

    filter_sp_fasta_db $to_be_removed $input > ${input/.fst/_strange_genes.fst}
    filter_sp_fasta_db $to_keep $input
}

function filter_sp_fasta_db() {
    gene_pattern="$(echo $(cat $1) | tr ' ' '|')"

    $AwkCmd -F "_" -v gene_pattern=$gene_pattern '
        /^>/ && (gene ~ gene_pattern) {
            print entry
        }

        /^>/ {
            entry = $0
            gene = $1
            sub(/^>/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            if (gene ~ gene_pattern)
                print entry
        }
    ' $2

}

function filter_out_sp_fasta_db() {
    gene_pattern="$(echo $(cat $1) | tr ' ' '|')"

    $AwkCmd -v gene_pattern=$gene_pattern '
        /^>/ && (gene !~ gene_pattern) {
            print entry
        }

        /^>/ {
            entry = $0
            gene = $1
            sub(/^>/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            if (gene !~ gene_pattern)
                print entry
        }
    ' $2

}

function split_by_gene() {
    $AwkCmd -F "=" -v SPGENES=$SPGENESDIR '
        function writegene(gene,entry) {
            outputdir = SPGENES "/" gene 
            output = outputdir "/" gene ".fst"
            system("mkdir -p " outputdir)
            print entry >> output
            close(output)
        }

        /^>/ && gene!="" {
            writegene(gene,entry)
        }

        /^>/ {
            entry = $0
            gene = $NF
            sub(/;$/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            writegene(gene,entry)
        }
    ' $1
}

function dereplicate() {
    local CDHIT_ID=0.95
    local CDHIT_DELTA=0.95

 	local gene="${1}"
 	local fastain="${gene}/${gene}.fst"
 	local cdhitout="${gene}/${gene}.cdhit.fst"

     	cd-hit 	-i "${fastain}" \
 	    	-o "${cdhitout}" \
 	    	-c ${CDHIT_DELTA} \
 	    	-G 1 \
			-g 1 \
			-aL 0.95 \
			-s ${CDHIT_ID} \
 	    	-b 350 -p 1 \
 	    	-d 0 -n 3

 	local fasta1="${gene}/${gene}.1l.fst"
}

function dereplicate_genes() {
    pushd $SPGENESDIR
    for g in * ; do 
        dereplicate $g ; 
    done
    popd
}

function buildGeneBlastDB() {
 	local gene="${1}"
 	local fastain="${gene}/${gene}.cdhit.fst"

    loginfo "  formatting Blast $gene DB"
    timeoutcmd 300 makeblastdb -dbtype prot -in ${fastain} >& /dev/null
	loginfo "Done"
}

function buildBlastDBs() {
    pushd $SPGENESDIR
    for g in * ; do 
        buildGeneBlastDB $g ; 
    done
    popd
}

function list_shell_genes() {
    pushd $SPDIR

    ls genes \
    | grep -v '\.' \
    | egrep -iv $(tr " " "|" <<< $CORE_GENES) \
    | grep -iv '^orf' \
    | grep -iv "$RPS12_GENE"

    popd
}

function list_dust_genes() {
    pushd $SPDIR 1>&2

    ls genes \
    | grep -v '\.' \
    | egrep -iv $(tr " " "|" <<< $CORE_GENES) \
    | grep -i '^orf' 

    popd 1>&2
}

function build_core_libraries() {
    pushd $SPDIR 1>&2

    rm -rf core
    mkdir -p core

    for gene in $CORE_GENES ; do 
        cp genes/$gene/$gene.cdhit.fst core/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr core/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin core/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq core/$gene.fst.psq
    done

    popd 1>&2
}

function build_rps12_library() {
    pushd $SPDIR 1>&2

    local gene=$RPS12_GENE

    rm -rf RPS12
    mkdir -p RPS12
    
    cp genes/$gene/$gene.cdhit.fst RPS12/$gene.fst
    cp genes/$gene/$gene.cdhit.fst.phr RPS12/$gene.fst.phr
    cp genes/$gene/$gene.cdhit.fst.pin RPS12/$gene.fst.pin
    cp genes/$gene/$gene.cdhit.fst.psq RPS12/$gene.fst.psq

    popd 1>&2
}

function build_shell_libraries() {
    pushd $SPDIR 1>&2

    rm -rf shell
    mkdir -p shell

    for gene in $(list_shell_genes) ; do 
        cp genes/$gene/$gene.cdhit.fst shell/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr shell/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin shell/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq shell/$gene.fst.psq
    done

    popd 1>&2
}

function build_dust_libraries() {
    pushd $SPDIR 1>&2

    rm -rf dust
    mkdir -p dust

    for gene in $(list_dust_genes) ; do 
        cp genes/$gene/$gene.cdhit.fst dust/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr dust/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin dust/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq dust/$gene.fst.psq
    done

    popd 1>&2
}

function get_product_line() {
    pushd $SPGENESDIR 1>&2

    local gene=$1
    local spac=$(head -1 $gene/$gene.cdhit.fst \
                 | awk '
                     { AC=$2; 
                       sub(/^SP_AC=/,"",AC); 
                       sub(/;$/,"",AC); 
                       print AC}')

    popd 1>&2

    pushd $SPDIR 1>&2

    extract_chloro_gene_ac $spac rawdata/SP_Chloro.dat \
    | grep "^DE   " \
    | $AwkCmd -v gene=$gene '
        function remove_tails(line) {
            sub(/ *(\{[^}]+\});/,"",line)
            st = index(line,"=")
            return substr(line,st+1)
        } 

        /DE +RecName:/ {
            full = remove_tails($0)
        }
        /DE +Short=/ {
            ns = remove_tails($0)
            if (length(ns) > length(short)) {
                short = ns
            }
        }
        /DE +EC=/ {
            ec = remove_tails($0)
        }

        END {
            if (length(short) > 10) {
                product = short
            } else {
                product = full
            }

            if (ec != "") {
                product = product " (EC:" ec ")"
            }

            if (product == "") {
                product = "Hypothetical protein of unknown function"
            } 

            gsub(/ /,"_",product)

            print gene,gene,"--","--","--",product
        }
      '
    popd 1>&2
}

function build_annotate_lst() {
    pushd $SPGENESDIR 1>&2

    for gene in * ; do 
        get_product_line $gene
    done

    popd 1>&2
}

pushd $SPDIR

####
#
# Download and prepare raw library from Swissprot FTP site
#
###

rm -rf rawdata
mkdir -p rawdata

pushd rawdata

download_swissprot | extract_chloro_entries > SP_Chloro.dat

extract_fasta_protein SP_Chloro.dat > SP_Chloro_gene_db.fst

popd

####
#
# Clean swiss-prot fasta file for gene name annotation
#
###

pushd rawdata

rename_genes SP_Chloro_gene_db.fst > SP_Chloro_gene_db.clean_name.fst

clean_strange_gene_name SP_Chloro_gene_db.clean_name.fst \
                      > SP_Chloro_gene_db.good_gene.fst

popd

####
#
# Prepare the database for all genes
#
###

rm -rf genes
split_by_gene rawdata/SP_Chloro_gene_db.good_gene.fst

dereplicate_genes

buildBlastDBs

####
#
# Prepare the differente gene databases for CDS annotation
#
###

build_core_libraries
build_shell_libraries
build_dust_libraries
build_rps12_library

####
#
# Build Annotation file
#
###

build_annotate_lst > Annot.lst

# ls ../../chlorodb/core/ | grep -v '\.' | sort > core_genes.lst
# ls | grep -v '\.' | awk '{print tolower($1),$1}' | sort > sp_genes.lst
# join -a 1 -e xxxx core_genes.lst sp_genes.lst  > join_core.lst

popd
@ -1,106 +0,0 @@
#!/bin/bash
#
#                           BUILD REFERENCE : THE RPS12 LIBRARY
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink

THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
source "${THIS_DIR}/lib/clusterize_prot.sh"

function extract_rps12() {
$AwkCmd '                                          \
      /^LOCUS/    {LOCUS=$2;}                      \
      /^     [^ ]/ { if (CDS) {                    \
                        print LOCUS "/" feature;   \
                        print "#################"  \
                       }                           \
                    CDS=0;                         \
                   }                               \
      /^     CDS / {CDS=1;                         \
                    $1="";                         \
                    feature=""}                    \
      (CDS)        { sub(/^ */,"",$0);             \
                     sub(/ *$/,"",$0);             \
                     feature=feature $0}           \
    ' \
    | egrep -i '"rps12"'  \
    | $AwkCmd -F"/" '                              \
       function printfasta(seq) {                  \
             seqlen=length(seq);                   \
             for (i=1; i <= seqlen; i+=60)         \
                 print substr(seq,i,60);           \
             }                                     \
                                                   \
                ($1 != current) {current=$1;       \
                                  n=1              \
                                 }                 \
                 {$1=$1 "_rps12_" n;               \
                  n++;                             \
                  delete keys;                     \
                  for (i=3; i<=NF; i++) {          \
                      split($i,key,"=");           \
                      keys[key[1]]=key[2]          \
                  }                                \
                  prot = keys["translation"];      \
                  gsub(/"/,"",prot);               \
                  print ">" $1,"location=" $2 ";"; \
                  printfasta(prot)                 \
                 }                                 \
    ' 
}


pushTmpDir ORG.buildRPS12DB

	RPS12FILE=RPS12_prot.fst	

	openLogFile "${CDS_DATA_DIR}/chlorodb/RPS12_DB.log"

	loginfo "Selecting Viridiplantae genbank entries..."
		VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*) 
		loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
	loginfo "Done"

    loginfo "Extracting the RPS12 protein sequences from the plants entries..."
        ( for gbk in ${VIRIDIPLANTAE} ; do
            gzcat $gbk | \
                extract_rps12 
          done ) > ${RPS12FILE}
	loginfo "Done"

	loginfo "Installing the RPS12 protein sequence database..."
		
		cp ${RPS12FILE} "${CDS_DATA_DIR}/chlorodb/RPS12_DB.fst"

	loginfo "Done"

popTmpDir

pushd "${CDS_DATA_DIR}/chlorodb"

	loginfo "Clusterizing the RPS12 protein sequence database..."
    rm -rf RPS12_DB.clean.fst
    clusterize RPS12_DB
	loginfo "Done"

    loginfo "  formatting Blast RPS12 DB"
    timeoutcmd 300 makeblastdb -dbtype prot -in RPS12_DB.clean.fst >& /dev/null
	loginfo "Done"


popd

#
# format blast protein database
#


loginfo "Done"

@ -1,8 +1,13 @@
#!/bin/bash
#

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

  gawk 'function printfasta(seq) {                                            \

  $AwkCmd 'function printfasta(seq) {                                            \
             seqlen=length(seq);                                             \
             for (i=1; i <= seqlen; i+=60)                                   \
                 print substr(seq,i,60);                                     \

@ -1,8 +1,13 @@
#!/bin/bash
#

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

  gawk 'function printfasta(seq) {                                            \

  $AwkCmd 'function printfasta(seq) {                                            \
             seqlen=length(seq);                                             \
             for (i=1; i <= seqlen; i+=60)                                   \
                 print substr(seq,i,60);                                     \

@ -1,5 +1,10 @@
#!/bin/bash

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"


(                                          \
	for f in $* ; do                       \
@ -12,7 +17,7 @@
    done                                   \
) | \
  grep -B 1 Viridiplantae | \
  gawk '{print $1}' | \
  $AwkCmd '{print $1}' | \
  grep '\.gbk' | \
  sed -E 's/(^.*\.gbk(.gz)?).$/\1/' | \
  uniq