annotate/detectors/cds/tools/build_swissprot_mito_db.sh

#!/bin/bash
#
#                           BUILD REFERENCE : From SwissProt
#
#========================================================================================

# -- CAUTION -- Works as long than the script 
#               is not called through a symlink

THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"

SPDIR="$CDS_DATA_DIR/sp_mitodb"
SPGENESDIR="$SPDIR/genes"

CORE_GENES="atp6 atp8"
CORE_GENES="$CORE_GENES cox1 cox2 cox3"
CORE_GENES="$CORE_GENES cytb"
CORE_GENES="$CORE_GENES nd1 nd2 nd3 nd4 nd4L nd5 nd6"

function download_swissprot() {
    URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"

    curl $URL
}

function extract_mito_entries() {
    awk -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        /OC   Eukaryota; Metazoa;/ && /OG   Mitochondrion\./ && !/DE   Flags: Fragment;/ {print $0}
    ' $1
}

function extract_mito_gene_entries() {

    awk -v gene=$1 -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        ($0 ~ gene"_") && /OC   Eukaryota; Metazoa;/ && /OG   Mitochondrion./ {print $0}
    ' $2
}

function extract_mito_gene_frg() {

    awk -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        /DE   Flags: Fragment;/ {print $0}
    ' $1
}

function extract_mito_gene_ac() {

    awk -v ac=$1 -F'\n' '
        BEGIN {RS="//\n"; ORS=RS; OFS="\n"} 
        ($0 ~ "AC   " ac ";") && /OC   Eukaryota; Metazoa;/ && /OG   Mitochondrion./ {print $0}
    ' $2
}

function extract_fasta_protein() {
    $AwkCmd '
        /^ID/ {ID=$2} 
        /^AC/ {AC=$2; sub(";","",AC)} 
        /^DR   EMBL;/ {
            EMBL=$4; 
            sub(";","",EMBL); 
            sub("-","xxx",EMBL);
            if ( EMBL != "xxx" ) {
                EBI="curl \"https://www.ebi.ac.uk/ena/browser/api/embl/" EMBL "?download=true\" | egrep \"FT +/gene=\" | cut -d \"=\" -f 2 | tr -d \"\\\"\""
                EBI | getline GENE
                close(EBI)
            } else {
                GENE = "xxx"
            }
            } 
        /^   / {gsub(/ /,"",$0); SEQ=SEQ $0}
        /^\/\// {
            if (GENE != "xxx") {
                print ">"ID,"SP_AC="AC";","EMBL_AC="EMBL";","gene="GENE";";
                print SEQ 
            }
            ID="xxx";
            AC="xxx"; 
            EMBL="xxx"; 
            GENE="xxx"
            SEQ=""
        }
        ' $1 \
    | formatfasta
}

function rename_rules() {
    egrep "^>" $1 \
    | sed 's/^>//' \
    | sed 's/_/=/' \
    | tr -d ";" \
    | awk -F"=" '{print $1,$NF}' \
    | sort \
    | uniq -c \
    | awk '{
        x=$1
        $1=$2
        $2=sprintf("%06d",x)
        print $0}' \
    | sort -rn \
    | sed 's/ /=/' \
    | sed 's/ /=/' \
    | awk -F "=" '
        ($1 == current) {
            sub(/^0+/,"",occurrence)
            print "# based on",occurrence,"observations renames",$NF,"in",gene
            print "/^>" current "/ s@gene=" $NF "@gene=" gene "@"
        }
        ($1 != current) {
            current= $1
            gene=$NF 
            occurrence=$2
        }
        '
}

function rename_genes() {
    local n=1
    local input=$1
    cat $input > __tmp__$$__fasta
    rules=${input/fst/rules}
    #echo $rule 1>&2

    rename_rules __tmp__$$__fasta > $rules.$n
    while [[ -s $rules.$n ]] ; do
        sed -f $rules.$n __tmp__$$__fasta > __tmp2__$$__fasta
        mv __tmp2__$$__fasta __tmp__$$__fasta
        ((n++))
        rename_rules __tmp__$$__fasta > $rules.$n
    done

    cat __tmp__$$__fasta
    rm __tmp__$$__fasta
}


function clean_strange_gene_name() {
    local input=$1
    local output=${input/.fst/_sp_ebi.genes}
    local to_keep=${input/.fst/_to_keep.lst}
    local to_be_removed=${input/.fst/_to_be_removed.lst}

    grep '^>' $input \
    | sed 's/^>//' \
    | sed 's/_/=/' \
    | sed 's/;$//' \
    | awk -F'=' '{print $NF,$1}' \
    | sort \
    | uniq -c \
    | $AwkCmd '{
        x=$1
        $1=$2
        $2=sprintf("%06d",x)
        print $0}' \
    | sort -r > $output

    $AwkCmd '
        ($1!=current) {current=$1; print $3}
        ' $output > $to_keep

    $AwkCmd '
        ($1==current) {print $3} 
        ($1!=current) {current=$1}
        ' $output > $to_be_removed

    filter_sp_fasta_db $to_be_removed $input > ${input/.fst/_strange_genes.fst}
    filter_sp_fasta_db $to_keep $input
}

function filter_sp_fasta_db() {
    gene_pattern="$(echo $(cat $1) | tr ' ' '|')"

    $AwkCmd -F "_" -v gene_pattern=$gene_pattern '
        /^>/ && (gene ~ gene_pattern) {
            print entry
        }

        /^>/ {
            entry = $0
            gene = $1
            sub(/^>/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            if (gene ~ gene_pattern)
                print entry
        }
    ' $2

}

function filter_out_sp_fasta_db() {
    gene_pattern="$(echo $(cat $1) | tr ' ' '|')"

    $AwkCmd -v gene_pattern=$gene_pattern '
        /^>/ && (gene !~ gene_pattern) {
            print entry
        }

        /^>/ {
            entry = $0
            gene = $1
            sub(/^>/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            if (gene !~ gene_pattern)
                print entry
        }
    ' $2

}

function split_by_gene() {
    $AwkCmd -F "=" -v SPGENES=$SPGENESDIR '
        function writegene(gene,entry) {
            outputdir = SPGENES "/" gene 
            output = outputdir "/" gene ".fst"
            system("mkdir -p " outputdir)
            print entry >> output
            close(output)
        }

        /^>/ && gene!="" {
            writegene(gene,entry)
        }

        /^>/ {
            entry = $0
            gene = $NF
            sub(/;$/, "", gene)
        }

        !/^>/ {
            entry = entry "\n" $0
        }

        END {
            writegene(gene,entry)
        }
    ' $1
}

function dereplicate() {
    local CDHIT_ID=0.95
    local CDHIT_DELTA=0.95

 	local gene="${1}"
 	local fastain="${gene}/${gene}.fst"
 	local cdhitout="${gene}/${gene}.cdhit.fst"

     	cd-hit 	-i "${fastain}" \
 	    	-o "${cdhitout}" \
 	    	-c ${CDHIT_DELTA} \
 	    	-G 1 \
			-g 1 \
			-aL 0.95 \
			-s ${CDHIT_ID} \
 	    	-b 350 -p 1 \
 	    	-d 0 -n 3

 	local fasta1="${gene}/${gene}.1l.fst"
}

function dereplicate_genes() {
    pushd $SPGENESDIR
    for g in * ; do 
        dereplicate $g ; 
    done
    popd
}

function buildGeneBlastDB() {
 	local gene="${1}"
 	local fastain="${gene}/${gene}.cdhit.fst"

    loginfo "  formatting Blast $gene DB"
    timeoutcmd 300 makeblastdb -dbtype prot -in ${fastain} >& /dev/null
	loginfo "Done"
}

function buildBlastDBs() {
    pushd $SPGENESDIR
    for g in * ; do 
        buildGeneBlastDB $g ; 
    done
    popd
}

function list_shell_genes() {
    pushd $SPDIR

    ls genes \
    | grep -v '\.' \
    | egrep -iv $(tr " " "|" <<< $CORE_GENES) \
    | grep -iv '^orf' \
    | grep -iv "$RPS12_GENE"

    popd
}

function list_dust_genes() {
    pushd $SPDIR 1>&2

    ls genes \
    | grep -v '\.' \
    | egrep -iv $(tr " " "|" <<< $CORE_GENES) \
    | grep -i '^orf' 

    popd 1>&2
}

function build_core_libraries() {
    pushd $SPDIR 1>&2

    rm -rf core
    mkdir -p core

    for gene in $CORE_GENES ; do 
        cp genes/$gene/$gene.cdhit.fst core/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr core/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin core/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq core/$gene.fst.psq
    done

    popd 1>&2
}

function build_rps12_library() {
    pushd $SPDIR 1>&2

    local gene=$RPS12_GENE

    rm -rf RPS12
    mkdir -p RPS12
    
    cp genes/$gene/$gene.cdhit.fst RPS12/$gene.fst
    cp genes/$gene/$gene.cdhit.fst.phr RPS12/$gene.fst.phr
    cp genes/$gene/$gene.cdhit.fst.pin RPS12/$gene.fst.pin
    cp genes/$gene/$gene.cdhit.fst.psq RPS12/$gene.fst.psq

    popd 1>&2
}

function build_shell_libraries() {
    pushd $SPDIR 1>&2

    rm -rf shell
    mkdir -p shell

    for gene in $(list_shell_genes) ; do 
        cp genes/$gene/$gene.cdhit.fst shell/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr shell/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin shell/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq shell/$gene.fst.psq
    done

    popd 1>&2
}

function build_dust_libraries() {
    pushd $SPDIR 1>&2

    rm -rf dust
    mkdir -p dust

    for gene in $(list_dust_genes) ; do 
        cp genes/$gene/$gene.cdhit.fst dust/$gene.fst
        cp genes/$gene/$gene.cdhit.fst.phr dust/$gene.fst.phr
        cp genes/$gene/$gene.cdhit.fst.pin dust/$gene.fst.pin
        cp genes/$gene/$gene.cdhit.fst.psq dust/$gene.fst.psq
    done

    popd 1>&2
}

function get_product_line() {
    pushd $SPGENESDIR 1>&2

    local gene=$1
    local spac=$(head -1 $gene/$gene.cdhit.fst \
                 | awk '
                     { AC=$2; 
                       sub(/^SP_AC=/,"",AC); 
                       sub(/;$/,"",AC); 
                       print AC}')

    popd 1>&2

    pushd $SPDIR 1>&2

    extract_mito_gene_ac $spac rawdata/SP_Mito.dat \
    | grep "^DE   " \
    | $AwkCmd -v gene=$gene '
        function remove_tails(line) {
            sub(/ *(\{[^}]+\});/,"",line)
            st = index(line,"=")
            return substr(line,st+1)
        } 

        /DE +RecName:/ {
            full = remove_tails($0)
        }
        /DE +Short=/ {
            ns = remove_tails($0)
            if (length(ns) > length(short)) {
                short = ns
            }
        }
        /DE +EC=/ {
            ec = remove_tails($0)
        }

        END {
            if (length(short) > 10) {
                product = short
            } else {
                product = full
            }

            if (ec != "") {
                product = product " (EC:" ec ")"
            }

            if (product == "") {
                product = "Hypothetical protein of unknown function"
            } 

            gsub(/ /,"_",product)

            print gene,gene,"--","--","--",product
        }
      '
    popd 1>&2
}

function build_annotate_lst() {
    pushd $SPGENESDIR 1>&2

    for gene in * ; do 
        get_product_line $gene
    done

    popd 1>&2
}

pushd $SPDIR

####
#
# Download and prepare raw library from Swissprot FTP site
#
###

rm -rf rawdata
mkdir -p rawdata

pushd rawdata

download_swissprot | extract_mito_entries > SP_Mito.dat

extract_fasta_protein SP_Mito.dat > SP_Mito_gene_db.fst

popd

####
#
# Clean swiss-prot fasta file for gene name annotation
#
###

pushd rawdata

rename_genes SP_Mito_gene_db.fst > SP_Mito_gene_db.clean_name.fst

clean_strange_gene_name SP_Mito_gene_db.clean_name.fst \
                      > SP_Mito_gene_db.good_gene.fst

popd

####
#
# Prepare the database for all genes
#
###

rm -rf genes
split_by_gene rawdata/SP_Mito_gene_db.good_gene.fst

dereplicate_genes

buildBlastDBs

####
#
# Prepare the differente gene databases for CDS annotation
#
###

build_core_libraries
build_shell_libraries
build_dust_libraries
build_rps12_library

####
#
# Build Annotation file
#
###

build_annotate_lst > Annot.lst

# ls ../../mitodb/core/ | grep -v '\.' | sort > core_genes.lst
# ls | grep -v '\.' | awk '{print tolower($1),$1}' | sort > sp_genes.lst
# join -a 1 -e xxxx core_genes.lst sp_genes.lst  > join_core.lst

popd