Files
annotate/detectors/cds/tools/build_swissprot_mito_db.sh
Eric Coissac 2c012eec8e first batch
Former-commit-id: 1eecb206a17c4aff21d1170b48db134ce3c4f14e
2025-03-01 16:15:28 +01:00

528 lines
11 KiB
Bash

#!/bin/bash
#
# BUILD REFERENCE : From SwissProt
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
SPDIR="$CDS_DATA_DIR/sp_mitodb"
SPGENESDIR="$SPDIR/genes"
CORE_GENES="atp6 atp8"
CORE_GENES="$CORE_GENES cox1 cox2 cox3"
CORE_GENES="$CORE_GENES cytb"
CORE_GENES="$CORE_GENES nd1 nd2 nd3 nd4 nd4L nd5 nd6"
function download_swissprot() {
URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"
curl $URL
}
function extract_mito_entries() {
awk -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
/OC Eukaryota; Metazoa;/ && /OG Mitochondrion\./ && !/DE Flags: Fragment;/ {print $0}
' $1
}
function extract_mito_gene_entries() {
awk -v gene=$1 -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
($0 ~ gene"_") && /OC Eukaryota; Metazoa;/ && /OG Mitochondrion./ {print $0}
' $2
}
function extract_mito_gene_frg() {
awk -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
/DE Flags: Fragment;/ {print $0}
' $1
}
function extract_mito_gene_ac() {
awk -v ac=$1 -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
($0 ~ "AC " ac ";") && /OC Eukaryota; Metazoa;/ && /OG Mitochondrion./ {print $0}
' $2
}
function extract_fasta_protein() {
$AwkCmd '
/^ID/ {ID=$2}
/^AC/ {AC=$2; sub(";","",AC)}
/^DR EMBL;/ {
EMBL=$4;
sub(";","",EMBL);
sub("-","xxx",EMBL);
if ( EMBL != "xxx" ) {
EBI="curl \"https://www.ebi.ac.uk/ena/browser/api/embl/" EMBL "?download=true\" | egrep \"FT +/gene=\" | cut -d \"=\" -f 2 | tr -d \"\\\"\""
EBI | getline GENE
close(EBI)
} else {
GENE = "xxx"
}
}
/^ / {gsub(/ /,"",$0); SEQ=SEQ $0}
/^\/\// {
if (GENE != "xxx") {
print ">"ID,"SP_AC="AC";","EMBL_AC="EMBL";","gene="GENE";";
print SEQ
}
ID="xxx";
AC="xxx";
EMBL="xxx";
GENE="xxx"
SEQ=""
}
' $1 \
| formatfasta
}
function rename_rules() {
egrep "^>" $1 \
| sed 's/^>//' \
| sed 's/_/=/' \
| tr -d ";" \
| awk -F"=" '{print $1,$NF}' \
| sort \
| uniq -c \
| awk '{
x=$1
$1=$2
$2=sprintf("%06d",x)
print $0}' \
| sort -rn \
| sed 's/ /=/' \
| sed 's/ /=/' \
| awk -F "=" '
($1 == current) {
sub(/^0+/,"",occurrence)
print "# based on",occurrence,"observations renames",$NF,"in",gene
print "/^>" current "/ s@gene=" $NF "@gene=" gene "@"
}
($1 != current) {
current= $1
gene=$NF
occurrence=$2
}
'
}
function rename_genes() {
local n=1
local input=$1
cat $input > __tmp__$$__fasta
rules=${input/fst/rules}
#echo $rule 1>&2
rename_rules __tmp__$$__fasta > $rules.$n
while [[ -s $rules.$n ]] ; do
sed -f $rules.$n __tmp__$$__fasta > __tmp2__$$__fasta
mv __tmp2__$$__fasta __tmp__$$__fasta
((n++))
rename_rules __tmp__$$__fasta > $rules.$n
done
cat __tmp__$$__fasta
rm __tmp__$$__fasta
}
function clean_strange_gene_name() {
local input=$1
local output=${input/.fst/_sp_ebi.genes}
local to_keep=${input/.fst/_to_keep.lst}
local to_be_removed=${input/.fst/_to_be_removed.lst}
grep '^>' $input \
| sed 's/^>//' \
| sed 's/_/=/' \
| sed 's/;$//' \
| awk -F'=' '{print $NF,$1}' \
| sort \
| uniq -c \
| $AwkCmd '{
x=$1
$1=$2
$2=sprintf("%06d",x)
print $0}' \
| sort -r > $output
$AwkCmd '
($1!=current) {current=$1; print $3}
' $output > $to_keep
$AwkCmd '
($1==current) {print $3}
($1!=current) {current=$1}
' $output > $to_be_removed
filter_sp_fasta_db $to_be_removed $input > ${input/.fst/_strange_genes.fst}
filter_sp_fasta_db $to_keep $input
}
function filter_sp_fasta_db() {
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
$AwkCmd -F "_" -v gene_pattern=$gene_pattern '
/^>/ && (gene ~ gene_pattern) {
print entry
}
/^>/ {
entry = $0
gene = $1
sub(/^>/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
if (gene ~ gene_pattern)
print entry
}
' $2
}
function filter_out_sp_fasta_db() {
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
$AwkCmd -v gene_pattern=$gene_pattern '
/^>/ && (gene !~ gene_pattern) {
print entry
}
/^>/ {
entry = $0
gene = $1
sub(/^>/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
if (gene !~ gene_pattern)
print entry
}
' $2
}
function split_by_gene() {
$AwkCmd -F "=" -v SPGENES=$SPGENESDIR '
function writegene(gene,entry) {
outputdir = SPGENES "/" gene
output = outputdir "/" gene ".fst"
system("mkdir -p " outputdir)
print entry >> output
close(output)
}
/^>/ && gene!="" {
writegene(gene,entry)
}
/^>/ {
entry = $0
gene = $NF
sub(/;$/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
writegene(gene,entry)
}
' $1
}
function dereplicate() {
local CDHIT_ID=0.95
local CDHIT_DELTA=0.95
local gene="${1}"
local fastain="${gene}/${gene}.fst"
local cdhitout="${gene}/${gene}.cdhit.fst"
cd-hit -i "${fastain}" \
-o "${cdhitout}" \
-c ${CDHIT_DELTA} \
-G 1 \
-g 1 \
-aL 0.95 \
-s ${CDHIT_ID} \
-b 350 -p 1 \
-d 0 -n 3
local fasta1="${gene}/${gene}.1l.fst"
}
function dereplicate_genes() {
pushd $SPGENESDIR
for g in * ; do
dereplicate $g ;
done
popd
}
function buildGeneBlastDB() {
local gene="${1}"
local fastain="${gene}/${gene}.cdhit.fst"
loginfo " formatting Blast $gene DB"
timeoutcmd 300 makeblastdb -dbtype prot -in ${fastain} >& /dev/null
loginfo "Done"
}
function buildBlastDBs() {
pushd $SPGENESDIR
for g in * ; do
buildGeneBlastDB $g ;
done
popd
}
function list_shell_genes() {
pushd $SPDIR
ls genes \
| grep -v '\.' \
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
| grep -iv '^orf' \
| grep -iv "$RPS12_GENE"
popd
}
function list_dust_genes() {
pushd $SPDIR 1>&2
ls genes \
| grep -v '\.' \
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
| grep -i '^orf'
popd 1>&2
}
function build_core_libraries() {
pushd $SPDIR 1>&2
rm -rf core
mkdir -p core
for gene in $CORE_GENES ; do
cp genes/$gene/$gene.cdhit.fst core/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr core/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin core/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq core/$gene.fst.psq
done
popd 1>&2
}
function build_rps12_library() {
pushd $SPDIR 1>&2
local gene=$RPS12_GENE
rm -rf RPS12
mkdir -p RPS12
cp genes/$gene/$gene.cdhit.fst RPS12/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr RPS12/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin RPS12/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq RPS12/$gene.fst.psq
popd 1>&2
}
function build_shell_libraries() {
pushd $SPDIR 1>&2
rm -rf shell
mkdir -p shell
for gene in $(list_shell_genes) ; do
cp genes/$gene/$gene.cdhit.fst shell/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr shell/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin shell/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq shell/$gene.fst.psq
done
popd 1>&2
}
function build_dust_libraries() {
pushd $SPDIR 1>&2
rm -rf dust
mkdir -p dust
for gene in $(list_dust_genes) ; do
cp genes/$gene/$gene.cdhit.fst dust/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr dust/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin dust/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq dust/$gene.fst.psq
done
popd 1>&2
}
function get_product_line() {
pushd $SPGENESDIR 1>&2
local gene=$1
local spac=$(head -1 $gene/$gene.cdhit.fst \
| awk '
{ AC=$2;
sub(/^SP_AC=/,"",AC);
sub(/;$/,"",AC);
print AC}')
popd 1>&2
pushd $SPDIR 1>&2
extract_mito_gene_ac $spac rawdata/SP_Mito.dat \
| grep "^DE " \
| $AwkCmd -v gene=$gene '
function remove_tails(line) {
sub(/ *(\{[^}]+\});/,"",line)
st = index(line,"=")
return substr(line,st+1)
}
/DE +RecName:/ {
full = remove_tails($0)
}
/DE +Short=/ {
ns = remove_tails($0)
if (length(ns) > length(short)) {
short = ns
}
}
/DE +EC=/ {
ec = remove_tails($0)
}
END {
if (length(short) > 10) {
product = short
} else {
product = full
}
if (ec != "") {
product = product " (EC:" ec ")"
}
if (product == "") {
product = "Hypothetical protein of unknown function"
}
gsub(/ /,"_",product)
print gene,gene,"--","--","--",product
}
'
popd 1>&2
}
function build_annotate_lst() {
pushd $SPGENESDIR 1>&2
for gene in * ; do
get_product_line $gene
done
popd 1>&2
}
pushd $SPDIR
####
#
# Download and prepare raw library from Swissprot FTP site
#
###
rm -rf rawdata
mkdir -p rawdata
pushd rawdata
download_swissprot | extract_mito_entries > SP_Mito.dat
extract_fasta_protein SP_Mito.dat > SP_Mito_gene_db.fst
popd
####
#
# Clean swiss-prot fasta file for gene name annotation
#
###
pushd rawdata
rename_genes SP_Mito_gene_db.fst > SP_Mito_gene_db.clean_name.fst
clean_strange_gene_name SP_Mito_gene_db.clean_name.fst \
> SP_Mito_gene_db.good_gene.fst
popd
####
#
# Prepare the database for all genes
#
###
rm -rf genes
split_by_gene rawdata/SP_Mito_gene_db.good_gene.fst
dereplicate_genes
buildBlastDBs
####
#
# Prepare the differente gene databases for CDS annotation
#
###
build_core_libraries
build_shell_libraries
build_dust_libraries
build_rps12_library
####
#
# Build Annotation file
#
###
build_annotate_lst > Annot.lst
# ls ../../mitodb/core/ | grep -v '\.' | sort > core_genes.lst
# ls | grep -v '\.' | awk '{print tolower($1),$1}' | sort > sp_genes.lst
# join -a 1 -e xxxx core_genes.lst sp_genes.lst > join_core.lst
popd