Switch to a swissprot based reference database for CDS annotation

Former-commit-id: 3da31ce8a135394ecac041291134d61f11f06d8f
Former-commit-id: 406f41a7cb2db14ea832480b86f72a11d3b0ab4a
This commit is contained in:
2022-02-16 22:50:17 +01:00
parent 90b3ee9b04
commit 831669433e
644 changed files with 25433 additions and 485597 deletions

View File

@ -6,10 +6,13 @@
#
# Annotate CDS using exonerate
#
# do_exonerate.sh <FASTAGENOM> <FASTAPROT> [<OUTDIR>]
# do_exonerate.sh <PASS> <FASTAGENOM> <FASTAPROT> <ANNOTFILE> <MODELDIR> [<OUTDIR>]
#
# - <PASS> : the pass running exonarate
# - <FASTAGENOM> : The fasta file containing the genome to annotate
# - <FASTAPROT> : The fasta file containing the protein family
# - <ANNOTFILE> : The annotation file used to add product info
# - <MODELDIR> : Directory containing model parameters for exonerate
#
# Results are in file : `basename <FASTAGENOM>:r`.`basename <FASTAPROT>:r`.res
#
@ -26,17 +29,21 @@ alias Override 'if (-e \!:2) set \!:1 = \!:2'
NeedArg 2
set Pass = $Argv[1]; Shift
set GenoFile = $Argv[1]; Shift
set GenoName = `basename $GenoFile:r`
set ProtFile = $Argv[1]; Shift
set ProtDir = `dirname $ProtFile`
set DBDir = `dirname $ProtDir`
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
set ProtType = `basename $ProtDir`
set AnnotFile = $Argv[1]; Shift
NeedFile $GenoFile
NeedFile $ProtFile
NeedFile $ProtDir/Annot.lst
NeedFile $AnnotFile
set ModelsDir = $PROG_DIR/../models
if ($#Argv > 0) then
@ -188,7 +195,7 @@ if ( -z $base.exo.best) then
$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN \
-v ALLOW_STOP=1 \
-v EXCLUDE=$GenoName \
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
endif
endif
@ -196,7 +203,16 @@ endif
# get annotations
#
egrep "^$ProtName " $ProtDir/Annot.lst | $AwkCmd '{print "c annot", $0}' > T_$$
set sp_match=`awk '/^e similarity/ {print $12}' $base.exo.best | head -1`
if ( ${%sp_match} > 0 ) then
set sp_ac=`awk -v sp="$sp_match" '($1 ~ sp) {sub(/SP_AC=/,"",$2); sub(/;$/,"",$2); print $2} ' $DbFile`
echo " " patch ac $sp_match to $sp_ac > /dev/stderr
sed "s/$sp_match/$sp_ac/g" $base.exo.best > T_$$
mv T_$$ $base.exo.best
endif
egrep "^$ProtName " $AnnotFile | $AwkCmd '{print "c annot", $0}' > T_$$
#
# extend start/stop
@ -213,7 +229,7 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk \
# translate
#
echo "c pass pass1 $ProtType" > $base.iff
echo "c pass $Pass $ProtType" > $base.iff
$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
-f $LIB_DIR/translate.awk T_$$ >> $base.iff

View File

@ -37,10 +37,13 @@ else
TEMP=""
fi
DBROOT="$CDS_DATA_DIR/chlorodb/RPS12"
RPS12DB="${DBROOT}/RPS12_DB.clean.fst"
DBROOT="$CDS_DATA_DIR/sp_chlorodb/RPS12"
RPS12DB="${DBROOT}/rps12.fst"
DELTA=50
AnnotFile="$CDS_DATA_DIR/sp_chlorodb/Annot.lst"
ModelsDir="$CDS_DATA_DIR/sp_chlorodb/models"
SEQLEN=$(seqlength "${QUERY}")
SEQUENCE=$(readfirstfastaseq "${QUERY}")
@ -61,7 +64,9 @@ blastx \
BEGIN {BEST_EVAL = 1e-40;
OUT = 0}
/^#/ {next}
($2 == PREV_CDS) { HSPs = HSPs "\n" $0;}
($2 == PREV_CDS) && (($11 + 0.0) < (1e-5 + 0.0)) {
HSPs = HSPs "\n" $0;
}
(OUT < 20) && ($2 != PREV_CDS) && (BEST_EVAL < (1e-20 + 0.0)) {
if (PREV_CDS) print HSPs;
@ -75,6 +80,7 @@ blastx \
(BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)}
' > "rps12_locate.hsps"
#
# Extracting protein ids from selected blast HSPs
#
@ -83,7 +89,6 @@ blastx \
| sort \
| uniq > "dbsel.txt"
#
# Extract corresponding protein sequences
# from the RPS12 database.
@ -134,7 +139,7 @@ blastx \
}
}
' \
| sort -nk 3 \
| sort -nk 3 \
| $AwkCmd '($3 != old3 || $4 != old4) {
i++
old3=$3
@ -262,13 +267,14 @@ blastx \
# It should be one or two fragments
#
export PASS1_SPEEDUP=0
cp $DBROOT/Annot.lst RPS12
nbseq = 0
nbseq=0
for fasta in rps12_fragments_*.fasta ; do
tcsh -f ${PROG_DIR}/do_exonerate.csh \
Pass2 \
$fasta \
"RPS12/rps12.fasta" \
$DBROOT/../models $(pwd)
$AnnotFile \
$ModelsDir $(pwd)
((nbseq=nbseq+1))
done

View File

@ -35,19 +35,22 @@ Genome=$(basename ${Fasta%.*})
# DbRoot is set to its default values except
# if the second argument precise another DbRoot
DbRoot="$CDS_DATA_DIR/chlorodb"
DbRoot="$CDS_DATA_DIR/sp_chlorodb"
if (( $# > 0)) ; then
DbRoot="$1"; Shift
fi
AnnotFile="$DbRoot/Annot.lst"
needdir $DbRoot
needdir $DbRoot/core
needfile $DbRoot/core/Annot.lst
needfile $AnnotFile
needdir $DbRoot/models
assignundef cdsdetection_pass1 yes
assignundef cdsdetection_pass2 yes
assignundef cdsdetection_pass3 yes
temp=$(mktempdir $(hostname))
@ -63,33 +66,47 @@ Fasta="$temp/genome.fasta"
#
if [[ "$cdsdetection_pass1" == "yes" ]] ; then
for dir in "core" "shell" "dust" ; do
for dir in "core" ; do
if [[ -d $DbRoot/$dir ]] ; then
fams=$(ls $DbRoot/$dir/*.clean.fst)
fams=$(ls $DbRoot/$dir/*.fst)
loginfo "running pass1:$dir exonerate of $Genome on $DbRoot"
for f in $fams ; do
tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
tcsh -f $PROG_DIR/do_exonerate.csh Pass1 $Fasta $f $AnnotFile $DbRoot/models $temp
done
fi
done
cp $temp/genome.cds.fasta $Genome.cds.fasta
mv $temp/genome.cds.fasta $Genome.cds_pass1.fasta
fi
#
# pass2: transsplicing
# pass2: RPS12 gene with transsplicing
#
if [[ "$cdsdetection_pass2" == "yes" ]] ; then
loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
$PROG_DIR/do_rps12.sh $Fasta $temp
fi
#
# pass3: prokov
# pass3: run exonerate on shell and dust
#
if [[ "$cdsdetection_pass3" == "yes" ]] ; then
for dir in "shell" ; do
if [[ -d $DbRoot/$dir ]] ; then
fams=$(ls $DbRoot/$dir/*.fst)
loginfo $fams
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
for f in $fams ; do
tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
done
fi
done
mv $temp/genome.cds.fasta $Genome.cds_pass2.fasta
fi
# $PROG_DIR/do_prokov.sh $Fasta $Genome.cds.fasta $temp
#

View File

@ -142,8 +142,7 @@ function Unk(s) {
}
/^e similarity/ {
split($12, a, "@")
Simil = a[1] ":" a[2]
Simil = a[1] "UniProtKB/Swiss-Prot:" $12
next
}
@ -179,7 +178,7 @@ function Unk(s) {
QQualifier("note","nonfunctional due to a frameshift")
}
QQualifier("product", Product)
QQualifier("inference", "similar to DNA sequence:" Simil)
QQualifier("inference", "similar to " Simil)
# QQualifier("inference", "org.annot -- detect pass:" PassType ":" PassInfo)
if (FrameShift==0) {
if (match(Translat,/\*/)>0) {

View File

@ -0,0 +1,542 @@
#!/bin/bash
#
# BUILD REFERENCE : From SwissProt
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
SPDIR="$CDS_DATA_DIR/sp_chlorodb"
SPGENESDIR="$SPDIR/genes"
CORE_GENES="accD"
CORE_GENES="$CORE_GENES atpA atpB atpE atpF atpH atpI"
CORE_GENES="$CORE_GENES ccsA"
CORE_GENES="$CORE_GENES cemA"
CORE_GENES="$CORE_GENES clpP"
CORE_GENES="$CORE_GENES infA"
CORE_GENES="$CORE_GENES matK"
CORE_GENES="$CORE_GENES ndhA ndhB ndhC ndhD ndhE ndhF ndhG ndhH ndhI ndhJ ndhK"
CORE_GENES="$CORE_GENES petA petB petD petG petL petN"
CORE_GENES="$CORE_GENES psaA psaB psaC psaI psaJ psbA"
CORE_GENES="$CORE_GENES psbB psbC psbD psbE psbF psbH psbI psbJ psbK psbL psbM psbN psbT psbZ"
CORE_GENES="$CORE_GENES rbcL"
CORE_GENES="$CORE_GENES rpl14 rpl16 rpl2 rpl20 rpl22 rpl23 rpl32 rpl33 rpl36"
CORE_GENES="$CORE_GENES rpoA rpoB rpoC1 rpoC2"
CORE_GENES="$CORE_GENES rps2 rps3 rps4 rps7 rps8 rps11 rps14 rps15 rps16 rps18 rps19"
CORE_GENES="$CORE_GENES ycf1 ycf2 ycf3 ycf4"
RPS12_GENE="rps12"
function download_swissprot() {
URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"
curl $URL
}
function extract_chloro_entries() {
awk -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
/OC Eukaryota; Viridiplantae; Streptophyta;/ && /OG Plastid; Chloroplast./ && !/DE Flags: Fragment;/ {print $0}
' $1
}
function extract_chloro_gene_entries() {
awk -v gene=$1 -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
($0 ~ gene"_") && /OC Eukaryota; Viridiplantae; Streptophyta;/ && /OG Plastid; Chloroplast./ {print $0}
' $2
}
function extract_chloro_gene_frg() {
awk -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
/DE Flags: Fragment;/ {print $0}
' $1
}
function extract_chloro_gene_ac() {
awk -v ac=$1 -F'\n' '
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
($0 ~ "AC " ac ";") && /OC Eukaryota; Viridiplantae; Streptophyta;/ && /OG Plastid; Chloroplast./ {print $0}
' $2
}
function extract_fasta_protein() {
$AwkCmd '
/^ID/ {ID=$2}
/^AC/ {AC=$2; sub(";","",AC)}
/^DR EMBL;/ {
EMBL=$4;
sub(";","",EMBL);
sub("-","xxx",EMBL);
if ( EMBL != "xxx" ) {
EBI="curl \"https://www.ebi.ac.uk/ena/browser/api/embl/" EMBL "?download=true\" | egrep \"FT +/gene=\" | cut -d \"=\" -f 2 | tr -d \"\\\"\""
EBI | getline GENE
close(EBI)
} else {
GENE = "xxx"
}
}
/^ / {gsub(/ /,"",$0); SEQ=SEQ $0}
/^\/\// {
if (GENE != "xxx") {
print ">"ID,"SP_AC="AC";","EMBL_AC="EMBL";","gene="GENE";";
print SEQ
}
ID="xxx";
AC="xxx";
EMBL="xxx";
GENE="xxx"
SEQ=""
}
' $1 \
| formatfasta
}
function rename_rules() {
egrep "^>" $1 \
| sed 's/^>//' \
| sed 's/_/=/' \
| tr -d ";" \
| awk -F"=" '{print $1,$NF}' \
| sort \
| uniq -c \
| awk '{
x=$1
$1=$2
$2=sprintf("%06d",x)
print $0}' \
| sort -rn \
| sed 's/ /=/' \
| sed 's/ /=/' \
| awk -F "=" '
($1 == current) {
sub(/^0+/,"",occurrence)
print "# based on",occurrence,"observations renames",$NF,"in",gene
print "/^>" current "/ s@gene=" $NF "@gene=" gene "@"
}
($1 != current) {
current= $1
gene=$NF
occurrence=$2
}
'
}
function rename_genes() {
local n=1
local input=$1
cat $input > __tmp__$$__fasta
rules=${input/fst/rules}
#echo $rule 1>&2
rename_rules __tmp__$$__fasta > $rules.$n
while [[ -s $rules.$n ]] ; do
sed -f $rules.$n __tmp__$$__fasta > __tmp2__$$__fasta
mv __tmp2__$$__fasta __tmp__$$__fasta
((n++))
rename_rules __tmp__$$__fasta > $rules.$n
done
cat __tmp__$$__fasta
rm __tmp__$$__fasta
}
function clean_strange_gene_name() {
local input=$1
local output=${input/.fst/_sp_ebi.genes}
local to_keep=${input/.fst/_to_keep.lst}
local to_be_removed=${input/.fst/_to_be_removed.lst}
grep '^>' $input \
| sed 's/^>//' \
| sed 's/_/=/' \
| sed 's/;$//' \
| awk -F'=' '{print $NF,$1}' \
| sort \
| uniq -c \
| $AwkCmd '{
x=$1
$1=$2
$2=sprintf("%06d",x)
print $0}' \
| sort -r > $output
$AwkCmd '
($1!=current) {current=$1; print $3}
' $output > $to_keep
$AwkCmd '
($1==current) {print $3}
($1!=current) {current=$1}
' $output > $to_be_removed
filter_sp_fasta_db $to_be_removed $input > ${input/.fst/_strange_genes.fst}
filter_sp_fasta_db $to_keep $input
}
function filter_sp_fasta_db() {
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
$AwkCmd -F "_" -v gene_pattern=$gene_pattern '
/^>/ && (gene ~ gene_pattern) {
print entry
}
/^>/ {
entry = $0
gene = $1
sub(/^>/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
if (gene ~ gene_pattern)
print entry
}
' $2
}
function filter_out_sp_fasta_db() {
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
$AwkCmd -v gene_pattern=$gene_pattern '
/^>/ && (gene !~ gene_pattern) {
print entry
}
/^>/ {
entry = $0
gene = $1
sub(/^>/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
if (gene !~ gene_pattern)
print entry
}
' $2
}
function split_by_gene() {
$AwkCmd -F "=" -v SPGENES=$SPGENESDIR '
function writegene(gene,entry) {
outputdir = SPGENES "/" gene
output = outputdir "/" gene ".fst"
system("mkdir -p " outputdir)
print entry >> output
close(output)
}
/^>/ && gene!="" {
writegene(gene,entry)
}
/^>/ {
entry = $0
gene = $NF
sub(/;$/, "", gene)
}
!/^>/ {
entry = entry "\n" $0
}
END {
writegene(gene,entry)
}
' $1
}
function dereplicate() {
local CDHIT_ID=0.95
local CDHIT_DELTA=0.95
local gene="${1}"
local fastain="${gene}/${gene}.fst"
local cdhitout="${gene}/${gene}.cdhit.fst"
cd-hit -i "${fastain}" \
-o "${cdhitout}" \
-c ${CDHIT_DELTA} \
-G 1 \
-g 1 \
-aL 0.95 \
-s ${CDHIT_ID} \
-b 350 -p 1 \
-d 0 -n 3
local fasta1="${gene}/${gene}.1l.fst"
}
function dereplicate_genes() {
pushd $SPGENESDIR
for g in * ; do
dereplicate $g ;
done
popd
}
function buildGeneBlastDB() {
local gene="${1}"
local fastain="${gene}/${gene}.cdhit.fst"
loginfo " formatting Blast $gene DB"
timeoutcmd 300 makeblastdb -dbtype prot -in ${fastain} >& /dev/null
loginfo "Done"
}
function buildBlastDBs() {
pushd $SPGENESDIR
for g in * ; do
buildGeneBlastDB $g ;
done
popd
}
function list_shell_genes() {
pushd $SPDIR
ls genes \
| grep -v '\.' \
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
| grep -iv '^orf' \
| grep -iv "$RPS12_GENE"
popd
}
function list_dust_genes() {
pushd $SPDIR 1>&2
ls genes \
| grep -v '\.' \
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
| grep -i '^orf'
popd 1>&2
}
function build_core_libraries() {
pushd $SPDIR 1>&2
rm -rf core
mkdir -p core
for gene in $CORE_GENES ; do
cp genes/$gene/$gene.cdhit.fst core/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr core/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin core/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq core/$gene.fst.psq
done
popd 1>&2
}
function build_rps12_library() {
pushd $SPDIR 1>&2
local gene=$RPS12_GENE
rm -rf RPS12
mkdir -p RPS12
cp genes/$gene/$gene.cdhit.fst RPS12/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr RPS12/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin RPS12/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq RPS12/$gene.fst.psq
popd 1>&2
}
function build_shell_libraries() {
pushd $SPDIR 1>&2
rm -rf shell
mkdir -p shell
for gene in $(list_shell_genes) ; do
cp genes/$gene/$gene.cdhit.fst shell/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr shell/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin shell/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq shell/$gene.fst.psq
done
popd 1>&2
}
function build_dust_libraries() {
pushd $SPDIR 1>&2
rm -rf dust
mkdir -p dust
for gene in $(list_dust_genes) ; do
cp genes/$gene/$gene.cdhit.fst dust/$gene.fst
cp genes/$gene/$gene.cdhit.fst.phr dust/$gene.fst.phr
cp genes/$gene/$gene.cdhit.fst.pin dust/$gene.fst.pin
cp genes/$gene/$gene.cdhit.fst.psq dust/$gene.fst.psq
done
popd 1>&2
}
function get_product_line() {
pushd $SPGENESDIR 1>&2
local gene=$1
local spac=$(head -1 $gene/$gene.cdhit.fst \
| awk '
{ AC=$2;
sub(/^SP_AC=/,"",AC);
sub(/;$/,"",AC);
print AC}')
popd 1>&2
pushd $SPDIR 1>&2
extract_chloro_gene_ac $spac rawdata/SP_Chloro.dat \
| grep "^DE " \
| $AwkCmd -v gene=$gene '
function remove_tails(line) {
sub(/ *(\{[^}]+\});/,"",line)
st = index(line,"=")
return substr(line,st+1)
}
/DE +RecName:/ {
full = remove_tails($0)
}
/DE +Short=/ {
ns = remove_tails($0)
if (length(ns) > length(short)) {
short = ns
}
}
/DE +EC=/ {
ec = remove_tails($0)
}
END {
if (length(short) > 10) {
product = short
} else {
product = full
}
if (ec != "") {
product = product " (EC:" ec ")"
}
if (product == "") {
product = "Hypothetical protein of unknown function"
}
gsub(/ /,"_",product)
print gene,gene,"--","--","--",product
}
'
popd 1>&2
}
function build_annotate_lst() {
pushd $SPGENESDIR 1>&2
for gene in * ; do
get_product_line $gene
done
popd 1>&2
}
pushd $SPDIR
####
#
# Download and prepare raw library from Swissprot FTP site
#
###
rm -rf rawdata
mkdir -p rawdata
pushd rawdata
download_swissprot | extract_chloro_entries > SP_Chloro.dat
extract_fasta_protein SP_Chloro.dat > SP_Chloro_gene_db.fst
popd
####
#
# Clean swiss-prot fasta file for gene name annotation
#
###
pushd rawdata
rename_genes SP_Chloro_gene_db.fst > SP_Chloro_gene_db.clean_name.fst
clean_strange_gene_name SP_Chloro_gene_db.clean_name.fst \
> SP_Chloro_gene_db.good_gene.fst
popd
####
#
# Prepare the database for all genes
#
###
rm -rf genes
split_by_gene rawdata/SP_Chloro_gene_db.good_gene.fst
dereplicate_genes
buildBlastDBs
####
#
# Prepare the differente gene databases for CDS annotation
#
###
build_core_libraries
build_shell_libraries
build_dust_libraries
build_rps12_library
####
#
# Build Annotation file
#
###
build_annotate_lst > Annot.lst
# ls ../../chlorodb/core/ | grep -v '\.' | sort > core_genes.lst
# ls | grep -v '\.' | awk '{print tolower($1),$1}' | sort > sp_genes.lst
# join -a 1 -e xxxx core_genes.lst sp_genes.lst > join_core.lst
popd

View File

@ -1,106 +0,0 @@
#!/bin/bash
#
# BUILD REFERENCE : THE RPS12 LIBRARY
#
#========================================================================================
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
source "${THIS_DIR}/lib/clusterize_prot.sh"
function extract_rps12() {
$AwkCmd ' \
/^LOCUS/ {LOCUS=$2;} \
/^ [^ ]/ { if (CDS) { \
print LOCUS "/" feature; \
print "#################" \
} \
CDS=0; \
} \
/^ CDS / {CDS=1; \
$1=""; \
feature=""} \
(CDS) { sub(/^ */,"",$0); \
sub(/ *$/,"",$0); \
feature=feature $0} \
' \
| egrep -i '"rps12"' \
| $AwkCmd -F"/" ' \
function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \
} \
\
($1 != current) {current=$1; \
n=1 \
} \
{$1=$1 "_rps12_" n; \
n++; \
delete keys; \
for (i=3; i<=NF; i++) { \
split($i,key,"="); \
keys[key[1]]=key[2] \
} \
prot = keys["translation"]; \
gsub(/"/,"",prot); \
print ">" $1,"location=" $2 ";"; \
printfasta(prot) \
} \
'
}
pushTmpDir ORG.buildRPS12DB
RPS12FILE=RPS12_prot.fst
openLogFile "${CDS_DATA_DIR}/chlorodb/RPS12_DB.log"
loginfo "Selecting Viridiplantae genbank entries..."
VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*)
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
loginfo "Done"
loginfo "Extracting the RPS12 protein sequences from the plants entries..."
( for gbk in ${VIRIDIPLANTAE} ; do
gzcat $gbk | \
extract_rps12
done ) > ${RPS12FILE}
loginfo "Done"
loginfo "Installing the RPS12 protein sequence database..."
cp ${RPS12FILE} "${CDS_DATA_DIR}/chlorodb/RPS12_DB.fst"
loginfo "Done"
popTmpDir
pushd "${CDS_DATA_DIR}/chlorodb"
loginfo "Clusterizing the RPS12 protein sequence database..."
rm -rf RPS12_DB.clean.fst
clusterize RPS12_DB
loginfo "Done"
loginfo " formatting Blast RPS12 DB"
timeoutcmd 300 makeblastdb -dbtype prot -in RPS12_DB.clean.fst >& /dev/null
loginfo "Done"
popd
#
# format blast protein database
#
loginfo "Done"

View File

@ -1,8 +1,13 @@
#!/bin/bash
#
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
gawk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -1,8 +1,13 @@
#!/bin/bash
#
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
gawk 'function printfasta(seq) { \
$AwkCmd 'function printfasta(seq) { \
seqlen=length(seq); \
for (i=1; i <= seqlen; i+=60) \
print substr(seq,i,60); \

View File

@ -1,5 +1,10 @@
#!/bin/bash
# -- CAUTION -- Works as long than the script
# is not called through a symlink
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${THIS_DIR}/../../../scripts/bash_init.sh"
( \
for f in $* ; do \
@ -12,7 +17,7 @@
done \
) | \
grep -B 1 Viridiplantae | \
gawk '{print $1}' | \
$AwkCmd '{print $1}' | \
grep '\.gbk' | \
sed -E 's/(^.*\.gbk(.gz)?).$/\1/' | \
uniq