Switch to a swissprot based reference database for CDS annotation
Former-commit-id: 3da31ce8a135394ecac041291134d61f11f06d8f Former-commit-id: 406f41a7cb2db14ea832480b86f72a11d3b0ab4a
This commit is contained in:
@ -6,10 +6,13 @@
|
||||
#
|
||||
# Annotate CDS using exonerate
|
||||
#
|
||||
# do_exonerate.sh <FASTAGENOM> <FASTAPROT> [<OUTDIR>]
|
||||
# do_exonerate.sh <PASS> <FASTAGENOM> <FASTAPROT> <ANNOTFILE> <MODELDIR> [<OUTDIR>]
|
||||
#
|
||||
# - <PASS> : the pass running exonarate
|
||||
# - <FASTAGENOM> : The fasta file containing the genome to annotate
|
||||
# - <FASTAPROT> : The fasta file containing the protein family
|
||||
# - <ANNOTFILE> : The annotation file used to add product info
|
||||
# - <MODELDIR> : Directory containing model parameters for exonerate
|
||||
#
|
||||
# Results are in file : `basename <FASTAGENOM>:r`.`basename <FASTAPROT>:r`.res
|
||||
#
|
||||
@ -26,17 +29,21 @@ alias Override 'if (-e \!:2) set \!:1 = \!:2'
|
||||
|
||||
NeedArg 2
|
||||
|
||||
set Pass = $Argv[1]; Shift
|
||||
set GenoFile = $Argv[1]; Shift
|
||||
set GenoName = `basename $GenoFile:r`
|
||||
|
||||
set ProtFile = $Argv[1]; Shift
|
||||
set ProtDir = `dirname $ProtFile`
|
||||
set DBDir = `dirname $ProtDir`
|
||||
set ProtName = `basename $ProtFile | $AwkCmd -F'.' '{print $1}'`
|
||||
set ProtType = `basename $ProtDir`
|
||||
|
||||
set AnnotFile = $Argv[1]; Shift
|
||||
|
||||
NeedFile $GenoFile
|
||||
NeedFile $ProtFile
|
||||
NeedFile $ProtDir/Annot.lst
|
||||
NeedFile $AnnotFile
|
||||
|
||||
set ModelsDir = $PROG_DIR/../models
|
||||
if ($#Argv > 0) then
|
||||
@ -188,7 +195,7 @@ if ( -z $base.exo.best) then
|
||||
$AwkCmd -v MAX_SPAN=$PASS1_MAX_SPAN \
|
||||
-v ALLOW_STOP=1 \
|
||||
-v EXCLUDE=$GenoName \
|
||||
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
|
||||
-f $LIB_DIR/bestclust.awk $base.exo.raw > $base.exo.best
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -196,7 +203,16 @@ endif
|
||||
# get annotations
|
||||
#
|
||||
|
||||
egrep "^$ProtName " $ProtDir/Annot.lst | $AwkCmd '{print "c annot", $0}' > T_$$
|
||||
set sp_match=`awk '/^e similarity/ {print $12}' $base.exo.best | head -1`
|
||||
|
||||
if ( ${%sp_match} > 0 ) then
|
||||
set sp_ac=`awk -v sp="$sp_match" '($1 ~ sp) {sub(/SP_AC=/,"",$2); sub(/;$/,"",$2); print $2} ' $DbFile`
|
||||
echo " " patch ac $sp_match to $sp_ac > /dev/stderr
|
||||
sed "s/$sp_match/$sp_ac/g" $base.exo.best > T_$$
|
||||
mv T_$$ $base.exo.best
|
||||
endif
|
||||
|
||||
egrep "^$ProtName " $AnnotFile | $AwkCmd '{print "c annot", $0}' > T_$$
|
||||
|
||||
#
|
||||
# extend start/stop
|
||||
@ -213,7 +229,7 @@ $AwkCmd -f $LIB_DIR/libutil.awk -f $LIB_DIR/extend.awk \
|
||||
# translate
|
||||
#
|
||||
|
||||
echo "c pass pass1 $ProtType" > $base.iff
|
||||
echo "c pass $Pass $ProtType" > $base.iff
|
||||
|
||||
$AwkCmd -v FASTA=$GenoFile -f $LIB_DIR/libutil.awk \
|
||||
-f $LIB_DIR/translate.awk T_$$ >> $base.iff
|
||||
|
@ -37,10 +37,13 @@ else
|
||||
TEMP=""
|
||||
fi
|
||||
|
||||
DBROOT="$CDS_DATA_DIR/chlorodb/RPS12"
|
||||
RPS12DB="${DBROOT}/RPS12_DB.clean.fst"
|
||||
DBROOT="$CDS_DATA_DIR/sp_chlorodb/RPS12"
|
||||
RPS12DB="${DBROOT}/rps12.fst"
|
||||
DELTA=50
|
||||
|
||||
AnnotFile="$CDS_DATA_DIR/sp_chlorodb/Annot.lst"
|
||||
ModelsDir="$CDS_DATA_DIR/sp_chlorodb/models"
|
||||
|
||||
SEQLEN=$(seqlength "${QUERY}")
|
||||
SEQUENCE=$(readfirstfastaseq "${QUERY}")
|
||||
|
||||
@ -61,7 +64,9 @@ blastx \
|
||||
BEGIN {BEST_EVAL = 1e-40;
|
||||
OUT = 0}
|
||||
/^#/ {next}
|
||||
($2 == PREV_CDS) { HSPs = HSPs "\n" $0;}
|
||||
($2 == PREV_CDS) && (($11 + 0.0) < (1e-5 + 0.0)) {
|
||||
HSPs = HSPs "\n" $0;
|
||||
}
|
||||
|
||||
(OUT < 20) && ($2 != PREV_CDS) && (BEST_EVAL < (1e-20 + 0.0)) {
|
||||
if (PREV_CDS) print HSPs;
|
||||
@ -75,6 +80,7 @@ blastx \
|
||||
(BEST_EVAL > ($11 + 0.0)) {BEST_EVAL = ($11 + 0.0)}
|
||||
' > "rps12_locate.hsps"
|
||||
|
||||
|
||||
#
|
||||
# Extracting protein ids from selected blast HSPs
|
||||
#
|
||||
@ -83,7 +89,6 @@ blastx \
|
||||
| sort \
|
||||
| uniq > "dbsel.txt"
|
||||
|
||||
|
||||
#
|
||||
# Extract corresponding protein sequences
|
||||
# from the RPS12 database.
|
||||
@ -134,7 +139,7 @@ blastx \
|
||||
}
|
||||
}
|
||||
' \
|
||||
| sort -nk 3 \
|
||||
| sort -nk 3 \
|
||||
| $AwkCmd '($3 != old3 || $4 != old4) {
|
||||
i++
|
||||
old3=$3
|
||||
@ -262,13 +267,14 @@ blastx \
|
||||
# It should be one or two fragments
|
||||
#
|
||||
export PASS1_SPEEDUP=0
|
||||
cp $DBROOT/Annot.lst RPS12
|
||||
nbseq = 0
|
||||
nbseq=0
|
||||
for fasta in rps12_fragments_*.fasta ; do
|
||||
tcsh -f ${PROG_DIR}/do_exonerate.csh \
|
||||
Pass2 \
|
||||
$fasta \
|
||||
"RPS12/rps12.fasta" \
|
||||
$DBROOT/../models $(pwd)
|
||||
$AnnotFile \
|
||||
$ModelsDir $(pwd)
|
||||
((nbseq=nbseq+1))
|
||||
done
|
||||
|
||||
|
@ -35,19 +35,22 @@ Genome=$(basename ${Fasta%.*})
|
||||
# DbRoot is set to its default values except
|
||||
# if the second argument precise another DbRoot
|
||||
|
||||
DbRoot="$CDS_DATA_DIR/chlorodb"
|
||||
DbRoot="$CDS_DATA_DIR/sp_chlorodb"
|
||||
|
||||
if (( $# > 0)) ; then
|
||||
DbRoot="$1"; Shift
|
||||
fi
|
||||
|
||||
AnnotFile="$DbRoot/Annot.lst"
|
||||
|
||||
needdir $DbRoot
|
||||
needdir $DbRoot/core
|
||||
needfile $DbRoot/core/Annot.lst
|
||||
needfile $AnnotFile
|
||||
needdir $DbRoot/models
|
||||
|
||||
assignundef cdsdetection_pass1 yes
|
||||
assignundef cdsdetection_pass2 yes
|
||||
assignundef cdsdetection_pass3 yes
|
||||
|
||||
temp=$(mktempdir $(hostname))
|
||||
|
||||
@ -63,33 +66,47 @@ Fasta="$temp/genome.fasta"
|
||||
#
|
||||
|
||||
if [[ "$cdsdetection_pass1" == "yes" ]] ; then
|
||||
for dir in "core" "shell" "dust" ; do
|
||||
for dir in "core" ; do
|
||||
if [[ -d $DbRoot/$dir ]] ; then
|
||||
fams=$(ls $DbRoot/$dir/*.clean.fst)
|
||||
fams=$(ls $DbRoot/$dir/*.fst)
|
||||
loginfo "running pass1:$dir exonerate of $Genome on $DbRoot"
|
||||
for f in $fams ; do
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh $Fasta $f $DbRoot/models $temp
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh Pass1 $Fasta $f $AnnotFile $DbRoot/models $temp
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
||||
cp $temp/genome.cds.fasta $Genome.cds.fasta
|
||||
|
||||
mv $temp/genome.cds.fasta $Genome.cds_pass1.fasta
|
||||
fi
|
||||
|
||||
|
||||
#
|
||||
# pass2: transsplicing
|
||||
# pass2: RPS12 gene with transsplicing
|
||||
#
|
||||
|
||||
if [[ "$cdsdetection_pass2" == "yes" ]] ; then
|
||||
loginfo "running pass2:rps12 exonerate of $Genome on $DbRoot"
|
||||
$PROG_DIR/do_rps12.sh $Fasta $temp
|
||||
fi
|
||||
|
||||
#
|
||||
# pass3: prokov
|
||||
# pass3: run exonerate on shell and dust
|
||||
#
|
||||
|
||||
if [[ "$cdsdetection_pass3" == "yes" ]] ; then
|
||||
for dir in "shell" ; do
|
||||
if [[ -d $DbRoot/$dir ]] ; then
|
||||
fams=$(ls $DbRoot/$dir/*.fst)
|
||||
loginfo $fams
|
||||
loginfo "running pass3:$dir exonerate of $Genome on $DbRoot"
|
||||
for f in $fams ; do
|
||||
tcsh -f $PROG_DIR/do_exonerate.csh Pass3 $Fasta $f $AnnotFile $DbRoot/models $temp
|
||||
done
|
||||
fi
|
||||
done
|
||||
mv $temp/genome.cds.fasta $Genome.cds_pass2.fasta
|
||||
fi
|
||||
|
||||
# $PROG_DIR/do_prokov.sh $Fasta $Genome.cds.fasta $temp
|
||||
|
||||
#
|
||||
|
@ -142,8 +142,7 @@ function Unk(s) {
|
||||
}
|
||||
|
||||
/^e similarity/ {
|
||||
split($12, a, "@")
|
||||
Simil = a[1] ":" a[2]
|
||||
Simil = a[1] "UniProtKB/Swiss-Prot:" $12
|
||||
next
|
||||
}
|
||||
|
||||
@ -179,7 +178,7 @@ function Unk(s) {
|
||||
QQualifier("note","nonfunctional due to a frameshift")
|
||||
}
|
||||
QQualifier("product", Product)
|
||||
QQualifier("inference", "similar to DNA sequence:" Simil)
|
||||
QQualifier("inference", "similar to " Simil)
|
||||
# QQualifier("inference", "org.annot -- detect pass:" PassType ":" PassInfo)
|
||||
if (FrameShift==0) {
|
||||
if (match(Translat,/\*/)>0) {
|
||||
|
542
detectors/cds/tools/build_swissprot_db.sh
Normal file
542
detectors/cds/tools/build_swissprot_db.sh
Normal file
@ -0,0 +1,542 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BUILD REFERENCE : From SwissProt
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
|
||||
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
SPDIR="$CDS_DATA_DIR/sp_chlorodb"
|
||||
SPGENESDIR="$SPDIR/genes"
|
||||
|
||||
CORE_GENES="accD"
|
||||
CORE_GENES="$CORE_GENES atpA atpB atpE atpF atpH atpI"
|
||||
CORE_GENES="$CORE_GENES ccsA"
|
||||
CORE_GENES="$CORE_GENES cemA"
|
||||
CORE_GENES="$CORE_GENES clpP"
|
||||
CORE_GENES="$CORE_GENES infA"
|
||||
CORE_GENES="$CORE_GENES matK"
|
||||
CORE_GENES="$CORE_GENES ndhA ndhB ndhC ndhD ndhE ndhF ndhG ndhH ndhI ndhJ ndhK"
|
||||
CORE_GENES="$CORE_GENES petA petB petD petG petL petN"
|
||||
CORE_GENES="$CORE_GENES psaA psaB psaC psaI psaJ psbA"
|
||||
CORE_GENES="$CORE_GENES psbB psbC psbD psbE psbF psbH psbI psbJ psbK psbL psbM psbN psbT psbZ"
|
||||
CORE_GENES="$CORE_GENES rbcL"
|
||||
CORE_GENES="$CORE_GENES rpl14 rpl16 rpl2 rpl20 rpl22 rpl23 rpl32 rpl33 rpl36"
|
||||
CORE_GENES="$CORE_GENES rpoA rpoB rpoC1 rpoC2"
|
||||
CORE_GENES="$CORE_GENES rps2 rps3 rps4 rps7 rps8 rps11 rps14 rps15 rps16 rps18 rps19"
|
||||
CORE_GENES="$CORE_GENES ycf1 ycf2 ycf3 ycf4"
|
||||
|
||||
RPS12_GENE="rps12"
|
||||
|
||||
function download_swissprot() {
|
||||
URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz"
|
||||
|
||||
curl $URL
|
||||
}
|
||||
|
||||
function extract_chloro_entries() {
|
||||
awk -F'\n' '
|
||||
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
|
||||
/OC Eukaryota; Viridiplantae; Streptophyta;/ && /OG Plastid; Chloroplast./ && !/DE Flags: Fragment;/ {print $0}
|
||||
' $1
|
||||
}
|
||||
|
||||
function extract_chloro_gene_entries() {
|
||||
|
||||
awk -v gene=$1 -F'\n' '
|
||||
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
|
||||
($0 ~ gene"_") && /OC Eukaryota; Viridiplantae; Streptophyta;/ && /OG Plastid; Chloroplast./ {print $0}
|
||||
' $2
|
||||
}
|
||||
|
||||
function extract_chloro_gene_frg() {
|
||||
|
||||
awk -F'\n' '
|
||||
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
|
||||
/DE Flags: Fragment;/ {print $0}
|
||||
' $1
|
||||
}
|
||||
|
||||
function extract_chloro_gene_ac() {
|
||||
|
||||
awk -v ac=$1 -F'\n' '
|
||||
BEGIN {RS="//\n"; ORS=RS; OFS="\n"}
|
||||
($0 ~ "AC " ac ";") && /OC Eukaryota; Viridiplantae; Streptophyta;/ && /OG Plastid; Chloroplast./ {print $0}
|
||||
' $2
|
||||
}
|
||||
|
||||
function extract_fasta_protein() {
|
||||
$AwkCmd '
|
||||
/^ID/ {ID=$2}
|
||||
/^AC/ {AC=$2; sub(";","",AC)}
|
||||
/^DR EMBL;/ {
|
||||
EMBL=$4;
|
||||
sub(";","",EMBL);
|
||||
sub("-","xxx",EMBL);
|
||||
if ( EMBL != "xxx" ) {
|
||||
EBI="curl \"https://www.ebi.ac.uk/ena/browser/api/embl/" EMBL "?download=true\" | egrep \"FT +/gene=\" | cut -d \"=\" -f 2 | tr -d \"\\\"\""
|
||||
EBI | getline GENE
|
||||
close(EBI)
|
||||
} else {
|
||||
GENE = "xxx"
|
||||
}
|
||||
}
|
||||
/^ / {gsub(/ /,"",$0); SEQ=SEQ $0}
|
||||
/^\/\// {
|
||||
if (GENE != "xxx") {
|
||||
print ">"ID,"SP_AC="AC";","EMBL_AC="EMBL";","gene="GENE";";
|
||||
print SEQ
|
||||
}
|
||||
ID="xxx";
|
||||
AC="xxx";
|
||||
EMBL="xxx";
|
||||
GENE="xxx"
|
||||
SEQ=""
|
||||
}
|
||||
' $1 \
|
||||
| formatfasta
|
||||
}
|
||||
|
||||
function rename_rules() {
|
||||
egrep "^>" $1 \
|
||||
| sed 's/^>//' \
|
||||
| sed 's/_/=/' \
|
||||
| tr -d ";" \
|
||||
| awk -F"=" '{print $1,$NF}' \
|
||||
| sort \
|
||||
| uniq -c \
|
||||
| awk '{
|
||||
x=$1
|
||||
$1=$2
|
||||
$2=sprintf("%06d",x)
|
||||
print $0}' \
|
||||
| sort -rn \
|
||||
| sed 's/ /=/' \
|
||||
| sed 's/ /=/' \
|
||||
| awk -F "=" '
|
||||
($1 == current) {
|
||||
sub(/^0+/,"",occurrence)
|
||||
print "# based on",occurrence,"observations renames",$NF,"in",gene
|
||||
print "/^>" current "/ s@gene=" $NF "@gene=" gene "@"
|
||||
}
|
||||
($1 != current) {
|
||||
current= $1
|
||||
gene=$NF
|
||||
occurrence=$2
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
function rename_genes() {
|
||||
local n=1
|
||||
local input=$1
|
||||
cat $input > __tmp__$$__fasta
|
||||
rules=${input/fst/rules}
|
||||
#echo $rule 1>&2
|
||||
|
||||
rename_rules __tmp__$$__fasta > $rules.$n
|
||||
while [[ -s $rules.$n ]] ; do
|
||||
sed -f $rules.$n __tmp__$$__fasta > __tmp2__$$__fasta
|
||||
mv __tmp2__$$__fasta __tmp__$$__fasta
|
||||
((n++))
|
||||
rename_rules __tmp__$$__fasta > $rules.$n
|
||||
done
|
||||
|
||||
cat __tmp__$$__fasta
|
||||
rm __tmp__$$__fasta
|
||||
}
|
||||
|
||||
|
||||
function clean_strange_gene_name() {
|
||||
local input=$1
|
||||
local output=${input/.fst/_sp_ebi.genes}
|
||||
local to_keep=${input/.fst/_to_keep.lst}
|
||||
local to_be_removed=${input/.fst/_to_be_removed.lst}
|
||||
|
||||
grep '^>' $input \
|
||||
| sed 's/^>//' \
|
||||
| sed 's/_/=/' \
|
||||
| sed 's/;$//' \
|
||||
| awk -F'=' '{print $NF,$1}' \
|
||||
| sort \
|
||||
| uniq -c \
|
||||
| $AwkCmd '{
|
||||
x=$1
|
||||
$1=$2
|
||||
$2=sprintf("%06d",x)
|
||||
print $0}' \
|
||||
| sort -r > $output
|
||||
|
||||
$AwkCmd '
|
||||
($1!=current) {current=$1; print $3}
|
||||
' $output > $to_keep
|
||||
|
||||
$AwkCmd '
|
||||
($1==current) {print $3}
|
||||
($1!=current) {current=$1}
|
||||
' $output > $to_be_removed
|
||||
|
||||
filter_sp_fasta_db $to_be_removed $input > ${input/.fst/_strange_genes.fst}
|
||||
filter_sp_fasta_db $to_keep $input
|
||||
}
|
||||
|
||||
function filter_sp_fasta_db() {
|
||||
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
|
||||
|
||||
$AwkCmd -F "_" -v gene_pattern=$gene_pattern '
|
||||
/^>/ && (gene ~ gene_pattern) {
|
||||
print entry
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
entry = $0
|
||||
gene = $1
|
||||
sub(/^>/, "", gene)
|
||||
}
|
||||
|
||||
!/^>/ {
|
||||
entry = entry "\n" $0
|
||||
}
|
||||
|
||||
END {
|
||||
if (gene ~ gene_pattern)
|
||||
print entry
|
||||
}
|
||||
' $2
|
||||
|
||||
}
|
||||
|
||||
function filter_out_sp_fasta_db() {
|
||||
gene_pattern="$(echo $(cat $1) | tr ' ' '|')"
|
||||
|
||||
$AwkCmd -v gene_pattern=$gene_pattern '
|
||||
/^>/ && (gene !~ gene_pattern) {
|
||||
print entry
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
entry = $0
|
||||
gene = $1
|
||||
sub(/^>/, "", gene)
|
||||
}
|
||||
|
||||
!/^>/ {
|
||||
entry = entry "\n" $0
|
||||
}
|
||||
|
||||
END {
|
||||
if (gene !~ gene_pattern)
|
||||
print entry
|
||||
}
|
||||
' $2
|
||||
|
||||
}
|
||||
|
||||
function split_by_gene() {
|
||||
$AwkCmd -F "=" -v SPGENES=$SPGENESDIR '
|
||||
function writegene(gene,entry) {
|
||||
outputdir = SPGENES "/" gene
|
||||
output = outputdir "/" gene ".fst"
|
||||
system("mkdir -p " outputdir)
|
||||
print entry >> output
|
||||
close(output)
|
||||
}
|
||||
|
||||
/^>/ && gene!="" {
|
||||
writegene(gene,entry)
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
entry = $0
|
||||
gene = $NF
|
||||
sub(/;$/, "", gene)
|
||||
}
|
||||
|
||||
!/^>/ {
|
||||
entry = entry "\n" $0
|
||||
}
|
||||
|
||||
END {
|
||||
writegene(gene,entry)
|
||||
}
|
||||
' $1
|
||||
}
|
||||
|
||||
function dereplicate() {
|
||||
local CDHIT_ID=0.95
|
||||
local CDHIT_DELTA=0.95
|
||||
|
||||
local gene="${1}"
|
||||
local fastain="${gene}/${gene}.fst"
|
||||
local cdhitout="${gene}/${gene}.cdhit.fst"
|
||||
|
||||
cd-hit -i "${fastain}" \
|
||||
-o "${cdhitout}" \
|
||||
-c ${CDHIT_DELTA} \
|
||||
-G 1 \
|
||||
-g 1 \
|
||||
-aL 0.95 \
|
||||
-s ${CDHIT_ID} \
|
||||
-b 350 -p 1 \
|
||||
-d 0 -n 3
|
||||
|
||||
local fasta1="${gene}/${gene}.1l.fst"
|
||||
}
|
||||
|
||||
function dereplicate_genes() {
|
||||
pushd $SPGENESDIR
|
||||
for g in * ; do
|
||||
dereplicate $g ;
|
||||
done
|
||||
popd
|
||||
}
|
||||
|
||||
function buildGeneBlastDB() {
|
||||
local gene="${1}"
|
||||
local fastain="${gene}/${gene}.cdhit.fst"
|
||||
|
||||
loginfo " formatting Blast $gene DB"
|
||||
timeoutcmd 300 makeblastdb -dbtype prot -in ${fastain} >& /dev/null
|
||||
loginfo "Done"
|
||||
}
|
||||
|
||||
function buildBlastDBs() {
|
||||
pushd $SPGENESDIR
|
||||
for g in * ; do
|
||||
buildGeneBlastDB $g ;
|
||||
done
|
||||
popd
|
||||
}
|
||||
|
||||
function list_shell_genes() {
|
||||
pushd $SPDIR
|
||||
|
||||
ls genes \
|
||||
| grep -v '\.' \
|
||||
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
|
||||
| grep -iv '^orf' \
|
||||
| grep -iv "$RPS12_GENE"
|
||||
|
||||
popd
|
||||
}
|
||||
|
||||
function list_dust_genes() {
|
||||
pushd $SPDIR 1>&2
|
||||
|
||||
ls genes \
|
||||
| grep -v '\.' \
|
||||
| egrep -iv $(tr " " "|" <<< $CORE_GENES) \
|
||||
| grep -i '^orf'
|
||||
|
||||
popd 1>&2
|
||||
}
|
||||
|
||||
function build_core_libraries() {
|
||||
pushd $SPDIR 1>&2
|
||||
|
||||
rm -rf core
|
||||
mkdir -p core
|
||||
|
||||
for gene in $CORE_GENES ; do
|
||||
cp genes/$gene/$gene.cdhit.fst core/$gene.fst
|
||||
cp genes/$gene/$gene.cdhit.fst.phr core/$gene.fst.phr
|
||||
cp genes/$gene/$gene.cdhit.fst.pin core/$gene.fst.pin
|
||||
cp genes/$gene/$gene.cdhit.fst.psq core/$gene.fst.psq
|
||||
done
|
||||
|
||||
popd 1>&2
|
||||
}
|
||||
|
||||
function build_rps12_library() {
|
||||
pushd $SPDIR 1>&2
|
||||
|
||||
local gene=$RPS12_GENE
|
||||
|
||||
rm -rf RPS12
|
||||
mkdir -p RPS12
|
||||
|
||||
cp genes/$gene/$gene.cdhit.fst RPS12/$gene.fst
|
||||
cp genes/$gene/$gene.cdhit.fst.phr RPS12/$gene.fst.phr
|
||||
cp genes/$gene/$gene.cdhit.fst.pin RPS12/$gene.fst.pin
|
||||
cp genes/$gene/$gene.cdhit.fst.psq RPS12/$gene.fst.psq
|
||||
|
||||
popd 1>&2
|
||||
}
|
||||
|
||||
function build_shell_libraries() {
|
||||
pushd $SPDIR 1>&2
|
||||
|
||||
rm -rf shell
|
||||
mkdir -p shell
|
||||
|
||||
for gene in $(list_shell_genes) ; do
|
||||
cp genes/$gene/$gene.cdhit.fst shell/$gene.fst
|
||||
cp genes/$gene/$gene.cdhit.fst.phr shell/$gene.fst.phr
|
||||
cp genes/$gene/$gene.cdhit.fst.pin shell/$gene.fst.pin
|
||||
cp genes/$gene/$gene.cdhit.fst.psq shell/$gene.fst.psq
|
||||
done
|
||||
|
||||
popd 1>&2
|
||||
}
|
||||
|
||||
function build_dust_libraries() {
|
||||
pushd $SPDIR 1>&2
|
||||
|
||||
rm -rf dust
|
||||
mkdir -p dust
|
||||
|
||||
for gene in $(list_dust_genes) ; do
|
||||
cp genes/$gene/$gene.cdhit.fst dust/$gene.fst
|
||||
cp genes/$gene/$gene.cdhit.fst.phr dust/$gene.fst.phr
|
||||
cp genes/$gene/$gene.cdhit.fst.pin dust/$gene.fst.pin
|
||||
cp genes/$gene/$gene.cdhit.fst.psq dust/$gene.fst.psq
|
||||
done
|
||||
|
||||
popd 1>&2
|
||||
}
|
||||
|
||||
function get_product_line() {
|
||||
pushd $SPGENESDIR 1>&2
|
||||
|
||||
local gene=$1
|
||||
local spac=$(head -1 $gene/$gene.cdhit.fst \
|
||||
| awk '
|
||||
{ AC=$2;
|
||||
sub(/^SP_AC=/,"",AC);
|
||||
sub(/;$/,"",AC);
|
||||
print AC}')
|
||||
|
||||
popd 1>&2
|
||||
|
||||
pushd $SPDIR 1>&2
|
||||
|
||||
extract_chloro_gene_ac $spac rawdata/SP_Chloro.dat \
|
||||
| grep "^DE " \
|
||||
| $AwkCmd -v gene=$gene '
|
||||
function remove_tails(line) {
|
||||
sub(/ *(\{[^}]+\});/,"",line)
|
||||
st = index(line,"=")
|
||||
return substr(line,st+1)
|
||||
}
|
||||
|
||||
/DE +RecName:/ {
|
||||
full = remove_tails($0)
|
||||
}
|
||||
/DE +Short=/ {
|
||||
ns = remove_tails($0)
|
||||
if (length(ns) > length(short)) {
|
||||
short = ns
|
||||
}
|
||||
}
|
||||
/DE +EC=/ {
|
||||
ec = remove_tails($0)
|
||||
}
|
||||
|
||||
END {
|
||||
if (length(short) > 10) {
|
||||
product = short
|
||||
} else {
|
||||
product = full
|
||||
}
|
||||
|
||||
if (ec != "") {
|
||||
product = product " (EC:" ec ")"
|
||||
}
|
||||
|
||||
if (product == "") {
|
||||
product = "Hypothetical protein of unknown function"
|
||||
}
|
||||
|
||||
gsub(/ /,"_",product)
|
||||
|
||||
print gene,gene,"--","--","--",product
|
||||
}
|
||||
'
|
||||
popd 1>&2
|
||||
}
|
||||
|
||||
function build_annotate_lst() {
|
||||
pushd $SPGENESDIR 1>&2
|
||||
|
||||
for gene in * ; do
|
||||
get_product_line $gene
|
||||
done
|
||||
|
||||
popd 1>&2
|
||||
}
|
||||
|
||||
pushd $SPDIR
|
||||
|
||||
####
|
||||
#
|
||||
# Download and prepare raw library from Swissprot FTP site
|
||||
#
|
||||
###
|
||||
|
||||
rm -rf rawdata
|
||||
mkdir -p rawdata
|
||||
|
||||
pushd rawdata
|
||||
|
||||
download_swissprot | extract_chloro_entries > SP_Chloro.dat
|
||||
|
||||
extract_fasta_protein SP_Chloro.dat > SP_Chloro_gene_db.fst
|
||||
|
||||
popd
|
||||
|
||||
####
|
||||
#
|
||||
# Clean swiss-prot fasta file for gene name annotation
|
||||
#
|
||||
###
|
||||
|
||||
pushd rawdata
|
||||
|
||||
rename_genes SP_Chloro_gene_db.fst > SP_Chloro_gene_db.clean_name.fst
|
||||
|
||||
clean_strange_gene_name SP_Chloro_gene_db.clean_name.fst \
|
||||
> SP_Chloro_gene_db.good_gene.fst
|
||||
|
||||
popd
|
||||
|
||||
####
|
||||
#
|
||||
# Prepare the database for all genes
|
||||
#
|
||||
###
|
||||
|
||||
rm -rf genes
|
||||
split_by_gene rawdata/SP_Chloro_gene_db.good_gene.fst
|
||||
|
||||
dereplicate_genes
|
||||
|
||||
buildBlastDBs
|
||||
|
||||
####
|
||||
#
|
||||
# Prepare the differente gene databases for CDS annotation
|
||||
#
|
||||
###
|
||||
|
||||
build_core_libraries
|
||||
build_shell_libraries
|
||||
build_dust_libraries
|
||||
build_rps12_library
|
||||
|
||||
####
|
||||
#
|
||||
# Build Annotation file
|
||||
#
|
||||
###
|
||||
|
||||
build_annotate_lst > Annot.lst
|
||||
|
||||
# ls ../../chlorodb/core/ | grep -v '\.' | sort > core_genes.lst
|
||||
# ls | grep -v '\.' | awk '{print tolower($1),$1}' | sort > sp_genes.lst
|
||||
# join -a 1 -e xxxx core_genes.lst sp_genes.lst > join_core.lst
|
||||
|
||||
popd
|
@ -1,106 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# BUILD REFERENCE : THE RPS12 LIBRARY
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
|
||||
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
source "${THIS_DIR}/lib/clusterize_prot.sh"
|
||||
|
||||
function extract_rps12() {
|
||||
$AwkCmd ' \
|
||||
/^LOCUS/ {LOCUS=$2;} \
|
||||
/^ [^ ]/ { if (CDS) { \
|
||||
print LOCUS "/" feature; \
|
||||
print "#################" \
|
||||
} \
|
||||
CDS=0; \
|
||||
} \
|
||||
/^ CDS / {CDS=1; \
|
||||
$1=""; \
|
||||
feature=""} \
|
||||
(CDS) { sub(/^ */,"",$0); \
|
||||
sub(/ *$/,"",$0); \
|
||||
feature=feature $0} \
|
||||
' \
|
||||
| egrep -i '"rps12"' \
|
||||
| $AwkCmd -F"/" ' \
|
||||
function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
} \
|
||||
\
|
||||
($1 != current) {current=$1; \
|
||||
n=1 \
|
||||
} \
|
||||
{$1=$1 "_rps12_" n; \
|
||||
n++; \
|
||||
delete keys; \
|
||||
for (i=3; i<=NF; i++) { \
|
||||
split($i,key,"="); \
|
||||
keys[key[1]]=key[2] \
|
||||
} \
|
||||
prot = keys["translation"]; \
|
||||
gsub(/"/,"",prot); \
|
||||
print ">" $1,"location=" $2 ";"; \
|
||||
printfasta(prot) \
|
||||
} \
|
||||
'
|
||||
}
|
||||
|
||||
|
||||
pushTmpDir ORG.buildRPS12DB
|
||||
|
||||
RPS12FILE=RPS12_prot.fst
|
||||
|
||||
openLogFile "${CDS_DATA_DIR}/chlorodb/RPS12_DB.log"
|
||||
|
||||
loginfo "Selecting Viridiplantae genbank entries..."
|
||||
VIRIDIPLANTAE=$(${PROG_DIR}/../../normalize/tools/selectViridiplantae.sh $*)
|
||||
loginfo " --> $(echo ${VIRIDIPLANTAE} | wc -w) entries selected"
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Extracting the RPS12 protein sequences from the plants entries..."
|
||||
( for gbk in ${VIRIDIPLANTAE} ; do
|
||||
gzcat $gbk | \
|
||||
extract_rps12
|
||||
done ) > ${RPS12FILE}
|
||||
loginfo "Done"
|
||||
|
||||
loginfo "Installing the RPS12 protein sequence database..."
|
||||
|
||||
cp ${RPS12FILE} "${CDS_DATA_DIR}/chlorodb/RPS12_DB.fst"
|
||||
|
||||
loginfo "Done"
|
||||
|
||||
popTmpDir
|
||||
|
||||
pushd "${CDS_DATA_DIR}/chlorodb"
|
||||
|
||||
loginfo "Clusterizing the RPS12 protein sequence database..."
|
||||
rm -rf RPS12_DB.clean.fst
|
||||
clusterize RPS12_DB
|
||||
loginfo "Done"
|
||||
|
||||
loginfo " formatting Blast RPS12 DB"
|
||||
timeoutcmd 300 makeblastdb -dbtype prot -in RPS12_DB.clean.fst >& /dev/null
|
||||
loginfo "Done"
|
||||
|
||||
|
||||
|
||||
popd
|
||||
|
||||
#
|
||||
# format blast protein database
|
||||
#
|
||||
|
||||
|
||||
|
||||
|
||||
loginfo "Done"
|
||||
|
@ -1,8 +1,13 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
gawk 'function printfasta(seq) { \
|
||||
|
||||
$AwkCmd 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
|
@ -1,8 +1,13 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
gawk 'function printfasta(seq) { \
|
||||
|
||||
$AwkCmd 'function printfasta(seq) { \
|
||||
seqlen=length(seq); \
|
||||
for (i=1; i <= seqlen; i+=60) \
|
||||
print substr(seq,i,60); \
|
||||
|
@ -1,5 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
|
||||
( \
|
||||
for f in $* ; do \
|
||||
@ -12,7 +17,7 @@
|
||||
done \
|
||||
) | \
|
||||
grep -B 1 Viridiplantae | \
|
||||
gawk '{print $1}' | \
|
||||
$AwkCmd '{print $1}' | \
|
||||
grep '\.gbk' | \
|
||||
sed -E 's/(^.*\.gbk(.gz)?).$/\1/' | \
|
||||
uniq
|
Reference in New Issue
Block a user