Simplify CAU tRNA reference database building to keep onlyCAU tRNA
from plastomes where the three categories of CAU tRNA (Met/Ile/fMet) are annotated Former-commit-id: 67dc445698e22fe8a503c6700977c79e4817d302 Former-commit-id: 6e84303543b0752a7946bdde6e5114cfe6eef8da
This commit is contained in:
@ -13,55 +13,36 @@ function fasta1li {
|
|||||||
|
|
||||||
$AwkCmd '/^>/ {if (sequence) \
|
$AwkCmd '/^>/ {if (sequence) \
|
||||||
{print sequence}; \
|
{print sequence}; \
|
||||||
print $0; \
|
print $0; \
|
||||||
sequence=""} \
|
sequence=""} \
|
||||||
!/^>/ {sequence = sequence $0} \
|
!/^>/ {sequence = sequence $0} \
|
||||||
END {print sequence}' $1
|
END {print sequence}' $1
|
||||||
}
|
}
|
||||||
|
|
||||||
function dereplicate {
|
|
||||||
DATA=$1
|
|
||||||
sumaclust -t 1 $DATA | \
|
|
||||||
fasta1li | \
|
|
||||||
grep -A 1 '^>' | \
|
|
||||||
grep -A1 'cluster_center=True;' | \
|
|
||||||
grep -v -- -- | \
|
|
||||||
sed -E "s/count=[0-9]+; //" | \
|
|
||||||
sed 's/cluster_weight/count/' | \
|
|
||||||
$AwkCmd ' /^>/ {SEQ++;$1=$1"_"SEQ;print $0} \
|
|
||||||
!/^>/ {print $0}'
|
|
||||||
}
|
|
||||||
|
|
||||||
function extractSeqs {
|
function filtertrna() {
|
||||||
rm -f $$.index
|
|
||||||
fastaindex -f $1 \
|
|
||||||
-i $$.index
|
|
||||||
|
|
||||||
for id in `cat $2`; do
|
$AwkCmd -F '_' 'BEGIN {RS=">"} \
|
||||||
fastafetch -f $1 \
|
(! /^$/) {trna=$1; \
|
||||||
-i $$.index \
|
ac=$2"_"$3;} \
|
||||||
-q $id
|
(ac!=oldac && \
|
||||||
done
|
trnas["trnfM"]==1 && \
|
||||||
|
trnas["trnM"]==1 && \
|
||||||
rm -f $$.index
|
trnas["trnI"]==1 \
|
||||||
}
|
) {print seqs} \
|
||||||
|
(ac!=oldac) {trnas["trnfM"]=0; \
|
||||||
function goodtrna {
|
trnas["trnM"]=0; \
|
||||||
local QUERY=$1
|
trnas["trnI"]=0; \
|
||||||
local REF=$2
|
seqs=""; \
|
||||||
sumatra -t 0.90 -x $QUERY $REF | \
|
oldac=ac \
|
||||||
sed -E 's/.(trn.M?)[_A-Z0-9]+/ \1 /' | \
|
} \
|
||||||
sort -k 1,2 | \
|
(! /^$/) {seqs=seqs"\n>"$0; \
|
||||||
$AwkCmd '(OLD) && ($1!=OLD) {print OLD,c["trnM"],c["trnfM"],c["trnI"]} \
|
trnas[trna]=1;} \
|
||||||
(OLD !=$1) {c["trnM"]=0;c["trnfM"]=0;c["trnI"]=0;OLD=$1} \
|
END {if (trnas["trnfM"]==1 && \
|
||||||
{c[$2]+=$5}' | \
|
trnas["trnM"]==1 && \
|
||||||
$AwkCmd '{p=0;} \
|
trnas["trnI"]==1) \
|
||||||
($2 > $3) && ($2 > $4) { print $0,"trnM";p=1 } \
|
print seqs}' $1 | \
|
||||||
($3 > $2) && ($3 > $4) {print $0,"trnfM";p=1} \
|
egrep -v "^ *$"
|
||||||
($4 > $2) && ($4 > $3) {print $0,"trnI";p=1} \
|
|
||||||
(p==0) {print $0,"----"}' | sed 's/_/ /' | \
|
|
||||||
$AwkCmd '{print $1"_"$2,$3,$4,$5,$1,$6}' | \
|
|
||||||
$AwkCmd '(($2+$3+$4) > 1) && ($5==$6) {print $1}'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pushTmpDir ORG.buildSCDB
|
pushTmpDir ORG.buildSCDB
|
||||||
@ -79,46 +60,13 @@ pushTmpDir ORG.buildSCDB
|
|||||||
${PROG_DIR}/extract_refCAUtRNA.sh ${VIRIDIPLANTAE} | \
|
${PROG_DIR}/extract_refCAUtRNA.sh ${VIRIDIPLANTAE} | \
|
||||||
fasta1li | \
|
fasta1li | \
|
||||||
egrep -A 1 '^>trn(I|M|fM)' | \
|
egrep -A 1 '^>trn(I|M|fM)' | \
|
||||||
grep -v -- -- > ${CAUFILE}
|
grep -v -- -- | \
|
||||||
loginfo "Done"
|
filtertrna > ${CAUFILE}
|
||||||
|
|
||||||
loginfo "Sorting the CAU tRNA..."
|
|
||||||
|
|
||||||
loginfo "Extract and dereplicate trnI..."
|
|
||||||
egrep -A 1 '^>trnI_' ${CAUFILE} | grep -v -- -- > trnI.fasta
|
|
||||||
dereplicate trnI.fasta > trnCAU.fasta
|
|
||||||
|
|
||||||
loginfo "Extract and dereplicate trnM..."
|
|
||||||
egrep -A 1 '^>trnM_' ${CAUFILE} | grep -v -- -- > trnM.fasta
|
|
||||||
dereplicate trnM.fasta >> trnCAU.fasta
|
|
||||||
|
|
||||||
loginfo "Extract and dereplicate trnfM..."
|
|
||||||
egrep -A 1 '^>trnfM_' ${CAUFILE} | grep -v -- -- > trnfM.fasta
|
|
||||||
dereplicate trnfM.fasta >> trnCAU.fasta
|
|
||||||
|
|
||||||
loginfo "Done"
|
|
||||||
|
|
||||||
loginfo "Cleaning the CAU tRNA..."
|
|
||||||
|
|
||||||
loginfo "First pass..."
|
|
||||||
|
|
||||||
goodtrna trnCAU.fasta trnCAU.fasta > trnCAU.good.ids
|
|
||||||
extractSeqs trnCAU.fasta trnCAU.good.ids > trnCAU.good.fasta
|
|
||||||
|
|
||||||
loginfo " --> $(wc -l trnCAU.good.ids) sequences kept"
|
|
||||||
|
|
||||||
loginfo "Second pass..."
|
|
||||||
|
|
||||||
goodtrna trnCAU.fasta trnCAU.good.fasta > trnCAU.good.ids
|
|
||||||
extractSeqs trnCAU.fasta trnCAU.good.ids > trnCAU.good.fasta
|
|
||||||
|
|
||||||
loginfo " --> $(wc -l trnCAU.good.ids) sequences kept"
|
|
||||||
|
|
||||||
loginfo "Done"
|
loginfo "Done"
|
||||||
|
|
||||||
loginfo "Installing the CAU tRNA database..."
|
loginfo "Installing the CAU tRNA database..."
|
||||||
|
|
||||||
cp trnCAU.good.fasta "${TRNA_DATA_DIR}/CAU_tRNA_DB.fasta"
|
cp ${CAUFILE} "${TRNA_DATA_DIR}/CAU_tRNA_DB.fasta"
|
||||||
|
|
||||||
loginfo "Done"
|
loginfo "Done"
|
||||||
|
|
||||||
|
@ -10,15 +10,39 @@ THIS_DIR="$(dirname ${BASH_SOURCE[0]})"
|
|||||||
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
source "${THIS_DIR}/../../../scripts/bash_init.sh"
|
||||||
|
|
||||||
function taxid {
|
function taxid {
|
||||||
egrep '/db_xref="taxon:[0-9]+"' $1 | \
|
local gbk=$1
|
||||||
|
local CAT=cat
|
||||||
|
|
||||||
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
||||||
|
CAT=gzcat
|
||||||
|
fi
|
||||||
|
|
||||||
|
$CAT $gbk | \
|
||||||
|
egrep '/db_xref="taxon:[0-9]+"' | \
|
||||||
sed -E 's@ +/db_xref="taxon:([0-9]+)"@\1@'
|
sed -E 's@ +/db_xref="taxon:([0-9]+)"@\1@'
|
||||||
}
|
}
|
||||||
|
|
||||||
function ac {
|
function ac {
|
||||||
head -1 $1 | $AwkCmd '{print $2}'
|
local gbk=$1
|
||||||
|
local CAT=cat
|
||||||
|
|
||||||
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
||||||
|
CAT=gzcat
|
||||||
|
fi
|
||||||
|
|
||||||
|
$CAT $gbk | \
|
||||||
|
head -1 | $AwkCmd '{print $2}'
|
||||||
}
|
}
|
||||||
|
|
||||||
function definition {
|
function definition {
|
||||||
|
local gbk=$1
|
||||||
|
local CAT=cat
|
||||||
|
|
||||||
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
||||||
|
CAT=gzcat
|
||||||
|
fi
|
||||||
|
|
||||||
|
$CAT $gbk | \
|
||||||
$AwkCmd '/^DEFINITION/ {on=1} \
|
$AwkCmd '/^DEFINITION/ {on=1} \
|
||||||
(on==1) {printf("%s ",$0)} \
|
(on==1) {printf("%s ",$0)} \
|
||||||
(/\.$/ && (on==1)) {on=0;print ""}' $1 | \
|
(/\.$/ && (on==1)) {on=0;print ""}' $1 | \
|
||||||
@ -27,15 +51,23 @@ function definition {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function gb2fasta {
|
function gb2fasta {
|
||||||
|
local gbk=$1
|
||||||
|
local CAT=cat
|
||||||
|
|
||||||
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
||||||
|
CAT=gzcat
|
||||||
|
fi
|
||||||
|
|
||||||
AC=`ac $1`
|
AC=`ac $1`
|
||||||
TAXID=`taxid $1`
|
TAXID=`taxid $1`
|
||||||
DEFINITION=`definition $1`
|
DEFINITION=`definition $1`
|
||||||
|
|
||||||
echo ">${AC} taxid=${TAXID}; ${DEFINITION}"
|
echo ">${AC} taxid=${TAXID}; ${DEFINITION}"
|
||||||
|
|
||||||
$AwkCmd '/^\/\// {on=0} \
|
$CAT $gbk | \
|
||||||
|
$AwkCmd '/^\/\// {on=0} \
|
||||||
(on==1) {print $0} \
|
(on==1) {print $0} \
|
||||||
/^ORIGIN / {on=1}' $1 | \
|
/^ORIGIN / {on=1}' | \
|
||||||
sed -E 's/^ *[0-9]+ +//' | \
|
sed -E 's/^ *[0-9]+ +//' | \
|
||||||
sed 's/ //g'
|
sed 's/ //g'
|
||||||
}
|
}
|
||||||
@ -58,9 +90,17 @@ function findCAUtrna {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function trnaAnnotations {
|
function trnaAnnotations {
|
||||||
|
local gbk=$1
|
||||||
|
local CAT=cat
|
||||||
|
|
||||||
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
||||||
|
CAT=gzcat
|
||||||
|
fi
|
||||||
|
|
||||||
|
$CAT $gbk | \
|
||||||
$AwkCmd '/^ORIGIN/ {on=0} \
|
$AwkCmd '/^ORIGIN/ {on=0} \
|
||||||
(on==1) {print $0} \
|
(on==1) {print $0} \
|
||||||
/^FEATURE/ {on=1}' $1 | \
|
/^FEATURE/ {on=1}' | \
|
||||||
$AwkCmd '/^ [^ ]/ {print ""} \
|
$AwkCmd '/^ [^ ]/ {print ""} \
|
||||||
{printf("%s ",$0)} \
|
{printf("%s ",$0)} \
|
||||||
END {print ""}' | \
|
END {print ""}' | \
|
||||||
@ -91,13 +131,20 @@ function annotateCAU {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function writeTRNA {
|
function writeTRNA {
|
||||||
TAXID=`taxid $1`
|
local gbk=$1
|
||||||
AC=`ac $1`
|
local CAT=cat
|
||||||
DEFINITION=`definition $1`
|
|
||||||
|
|
||||||
TRNATMP="$$.trna.txt"
|
if [[ "$gbk" =~ \.gz$ ]] ; then
|
||||||
|
CAT=gzcat
|
||||||
|
fi
|
||||||
|
|
||||||
trnaAnnotations $1 > ${TRNATMP}
|
local TAXID=`taxid $gbk`
|
||||||
|
local AC=`ac $gbk`
|
||||||
|
local DEFINITION=`definition $gbk`
|
||||||
|
|
||||||
|
local TRNATMP="$$.trna.txt"
|
||||||
|
|
||||||
|
trnaAnnotations $gbk > ${TRNATMP}
|
||||||
ntrna=`wc -l ${TRNATMP} | $AwkCmd '{print $1}'`
|
ntrna=`wc -l ${TRNATMP} | $AwkCmd '{print $1}'`
|
||||||
|
|
||||||
if (( ntrna > 0 )); then
|
if (( ntrna > 0 )); then
|
||||||
|
Reference in New Issue
Block a user