Switch to a swissprot based reference database for CDS annotation

Former-commit-id: 3da31ce8a135394ecac041291134d61f11f06d8f
Former-commit-id: 406f41a7cb2db14ea832480b86f72a11d3b0ab4a
This commit is contained in:
2022-02-16 22:50:17 +01:00
parent 90b3ee9b04
commit 831669433e
644 changed files with 25433 additions and 485597 deletions

View File

@ -145,7 +145,7 @@ function fastaCount {
function seqlength {
cat $1 | \
wc |\
$AwkCmd -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
$AwkCmd -v t="$(head -1 $1 | wc -c)" '{print $3 - t - $1 + 1}'
}
function readfirstfastaseq {
@ -217,6 +217,49 @@ function reversecomp {
| rev
}
function annotatedParts() {
egrep -A 3 "^FT (CDS|rRNA|tRNA)" $1 \
| egrep -v -- '^--' \
| egrep -v '^FT +/' \
| egrep -o '[0-9]+\.\.[0-9]+' \
| awk -F'.' '($1 > $3) {x=$3;$3=$1;$1=x} {print $1"-"$3}' \
| sort -n
}
function notAnnoted() {
local annotations=$1
local genome=$2
local genomesize=$(seqlength $genome)
local minimum=$3
local from=1
for part in $(annotatedParts ${annotations}) ; do
loc=( $(tr '-' ' ' <<< $part) )
begin=${loc[0]}
end=${loc[1]}
if (( ( begin - from ) >= minimum )) ; then
cutseq "${genome}" $from $(( begin - 1 )) \
| $AwkCmd -v from=$from -v to=$(( begin - 1 )) '
/^>/ {$0=$1 " from=" from "; to=" to ";"}
{print $0}
'
fi
from=$((end+1))
done
if (( ( genomesize - from + 1 ) >= minimum )) ; then
cutseq "${genome}" $from $genomesize \
| $AwkCmd -v from=$from -v to=$genomesize '
/^>/ {$0=$1 " from=" from "; to=" to ";"}
{print $0}
'
fi
}
#
# Process management related function
#