97 lines
3.0 KiB
Bash
97 lines
3.0 KiB
Bash
|
#!/bin/bash
|
||
|
#
|
||
|
#
|
||
|
|
||
|
export ORGANNOT_HOME=`dirname $0`
|
||
|
|
||
|
REPSEEK=${ORGANNOT_HOME}/repseek
|
||
|
SUMATRA=${ORGANNOT_HOME}/sumatra
|
||
|
ARAGORN=${ORGANNOT_HOME}/aragorn
|
||
|
WRAPARAGORN=${ORGANNOT_HOME}/aragorn_wrapper.awk
|
||
|
ECOFIND=${ORGANNOT_HOME}/ecofind
|
||
|
|
||
|
function annotateCAU {
|
||
|
QUERY="$$.query.fasta"
|
||
|
echo $1 | sed 's/&/ /' | tr '@' '\n' > ${QUERY}
|
||
|
${SUMATRA} -d -n ${QUERY} $2 2> /dev/null | \
|
||
|
awk ' {n[$2]+=1;d[$2]+=$3} \
|
||
|
END {for (i in n) \
|
||
|
print i, n[i],d[i], d[i]/n[i]\
|
||
|
}' | \
|
||
|
sort -rnk4 | \
|
||
|
egrep '^trn(I|M|fM)' | \
|
||
|
tail -1 | \
|
||
|
awk '{print $1,$NF}'
|
||
|
rm -rf ${QUERY}
|
||
|
}
|
||
|
|
||
|
function gffTRNA {
|
||
|
|
||
|
${ARAGORN} -w -io -seq $3 | awk -v gid=${1} -f ${WRAPARAGORN}
|
||
|
|
||
|
}
|
||
|
|
||
|
# s'alimente avec un fichier.fasta
|
||
|
# $3 : nb de caractere du fichier, t : nb de caractere du titre,
|
||
|
# $1+1 : nb de retour chariot du fichier
|
||
|
function seqlength {
|
||
|
cat $1 | \
|
||
|
wc |\
|
||
|
awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
|
||
|
}
|
||
|
|
||
|
|
||
|
# recupere les informations issues du programme repseek avec l'origine des deux
|
||
|
# IR et leur taille
|
||
|
function lookforIR {
|
||
|
${REPSEEK} -c -p 0.001 $1 | \
|
||
|
grep 'Distant.inv' | \
|
||
|
sort -n -k4 | \
|
||
|
tail -1 | \
|
||
|
awk '{print $7}' | \
|
||
|
sed 's/-/ /g'
|
||
|
}
|
||
|
|
||
|
# recupere le nom de la sequence analyse
|
||
|
function seqName {
|
||
|
head -n1 $1| \
|
||
|
awk '{print $1}' | \
|
||
|
sed 's/^>//' | \
|
||
|
sed -E 's/.*\|([^|]+)\|/\1/'
|
||
|
}
|
||
|
|
||
|
# cree un resume du fichier analyse au format gff
|
||
|
# ex : GFF (NC_*** Repseek IR1 start end . + . )
|
||
|
function gffIR {
|
||
|
lseq=$2
|
||
|
nom=$1
|
||
|
lookforIR $3 | \
|
||
|
awk -v nom="$nom" -v lseq="$lseq" \
|
||
|
'BEGIN {OFS="\t"} \
|
||
|
{ startIR1=$1; \
|
||
|
startIR2=$2; \
|
||
|
endIR1=startIR1 + $3 -1; \
|
||
|
endIR2=startIR2 + $3 -1; \
|
||
|
startSSC=1; \
|
||
|
endSSC=startIR1-1; \
|
||
|
startLSC=endIR1+1; \
|
||
|
endLSC=startIR2-1; \
|
||
|
\
|
||
|
print nom,"RepSeek","misc_feature",startSSC,endSSC,"\.","+","\.","ID=SSC;note=small single copy region";\
|
||
|
print nom,"RepSeek","repeat_region",startIR1,endIR1,"\.","+","\.","ID=IRA;note=inverted repeat A";\
|
||
|
print nom,"RepSeek","misc_feature",startLSC,endLSC,"\.","+","\.","ID=LSC;note=large single copy region";\
|
||
|
print nom,"RepSeek","repeat_region",startIR2,endIR2,"\.","-","\.","ID=IRB;note=inverted repeat B";\
|
||
|
}'
|
||
|
}
|
||
|
|
||
|
|
||
|
echo "##gff-version 3"
|
||
|
|
||
|
|
||
|
genome=$1
|
||
|
genome_name=`seqName $1`
|
||
|
genome_length=`seqlength $1`
|
||
|
|
||
|
gffIR ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
|
||
|
gffTRNA ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
|