Initial commit of ORG.Annotate
Former-commit-id: 8c4c1d0391ad06ed67e748e0f837399e9b3de7ad Former-commit-id: 5af9ed909a6f6bed7bae86c77bd770ae4b5fc14e
This commit is contained in:
96
annotate_plastid.sh
Executable file
96
annotate_plastid.sh
Executable file
@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
|
||||
export ORGANNOT_HOME=`dirname $0`
|
||||
|
||||
REPSEEK=${ORGANNOT_HOME}/repseek
|
||||
SUMATRA=${ORGANNOT_HOME}/sumatra
|
||||
ARAGORN=${ORGANNOT_HOME}/aragorn
|
||||
WRAPARAGORN=${ORGANNOT_HOME}/aragorn_wrapper.awk
|
||||
ECOFIND=${ORGANNOT_HOME}/ecofind
|
||||
|
||||
function annotateCAU {
|
||||
QUERY="$$.query.fasta"
|
||||
echo $1 | sed 's/&/ /' | tr '@' '\n' > ${QUERY}
|
||||
${SUMATRA} -d -n ${QUERY} $2 2> /dev/null | \
|
||||
awk ' {n[$2]+=1;d[$2]+=$3} \
|
||||
END {for (i in n) \
|
||||
print i, n[i],d[i], d[i]/n[i]\
|
||||
}' | \
|
||||
sort -rnk4 | \
|
||||
egrep '^trn(I|M|fM)' | \
|
||||
tail -1 | \
|
||||
awk '{print $1,$NF}'
|
||||
rm -rf ${QUERY}
|
||||
}
|
||||
|
||||
function gffTRNA {
|
||||
|
||||
${ARAGORN} -w -io -seq $3 | awk -v gid=${1} -f ${WRAPARAGORN}
|
||||
|
||||
}
|
||||
|
||||
# s'alimente avec un fichier.fasta
|
||||
# $3 : nb de caractere du fichier, t : nb de caractere du titre,
|
||||
# $1+1 : nb de retour chariot du fichier
|
||||
function seqlength {
|
||||
cat $1 | \
|
||||
wc |\
|
||||
awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
|
||||
}
|
||||
|
||||
|
||||
# recupere les informations issues du programme repseek avec l'origine des deux
|
||||
# IR et leur taille
|
||||
function lookforIR {
|
||||
${REPSEEK} -c -p 0.001 $1 | \
|
||||
grep 'Distant.inv' | \
|
||||
sort -n -k4 | \
|
||||
tail -1 | \
|
||||
awk '{print $7}' | \
|
||||
sed 's/-/ /g'
|
||||
}
|
||||
|
||||
# recupere le nom de la sequence analyse
|
||||
function seqName {
|
||||
head -n1 $1| \
|
||||
awk '{print $1}' | \
|
||||
sed 's/^>//' | \
|
||||
sed -E 's/.*\|([^|]+)\|/\1/'
|
||||
}
|
||||
|
||||
# cree un resume du fichier analyse au format gff
|
||||
# ex : GFF (NC_*** Repseek IR1 start end . + . )
|
||||
function gffIR {
|
||||
lseq=$2
|
||||
nom=$1
|
||||
lookforIR $3 | \
|
||||
awk -v nom="$nom" -v lseq="$lseq" \
|
||||
'BEGIN {OFS="\t"} \
|
||||
{ startIR1=$1; \
|
||||
startIR2=$2; \
|
||||
endIR1=startIR1 + $3 -1; \
|
||||
endIR2=startIR2 + $3 -1; \
|
||||
startSSC=1; \
|
||||
endSSC=startIR1-1; \
|
||||
startLSC=endIR1+1; \
|
||||
endLSC=startIR2-1; \
|
||||
\
|
||||
print nom,"RepSeek","misc_feature",startSSC,endSSC,"\.","+","\.","ID=SSC;note=small single copy region";\
|
||||
print nom,"RepSeek","repeat_region",startIR1,endIR1,"\.","+","\.","ID=IRA;note=inverted repeat A";\
|
||||
print nom,"RepSeek","misc_feature",startLSC,endLSC,"\.","+","\.","ID=LSC;note=large single copy region";\
|
||||
print nom,"RepSeek","repeat_region",startIR2,endIR2,"\.","-","\.","ID=IRB;note=inverted repeat B";\
|
||||
}'
|
||||
}
|
||||
|
||||
|
||||
echo "##gff-version 3"
|
||||
|
||||
|
||||
genome=$1
|
||||
genome_name=`seqName $1`
|
||||
genome_length=`seqlength $1`
|
||||
|
||||
gffIR ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
|
||||
gffTRNA ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
|
Reference in New Issue
Block a user