First version of the tRNA detector and of the global organnot.sh script
Former-commit-id: f2a75cf99b24875c90c426c2afb22a75b972bf60 Former-commit-id: 65e3dfb35df06ca69bb29b690c9a40e8940ac6bf
This commit is contained in:
@ -1,96 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
|
||||
export ORGANNOT_HOME=`dirname $0`
|
||||
|
||||
REPSEEK=${ORGANNOT_HOME}/repseek
|
||||
SUMATRA=${ORGANNOT_HOME}/sumatra
|
||||
ARAGORN=${ORGANNOT_HOME}/aragorn
|
||||
WRAPARAGORN=${ORGANNOT_HOME}/aragorn_wrapper.awk
|
||||
ECOFIND=${ORGANNOT_HOME}/ecofind
|
||||
|
||||
function annotateCAU {
|
||||
QUERY="$$.query.fasta"
|
||||
echo $1 | sed 's/&/ /' | tr '@' '\n' > ${QUERY}
|
||||
${SUMATRA} -d -n ${QUERY} $2 2> /dev/null | \
|
||||
awk ' {n[$2]+=1;d[$2]+=$3} \
|
||||
END {for (i in n) \
|
||||
print i, n[i],d[i], d[i]/n[i]\
|
||||
}' | \
|
||||
sort -rnk4 | \
|
||||
egrep '^trn(I|M|fM)' | \
|
||||
tail -1 | \
|
||||
awk '{print $1,$NF}'
|
||||
rm -rf ${QUERY}
|
||||
}
|
||||
|
||||
function gffTRNA {
|
||||
|
||||
${ARAGORN} -w -io -seq $3 | awk -v gid=${1} -f ${WRAPARAGORN}
|
||||
|
||||
}
|
||||
|
||||
# s'alimente avec un fichier.fasta
|
||||
# $3 : nb de caractere du fichier, t : nb de caractere du titre,
|
||||
# $1+1 : nb de retour chariot du fichier
|
||||
function seqlength {
|
||||
cat $1 | \
|
||||
wc |\
|
||||
awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
|
||||
}
|
||||
|
||||
|
||||
# recupere les informations issues du programme repseek avec l'origine des deux
|
||||
# IR et leur taille
|
||||
function lookforIR {
|
||||
${REPSEEK} -c -p 0.001 $1 | \
|
||||
grep 'Distant.inv' | \
|
||||
sort -n -k4 | \
|
||||
tail -1 | \
|
||||
awk '{print $7}' | \
|
||||
sed 's/-/ /g'
|
||||
}
|
||||
|
||||
# recupere le nom de la sequence analyse
|
||||
function seqName {
|
||||
head -n1 $1| \
|
||||
awk '{print $1}' | \
|
||||
sed 's/^>//' | \
|
||||
sed -E 's/.*\|([^|]+)\|/\1/'
|
||||
}
|
||||
|
||||
# cree un resume du fichier analyse au format gff
|
||||
# ex : GFF (NC_*** Repseek IR1 start end . + . )
|
||||
function gffIR {
|
||||
lseq=$2
|
||||
nom=$1
|
||||
lookforIR $3 | \
|
||||
awk -v nom="$nom" -v lseq="$lseq" \
|
||||
'BEGIN {OFS="\t"} \
|
||||
{ startIR1=$1; \
|
||||
startIR2=$2; \
|
||||
endIR1=startIR1 + $3 -1; \
|
||||
endIR2=startIR2 + $3 -1; \
|
||||
startSSC=1; \
|
||||
endSSC=startIR1-1; \
|
||||
startLSC=endIR1+1; \
|
||||
endLSC=startIR2-1; \
|
||||
\
|
||||
print nom,"RepSeek","misc_feature",startSSC,endSSC,"\.","+","\.","ID=SSC;note=small single copy region";\
|
||||
print nom,"RepSeek","repeat_region",startIR1,endIR1,"\.","+","\.","ID=IRA;note=inverted repeat A";\
|
||||
print nom,"RepSeek","misc_feature",startLSC,endLSC,"\.","+","\.","ID=LSC;note=large single copy region";\
|
||||
print nom,"RepSeek","repeat_region",startIR2,endIR2,"\.","-","\.","ID=IRB;note=inverted repeat B";\
|
||||
}'
|
||||
}
|
||||
|
||||
|
||||
echo "##gff-version 3"
|
||||
|
||||
|
||||
genome=$1
|
||||
genome_name=`seqName $1`
|
||||
genome_length=`seqlength $1`
|
||||
|
||||
gffIR ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
|
||||
gffTRNA ${genome_name} ${genome_length} ${genome}| grep -v '^ *$'
|
@ -61,4 +61,4 @@ pushTmpDir ORG.ir
|
||||
|
||||
popTmpDir
|
||||
|
||||
exit 0
|
||||
exit 0
|
@ -27,12 +27,14 @@
|
||||
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source ${SCRIPT_DIR}/../lib/lookforIR.lib.sh
|
||||
|
||||
ORG_DEBUG=1
|
||||
|
||||
pushTmpDir ORG.normalize
|
||||
|
||||
tmpfasta1="tmp_$$_1.fasta"
|
||||
tmpfasta2="tmp_$$_2.fasta"
|
||||
|
||||
logdebug "Running on : $QUERY"
|
||||
|
||||
loginfo "Computing the genome size..."
|
||||
genome_length=$(seqlength $QUERY)
|
||||
|
@ -47,7 +47,9 @@ function lookForIR {
|
||||
}
|
||||
|
||||
SCDB="${IR_DATA_DIR}/SC_RefDB"
|
||||
QUERY="${CALL_DIR}/$1"
|
||||
|
||||
|
||||
openLogFile "${QUERY/.*/}.log"
|
||||
if [[ ! "$1" =~ ^/ ]]; then
|
||||
QUERY="${CALL_DIR}/$1"
|
||||
else
|
||||
QUERY="$1"
|
||||
fi
|
||||
|
@ -31,7 +31,7 @@ pushTmpDir ORG.buildSCDB
|
||||
|
||||
|
||||
loginfo "Building LSC coorientation graph..."
|
||||
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${LOGFILE} > LSC.tgf
|
||||
${PROG_DIR}/coorienteSC.sh LSC.fasta 20000 ${ORG_LOGFILE} > LSC.tgf
|
||||
${PROG_DIR}/cc.py LSC.tgf > LSC.cc
|
||||
loginfo " --> $(awk '{print $1}' LSC.cc | uniq | wc -l) connected componants"
|
||||
loginfo "Done"
|
||||
@ -73,7 +73,7 @@ pushTmpDir ORG.buildSCDB
|
||||
|
||||
|
||||
loginfo "Checking LCS homogeneity..."
|
||||
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${LOGFILE} > LSC_RefDB.tgf
|
||||
${PROG_DIR}/coorienteSC.sh LSC.direct.fasta 20000 ${ORG_LOGFILE} > LSC_RefDB.tgf
|
||||
${PROG_DIR}/cc.py LSC_RefDB.tgf > LSC_RefDB.cc
|
||||
NCC=$(awk '{print $1}' LSC_RefDB.cc | uniq | wc -l)
|
||||
if (( $NCC == 1 )); then
|
||||
@ -103,7 +103,7 @@ pushTmpDir ORG.buildSCDB
|
||||
|
||||
|
||||
loginfo "Building SSC coorientation graph..."
|
||||
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${LOGFILE} > SSC.tgf
|
||||
${PROG_DIR}/coorienteSC.sh SSC.fasta 5000 ${ORG_LOGFILE} > SSC.tgf
|
||||
${PROG_DIR}/cc.py SSC.tgf > SSC.cc
|
||||
loginfo " --> $(awk '{print $1}' SSC.cc | uniq | wc -l) connected componants"
|
||||
loginfo "Done"
|
||||
@ -146,7 +146,7 @@ pushTmpDir ORG.buildSCDB
|
||||
|
||||
|
||||
loginfo "Checking SSC homogeneity..."
|
||||
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${LOGFILE} > SSC_RefDB.tgf
|
||||
${PROG_DIR}/coorienteSC.sh SSC.direct.fasta 5000 ${ORG_LOGFILE} > SSC_RefDB.tgf
|
||||
${PROG_DIR}/cc.py SSC_RefDB.tgf > SSC_RefDB.cc
|
||||
NCC=$(awk '{print $1}' SSC_RefDB.cc | uniq | wc -l)
|
||||
if (( $NCC == 1 )); then
|
||||
|
42
detectors/trna/bin/go_trna.sh
Executable file
42
detectors/trna/bin/go_trna.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Annotate tRNA
|
||||
#
|
||||
#========================================================================================
|
||||
#
|
||||
# Annotate tRNA based on the Aragorn software predictions.
|
||||
|
||||
# go_trna.sh <FASTAFILE>
|
||||
#
|
||||
# - <FASTAFILE> : The fasta file containing the genome to annotate
|
||||
#
|
||||
# Results are printed to the standart output
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"
|
||||
|
||||
pushTmpDir ORG.trna
|
||||
|
||||
CAUTRNADB="${TRNA_DATA_DIR}/CAU_tRNA_DB.fasta"
|
||||
export CAUTRNADB
|
||||
|
||||
if [[ ! "$1" =~ ^/ ]]; then
|
||||
QUERY="${CALL_DIR}/$1"
|
||||
else
|
||||
QUERY="$1"
|
||||
fi
|
||||
|
||||
TRNA=$(basename ${QUERY})
|
||||
|
||||
aragorn -i -w -seq ${QUERY} | \
|
||||
${PROG_DIR}/../lib/aragorn_wrapper.awk
|
||||
|
||||
|
||||
popTmpDir
|
||||
|
||||
exit 0
|
||||
|
@ -7,17 +7,9 @@ function genomeid() {
|
||||
return gid;
|
||||
}
|
||||
|
||||
function home() {
|
||||
"echo $ORGANNOT_HOME" | getline homedir;
|
||||
return homedir;
|
||||
}
|
||||
|
||||
function prog(program) {
|
||||
return home() "/" program;
|
||||
}
|
||||
|
||||
function trnalib(prognam) {
|
||||
return home() "/lib/trnaCAU.ref.fasta";
|
||||
function trnalib() {
|
||||
"echo $CAUTRNADB" | getline ref;
|
||||
return ref
|
||||
}
|
||||
|
||||
function awkPID() {
|
||||
@ -65,19 +57,20 @@ function patchtRNA(anticodon,trna,seq) {
|
||||
if (anticodon == "cat") {
|
||||
file=printfasta(trna "_" anticodon,seq,"");
|
||||
|
||||
command= prog("sumatra") " -d -n " file " " trnalib();
|
||||
command= "sumatra -t 0.9 -x -n " file " " trnalib() " 2>> /dev/null";
|
||||
while ((command | getline output) > 0) {
|
||||
split(output,field," ");
|
||||
n[field[2]]++;
|
||||
d[field[2]]=field[3];
|
||||
match(field[2],"trn.M?");
|
||||
trna=substr(field[2],RSTART,RLENGTH);
|
||||
n[trna]+=field[5];
|
||||
}
|
||||
close(command)
|
||||
|
||||
dmin=1;
|
||||
nmax=0;
|
||||
for (i in n) {
|
||||
dist=d[i]/n[i];
|
||||
if (dist < dmin) {
|
||||
dmin=dist;
|
||||
dist=n[i];
|
||||
if (n[i] > nmax) {
|
||||
nmax=n[i];
|
||||
trna=i;
|
||||
}
|
||||
}
|
||||
@ -94,6 +87,42 @@ function gene2product(gene) {
|
||||
return "tRNA-" AA3[substr(gene,4,1)];
|
||||
}
|
||||
|
||||
function emblTRNA(geneid,trna,loc,anti,intron,seq) {
|
||||
if (loc ~ /^c/) {
|
||||
sub("c\\[","complement(",loc);
|
||||
sub("\\]",")",loc);
|
||||
sub(",","..",loc)}
|
||||
else {
|
||||
sub("\\[","",loc);
|
||||
sub("\\]","",loc);
|
||||
sub(",","..",loc)}
|
||||
|
||||
anti=toupper(anti);
|
||||
gsub("T","U",anti);
|
||||
product=gene2product(trna);
|
||||
|
||||
if (intron!="") {
|
||||
l=length(intron);
|
||||
intron=substr(intron,2,l-2);
|
||||
split(intron,intronpos,",");
|
||||
ib=intronpos[0];
|
||||
ie=intronpos[1];
|
||||
match(loc,"[0-9][0-9]*");
|
||||
gb=substr(loc,RSTART,RLENGTH);
|
||||
sub("\\.\\.",".." (gb + ib -2) "," (gb + ie) "..",loc); \
|
||||
sub("complement","complement(join",loc);\
|
||||
if (substr(loc,1,1) ~ /[0-9]/) {
|
||||
loc="join("loc}
|
||||
loc=loc")";
|
||||
}
|
||||
|
||||
print "FT tRNA " loc;
|
||||
print "FT /gene=\""trna"\"";
|
||||
print "FT /anticodon=\""anti"\"";
|
||||
print "FT /product=\""product"("anti")\"";
|
||||
|
||||
}
|
||||
|
||||
function gffTRNA(geneid,trna,loc,anti,intron,seq) {
|
||||
if (loc ~ /^c/) {
|
||||
complement="-";
|
||||
@ -139,7 +168,6 @@ function gffTRNA(geneid,trna,loc,anti,intron,seq) {
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
print ARGV[1];
|
||||
AA1["Ala"]="A";
|
||||
AA1["Cys"]="C";
|
||||
AA1["Asp"]="D";
|
||||
@ -201,7 +229,7 @@ BEGIN {
|
||||
{ seq=epissage(intron,seq);
|
||||
trna=patchtRNA(anti,trna,seq);
|
||||
# print geneid,trna,loc,anti,"'"intron"'",seq;
|
||||
gffTRNA(geneid,trna,loc,anti,intron,seq);
|
||||
emblTRNA(geneid,trna,loc,anti,intron,seq);
|
||||
seq=""
|
||||
}
|
||||
|
||||
@ -225,5 +253,5 @@ BEGIN {
|
||||
END { seq=epissage(intron,seq);
|
||||
trna=patchtRNA(anti,trna,seq);
|
||||
# print geneid,trna,loc,anti,"'"intron"'",seq;
|
||||
gffTRNA(geneid,trna,loc,anti,intron,seq);
|
||||
emblTRNA(geneid,trna,loc,anti,intron,seq);
|
||||
}
|
47
organnote.sh
Executable file
47
organnote.sh
Executable file
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# Annotate tRNA
|
||||
#
|
||||
#========================================================================================
|
||||
#
|
||||
# Annotate tRNA based on the Aragorn software predictions.
|
||||
|
||||
# go_trna.sh <FASTAFILE>
|
||||
#
|
||||
# - <FASTAFILE> : The fasta file containing the genome to annotate
|
||||
#
|
||||
# Results are printed to the standart output
|
||||
#
|
||||
#========================================================================================
|
||||
|
||||
# -- CAUTION -- Works as long than the script
|
||||
# is not called through a symlink
|
||||
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
|
||||
source "${SCRIPT_DIR}/scripts/bash_init.sh"
|
||||
|
||||
pushTmpDir ORG.organnot
|
||||
|
||||
if [[ ! "$1" =~ ^/ ]]; then
|
||||
QUERY="${CALL_DIR}/$1"
|
||||
else
|
||||
QUERY="$1"
|
||||
fi
|
||||
|
||||
RESULTS=$(basename ${QUERY/.*/})
|
||||
LOG="${CALL_DIR}/${RESULTS}.log"
|
||||
|
||||
rm -f ${LOG}
|
||||
openLogFile ${LOG}
|
||||
|
||||
${PROG_DIR}/detectors/normalize/bin/go_normalize.sh ${QUERY} > "${RESULTS}.norm.fasta"
|
||||
|
||||
${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot"
|
||||
${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot"
|
||||
|
||||
cat "${RESULTS}.annot"
|
||||
|
||||
popTmpDir
|
||||
|
@ -19,11 +19,18 @@ function getAbsolutePath {
|
||||
function pushTmpDir {
|
||||
TMP_DIR=$(mktemp -d -t "$1_proc_$$_")
|
||||
pushd $TMP_DIR >& /dev/null
|
||||
TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK"
|
||||
logdebug "Pushing temp directory $TMP_DIR"
|
||||
logdebug "Stack : ${TMP_DIR_STACK}"
|
||||
}
|
||||
|
||||
function popTmpDir {
|
||||
TMP_DIR=$(echo $TMP_DIR_STACK | awk '{print $1}')
|
||||
TMP_DIR_STACK=$(echo $TMP_DIR_STACK | awk '{$1="";print $0}')
|
||||
popd >& /dev/null
|
||||
rm -rf $TMP_DIR >& /dev/null
|
||||
logdebug "Poping temp directory $TMP_DIR"
|
||||
logdebug "Stack : ${TMP_DIR_STACK}"
|
||||
}
|
||||
|
||||
# Logging functions
|
||||
@ -34,32 +41,42 @@ function errcho {
|
||||
}
|
||||
|
||||
function openLogFile {
|
||||
LOGFILE=$1
|
||||
touch ${LOGFILE}
|
||||
ORG_LOGFILE=$1
|
||||
export ORG_LOGFILE
|
||||
touch ${ORG_LOGFILE}
|
||||
}
|
||||
|
||||
|
||||
function loginfo {
|
||||
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1"
|
||||
if [[ ! -z ${LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${LOGFILE}
|
||||
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA INFO ] $$ -- $1" >> ${ORG_LOGFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
function logerror {
|
||||
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1"
|
||||
if [[ ! -z ${LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${LOGFILE}
|
||||
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA ERROR ] $$ -- $1" >> ${ORG_LOGFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
function logwarning {
|
||||
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1"
|
||||
if [[ ! -z ${LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${LOGFILE}
|
||||
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA WARNING] $$ -- $1" >> ${ORG_LOGFILE}
|
||||
fi
|
||||
}
|
||||
|
||||
function logdebug {
|
||||
if [[ ! -z ${ORG_DEBUG} ]]; then
|
||||
errcho `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1"
|
||||
if [[ ! -z ${ORG_LOGFILE} ]]; then
|
||||
echo `date +'%Y-%m-%d %H:%M:%S'` "[OA DEBUG ] $$ -- $1" >> ${ORG_LOGFILE}
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# Sequence related functions
|
||||
|
||||
# Counts how many sequences are stored in a fasta file
|
||||
|
Reference in New Issue
Block a user