diff --git a/config/default.conf b/config/default.conf index ff863ec..8bdb54c 100644 --- a/config/default.conf +++ b/config/default.conf @@ -118,9 +118,9 @@ INCDIR = $(abspath $(PRTDIR))/include # default gmake variable in implicit rules # ------------------------------------ -CFLAGS = $(OPTIM) $(MACHDEF) -I$(INCDIR) +CFLAGS := $(CFLAGS) $(OPTIM) $(MACHDEF) -I$(INCDIR) -LDFLAGS = -L$(LIBDIR) -L. +LDFLAGS := $(LDFLAGS) -L$(LIBDIR) -L. LDLIBS = $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS) diff --git a/detectors/ir/bin/go_ir.sh b/detectors/ir/bin/go_ir.sh index e199449..2f7a0bb 100755 --- a/detectors/ir/bin/go_ir.sh +++ b/detectors/ir/bin/go_ir.sh @@ -61,4 +61,4 @@ pushTmpDir ORG.ir popTmpDir -exit 0 \ No newline at end of file +exit 0 diff --git a/detectors/normalize/bin/go_normalize.sh b/detectors/normalize/bin/go_normalize.sh index 200e21d..0fbd8b9 100755 --- a/detectors/normalize/bin/go_normalize.sh +++ b/detectors/normalize/bin/go_normalize.sh @@ -100,7 +100,7 @@ pushTmpDir ORG.normalize tmpLSC="tmp_$$_LSC.fasta" tmpSSC="tmp_$$_SSC.fasta" - # Extract the first SC present in between the two IRs + # Extract the central SC present in between the two IRs # considering it as LSC let "beginLSC=$endIR1+1" @@ -110,7 +110,7 @@ pushTmpDir ORG.normalize strandLSC="${IR[1]}" - # Extract the second SC present in two parts + # Extract the external SC present in two parts # Considering it as SSC let "beginSSC=$endIR2+1" @@ -130,16 +130,17 @@ pushTmpDir ORG.normalize # Actually this is the oposite LSC is SSC and SSC is LSC - # Exchange the SSC and LSC sequences + # Exchanges the SSC and LSC sequences mv ${tmpSSC} ${tmpfasta1} mv ${tmpLSC} ${tmpSSC} mv ${tmpfasta1} ${tmpLSC} - # Exchange the IRa and IRb sequences + # Exchanges the IRa and IRb sequences mv ${tmpIR1} ${tmpfasta1} mv ${tmpIR2} ${tmpIR1} mv ${tmpfasta1} ${tmpIR2} + # Exchanges the strand of both the Single copies tmp=${strandSSC} strandSSC=${strandLSC} strandLSC=${tmp} @@ -162,7 +163,6 @@ pushTmpDir ORG.normalize cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta - popTmpDir exit 0 diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh index 1919a21..94ea75b 100644 --- a/detectors/normalize/lib/lookforIR.lib.sh +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -1,3 +1,5 @@ +#!/bin/bash + source "${THIS_DIR}/../../../scripts/bash_init.sh" SELECTIR="${PROG_DIR}/../../normalize/lib/selectIR.py" @@ -10,12 +12,21 @@ function lookForIR { local REPEATS="${MATCHES/.*/}.repseek" + # Blast columns: + # query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score + # We keep blast matches if : + # The match is longer than 1000 + # The identity is higher than 80% + # + # The match file has the following format: + # LSC/SSC begin end same_strand=1/diff_strand=0 + loginfo "Locating SSC and LSC by similarity..." blastn -db ${SCDB} \ -query ${QUERY} \ -outfmt 6 \ -max_target_seqs 10000 | \ - awk '($4 > 1000) && ($3>80) { \ + awk '($4 > 100) && ($3>80) { \ SAME=(($7 < $8) && ($9 < $10)) || (($7 > $8) && ($9 > $10)); \ if ($7 < $8) \ {print substr($2,1,3),$7,$8,SAME} \ @@ -23,12 +34,13 @@ function lookForIR { {print substr($2,1,3),$8,$7,SAME}}' | \ sort -nk 2 > ${MATCHES} loginfo "Done" + loginfo "Looking for long inverted repeats..." repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS} loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified" loginfo "Done" - + loginfo "Marking and selecting the best inverted repeat..." local IR=( $(${SELECTIR} ${MATCHES} ${REPEATS}) ) loginfo "Done" diff --git a/detectors/normalize/lib/selectIR.py b/detectors/normalize/lib/selectIR.py index cefe74b..7bd216d 100755 --- a/detectors/normalize/lib/selectIR.py +++ b/detectors/normalize/lib/selectIR.py @@ -8,6 +8,12 @@ repeats = open(sys.argv[2]) chloro = {'LSC' : [], 'SSC' : [] } chlorosize =0 +# We scan the blast matches: +# We build a vector with one position per base pair counting the matches + +# The match file has the following format: +# LSC/SSC begin end same_strand=1/diff_strand=0 + for line in data: parts = line.strip().split() if len(parts) >= 4: @@ -16,7 +22,8 @@ for line in data: end = int(parts[2]) direction = int(parts[3]) - + # Change the code of the direction: + # reverse complement = -1 if direction==0: direction=-1 @@ -32,43 +39,60 @@ for line in data: for p in range(begin,end): chr[p]+=direction - + maxSSC = float(max(abs(n) for n in chloro['SSC'])) maxLSC = float(max(abs(n) for n in chloro['LSC'])) chloro['SSC']=[n / maxSSC for n in chloro['SSC']] chloro['LSC']=[n / maxLSC for n in chloro['LSC']] - scoreMax=0 +len1Max=0 +len2Max=0 + imax = len(chloro['LSC']) for line in repeats: parts = line.strip().split() + # First repeat position and length + # (position start at 0) pos1 = int(parts[1]) -1 len1 = int(parts[3]) + # Second repeat position and length + # (position start at 0) pos2 = int(parts[2]) -1 len2 = int(parts[4]) + # Location of the central single copy + # - in between the two IR - c_begin = min(pos1 + len1,imax) c_end = min(pos2,imax) + + # Location of the external single copy + # - in between the two IR - o_max = min(pos1 ,imax) o_min = min(pos2 + len2, imax) - c_lsc = sum(abs(chloro['LSC'][n]) for n in range(c_begin,c_end)) - c_ssc = sum(abs(chloro['SSC'][n]) for n in range(c_begin,c_end)) + # count of coherent matches for LSC and SSC on the central single copy + c_lsc = abs(sum(chloro['LSC'][n] for n in range(c_begin,c_end))) + c_ssc = abs(sum(chloro['SSC'][n] for n in range(c_begin,c_end))) - o_lsc = sum(abs(chloro['LSC'][n]) for n in range(0,o_max)) - o_ssc = sum(abs(chloro['SSC'][n]) for n in range(0,o_max)) + # count of coherent matches for LSC and SSC on the external single copy + # this score is in two parts before the first copy and after the second + o_lsc = sum(chloro['LSC'][n] for n in range(0,o_max)) + o_ssc = sum(chloro['SSC'][n] for n in range(0,o_max)) - o_lsc += sum(abs(chloro['LSC'][n]) for n in range(o_min,len(chloro['LSC']))) - o_ssc += sum(abs(chloro['SSC'][n]) for n in range(o_min,len(chloro['SSC']))) + o_lsc += sum(chloro['LSC'][n] for n in range(o_min,imax)) + o_ssc += sum(chloro['SSC'][n] for n in range(o_min,imax)) + + o_lsc = abs(o_lsc) + o_ssc = abs(o_ssc) c = float(c_lsc + c_ssc) o = float(o_lsc + o_ssc) - + if c > 0: c_lsc /= c c_ssc /= c @@ -78,10 +102,9 @@ for line in repeats: o_ssc /= o score = ((c_lsc - c_ssc) ** 2 + (o_lsc - o_ssc) ** 2) / 2.0 - # print >>sys.stderr,"c.lsc = %f c.ssc = %f o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1) - if (score > scoreMax): + if (score >= scoreMax) and ((len1 > len1Max) or (len2 > len2Max)): scoreMax = score pos1Max = pos1 pos2Max = pos2 @@ -99,8 +122,8 @@ c_ssc = sum(chloro['SSC'][n] for n in range(c_begin,c_end)) o_lsc = sum(chloro['LSC'][n] for n in range(0,o_max)) o_ssc = sum(chloro['SSC'][n] for n in range(0,o_max)) -o_lsc += sum(chloro['LSC'][n] for n in range(o_min,len(chloro['LSC']))) -o_ssc += sum(chloro['SSC'][n] for n in range(o_min,len(chloro['SSC']))) +o_lsc += sum(chloro['LSC'][n] for n in range(o_min,imax)) +o_ssc += sum(chloro['SSC'][n] for n in range(o_min,imax)) if abs(c_lsc) > abs(c_ssc): center = "LSC" @@ -132,4 +155,4 @@ sys.stdout.write("%s %s %s %s %d %d %d %d %6.5f\n" % (center, #for p in range(chlorosize): # sys.stdout.write("%d %d %d\n" % (p,chloro['SSC'][p],chloro['LSC'][p])) - \ No newline at end of file + diff --git a/detectors/trna/bin/go_trna.sh b/detectors/trna/bin/go_trna.sh index dd6f468..66964d0 100755 --- a/detectors/trna/bin/go_trna.sh +++ b/detectors/trna/bin/go_trna.sh @@ -33,7 +33,7 @@ pushTmpDir ORG.trna TRNA=$(basename ${QUERY}) aragorn -i -w -seq ${QUERY} | \ - ${PROG_DIR}/../lib/aragorn_wrapper.awk + ${AwkCmd} -f ${PROG_DIR}/../lib/aragorn_wrapper.awk popTmpDir diff --git a/org-annotate.sh b/org-annotate.sh index 5417d8c..993428f 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -34,19 +34,19 @@ pushTmpDir ORG.organnot loginfo "Done." loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." - ${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot" + ${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" loginfo "Done." loginfo "Annotating the tRNA..." - ${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot" + ${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." loginfo "Annotating the rRNA genes..." - ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh ${QUERY} >> "${RESULTS}.annot" + ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." loginfo "Annotating the CDS..." - ${PROG_DIR}/detectors/cds/bin/go_cds.sh ${QUERY} >> "${RESULTS}.annot" + ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." loginfo "Printing annotations header..." @@ -96,11 +96,11 @@ pushTmpDir ORG.organnot freq["G"]" G; "\ freq["T"]" T; "\ other" other;" \ - }' ${QUERY} + }' "${RESULTS}.norm.fasta" loginfo "Done." loginfo "Reformating sequences..." - lines=$(wc -l ${QUERY} | awk '{print $1}') + lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}') awk -v lines=$lines ' \ ! /^>/ { \ seq=tolower($0); \ @@ -115,7 +115,7 @@ pushTmpDir ORG.organnot if (NR==lines) \ {pos-=1}; \ printf(" %6d\n",pos) \ - }' ${QUERY} + }' "${RESULTS}.norm.fasta" loginfo "Done." loginfo "Closing sequence part..." diff --git a/scripts/bash_init.sh b/scripts/bash_init.sh index bb60ba2..72d4a81 100644 --- a/scripts/bash_init.sh +++ b/scripts/bash_init.sh @@ -19,7 +19,7 @@ function getAbsolutePath { # Manage temp directory function pushTmpDir { - TMP_DIR=$(mktemp -d -t "$1_proc_$$_") + TMP_DIR=$(mktemp -d -t "$1_proc_$$_XXXXXX") pushd $TMP_DIR >& /dev/null TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK" logdebug "Pushing temp directory $TMP_DIR"