From 9d2d60e9f0aaa8ba787e03570d96847f843a121f Mon Sep 17 00:00:00 2001 From: "Eric Coissac eric.coissac@ujf-grenoble.fr" Date: Mon, 9 Nov 2015 11:35:00 +0100 Subject: [PATCH 1/8] Patch config/default.conf for considering a CFLAGS and LDFLAGS locally predefined variables Former-commit-id: 29597cc21be2605b6753e4d64bfeb165b210309b Former-commit-id: b2c4aed567119dae4f596f04c83648102639b000 --- config/default.conf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/default.conf b/config/default.conf index ff863ec..8bdb54c 100644 --- a/config/default.conf +++ b/config/default.conf @@ -118,9 +118,9 @@ INCDIR = $(abspath $(PRTDIR))/include # default gmake variable in implicit rules # ------------------------------------ -CFLAGS = $(OPTIM) $(MACHDEF) -I$(INCDIR) +CFLAGS := $(CFLAGS) $(OPTIM) $(MACHDEF) -I$(INCDIR) -LDFLAGS = -L$(LIBDIR) -L. +LDFLAGS := $(LDFLAGS) -L$(LIBDIR) -L. LDLIBS = $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS) From cf99ca56a4f34b8ad3f3753ee23acc2c92c9ebbe Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Nov 2015 13:11:54 +0100 Subject: [PATCH 2/8] Patches for the cluster luke Former-commit-id: 54cfd925a4cb13d389bd3086164b99bd246319e7 Former-commit-id: d865056147c56bf0c6220e29d7f40e26f140a4ba --- detectors/ir/bin/go_ir.sh | 2 +- detectors/normalize/lib/lookforIR.lib.sh | 2 ++ detectors/trna/bin/go_trna.sh | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/detectors/ir/bin/go_ir.sh b/detectors/ir/bin/go_ir.sh index e199449..2f7a0bb 100755 --- a/detectors/ir/bin/go_ir.sh +++ b/detectors/ir/bin/go_ir.sh @@ -61,4 +61,4 @@ pushTmpDir ORG.ir popTmpDir -exit 0 \ No newline at end of file +exit 0 diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh index 1919a21..0070c2d 100644 --- a/detectors/normalize/lib/lookforIR.lib.sh +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -1,3 +1,5 @@ +#!/bin/bash + source "${THIS_DIR}/../../../scripts/bash_init.sh" SELECTIR="${PROG_DIR}/../../normalize/lib/selectIR.py" diff --git a/detectors/trna/bin/go_trna.sh b/detectors/trna/bin/go_trna.sh index dd6f468..0647825 100755 --- a/detectors/trna/bin/go_trna.sh +++ b/detectors/trna/bin/go_trna.sh @@ -33,7 +33,7 @@ pushTmpDir ORG.trna TRNA=$(basename ${QUERY}) aragorn -i -w -seq ${QUERY} | \ - ${PROG_DIR}/../lib/aragorn_wrapper.awk + ${CmdAwk} -f ${PROG_DIR}/../lib/aragorn_wrapper.awk popTmpDir From 8b27f23a8ef7cfe5581a8a994c77f1c11041df9e Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Nov 2015 14:01:21 +0100 Subject: [PATCH 3/8] change the call to awk Former-commit-id: fa0b02e712db163ba6d37407cea46d1d413a7cb5 Former-commit-id: f393558b7962188159ac123d654ae14ee581f5f8 --- detectors/trna/bin/go_trna.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detectors/trna/bin/go_trna.sh b/detectors/trna/bin/go_trna.sh index 0647825..66964d0 100755 --- a/detectors/trna/bin/go_trna.sh +++ b/detectors/trna/bin/go_trna.sh @@ -33,7 +33,7 @@ pushTmpDir ORG.trna TRNA=$(basename ${QUERY}) aragorn -i -w -seq ${QUERY} | \ - ${CmdAwk} -f ${PROG_DIR}/../lib/aragorn_wrapper.awk + ${AwkCmd} -f ${PROG_DIR}/../lib/aragorn_wrapper.awk popTmpDir From ae703c70d66e181363e7c4e9477e7b7169bbbfe5 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Nov 2015 15:27:32 +0100 Subject: [PATCH 4/8] small modifications for the luke cluster Former-commit-id: 55a83611220f824a0efdb1267816f98df984152e Former-commit-id: e01bf6440c2981904f31807766e9e48aea8b18c1 --- detectors/normalize/lib/lookforIR.lib.sh | 1 + detectors/normalize/lib/selectIR.py | 4 ++-- scripts/bash_init.sh | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh index 0070c2d..437b77c 100644 --- a/detectors/normalize/lib/lookforIR.lib.sh +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -25,6 +25,7 @@ function lookForIR { {print substr($2,1,3),$8,$7,SAME}}' | \ sort -nk 2 > ${MATCHES} loginfo "Done" + loginfo "Looking for long inverted repeats..." repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS} diff --git a/detectors/normalize/lib/selectIR.py b/detectors/normalize/lib/selectIR.py index cefe74b..abe7dbd 100755 --- a/detectors/normalize/lib/selectIR.py +++ b/detectors/normalize/lib/selectIR.py @@ -32,7 +32,7 @@ for line in data: for p in range(begin,end): chr[p]+=direction - + maxSSC = float(max(abs(n) for n in chloro['SSC'])) maxLSC = float(max(abs(n) for n in chloro['LSC'])) @@ -132,4 +132,4 @@ sys.stdout.write("%s %s %s %s %d %d %d %d %6.5f\n" % (center, #for p in range(chlorosize): # sys.stdout.write("%d %d %d\n" % (p,chloro['SSC'][p],chloro['LSC'][p])) - \ No newline at end of file + diff --git a/scripts/bash_init.sh b/scripts/bash_init.sh index bb60ba2..72d4a81 100644 --- a/scripts/bash_init.sh +++ b/scripts/bash_init.sh @@ -19,7 +19,7 @@ function getAbsolutePath { # Manage temp directory function pushTmpDir { - TMP_DIR=$(mktemp -d -t "$1_proc_$$_") + TMP_DIR=$(mktemp -d -t "$1_proc_$$_XXXXXX") pushd $TMP_DIR >& /dev/null TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK" logdebug "Pushing temp directory $TMP_DIR" From 58e99a81eba2bd74630b8d5559cfeb6bfdd6116f Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Nov 2015 15:34:59 +0100 Subject: [PATCH 5/8] Run the annotation on the normalized sequence Former-commit-id: 48e30ca5dff232ebf886d351cfd254b3fc3a0798 Former-commit-id: e4832b9fe1c644d232b6bc724ab98d5ee18ab6eb --- org-annotate.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/org-annotate.sh b/org-annotate.sh index 5417d8c..2ea1b66 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -38,15 +38,15 @@ pushTmpDir ORG.organnot loginfo "Done." loginfo "Annotating the tRNA..." - ${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot" + ${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." loginfo "Annotating the rRNA genes..." - ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh ${QUERY} >> "${RESULTS}.annot" + ${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." loginfo "Annotating the CDS..." - ${PROG_DIR}/detectors/cds/bin/go_cds.sh ${QUERY} >> "${RESULTS}.annot" + ${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot" loginfo "Done." loginfo "Printing annotations header..." @@ -96,11 +96,11 @@ pushTmpDir ORG.organnot freq["G"]" G; "\ freq["T"]" T; "\ other" other;" \ - }' ${QUERY} + }' "${RESULTS}.norm.fasta" loginfo "Done." loginfo "Reformating sequences..." - lines=$(wc -l ${QUERY} | awk '{print $1}') + lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}') awk -v lines=$lines ' \ ! /^>/ { \ seq=tolower($0); \ @@ -115,7 +115,7 @@ pushTmpDir ORG.organnot if (NR==lines) \ {pos-=1}; \ printf(" %6d\n",pos) \ - }' ${QUERY} + }' "${RESULTS}.norm.fasta" loginfo "Done." loginfo "Closing sequence part..." From 195739b5f8f2cd195a9253dd9e8f26a2eb8e6684 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Nov 2015 17:03:22 +0100 Subject: [PATCH 6/8] Patch bug in the inverted repeats annotation Realize the annotation on the normalized chromosome Former-commit-id: 73e4e016d7bfe23fc92e4071efb0a4629eb8e21f Former-commit-id: d44249c0d078d475ca80386bb428fb660289b0dd --- detectors/normalize/bin/go_normalize.sh | 11 +++--- detectors/normalize/lib/lookforIR.lib.sh | 11 +++++- detectors/normalize/lib/selectIR.py | 49 +++++++++++++++++------- org-annotate.sh | 2 +- 4 files changed, 53 insertions(+), 20 deletions(-) diff --git a/detectors/normalize/bin/go_normalize.sh b/detectors/normalize/bin/go_normalize.sh index 200e21d..5ef1865 100755 --- a/detectors/normalize/bin/go_normalize.sh +++ b/detectors/normalize/bin/go_normalize.sh @@ -100,7 +100,7 @@ pushTmpDir ORG.normalize tmpLSC="tmp_$$_LSC.fasta" tmpSSC="tmp_$$_SSC.fasta" - # Extract the first SC present in between the two IRs + # Extract the central SC present in between the two IRs # considering it as LSC let "beginLSC=$endIR1+1" @@ -110,7 +110,7 @@ pushTmpDir ORG.normalize strandLSC="${IR[1]}" - # Extract the second SC present in two parts + # Extract the external SC present in two parts # Considering it as SSC let "beginSSC=$endIR2+1" @@ -130,16 +130,17 @@ pushTmpDir ORG.normalize # Actually this is the oposite LSC is SSC and SSC is LSC - # Exchange the SSC and LSC sequences + # Exchanges the SSC and LSC sequences mv ${tmpSSC} ${tmpfasta1} mv ${tmpLSC} ${tmpSSC} mv ${tmpfasta1} ${tmpLSC} - # Exchange the IRa and IRb sequences + # Exchanges the IRa and IRb sequences mv ${tmpIR1} ${tmpfasta1} mv ${tmpIR2} ${tmpIR1} mv ${tmpfasta1} ${tmpIR2} + # Exchanges the strand of both the Single copies tmp=${strandSSC} strandSSC=${strandLSC} strandLSC=${tmp} @@ -161,7 +162,7 @@ pushTmpDir ORG.normalize # Merges the four parts of the genome. cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta - + exit 1 popTmpDir diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh index 437b77c..3b35206 100644 --- a/detectors/normalize/lib/lookforIR.lib.sh +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -12,6 +12,15 @@ function lookForIR { local REPEATS="${MATCHES/.*/}.repseek" + # Blast columns: + # query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score + # We keep blast matches if : + # The match is longer than 1000 + # The identity is higher than 80% + # + # The match file has the following format: + # LSC/SSC begin end same_strand=1/diff_strand=0 + loginfo "Locating SSC and LSC by similarity..." blastn -db ${SCDB} \ -query ${QUERY} \ @@ -31,7 +40,7 @@ function lookForIR { repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS} loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified" loginfo "Done" - + loginfo "Marking and selecting the best inverted repeat..." local IR=( $(${SELECTIR} ${MATCHES} ${REPEATS}) ) loginfo "Done" diff --git a/detectors/normalize/lib/selectIR.py b/detectors/normalize/lib/selectIR.py index abe7dbd..7bd216d 100755 --- a/detectors/normalize/lib/selectIR.py +++ b/detectors/normalize/lib/selectIR.py @@ -8,6 +8,12 @@ repeats = open(sys.argv[2]) chloro = {'LSC' : [], 'SSC' : [] } chlorosize =0 +# We scan the blast matches: +# We build a vector with one position per base pair counting the matches + +# The match file has the following format: +# LSC/SSC begin end same_strand=1/diff_strand=0 + for line in data: parts = line.strip().split() if len(parts) >= 4: @@ -16,7 +22,8 @@ for line in data: end = int(parts[2]) direction = int(parts[3]) - + # Change the code of the direction: + # reverse complement = -1 if direction==0: direction=-1 @@ -39,36 +46,53 @@ maxLSC = float(max(abs(n) for n in chloro['LSC'])) chloro['SSC']=[n / maxSSC for n in chloro['SSC']] chloro['LSC']=[n / maxLSC for n in chloro['LSC']] - scoreMax=0 +len1Max=0 +len2Max=0 + imax = len(chloro['LSC']) for line in repeats: parts = line.strip().split() + # First repeat position and length + # (position start at 0) pos1 = int(parts[1]) -1 len1 = int(parts[3]) + # Second repeat position and length + # (position start at 0) pos2 = int(parts[2]) -1 len2 = int(parts[4]) + # Location of the central single copy + # - in between the two IR - c_begin = min(pos1 + len1,imax) c_end = min(pos2,imax) + + # Location of the external single copy + # - in between the two IR - o_max = min(pos1 ,imax) o_min = min(pos2 + len2, imax) - c_lsc = sum(abs(chloro['LSC'][n]) for n in range(c_begin,c_end)) - c_ssc = sum(abs(chloro['SSC'][n]) for n in range(c_begin,c_end)) + # count of coherent matches for LSC and SSC on the central single copy + c_lsc = abs(sum(chloro['LSC'][n] for n in range(c_begin,c_end))) + c_ssc = abs(sum(chloro['SSC'][n] for n in range(c_begin,c_end))) - o_lsc = sum(abs(chloro['LSC'][n]) for n in range(0,o_max)) - o_ssc = sum(abs(chloro['SSC'][n]) for n in range(0,o_max)) + # count of coherent matches for LSC and SSC on the external single copy + # this score is in two parts before the first copy and after the second + o_lsc = sum(chloro['LSC'][n] for n in range(0,o_max)) + o_ssc = sum(chloro['SSC'][n] for n in range(0,o_max)) - o_lsc += sum(abs(chloro['LSC'][n]) for n in range(o_min,len(chloro['LSC']))) - o_ssc += sum(abs(chloro['SSC'][n]) for n in range(o_min,len(chloro['SSC']))) + o_lsc += sum(chloro['LSC'][n] for n in range(o_min,imax)) + o_ssc += sum(chloro['SSC'][n] for n in range(o_min,imax)) + + o_lsc = abs(o_lsc) + o_ssc = abs(o_ssc) c = float(c_lsc + c_ssc) o = float(o_lsc + o_ssc) - + if c > 0: c_lsc /= c c_ssc /= c @@ -78,10 +102,9 @@ for line in repeats: o_ssc /= o score = ((c_lsc - c_ssc) ** 2 + (o_lsc - o_ssc) ** 2) / 2.0 - # print >>sys.stderr,"c.lsc = %f c.ssc = %f o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1) - if (score > scoreMax): + if (score >= scoreMax) and ((len1 > len1Max) or (len2 > len2Max)): scoreMax = score pos1Max = pos1 pos2Max = pos2 @@ -99,8 +122,8 @@ c_ssc = sum(chloro['SSC'][n] for n in range(c_begin,c_end)) o_lsc = sum(chloro['LSC'][n] for n in range(0,o_max)) o_ssc = sum(chloro['SSC'][n] for n in range(0,o_max)) -o_lsc += sum(chloro['LSC'][n] for n in range(o_min,len(chloro['LSC']))) -o_ssc += sum(chloro['SSC'][n] for n in range(o_min,len(chloro['SSC']))) +o_lsc += sum(chloro['LSC'][n] for n in range(o_min,imax)) +o_ssc += sum(chloro['SSC'][n] for n in range(o_min,imax)) if abs(c_lsc) > abs(c_ssc): center = "LSC" diff --git a/org-annotate.sh b/org-annotate.sh index 2ea1b66..993428f 100755 --- a/org-annotate.sh +++ b/org-annotate.sh @@ -34,7 +34,7 @@ pushTmpDir ORG.organnot loginfo "Done." loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..." - ${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot" + ${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot" loginfo "Done." loginfo "Annotating the tRNA..." From 32908c68093537795d0ee0e781265e5ff60d35c2 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Nov 2015 17:17:34 +0100 Subject: [PATCH 7/8] remove the debug code Former-commit-id: 7a805dc6278b8aae97e6be8deb7e9035bc4aa853 Former-commit-id: 54e6c6030fd0c876ad05745a6a51d14a523e313e --- detectors/normalize/bin/go_normalize.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/detectors/normalize/bin/go_normalize.sh b/detectors/normalize/bin/go_normalize.sh index 5ef1865..0fbd8b9 100755 --- a/detectors/normalize/bin/go_normalize.sh +++ b/detectors/normalize/bin/go_normalize.sh @@ -162,7 +162,6 @@ pushTmpDir ORG.normalize # Merges the four parts of the genome. cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta - exit 1 popTmpDir From 813e3958ba9aa23ed29d356fa086860924037650 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 9 Nov 2015 17:35:59 +0100 Subject: [PATCH 8/8] Change minimum length for considering a match from 1000 to 100. Former-commit-id: 6bd827ca011ee71d83e98710edc837f56a089875 Former-commit-id: 454f080c0b163f238951541eec23b5946f914f28 --- detectors/normalize/lib/lookforIR.lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detectors/normalize/lib/lookforIR.lib.sh b/detectors/normalize/lib/lookforIR.lib.sh index 3b35206..94ea75b 100644 --- a/detectors/normalize/lib/lookforIR.lib.sh +++ b/detectors/normalize/lib/lookforIR.lib.sh @@ -26,7 +26,7 @@ function lookForIR { -query ${QUERY} \ -outfmt 6 \ -max_target_seqs 10000 | \ - awk '($4 > 1000) && ($3>80) { \ + awk '($4 > 100) && ($3>80) { \ SAME=(($7 < $8) && ($9 < $10)) || (($7 > $8) && ($9 > $10)); \ if ($7 < $8) \ {print substr($2,1,3),$7,$8,SAME} \