dd9b23bc77481bc436f91c5d598dbe9e5a8e308f

@ -118,9 +118,9 @@ INCDIR	 =	$(abspath $(PRTDIR))/include
# default gmake variable in implicit rules
# default gmake variable in implicit rules
# ------------------------------------
# ------------------------------------


CFLAGS 	  = 	$(OPTIM) $(MACHDEF) -I$(INCDIR)
CFLAGS 	  := $(CFLAGS)	$(OPTIM) $(MACHDEF) -I$(INCDIR)


LDFLAGS	  =     -L$(LIBDIR) -L.
LDFLAGS	  := $(LDFLAGS)    -L$(LIBDIR) -L.


LDLIBS	  =	    $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS)
LDLIBS	  =	    $(LIBS) $(MALLOC_LIBS) $(MATH_LIBS) $(CC_LIBS)


@ -100,7 +100,7 @@ pushTmpDir ORG.normalize
	tmpLSC="tmp_$$_LSC.fasta"		
	tmpLSC="tmp_$$_LSC.fasta"		
	tmpSSC="tmp_$$_SSC.fasta"		
	tmpSSC="tmp_$$_SSC.fasta"		
	
	
	# Extract the first SC present in between the two IRs
	# Extract the central SC present in between the two IRs
	# considering it as LSC
	# considering it as LSC


	let "beginLSC=$endIR1+1"
	let "beginLSC=$endIR1+1"
@ -110,7 +110,7 @@ pushTmpDir ORG.normalize
	strandLSC="${IR[1]}"
	strandLSC="${IR[1]}"


	# Extract the second SC present in two parts
	# Extract the external SC present in two parts
	# Considering it as SSC
	# Considering it as SSC
	
	
	let "beginSSC=$endIR2+1"
	let "beginSSC=$endIR2+1"
@ -130,16 +130,17 @@ pushTmpDir ORG.normalize
	
	
		# Actually this is the oposite LSC is SSC and SSC is LSC
		# Actually this is the oposite LSC is SSC and SSC is LSC


		# Exchange the SSC and LSC sequences
		# Exchanges the SSC and LSC sequences
		mv ${tmpSSC}    ${tmpfasta1}
		mv ${tmpSSC}    ${tmpfasta1}
		mv ${tmpLSC}    ${tmpSSC}
		mv ${tmpLSC}    ${tmpSSC}
		mv ${tmpfasta1} ${tmpLSC}
		mv ${tmpfasta1} ${tmpLSC}
		
		
		# Exchange the IRa and IRb sequences
		# Exchanges the IRa and IRb sequences
		mv ${tmpIR1}    ${tmpfasta1}
		mv ${tmpIR1}    ${tmpfasta1}
		mv ${tmpIR2}    ${tmpIR1}
		mv ${tmpIR2}    ${tmpIR1}
		mv ${tmpfasta1} ${tmpIR2}
		mv ${tmpfasta1} ${tmpIR2}
		
		
		# Exchanges the strand of both the Single copies
		tmp=${strandSSC}
		tmp=${strandSSC}
		strandSSC=${strandLSC}
		strandSSC=${strandLSC}
		strandLSC=${tmp}
		strandLSC=${tmp}
@ -162,7 +163,6 @@ pushTmpDir ORG.normalize
	cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta
	cat ${tmpLSC} ${tmpIR2} ${tmpSSC} ${tmpIR1} | joinfasta


popTmpDir
popTmpDir


exit 0
exit 0

@ -1,3 +1,5 @@

#!/bin/bash


source "${THIS_DIR}/../../../scripts/bash_init.sh"
source "${THIS_DIR}/../../../scripts/bash_init.sh"


SELECTIR="${PROG_DIR}/../../normalize/lib/selectIR.py"
SELECTIR="${PROG_DIR}/../../normalize/lib/selectIR.py"
@ -10,12 +12,21 @@ function lookForIR {
	
	
	local REPEATS="${MATCHES/.*/}.repseek"
	local REPEATS="${MATCHES/.*/}.repseek"
	
	
	# Blast columns:

	# 	query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score

	# We keep blast matches if : 

	#	The match is longer than 1000

	#   The identity is higher than 80%

	#

	# The match file has the following format:

	#	LSC/SSC  begin end  same_strand=1/diff_strand=0

	
	loginfo "Locating SSC and LSC by similarity..."
	loginfo "Locating SSC and LSC by similarity..."
		blastn -db ${SCDB} \
		blastn -db ${SCDB} \
		       -query ${QUERY} \
		       -query ${QUERY} \
		       -outfmt 6 \
		       -outfmt 6 \
		       -max_target_seqs 10000 | \
		       -max_target_seqs 10000 | \
		  awk '($4 > 1000) && ($3>80) { \
		  awk '($4 > 100) && ($3>80) { \
		             SAME=(($7 < $8) && ($9 < $10)) || (($7 > $8) && ($9 > $10)); \
		             SAME=(($7 < $8) && ($9 < $10)) || (($7 > $8) && ($9 > $10)); \
			 		 if ($7 < $8) \
			 		 if ($7 < $8) \
			 			{print substr($2,1,3),$7,$8,SAME}  \
			 			{print substr($2,1,3),$7,$8,SAME}  \
@ -24,6 +35,7 @@ function lookForIR {
		  sort -nk 2 > ${MATCHES}
		  sort -nk 2 > ${MATCHES}
	loginfo "Done"
	loginfo "Done"


	loginfo "Looking for long inverted repeats..."
	loginfo "Looking for long inverted repeats..."
		repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS}
		repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS}
		loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified"
		loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified"

@ -8,6 +8,12 @@ repeats = open(sys.argv[2])
chloro    = {'LSC' : [], 'SSC' : [] }
chloro    = {'LSC' : [], 'SSC' : [] }
chlorosize =0
chlorosize =0


# We scan the blast matches:

#    We build a vector with one position per base pair counting the matches


# The match file has the following format:

#    LSC/SSC  begin end  same_strand=1/diff_strand=0


for line in data:
for line in data:
    parts = line.strip().split()
    parts = line.strip().split()
    if len(parts) >= 4:
    if len(parts) >= 4:
@ -16,7 +22,8 @@ for line in data:
        end         = int(parts[2])
        end         = int(parts[2])
        direction   = int(parts[3])
        direction   = int(parts[3])
        
        
        # Change the code of the direction:

        #    reverse complement = -1
        if direction==0:
        if direction==0:
            direction=-1
            direction=-1
            
            
@ -39,32 +46,49 @@ maxLSC = float(max(abs(n) for n in chloro['LSC']))
chloro['SSC']=[n / maxSSC for n in chloro['SSC']]
chloro['SSC']=[n / maxSSC for n in chloro['SSC']]
chloro['LSC']=[n / maxLSC for n in chloro['LSC']]
chloro['LSC']=[n / maxLSC for n in chloro['LSC']]


scoreMax=0
scoreMax=0

len1Max=0

len2Max=0


imax = len(chloro['LSC'])
imax = len(chloro['LSC'])


for line in repeats:
for line in repeats:
    parts   = line.strip().split()
    parts   = line.strip().split()
    
    
    # First repeat position and length 

    # (position start at 0)
    pos1    = int(parts[1]) -1
    pos1    = int(parts[1]) -1
    len1    = int(parts[3])
    len1    = int(parts[3])
    
    
    # Second repeat position and length

    # (position start at 0)
    pos2    = int(parts[2]) -1
    pos2    = int(parts[2]) -1
    len2    = int(parts[4])
    len2    = int(parts[4])
    
    
    # Location of the central single copy 

    #       - in between the two IR -
    c_begin = min(pos1 + len1,imax)
    c_begin = min(pos1 + len1,imax)
    c_end   = min(pos2,imax)
    c_end   = min(pos2,imax)

    
    # Location of the external single copy 

    #       - in between the two IR -
    o_max   = min(pos1 ,imax)
    o_max   = min(pos1 ,imax)
    o_min   = min(pos2 + len2, imax)
    o_min   = min(pos2 + len2, imax)
    
    
    c_lsc   = sum(abs(chloro['LSC'][n]) for n in range(c_begin,c_end))
    # count of coherent matches for LSC and SSC on the central single copy 
    c_ssc   = sum(abs(chloro['SSC'][n]) for n in range(c_begin,c_end))
    c_lsc   = abs(sum(chloro['LSC'][n] for n in range(c_begin,c_end)))

    c_ssc   = abs(sum(chloro['SSC'][n] for n in range(c_begin,c_end)))


    o_lsc   = sum(abs(chloro['LSC'][n]) for n in range(0,o_max))
    # count of coherent matches for LSC and SSC on the external single copy 
    o_ssc   = sum(abs(chloro['SSC'][n]) for n in range(0,o_max))
    #    this score is in two parts before the first copy and after the second

    o_lsc   = sum(chloro['LSC'][n] for n in range(0,o_max))

    o_ssc   = sum(chloro['SSC'][n] for n in range(0,o_max))


    o_lsc  += sum(abs(chloro['LSC'][n]) for n in range(o_min,len(chloro['LSC'])))
    o_lsc  += sum(chloro['LSC'][n] for n in range(o_min,imax))
    o_ssc  += sum(abs(chloro['SSC'][n]) for n in range(o_min,len(chloro['SSC'])))
    o_ssc  += sum(chloro['SSC'][n] for n in range(o_min,imax))

    
    o_lsc   = abs(o_lsc)

    o_ssc   = abs(o_ssc)
    
    
    c = float(c_lsc + c_ssc)
    c = float(c_lsc + c_ssc)
    o = float(o_lsc + o_ssc)
    o = float(o_lsc + o_ssc)
@ -78,10 +102,9 @@ for line in repeats:
        o_ssc /= o 
        o_ssc /= o 
    
    
    score = ((c_lsc - c_ssc) ** 2 + (o_lsc - o_ssc) ** 2) / 2.0
    score = ((c_lsc - c_ssc) ** 2 + (o_lsc - o_ssc) ** 2) / 2.0
    

    # print >>sys.stderr,"c.lsc = %f c.ssc = %f   o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1)
    # print >>sys.stderr,"c.lsc = %f c.ssc = %f   o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1)
        
        
    if (score > scoreMax):
    if (score >= scoreMax) and ((len1 > len1Max) or (len2 > len2Max)):
        scoreMax = score
        scoreMax = score
        pos1Max  = pos1
        pos1Max  = pos1
        pos2Max  = pos2
        pos2Max  = pos2
@ -99,8 +122,8 @@ c_ssc   = sum(chloro['SSC'][n] for n in range(c_begin,c_end))
o_lsc   = sum(chloro['LSC'][n] for n in range(0,o_max))
o_lsc   = sum(chloro['LSC'][n] for n in range(0,o_max))
o_ssc   = sum(chloro['SSC'][n] for n in range(0,o_max))
o_ssc   = sum(chloro['SSC'][n] for n in range(0,o_max))


o_lsc  += sum(chloro['LSC'][n] for n in range(o_min,len(chloro['LSC'])))
o_lsc  += sum(chloro['LSC'][n] for n in range(o_min,imax))
o_ssc  += sum(chloro['SSC'][n] for n in range(o_min,len(chloro['SSC'])))
o_ssc  += sum(chloro['SSC'][n] for n in range(o_min,imax))


if abs(c_lsc) > abs(c_ssc):
if abs(c_lsc) > abs(c_ssc):
    center = "LSC"  
    center = "LSC"  

@ -33,7 +33,7 @@ pushTmpDir ORG.trna
	TRNA=$(basename ${QUERY})
	TRNA=$(basename ${QUERY})
	
	
	aragorn -i -w -seq ${QUERY} | \
	aragorn -i -w -seq ${QUERY} | \
		${PROG_DIR}/../lib/aragorn_wrapper.awk
		${AwkCmd} -f ${PROG_DIR}/../lib/aragorn_wrapper.awk
	
	
popTmpDir
popTmpDir

@ -34,19 +34,19 @@ pushTmpDir ORG.organnot
	loginfo "Done."
	loginfo "Done."
	
	
	loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
	loginfo "Annotating the Inverted repeats and Single copies (LSC and SSC)..."
		${PROG_DIR}/detectors/ir/bin/go_ir.sh ${QUERY} > "${RESULTS}.annot"		
		${PROG_DIR}/detectors/ir/bin/go_ir.sh "${RESULTS}.norm.fasta" > "${RESULTS}.annot"		
	loginfo "Done."
	loginfo "Done."
	
	
	loginfo "Annotating the tRNA..."
	loginfo "Annotating the tRNA..."
		${PROG_DIR}/detectors/trna/bin/go_trna.sh ${QUERY} >> "${RESULTS}.annot"
		${PROG_DIR}/detectors/trna/bin/go_trna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
	loginfo "Done."
	loginfo "Done."
	
	
	loginfo "Annotating the rRNA genes..."
	loginfo "Annotating the rRNA genes..."
		${PROG_DIR}/detectors/rrna/bin/go_rrna.sh ${QUERY} >> "${RESULTS}.annot"
		${PROG_DIR}/detectors/rrna/bin/go_rrna.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
	loginfo "Done."
	loginfo "Done."


	loginfo "Annotating the CDS..."
	loginfo "Annotating the CDS..."
		${PROG_DIR}/detectors/cds/bin/go_cds.sh ${QUERY} >> "${RESULTS}.annot"
		${PROG_DIR}/detectors/cds/bin/go_cds.sh "${RESULTS}.norm.fasta" >> "${RESULTS}.annot"
	loginfo "Done."
	loginfo "Done."
	
	
	loginfo "Printing annotations header..."
	loginfo "Printing annotations header..."
@ -96,11 +96,11 @@ pushTmpDir ORG.organnot
			 		      freq["G"]" G; "\
			 		      freq["G"]" G; "\
			 		      freq["T"]" T; "\
			 		      freq["T"]" T; "\
			 		      other" other;" \
			 		      other" other;" \
			 }' ${QUERY}
			 }' "${RESULTS}.norm.fasta"
	loginfo "Done."
	loginfo "Done."
	
	
	loginfo "Reformating sequences..."
	loginfo "Reformating sequences..."
		lines=$(wc -l ${QUERY} | awk '{print $1}')
		lines=$(wc -l "${RESULTS}.norm.fasta" | awk '{print $1}')
		awk -v lines=$lines ' \
		awk -v lines=$lines ' \
			! /^>/ { \
			! /^>/ { \
					seq=tolower($0); \
					seq=tolower($0); \
@ -115,7 +115,7 @@ pushTmpDir ORG.organnot
					if (NR==lines) \
					if (NR==lines) \
					  {pos-=1}; \
					  {pos-=1}; \
					printf("   %6d\n",pos) \
					printf("   %6d\n",pos) \
			   }' ${QUERY}
			   }' "${RESULTS}.norm.fasta"
	loginfo "Done."
	loginfo "Done."
	
	
	loginfo "Closing sequence part..."
	loginfo "Closing sequence part..."

@ -19,7 +19,7 @@ function getAbsolutePath {
# Manage temp directory
# Manage temp directory


function pushTmpDir {
function pushTmpDir {
	TMP_DIR=$(mktemp -d -t "$1_proc_$$_")
	TMP_DIR=$(mktemp -d -t "$1_proc_$$_XXXXXX")
	pushd $TMP_DIR >& /dev/null
	pushd $TMP_DIR >& /dev/null
	TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK"
	TMP_DIR_STACK="$TMP_DIR $TMP_DIR_STACK"
	logdebug "Pushing temp directory $TMP_DIR"
	logdebug "Pushing temp directory $TMP_DIR"