9973d1cf7146dc452883c21389e28339c119d562

@ -39,240 +39,196 @@
# ex : joinfasta $1
#========================================================================================

FINDCDS=`dirname $0`/findcds
REPSEEK=`dirname $0`/repseek
# -- CAUTION -- Works as long than the script 
#               is not called through a symlink
SCRIPT_DIR="$(dirname ${BASH_SOURCE[0]})"
source "${SCRIPT_DIR}/../../../scripts/bash_init.sh"

# s'alimente avec un fichier.fasta
# $3 : nb de caractere du fichier, t : nb de caractere du titre, 
# $1+1 : nb de retour chariot du fichier
function seqlength {
  cat $1 | \
  wc |\
  awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
function lookForIR {

	local QUERY="$1"
	local MATCHES=$(basename ${QUERY})
	      MATCHES="${MATCHES/.*/.matches}"
	
	local REPEATS="${MATCHES/.*/.repseek}"
	
	loginfo "Locating SSC and LSC by similarity..."
		blastn -db ${SCDB} \
		       -query ${QUERY} \
		       -outfmt 6 \
		       -max_target_seqs 10000 | \
		  awk '($4 > 1000) && ($3>80) { \
		             SAME=(($7 < $8) && ($9 < $10)) || (($7 > $8) && ($9 > $10)); \
			 		 if ($7 < $8) \
			 			{print substr($2,1,3),$7,$8,SAME}  \
			 		 else \
			 			{print substr($2,1,3),$8,$7,SAME}}' | \
		  sort -nk 2 > ${MATCHES}
	loginfo "Done"
	  
	loginfo "Looking for long inverted repeats..."
		repseek -c -p 0.001 -i ${QUERY} 2>> /dev/null > ${REPEATS}
		loginfo " --> $(wc -l ${REPEATS} | awk '{print $1}') repeats identified"
	loginfo "Done"
	
	loginfo "Marking and selecting the best inverted repeat..."
		local IR=( $(${PROG_DIR}/selectIR.py ${MATCHES} ${REPEATS}) )
	loginfo "Done"
	
	loginfo " --> IR size : IRa = ${IR[5]} /  IRb = ${IR[7]}"
	loginfo " --> IR Score: ${IR[8]}"
	
	let "deltaIR=${IR[5]} -  ${IR[7]}"
	
	if (( $deltaIR < -10 )) ||  (( $deltaIR > 10 )); then
		logwarning "Differences between IR lengths ($deltaIR) is greater than 10"
	fi
	
	
	echo "${IR[@]}"
}

pushTmpDir ORG.normalize

# selectionne une sequence parmi le fichier
# $2 : debut de la sequence a couper, $3 : fin de la sequence a couper
function cutseq {
	awk -v from=$2 -v end=$3 'function printfasta(seq) {            \
									seqlen=length(seq);             \
									for (i=1; i <= seqlen; i+=60)    \
									   print substr(seq,i,60);      \
								}                                   \
																	\
								/^>/   {print $0}                   \
								! /^>/ {seq=seq$0}                  \
								END {printfasta(substr(seq,from,end-from+1))}' $1
}
	SCDB="${IR_DATA_DIR}/SC_RefDB"
	QUERY="${CALL_DIR}/$1"
	MATCHES="${1/.*/.matches}"
	REPEATS="${1/.*/.repseek}"

# donne le brin reverse de la sequence  : 
# la sous-fonction comp reecrit la sequence a l'envers 
# la sous-fonction rev remplace les bases par leurs bases associe
# la sous-fonction revcomp reprend les deux precedente
function revcomp {
	awk 'function printfasta(seq) {          					    \
			seqlen=length(seq);           						    \
			for (i=1; i <= seqlen; i+=60)      						\
			  print substr(seq,i,60);         						\
		 }                                      					\
		function comp(seq) {                   						\
			"echo "seq" | tr acgtACGT tgcaTGCA " | getline res; 	\
			return res;                        						\
		}                                      						\
		function rev(seq) {                    						\
			"echo "seq" | rev " | getline res; 						\
			return res;                        						\
		}                                      						\
		function revcomp(seq) {                						\
			res=rev(comp(seq));                						\
			return res;                        						\
		}                                      						\
																	\
		/^>/   {print $0}                    					    \
		! /^>/ {seq=seq$0}                     						\
		END {printfasta(revcomp(seq))}' $1
}


function formatfasta {
	awk  'function printfasta(seq) {                                \
									seqlen=length(seq);             \
									for (i=1; i <= seqlen; i+=60)   \
									   print substr(seq,i,60);      \
								   }                                \
								/^>/   { print $0 }                 \
								! /^>/ { seq=seq $0 }               \
								END    { printfasta(seq)}' $1
}


# colle bout a bout deux sequence en mettant le meme nombre de paire de base par ligne
# sur le fichier, ici regle a 60
# enleve les titres intermediaire entre deux sequences recollees si il y en a
function joinfasta {
	awk '(NR==1 && /^>/) {print $0}                                 \
	     ! /^>/          {print $0}' $1 |                           \
		 formatfasta
}

# recupere les informations issues du programme repseek avec l'origine des deux 
# IR et leur taille
function lookforIR {
	repseek -c -p 0.001 $1|                                         \
     grep 'Distant.inv'|                                            \
     sort -n -k4              |                                     \
     tail -1                  |                                     \
     awk '{print $7}'         |                                     \
     sed 's/-/ /g'
}

# determine si le fragment analyse doit etre recolle en forward ou reverse dans le 
# nouveau .fasta
function maxCDS {
	${FINDCDS} -F $1 -c -l 150 | \
	awk '/^[^#]/ && ($2 == "Watson") { Watson+=$5-$4+1} \
	     /^[^#]/ && ($2 == "Crick")  {Crick+=$5-$4+1} \
	     END                         {print Watson - Crick}'
}

#
# Exemple results from repseek
#
#	$1			$2		$3		$4		$5		$6		$7						$8		$9			$10		$11	 $12
# Class			pos_r1	pos_r2	len_r1	len_r2	Delta	Seed					ident	score		Rmean	Rmod frac
# Distant.inv	86608	130934	25319	25319	19007	86608-130934-25319-2.01	100.000	25262.43	2.01	2	 0.99

# Test where the sequence is cut

# Definie les variables utilisé : le debut, fin et taille des deux IR
genome=$1

genome_length=`seqlength $1`
IRS=(`lookforIR ${genome}`)

posIR1=${IRS[0]}
posIR2=${IRS[1]}
lenIR=${IRS[2]}

let "endIR2=$posIR2 + $lenIR - 1"
let "endIR1=$posIR1 + $lenIR - 1"

# Defini la coupe a adopter en fonction de : 
# - la position de la fin de l'IR2 par rapport a la sequence total, pour identifier une 
#   coupe au sein d'une IR 
# Le programme repseek considere toujours que la position maximal de la fin d'IR2 ne peut 
# pas depasser celle de la sequence, que l'IR2 soit coupe ou non. Donc dans tous les cas
# on choisit de recouper la sequence a mi-distance entre la fin de l'IR1 et le debut de 
# l'IR2
if (( endIR2 == genome_length )) ; then	
	tmpfasta1="tmp_$$_1.fasta"
	tmpfasta2="tmp_$$_2.fasta"
# defini la localisation de la coupure entre les deux IR
	let "posCut=($endIR1+$posIR2)/2"
# realise la coupure du fichier d'entre du nucleotide calcule jusqu'a la fin de la sequence
	cutseq ${genome} ${posCut} ${genome_length} > ${tmpfasta1}
	let "posCut=$posCut-1"
# realise la coupure du fichier d'entre du debut de la sequence jusqu'au nucleotide calcule
	cutseq ${genome} 1 ${posCut} >> ${tmpfasta1}
# ces deux fragment sont rassembles dans un fichier temporaire
	joinfasta ${tmpfasta1} > ${tmpfasta2}
	rm -f ${tmpfasta1}
	genome=${tmpfasta2}
# recalcul la nouvelle position des IR
	IRS=(`lookforIR ${genome}`)
	posIR1=${IRS[0]}
	posIR2=${IRS[1]}
	lenIR=${IRS[2]}


	openLogFile "${QUERY/.*/.log}"

	loginfo "Computing the genome size..."
		genome_length=$(seqlength $QUERY)
		loginfo " --> $genome_length bp"
	loginfo "Done"
	
	IR=( $(lookForIR ${QUERY}) )
	
	posIR1=${IR[4]}
	posIR2=${IR[6]}
	
	let "lenIR= ( ${IR[5]} +  ${IR[7]} ) / 2 " 

	let "endIR2=$posIR2 + $lenIR - 1"
	let "endIR1=$posIR1 + $lenIR - 1"
fi

tmpIR1="tmp_$$_IR1.fasta"		
tmpIR2="tmp_$$_IR2.fasta"		

#enregistre les deux fragments IR1 et IR2 complet
cutseq ${genome} ${posIR1} ${endIR1} > ${tmpIR1}
cutseq ${genome} ${posIR2} ${endIR2} > ${tmpIR2}

let "lenSC1=$posIR1 -1 + ($genome_length - endIR2)"
let "lenSC2=$posIR2 - $endIR1"

tmpLSC="tmp_$$_LSC.fasta"		
tmpSSC="tmp_$$_SSC.fasta"		

# Defini la coupe a adopter en fonction de : 
# - la taille de la SC1 par rapport a la taille de la SC2, pour identifier une 
#   coupe au sein d'une SC. La coupe a lieu au sein de la SC1, le but est d'identifier la 
#   LSC et la SSC parmis les SC1 et la SC2
# si la SC1 est plus grande que la SC2, alors la SC1 est la LSC et la coupe a eu lieu 
# dans la LSC
if (( lenSC1 > lenSC2 )); then
# defini le debut de la LSC
	let "beginLSC=$endIR2+1"
	cutseq ${genome} ${beginLSC} ${genome_length} > ${tmpLSC}
# defini la fin de la LSC
	let "endLSC=$posIR1-1"
	cutseq ${genome} 1 ${endLSC} >> ${tmpLSC}
	tmpfasta1="tmp_$$_1.fasta"
# rejoint les deux morceaux pour former la LSC
	joinfasta ${tmpLSC} > ${tmpfasta1}
	mv ${tmpfasta1} ${tmpLSC}

# donc la SC2 est la SSC, 
# definit l'origine et la fin a couper pour avoir le fragment SSC
	let "beginSSC=$endIR1+1"
	let "endSSC=$posIR2-1"
	cutseq ${genome} ${beginSSC} ${endSSC} > ${tmpSSC}
	
	tmp=${tmpIR1}
	tmpIR1=${tmpIR2}
	tmpIR2=${tmp}
else
# sinon la SC2 est la LSC, et la coupe a eu lieu dans la SSC
# definit l'origine et la fin a couper pour avoir le fragment LSC
	if (( "$endIR2" >= "$genome_length" )) ; then	
		loginfo "IRB is at the end of the original sequence"
		
		#
		# We just move the IRB at the begining of the sequence
		#
		
		# Extract the IRB sequence
		let "posCut=($endIR1+$posIR2)/2"
		cutseq ${QUERY} ${posCut} ${genome_length} > ${tmpfasta1}

		# Append the remaining part of the genome		
		let "posCut=$posCut-1"
		cutseq ${QUERY} 1 ${posCut} >> ${tmpfasta1}
		
		# merges both the parts
		joinfasta ${tmpfasta1} > ${tmpfasta2}
		rm -f ${tmpfasta1}
		QUERY=${tmpfasta2}

		loginfo "Recompute location of the IR..."
			declare -a IR=( $(lookForIR ${QUERY}) )
		loginfo "Done"
		
		posIR1="${IR[4]}"
		posIR2="${IR[6]}"
		
		let "lenIR=(${IR[5]} +  ${IR[7]}) / 2 " 
	
		let "endIR2=$posIR2 + $lenIR - 1"
		let "endIR1=$posIR1 + $lenIR - 1"
		
	fi		
	
	tmpIR1="tmp_$$_IR1.fasta"		
	tmpIR2="tmp_$$_IR2.fasta"		
	
	#enregistre les deux fragments IRa et IRb complet
	cutseq ${QUERY} ${posIR1} ${endIR1} > ${tmpIR1}
	cutseq ${QUERY} ${posIR2} ${endIR2} > ${tmpIR2}
	
	let "lenSC1=$posIR1 -1 + ($genome_length - endIR2)"
	let "lenSC2=$posIR2 - $endIR1"
	
	center="${IR[0]}"
		
	tmpLSC="tmp_$$_LSC.fasta"		
	tmpSSC="tmp_$$_SSC.fasta"		
	
	# Extract the first SC present in between the two IRs
	# considering it as LSC

	let "beginLSC=$endIR1+1"
	let "endLSC=$posIR2-1"
	cutseq ${genome} ${beginLSC} ${endLSC} > ${tmpLSC}
	cutseq ${QUERY} ${beginLSC} ${endLSC} > ${tmpLSC}

# definit le debut de la SSC et coupe la premiere partie
	strandLSC="${IR[1]}"


	# Extract the second SC present in two parts
	# Considering it as SSC
	
	let "beginSSC=$endIR2+1"
	cutseq ${genome} ${beginSSC} ${genome_length} > ${tmpSSC}
# definit la fin de la SSC	et coupe la seconde partie 
	cutseq ${QUERY} ${beginSSC} ${genome_length} > ${tmpSSC}

	let "endSSC=$posIR1-1"
	cutseq ${genome} 1 ${endSSC} >> ${tmpSSC}
# joint les deux parties afin de reformer la SSC
	cutseq ${QUERY} 1 ${endSSC} >> ${tmpSSC}

	joinfasta ${tmpSSC} > ${tmpfasta1}
	mv ${tmpfasta1} ${tmpSSC}
fi
	
	strandSSC="${IR[3]}"
	
	
	if [[ "$center" == "SSC" ]]; then
	
		# Actually this is the oposite LSC is SSC and SSC is LSC

		# Exchange the SSC and LSC sequences
		mv ${tmpSSC}    ${tmpfasta1}
		mv ${tmpLSC}    ${tmpSSC}
		mv ${tmpfasta1} ${tmpLSC}
		
		# Exchange the IRa and IRb sequences
		mv ${tmpIR1}    ${tmpfasta1}
		mv ${tmpIR2}    ${tmpIR1}
		mv ${tmpfasta1} ${tmpIR2}
		
		tmp=${strandSSC}
		strandSSC=${strandLSC}
		strandLSC=${tmp}
		
	fi
	
	# Reverse complement the SSC if needed
	if [[ "${strandSSC}" == "-" ]]; then
		fastarevcomp -f ${tmpSSC} > ${tmpfasta1}
		mv ${tmpfasta1} ${tmpSSC}
	fi
	
	# Reverse complement the LSC if needed
	if [[ "${strandLSC}" == "-" ]]; then
		fastarevcomp -f ${tmpLSC} > ${tmpfasta1}
		mv ${tmpfasta1} ${tmpLSC}
	fi
	
	# Merges the four parts of the genome.
	cat ${tmpSSC} ${tmpIR1} ${tmpLSC} ${tmpIR2} | joinfasta


if [[ ! -z $tmpfasta2 ]]; then
	rm -f $tmpfasta2
fi
# determine si les fragments doivent etre recolle en reverse ou forward
maxSSC=`maxCDS ${tmpSSC}`

# si maxSSC est negatif, le rapport Watson - Crick est negatif, le fragment est 
# donc reverse
if (( maxSSC < 0 )); then
	revcomp ${tmpSSC} > ${tmpfasta1}
	mv ${tmpfasta1} ${tmpSSC}
fi

maxLSC=`maxCDS ${tmpLSC}`

# si maxLSC est negatif, le rapport Watson - Crick est negatif, le fragment est 
# donc reverse
if (( maxLSC < 0 )); then
	revcomp ${tmpLSC} > ${tmpfasta1}
	mv ${tmpfasta1} ${tmpLSC}
fi

# Les quatre fragments sont recolle ensemble sans erreur de coupure et dans un ordre connus.
cat ${tmpSSC} ${tmpIR1} ${tmpLSC} ${tmpIR2} | joinfasta
	
	
popTmpDir

exit 0


@ -0,0 +1,135 @@
#!/usr/bin/env python

import sys 

data    = open(sys.argv[1])
repeats = open(sys.argv[2])

chloro    = {'LSC' : [], 'SSC' : [] }
chlorosize =0

for line in data:
    parts = line.strip().split()
    if len(parts) >= 4:
        single      = parts[0]
        begin       = int(parts[1])
        end         = int(parts[2])
        direction   = int(parts[3])
        
        
        if direction==0:
            direction=-1
            
        if end > chlorosize:
            extsize =  end - chlorosize 
            chloro['LSC'].extend([0] * extsize)
            chloro['SSC'].extend([0] * extsize)
            chlorosize=len(chloro['LSC'])
        
        begin-=1
        
        chr = chloro[single]
        
        for p in range(begin,end):
            chr[p]+=direction
   
maxSSC = float(max(abs(n) for n in chloro['SSC']))
maxLSC = float(max(abs(n) for n in chloro['LSC']))

chloro['SSC']=[n / maxSSC for n in chloro['SSC']]
chloro['LSC']=[n / maxLSC for n in chloro['LSC']]


scoreMax=0
imax = len(chloro['LSC'])

for line in repeats:
    parts   = line.strip().split()
    
    pos1    = int(parts[1]) -1
    len1    = int(parts[3])
    
    pos2    = int(parts[2]) -1
    len2    = int(parts[4])
    
    c_begin = min(pos1 + len1,imax)
    c_end   = min(pos2,imax)
    o_max   = min(pos1 ,imax)
    o_min   = min(pos2 + len2, imax)
    
    c_lsc   = sum(abs(chloro['LSC'][n]) for n in range(c_begin,c_end))
    c_ssc   = sum(abs(chloro['SSC'][n]) for n in range(c_begin,c_end))

    o_lsc   = sum(abs(chloro['LSC'][n]) for n in range(0,o_max))
    o_ssc   = sum(abs(chloro['SSC'][n]) for n in range(0,o_max))

    o_lsc  += sum(abs(chloro['LSC'][n]) for n in range(o_min,len(chloro['LSC'])))
    o_ssc  += sum(abs(chloro['SSC'][n]) for n in range(o_min,len(chloro['SSC'])))
    
    c = float(c_lsc + c_ssc)
    o = float(o_lsc + o_ssc)
    
    if c > 0:
        c_lsc /= c
        c_ssc /= c 
    
    if o > 0:
        o_lsc /= o 
        o_ssc /= o 
    
    score = ((c_lsc - c_ssc) ** 2 + (o_lsc - o_ssc) ** 2) / 2.0
    
    # print >>sys.stderr,"c.lsc = %f c.ssc = %f   o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1)
        
    if (score > scoreMax):
        scoreMax = score
        pos1Max  = pos1
        pos2Max  = pos2
        len1Max  = len1
        len2Max  = len2
        
c_begin = min(pos1Max + len1Max,imax)
c_end   = min(pos2Max,imax)
o_max   = min(pos1Max,imax)
o_min   = min(pos2Max + len2Max,imax)

c_lsc   = sum(chloro['LSC'][n] for n in range(c_begin,c_end))
c_ssc   = sum(chloro['SSC'][n] for n in range(c_begin,c_end))

o_lsc   = sum(chloro['LSC'][n] for n in range(0,o_max))
o_ssc   = sum(chloro['SSC'][n] for n in range(0,o_max))

o_lsc  += sum(chloro['LSC'][n] for n in range(o_min,len(chloro['LSC'])))
o_ssc  += sum(chloro['SSC'][n] for n in range(o_min,len(chloro['SSC'])))

if abs(c_lsc) > abs(c_ssc):
    center = "LSC"  
    dcenter= "+" if c_lsc > 0 else "-"
else:
    center = "SSC"
    dcenter= "+" if c_ssc > 0 else "-"

if abs(o_lsc) > abs(o_ssc):
    out    = "LSC"  
    dout   = "+" if o_lsc > 0 else "-"
else:
    out    = "SSC"
    dout   = "+" if o_ssc > 0 else "-"
    
    
sys.stdout.write("%s %s %s %s %d %d %d %d %6.5f\n" % (center,
                                                      dcenter,
                                                      out,
                                                      dout,
                                                      pos1Max + 1,
                                                      len1Max,
                                                      pos2Max + 1,
                                                      len2Max,
                                                      scoreMax))
    
    
#for p in range(chlorosize):
#    sys.stdout.write("%d %d %d\n"  % (p,chloro['SSC'][p],chloro['LSC'][p]))
    
@ -62,10 +62,58 @@ function logwarning {

# Sequence related functions	
	
# Counts how many sequences are stored in a fasta file
# 	- $1 : The fasta file to count
function fastaCount {
	grep '^>' $1 | wc -l
}


# compute the sequence length from a fasta sequence
# 	- $1 : The fasta file to cut
function seqlength {
  cat $1 | \
  wc |\
  awk -v t="`head -1 $1 | wc -c`" '{print $3 - t - $1 + 1}'
}

# extract a subseq from a fasta sequence
# 	- $1 : The fasta file to cut
#   - $2 : First position of the subsequence (first position is numered 1), 
#   - $3 : End of the subsequence (included in the subsequence)
function cutseq {
	awk -v from=$2 -v end=$3 'function printfasta(seq) {            \
									seqlen=length(seq);             \
									for (i=1; i <= seqlen; i+=60)    \
									   print substr(seq,i,60);      \
								}                                   \
																	\
								/^>/   {print $0}                   \
								! /^>/ {seq=seq$0}                  \
								END {printfasta(substr(seq,from,end-from+1))}' $1
}

# Joins a set of sequences stored in a fasta file into 
# a single sequence
# 	- $1 : The fasta file containing the sequences to join
function joinfasta {
	awk '(NR==1 && /^>/) {print $0}                                 \
	     ! /^>/          {print $0}' $1 |                           \
		 formatfasta
}

function formatfasta {
	awk  'function printfasta(seq) {                                \
									seqlen=length(seq);             \
									for (i=1; i <= seqlen; i+=60)   \
									   print substr(seq,i,60);      \
								   }                                \
								/^>/   { print $0 }                 \
								! /^>/ { seq=seq $0 }               \
								END    { printfasta(seq)}' $1
}


#
#
########################
@ -115,3 +163,4 @@ export PATH
export LANG=C
export LC_ALL=C