2023-04-29 07:06:07 +02:00
|
|
|
#!/usr/bin/env python3
|
2015-10-08 15:38:27 -03:00
|
|
|
|
|
|
|
import sys
|
2016-10-05 08:01:30 -03:00
|
|
|
from math import lgamma
|
|
|
|
from math import log
|
2015-10-08 15:38:27 -03:00
|
|
|
|
|
|
|
data = open(sys.argv[1])
|
|
|
|
repeats = open(sys.argv[2])
|
|
|
|
|
|
|
|
chloro = {'LSC' : [], 'SSC' : [] }
|
|
|
|
chlorosize =0
|
|
|
|
|
2016-10-05 08:01:30 -03:00
|
|
|
def lpbinom(x,n,p):
|
|
|
|
lprob = log(p) * x + log(1-p) * (n-x) + lgamma(n+1) - lgamma(x+1) - lgamma(n-x+1)
|
|
|
|
return lprob
|
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
# We scan the blast matches:
|
|
|
|
# We build a vector with one position per base pair counting the matches
|
|
|
|
|
|
|
|
# The match file has the following format:
|
|
|
|
# LSC/SSC begin end same_strand=1/diff_strand=0
|
|
|
|
|
2015-10-08 15:38:27 -03:00
|
|
|
for line in data:
|
|
|
|
parts = line.strip().split()
|
|
|
|
if len(parts) >= 4:
|
|
|
|
single = parts[0]
|
|
|
|
begin = int(parts[1])
|
|
|
|
end = int(parts[2])
|
|
|
|
direction = int(parts[3])
|
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
# Change the code of the direction:
|
|
|
|
# reverse complement = -1
|
2015-10-08 15:38:27 -03:00
|
|
|
if direction==0:
|
|
|
|
direction=-1
|
2023-04-29 07:06:07 +02:00
|
|
|
|
2015-10-08 15:38:27 -03:00
|
|
|
|
|
|
|
if end > chlorosize:
|
|
|
|
extsize = end - chlorosize
|
|
|
|
chloro['LSC'].extend([0] * extsize)
|
|
|
|
chloro['SSC'].extend([0] * extsize)
|
|
|
|
chlorosize=len(chloro['LSC'])
|
|
|
|
|
|
|
|
begin-=1
|
|
|
|
|
|
|
|
chr = chloro[single]
|
|
|
|
|
|
|
|
for p in range(begin,end):
|
|
|
|
chr[p]+=direction
|
|
|
|
|
2023-07-13 00:38:24 +02:00
|
|
|
# <Zafacs> 07/13/2023
|
|
|
|
# Hack for avoiding crash when LSC and SSC have no blast similarity
|
|
|
|
# Need to be reworked
|
2025-03-05 21:56:39 +01:00
|
|
|
if (len(chloro['SSC']) > 0) :
|
|
|
|
sSSC = sum(abs(x) for x in chloro['SSC'])
|
|
|
|
# maxSSC = float(max(abs(n) for n in chloro['SSC']))
|
|
|
|
chloro['SSC']=[n / sSSC for n in chloro['SSC']]
|
2023-07-13 00:38:24 +02:00
|
|
|
|
|
|
|
if (len(chloro['LSC']) > 0) :
|
2025-03-05 21:56:39 +01:00
|
|
|
sLSC = sum(abs(x) for x in chloro['LSC'])
|
|
|
|
# maxLSC = float(max(abs(n) for n in chloro['LSC']))
|
|
|
|
chloro['LSC']=[n / sLSC for n in chloro['LSC']]
|
2015-10-08 15:38:27 -03:00
|
|
|
|
|
|
|
scoreMax=0
|
2015-11-09 17:03:22 +01:00
|
|
|
len1Max=0
|
|
|
|
len2Max=0
|
|
|
|
|
2015-10-08 15:38:27 -03:00
|
|
|
imax = len(chloro['LSC'])
|
|
|
|
|
|
|
|
for line in repeats:
|
|
|
|
parts = line.strip().split()
|
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
# First repeat position and length
|
|
|
|
# (position start at 0)
|
2015-10-08 15:38:27 -03:00
|
|
|
pos1 = int(parts[1]) -1
|
|
|
|
len1 = int(parts[3])
|
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
# Second repeat position and length
|
|
|
|
# (position start at 0)
|
2015-10-08 15:38:27 -03:00
|
|
|
pos2 = int(parts[2]) -1
|
|
|
|
len2 = int(parts[4])
|
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
# Location of the central single copy
|
|
|
|
# - in between the two IR -
|
2015-10-08 15:38:27 -03:00
|
|
|
c_begin = min(pos1 + len1,imax)
|
|
|
|
c_end = min(pos2,imax)
|
2015-11-09 17:03:22 +01:00
|
|
|
|
|
|
|
# Location of the external single copy
|
|
|
|
# - in between the two IR -
|
2015-10-08 15:38:27 -03:00
|
|
|
o_max = min(pos1 ,imax)
|
|
|
|
o_min = min(pos2 + len2, imax)
|
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
# count of coherent matches for LSC and SSC on the central single copy
|
|
|
|
c_lsc = abs(sum(chloro['LSC'][n] for n in range(c_begin,c_end)))
|
|
|
|
c_ssc = abs(sum(chloro['SSC'][n] for n in range(c_begin,c_end)))
|
2015-10-08 15:38:27 -03:00
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
# count of coherent matches for LSC and SSC on the external single copy
|
|
|
|
# this score is in two parts before the first copy and after the second
|
|
|
|
o_lsc = sum(chloro['LSC'][n] for n in range(0,o_max))
|
|
|
|
o_ssc = sum(chloro['SSC'][n] for n in range(0,o_max))
|
2015-10-08 15:38:27 -03:00
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
o_lsc += sum(chloro['LSC'][n] for n in range(o_min,imax))
|
|
|
|
o_ssc += sum(chloro['SSC'][n] for n in range(o_min,imax))
|
|
|
|
|
|
|
|
o_lsc = abs(o_lsc)
|
|
|
|
o_ssc = abs(o_ssc)
|
2015-10-08 15:38:27 -03:00
|
|
|
|
|
|
|
c = float(c_lsc + c_ssc)
|
|
|
|
o = float(o_lsc + o_ssc)
|
2015-11-09 17:03:22 +01:00
|
|
|
|
2025-03-05 21:56:39 +01:00
|
|
|
# if c > 0:
|
|
|
|
# c_lsc /= c
|
|
|
|
# c_ssc /= c
|
2015-10-08 15:38:27 -03:00
|
|
|
|
2025-03-05 21:56:39 +01:00
|
|
|
# if o > 0:
|
|
|
|
# o_lsc /= o
|
|
|
|
# o_ssc /= o
|
2015-10-08 15:38:27 -03:00
|
|
|
|
2025-03-05 21:56:39 +01:00
|
|
|
#score = ((c_lsc - c_ssc) ** 2 + (o_lsc - o_ssc) ** 2) / 2.0
|
|
|
|
score = abs(abs(c_lsc) + abs(o_ssc) - abs(o_lsc) - abs(c_ssc))
|
2016-10-05 08:01:30 -03:00
|
|
|
# pvalue=
|
2025-03-05 21:56:39 +01:00
|
|
|
# print("c.lsc = %f c.ssc = %f o.lsc = %f o.ssc = %f score = %6.4f (len=%d)" % (c_lsc,c_ssc,o_lsc,o_ssc,score,len1),
|
|
|
|
# file=sys.stderr)
|
2015-10-08 15:38:27 -03:00
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
if (score >= scoreMax) and ((len1 > len1Max) or (len2 > len2Max)):
|
2015-10-08 15:38:27 -03:00
|
|
|
scoreMax = score
|
|
|
|
pos1Max = pos1
|
|
|
|
pos2Max = pos2
|
|
|
|
len1Max = len1
|
|
|
|
len2Max = len2
|
|
|
|
|
|
|
|
c_begin = min(pos1Max + len1Max,imax)
|
|
|
|
c_end = min(pos2Max,imax)
|
|
|
|
o_max = min(pos1Max,imax)
|
|
|
|
o_min = min(pos2Max + len2Max,imax)
|
|
|
|
|
|
|
|
c_lsc = sum(chloro['LSC'][n] for n in range(c_begin,c_end))
|
|
|
|
c_ssc = sum(chloro['SSC'][n] for n in range(c_begin,c_end))
|
|
|
|
|
|
|
|
o_lsc = sum(chloro['LSC'][n] for n in range(0,o_max))
|
|
|
|
o_ssc = sum(chloro['SSC'][n] for n in range(0,o_max))
|
|
|
|
|
2015-11-09 17:03:22 +01:00
|
|
|
o_lsc += sum(chloro['LSC'][n] for n in range(o_min,imax))
|
|
|
|
o_ssc += sum(chloro['SSC'][n] for n in range(o_min,imax))
|
2015-10-08 15:38:27 -03:00
|
|
|
|
|
|
|
if abs(c_lsc) > abs(c_ssc):
|
|
|
|
center = "LSC"
|
|
|
|
dcenter= "+" if c_lsc > 0 else "-"
|
|
|
|
else:
|
|
|
|
center = "SSC"
|
|
|
|
dcenter= "+" if c_ssc > 0 else "-"
|
|
|
|
|
|
|
|
if abs(o_lsc) > abs(o_ssc):
|
|
|
|
out = "LSC"
|
|
|
|
dout = "+" if o_lsc > 0 else "-"
|
|
|
|
else:
|
|
|
|
out = "SSC"
|
|
|
|
dout = "+" if o_ssc > 0 else "-"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sys.stdout.write("%s %s %s %s %d %d %d %d %6.5f\n" % (center,
|
|
|
|
dcenter,
|
|
|
|
out,
|
|
|
|
dout,
|
|
|
|
pos1Max + 1,
|
|
|
|
len1Max,
|
|
|
|
pos2Max + 1,
|
|
|
|
len2Max,
|
|
|
|
scoreMax))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#for p in range(chlorosize):
|
|
|
|
# sys.stdout.write("%d %d %d\n" % (p,chloro['SSC'][p],chloro['LSC'][p]))
|
2015-11-09 15:27:32 +01:00
|
|
|
|