ADD: new option --fieldcutname to make ecoTag work with paired-end sequences (experimental)

This commit is contained in:
Céline Mercier
2012-10-11 14:20:04 +00:00
parent 4e33a4ab0d
commit ad9cf0653f

View File

@ -102,6 +102,12 @@ def addSearchOptions(optionManager):
default=0.0,
help='Tolarated rate of wrong assignation')
optionManager.add_option('-F','--fieldcutname',
action='store',dest='fieldcutname',
default=None,
help="""Turn Unassembled match mode 'ON' when 'fieldcutname' is a tag of the sequence.
This tag gives the position of the junction of the two reads in the sequence (on (Celine + Fred) parle trop bien anglais !)""")
def count(data):
rep = {}
@ -121,6 +127,34 @@ def count(data):
rep[k]=rep.get(k,default)+v
return rep
from obitools.location import SimpleLocation
def myLenlcs(s1, s2, fieldCutName, minid, normalized, reference):
if (fieldCutName is not None) and (s1.hasKey(fieldCutName)) :
cutPos = s1[fieldCutName]
locs1_5P = SimpleLocation(1,cutPos)
locs1_3P = SimpleLocation(cutPos+1,len(s1))
locs2_5P = SimpleLocation(1, min(cutPos, len(s2)))
locs2_3P = SimpleLocation(max(1, len(s2)-(len(s1)-cutPos)+1), len(s2))
overlap = min(0,len(s1) - len(s2))
lcs5P, lali5P = lenlcs(s1.getSubSeq(locs1_5P),s2.getSubSeq(locs2_5P),minid,False)
lcs3P, lali3P = lenlcs(s1.getSubSeq(locs1_3P),s2.getSubSeq(locs2_3P),minid,False)
raw_lcs = lcs5P + lcs3P - overlap
lali = lali5P + lali3P - overlap
lcs = raw_lcs / float(lali)
else:
lcs, lali = lenlcs(s1,s2,minid,normalized,reference)
return lcs, lali
def lcsIterator(entries,db,options):
for seq in entries:
@ -128,7 +162,7 @@ def lcsIterator(entries,db,options):
maxid = (None,0.0)
minid = options.minimum
for d in db:
lcs,lali = lenlcs(seq,d,minid,normalized=True,reference=ALILEN)
lcs,lali = myLenlcs(seq, d, options.fieldcutname, minid,normalized=True,reference=ALILEN)
if lcs > maxid[1]:
maxid = (d,lcs)
minid = maxid[1] ** options.shape
@ -145,7 +179,7 @@ def lcsIteratorSelf(entries,db,options):
maxid = ([],0.0)
minid = options.minimum
for d in db:
lcs,lali = lenlcs(seq,d,minid,normalized=True,reference=ALILEN)
lcs,lali = myLenlcs(seq,d,options.fieldcutname,minid,normalized=True,reference=ALILEN)
if lcs > maxid[1]:
maxid = ([d],lcs)
minid = maxid[1]