add the find cds script
Former-commit-id: d4d49b96f84c942a1f15d10c286988f7cea3b76f Former-commit-id: 541a8dc171fa695a307b76df0446b33a7112bad1
This commit is contained in:
524
findcds
Executable file
524
findcds
Executable file
@ -0,0 +1,524 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
####################################################
|
||||
# Date de derniere modif : 03/2004
|
||||
# Il ne prend qu'une seule sequence fasta en entree.
|
||||
# contact : coissac@abi.snv.jussieu.fr
|
||||
####################################################
|
||||
|
||||
import sys
|
||||
import re
|
||||
import getopt
|
||||
import sets
|
||||
|
||||
def progressBar(pos,max):
|
||||
percent = float(pos)/max * 100
|
||||
bar = '#' * int(percent/2)
|
||||
bar+= '|/-\\-'[pos % 5]
|
||||
bar+= ' ' * (50 - int(percent/2))
|
||||
sys.stderr.write('\r%5.1f %% |%s]' %(percent,bar))
|
||||
|
||||
####################################################
|
||||
# fonction de comparaison des longueurs de cds (4eme champs)
|
||||
# pour les trier par ordre decroissant
|
||||
####################################################
|
||||
def complg (x,y) :
|
||||
|
||||
if x[3]< y[3] :
|
||||
return 1
|
||||
elif x[3] > y[3]:
|
||||
return -1
|
||||
else :
|
||||
return 0
|
||||
|
||||
####################################################
|
||||
# fonction de comparaison des longueurs de cds (4eme champs)
|
||||
# pour les trier par ordre decroissant
|
||||
####################################################
|
||||
def ordercds (x,y) :
|
||||
|
||||
if x[0] > y[0] :
|
||||
return 1
|
||||
elif x[0] < y[0]:
|
||||
return -1
|
||||
else :
|
||||
if x[1] > y[1] :
|
||||
return 1
|
||||
elif x[1] < y[1]:
|
||||
return -1
|
||||
return 0
|
||||
|
||||
####################################################
|
||||
# recuperation des arguments
|
||||
####################################################
|
||||
def parse_args():
|
||||
|
||||
try:
|
||||
opts = getopt.getopt(sys.argv[1:], 'vcil:N:I:T:F:d:')
|
||||
except getopt.GetoptError:
|
||||
print_usage(sys.argv[0])
|
||||
sys.exit(2)
|
||||
|
||||
che=0 # valeurs par defaut
|
||||
inc=0
|
||||
lg=0
|
||||
name=""
|
||||
init = 'ATG'
|
||||
stop = 'TAA|TAG|TGA'
|
||||
filename = ""
|
||||
delta=0
|
||||
|
||||
for elem in opts[0]: # modification des valeurs par defaut
|
||||
if (elem[0]=="-v"):
|
||||
print_version()
|
||||
sys.exit(0)
|
||||
if (elem[0]=="-c"):
|
||||
che=1
|
||||
elif (elem[0]=="-i"):
|
||||
inc=1
|
||||
elif (elem[0]=="-l"):
|
||||
lg=int(elem[1])
|
||||
elif (elem[0]=="-N"):
|
||||
name=elem[1]
|
||||
elif (elem[0]=="-I"):
|
||||
init = elem[1]
|
||||
elif (elem[0]=="-T"):
|
||||
stop = elem[1]
|
||||
elif (elem[0]=="-F"):
|
||||
filename = elem[1]
|
||||
elif (elem[0]=="-d"):
|
||||
delta = int(elem[1])
|
||||
|
||||
if filename =="":
|
||||
print_usage(sys.argv[0])
|
||||
sys.exit()
|
||||
|
||||
init = sets.Set(init.upper().strip().split('|'))
|
||||
stop = sets.Set(stop.upper().strip().split('|'))
|
||||
|
||||
return [che, inc, name, init, stop, lg, delta,filename]
|
||||
|
||||
|
||||
####################################################
|
||||
# HELP
|
||||
####################################################
|
||||
def print_usage(progname):
|
||||
print "usage:",progname.split("/")[-1], "-[vcil:N:I:T:]F filename"
|
||||
print "This argument gives the version of the program:"
|
||||
print " -v"
|
||||
print "This argument is needed for program execution:"
|
||||
print " -F filename"
|
||||
print " filename is the name of the fasta file"
|
||||
print "These arguments are optional:"
|
||||
print " -c"
|
||||
print " if 2 or more CDS overlap, the optimum way passing by"
|
||||
print " the longest ones are kept. This optimizes the total "
|
||||
print " length of CDS."
|
||||
print " This option also eliminates included ones "
|
||||
print " -i"
|
||||
print " only kept the maximum CDS (ignore the internal start"
|
||||
print " codons). This option is unsuitable for the -c option."
|
||||
print " -l min_length"
|
||||
print " if the length of a CDS is less than min_length, it is"
|
||||
print " eliminated"
|
||||
print " -d delta"
|
||||
print " if -c option is specified allow to indicate length"
|
||||
print " of acceptable overlap between two genes"
|
||||
print " -N prefix"
|
||||
print " the name of each CDS found will begin with this prefix"
|
||||
print " -I list of start codons separated by '|'"
|
||||
print " this option allows to change the set of initiation codons if "
|
||||
print " different from ATG. "
|
||||
print " -T list of stop codons separated by '|'"
|
||||
print " this option allows to change the set of termination codons if "
|
||||
print " different from TAG+TGA+TAA."
|
||||
return
|
||||
|
||||
|
||||
####################################################
|
||||
# Ecriture de la version
|
||||
####################################################
|
||||
|
||||
def print_version ():
|
||||
print "version 0.2 created in 03/2004"
|
||||
return
|
||||
|
||||
####################################################
|
||||
# Gestion des erreurs de fichiers:
|
||||
####################################################
|
||||
|
||||
def file_open_warning(file):
|
||||
print "Error opening", file, " for reading."
|
||||
return
|
||||
|
||||
|
||||
####################################################
|
||||
# Gestion des erreurs de formats d'entree:
|
||||
####################################################
|
||||
|
||||
def file_fasta_warning(file):
|
||||
print "Error ", file, "not in fasta format"
|
||||
print "or contains more than one sequence."
|
||||
return
|
||||
|
||||
####################################################
|
||||
# lecture du fichier et recuperation en string
|
||||
####################################################
|
||||
|
||||
def read_fasta (file):
|
||||
try:
|
||||
infile=open(file, 'r')
|
||||
except IOError:
|
||||
file_open_warning(file)
|
||||
sys.exit(0)
|
||||
k=[]
|
||||
name = ""
|
||||
tmp=infile.readlines()
|
||||
if tmp[0][0] != ">":
|
||||
file_fasta_warning(file)
|
||||
sys.exit(3)
|
||||
name =tmp[0].split()[0][1:]
|
||||
i=1
|
||||
lfile = len(tmp)
|
||||
for x in xrange (1, len(tmp)) :
|
||||
if tmp[x][0]!=">":
|
||||
k.append(tmp[x].strip().upper())
|
||||
else:
|
||||
file_fasta_warning(file)
|
||||
sys.exit(3)
|
||||
return name, "".join(k)
|
||||
|
||||
|
||||
####################################################
|
||||
# fonction qui ecrie le mot sur nb caracteres
|
||||
####################################################
|
||||
def print_espace(mot, nb):
|
||||
taille = len(mot)
|
||||
result = mot
|
||||
for i in range(taille, nb):
|
||||
result += " "
|
||||
return result
|
||||
|
||||
####################################################
|
||||
# fonction qui ecrie # et va a la ligne
|
||||
####################################################
|
||||
def print_diese(nb):
|
||||
print "#\n" * nb,
|
||||
|
||||
####################################################
|
||||
# fonction qui ecrie les resultats sur la sortie
|
||||
# standard
|
||||
####################################################
|
||||
|
||||
def print_out(n, s, args, codons, cdsW, cdsC, elim, cds):
|
||||
nb_start = [0, 0]
|
||||
nb_stop = [0, 0]
|
||||
for i in codons.items():
|
||||
if i[0] >= 0:
|
||||
nb_start[0] += i[1][0]
|
||||
nb_stop [0] += i[1][1]
|
||||
else:
|
||||
nb_start[1] += i[1][0]
|
||||
nb_stop [1] += i[1][1]
|
||||
|
||||
# ecrit les caracteristiques de la sequence etudiee
|
||||
print """\
|
||||
#**** Recherche des CDS Version 0.0 ****
|
||||
#
|
||||
# Titre : %(title)s
|
||||
# Longueur : %(seqlength)d
|
||||
#
|
||||
#""" % {'title' : n,
|
||||
'seqlength' : len(s) }
|
||||
|
||||
if args[5]:
|
||||
print "# Les CDS plus petits que %d sont elimines." % args[5]
|
||||
|
||||
if args[1]:
|
||||
print "# Les CDS inclus dans d'autres CDS sont elimines."
|
||||
|
||||
if args[0]:
|
||||
print "# Les CDS chevauchants d'autres CDS plus grands"
|
||||
print "# sont elimines."
|
||||
|
||||
print """\
|
||||
#
|
||||
# Sequence : Brin Watson
|
||||
# Complementaire: Brin Crick
|
||||
#
|
||||
# Analyse du brin Watson
|
||||
#
|
||||
# Nombre de codons START %(startcodon)-20s : %(startcount)8d""" % {
|
||||
'startcodon' : "(%s)" % ', '.join(args[3]),
|
||||
'startcount' : nb_start[0]
|
||||
}
|
||||
|
||||
for i in range (1, 4):
|
||||
print "# Sur la phase %d : %5d" % (i,codons[i][0])
|
||||
|
||||
print """\
|
||||
#
|
||||
# Nombre de codons STOP %(stopcodon)-20s : %(stopcount)8d""" % {
|
||||
'stopcodon' : "(%s)" % ', '.join(args[4]),
|
||||
'stopcount' : nb_stop[0]
|
||||
}
|
||||
|
||||
for i in range (1, 4):
|
||||
print "# Sur la phase %d : %5d" % (i,codons[i][1])
|
||||
|
||||
print """\
|
||||
# %(cdsW)d localises sur le brin Watson
|
||||
#
|
||||
# Analyse du brin Crick
|
||||
#
|
||||
# Nombre de codons START %(startcodon)-20s : %(startcount)8d""" % {
|
||||
'cdsW' : cdsW,
|
||||
'startcodon' : "(%s)" % ', '.join(args[3]),
|
||||
'startcount' : nb_start[1]
|
||||
}
|
||||
|
||||
for i in range (1, 4):
|
||||
print "# Sur la phase %d : %5d" % (i,codons[-i][0])
|
||||
|
||||
print """\
|
||||
#
|
||||
# Nombre de codons STOP %(stopcodon)-20s : %(stopcount)8d""" % {
|
||||
'stopcodon' : "(%s)" % ', '.join(args[4]),
|
||||
'stopcount' : nb_stop[1]
|
||||
}
|
||||
|
||||
for i in range (1, 4):
|
||||
print "# Sur la phase %d : %5d" % (i,codons[-i][1])
|
||||
print """\
|
||||
# %(cdsC)d localises sur le brin Watson
|
||||
#""" % { 'cdsC' : cdsC }
|
||||
|
||||
|
||||
# ecrit les bilans des filtres
|
||||
if args[0]:
|
||||
print "# %d CDS chevauchants ou inclus elimines" % elim
|
||||
print_diese(1)
|
||||
|
||||
# ecrit les caracteristiques des CDS
|
||||
print "# Brin Nom Debut Fin Commentaire"
|
||||
print_diese(1)
|
||||
num = 1
|
||||
for i in cds:
|
||||
strand = "Watson"
|
||||
brin = "w"
|
||||
if i[2] < 0:
|
||||
strand = "Crick"
|
||||
brin = "c"
|
||||
mnemo = "%s_CDS%d%c" % (args[2],num,brin)
|
||||
num += 1
|
||||
# Brin Nom Debut Fin Commentaire
|
||||
print "CDS %-10s %-15s %7d %7d # %s (phase %1d)" % (strand,mnemo,i[0]+1,i[1]+1,n,abs(i[2]))
|
||||
# ecrit la sequence
|
||||
|
||||
#for i in xrange(0,len(s),60):
|
||||
# print "SEQ %s" % s[i:i+60]
|
||||
|
||||
#print "XXX"
|
||||
return
|
||||
|
||||
####################################################
|
||||
# fonction qui cherche le complementaire (brin) d'une expression
|
||||
# reguliere
|
||||
####################################################
|
||||
def compl (exp):
|
||||
c = ""
|
||||
for i in range (1, len (exp)+1):
|
||||
a = exp [-i]
|
||||
if a == "A":
|
||||
c = c + "T"
|
||||
elif a == "T":
|
||||
c = c + "A"
|
||||
elif a == "C":
|
||||
c = c + "G"
|
||||
elif a == "G":
|
||||
c = c + "C"
|
||||
elif a == "(":
|
||||
c = c + ")"
|
||||
elif a == ")":
|
||||
c = c + "("
|
||||
elif a == "]":
|
||||
c = c + "["
|
||||
elif a == "[":
|
||||
c = c + "]"
|
||||
else:
|
||||
c = c + a
|
||||
return c
|
||||
|
||||
|
||||
####################################################
|
||||
# fonction de recherche de tous les CDS en un seul passage sur les deux brins en
|
||||
# meme temps
|
||||
####################################################
|
||||
def findAllCDS2 (s, init, stop, include,lg): # retourne le deb, la fin, la phase et la longueur des cds
|
||||
nb ={1 : [0,0], 2:[0,0], 3: [0,0], -1 :[0,0], -2:[0,0], -3:[0,0]} # contient pour chaque phase possible :
|
||||
sig_stack = {1 : [], 2:[], 3: [], -1 :[], -2:[], -3:[]}
|
||||
cstop = sets.Set([compl(x) for x in stop])
|
||||
cinit = sets.Set([compl(x) for x in init])
|
||||
cds = []
|
||||
cdsW = 0
|
||||
cdsC = 0
|
||||
|
||||
lseq = len(s)
|
||||
|
||||
for pos in xrange(0,lseq):
|
||||
if lseq > 100000 and not (pos % 599):
|
||||
progressBar(pos,lseq)
|
||||
|
||||
codon = s[pos:pos+3]
|
||||
i = pos + 2
|
||||
phase = (pos % 3) + 1
|
||||
rphase= -((i % 3) + 1)
|
||||
if codon in init:
|
||||
nb[phase][0]+=1
|
||||
if ((include and not sig_stack[phase]) or
|
||||
not include):
|
||||
sig_stack[phase].append(pos)
|
||||
|
||||
if codon in stop:
|
||||
nb[phase][1]+=1
|
||||
while sig_stack[phase]:
|
||||
start = sig_stack[phase].pop()
|
||||
length=i - start + 1
|
||||
if not lg or length >= lg:
|
||||
cds.append((start,i,phase,i - start + 1))
|
||||
cdsW+=1
|
||||
|
||||
if codon in cinit:
|
||||
nb[rphase][0]+=1
|
||||
if sig_stack[rphase]:
|
||||
if include:
|
||||
if len(sig_stack[rphase]) == 1:
|
||||
sig_stack[rphase].append(i)
|
||||
else:
|
||||
sig_stack[rphase][1]=i
|
||||
else:
|
||||
sig_stack[rphase].append(i)
|
||||
|
||||
|
||||
if codon in cstop:
|
||||
nb[rphase][1]+=1
|
||||
pstop = -1
|
||||
sstack= []
|
||||
if sig_stack[rphase]:
|
||||
pstop = sig_stack[rphase][0]
|
||||
sstack = sig_stack[rphase][1:]
|
||||
while sstack:
|
||||
start = sstack.pop()
|
||||
length = start - pstop + 1
|
||||
if not lg or length >= lg:
|
||||
cds.append((pstop,start,rphase,start - pstop + 1))
|
||||
cdsC+=1
|
||||
sig_stack[rphase]=[pos]
|
||||
|
||||
# depile les ORF restant sur les
|
||||
# piles reverses
|
||||
for rphase in xrange(-3,0):
|
||||
pstop = -1
|
||||
sstack= []
|
||||
if sig_stack[rphase]:
|
||||
pstop = sig_stack[rphase][0]
|
||||
sstack = sig_stack[rphase][1:]
|
||||
while sstack:
|
||||
start = sstack.pop()
|
||||
length = start - pstop + 1
|
||||
if not lg or length >= lg:
|
||||
cds.append((pstop,start,rphase,start - pstop + 1))
|
||||
cdsC+=1
|
||||
|
||||
cds.sort(ordercds)
|
||||
return nb, cdsW, cdsC, cds
|
||||
|
||||
|
||||
def __find_incompatible(automat,cds,state,first,start):
|
||||
lcds = len(cds)
|
||||
m = first
|
||||
while (m<lcds) and (cds[m][0] < start):
|
||||
m+=1
|
||||
|
||||
if m < lcds:
|
||||
ref = cds[m]
|
||||
while (m<lcds) and (cds[m][0] <= ref[1]):
|
||||
if state not in automat[m]:
|
||||
automat[m].add(state)
|
||||
__find_incompatible(automat,cds,m,m,cds[m][1]+1)
|
||||
m+=1
|
||||
|
||||
####################################################
|
||||
# fonction de filtrage (inclus, chevauchants, petits)
|
||||
####################################################
|
||||
|
||||
def new_filtre_cds(cds,delta=0):
|
||||
cds.sort(ordercds)
|
||||
automat = [sets.Set() for x in xrange(len(cds))]
|
||||
lcds = len(cds)
|
||||
|
||||
for current in xrange(-1,lcds):
|
||||
start=-1
|
||||
if current >= 0:
|
||||
start=cds[current][1]
|
||||
m = current+1
|
||||
while (m<lcds) and (start - cds[m][0] >= delta):
|
||||
m+=1
|
||||
if m < lcds:
|
||||
ref = cds[m][1]
|
||||
while (m<lcds) and (ref - cds[m][0] > delta):
|
||||
automat[m].add(current)
|
||||
m+=1
|
||||
|
||||
backtrack = []
|
||||
winner = (0,)
|
||||
|
||||
for p in xrange(len(cds)):
|
||||
scoremax = 0
|
||||
best = -1
|
||||
for vient_de in automat[p]:
|
||||
score = 0
|
||||
if (vient_de >= 0):
|
||||
score = backtrack[vient_de][0]+(cds[vient_de][3]**2)
|
||||
if score >= scoremax:
|
||||
scoremax = score
|
||||
best = vient_de
|
||||
if scoremax > winner[0]:
|
||||
winner=(scoremax,p)
|
||||
backtrack.append((scoremax,best))
|
||||
|
||||
# print automat[0:10]
|
||||
# print backtrack
|
||||
good = []
|
||||
current = winner[1]
|
||||
|
||||
while current >= 0:
|
||||
good.append(cds[current])
|
||||
current = backtrack[current][1]
|
||||
|
||||
good.reverse()
|
||||
return len(cds)-len(good),good
|
||||
|
||||
|
||||
####################################################
|
||||
# Programme
|
||||
####################################################
|
||||
def main():
|
||||
args=parse_args()
|
||||
(che, inc, name, init, stop, lg, delta, filename) = args
|
||||
name, seq = read_fasta(filename)
|
||||
cds_fil = []
|
||||
elim = [0,0]
|
||||
nb_codons, cdsW, cdsC, cds = findAllCDS2(seq, init, stop, inc,lg)
|
||||
if che or inc: # si tri sur chevauchants
|
||||
elim, cds_fil = new_filtre_cds (cds,delta)
|
||||
else :
|
||||
cds_fil = cds [0:len(cds)]
|
||||
#print len(cds), len(cds_fil)
|
||||
#print cds_fil
|
||||
print_out (name, seq, args, nb_codons, cdsW, cdsC, elim, cds_fil)
|
||||
print >>sys.stderr,""
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
Reference in New Issue
Block a user