add the find cds script

Former-commit-id: d4d49b96f84c942a1f15d10c286988f7cea3b76f
Former-commit-id: 541a8dc171fa695a307b76df0446b33a7112bad1
This commit is contained in:
2015-05-07 16:42:15 +02:00
parent 7eb55b5d0b
commit d4ba831499

524
findcds Executable file
View File

@ -0,0 +1,524 @@
#!/usr/bin/env python
####################################################
# Date de derniere modif : 03/2004
# Il ne prend qu'une seule sequence fasta en entree.
# contact : coissac@abi.snv.jussieu.fr
####################################################
import sys
import re
import getopt
import sets
def progressBar(pos,max):
percent = float(pos)/max * 100
bar = '#' * int(percent/2)
bar+= '|/-\\-'[pos % 5]
bar+= ' ' * (50 - int(percent/2))
sys.stderr.write('\r%5.1f %% |%s]' %(percent,bar))
####################################################
# fonction de comparaison des longueurs de cds (4eme champs)
# pour les trier par ordre decroissant
####################################################
def complg (x,y) :
if x[3]< y[3] :
return 1
elif x[3] > y[3]:
return -1
else :
return 0
####################################################
# fonction de comparaison des longueurs de cds (4eme champs)
# pour les trier par ordre decroissant
####################################################
def ordercds (x,y) :
if x[0] > y[0] :
return 1
elif x[0] < y[0]:
return -1
else :
if x[1] > y[1] :
return 1
elif x[1] < y[1]:
return -1
return 0
####################################################
# recuperation des arguments
####################################################
def parse_args():
try:
opts = getopt.getopt(sys.argv[1:], 'vcil:N:I:T:F:d:')
except getopt.GetoptError:
print_usage(sys.argv[0])
sys.exit(2)
che=0 # valeurs par defaut
inc=0
lg=0
name=""
init = 'ATG'
stop = 'TAA|TAG|TGA'
filename = ""
delta=0
for elem in opts[0]: # modification des valeurs par defaut
if (elem[0]=="-v"):
print_version()
sys.exit(0)
if (elem[0]=="-c"):
che=1
elif (elem[0]=="-i"):
inc=1
elif (elem[0]=="-l"):
lg=int(elem[1])
elif (elem[0]=="-N"):
name=elem[1]
elif (elem[0]=="-I"):
init = elem[1]
elif (elem[0]=="-T"):
stop = elem[1]
elif (elem[0]=="-F"):
filename = elem[1]
elif (elem[0]=="-d"):
delta = int(elem[1])
if filename =="":
print_usage(sys.argv[0])
sys.exit()
init = sets.Set(init.upper().strip().split('|'))
stop = sets.Set(stop.upper().strip().split('|'))
return [che, inc, name, init, stop, lg, delta,filename]
####################################################
# HELP
####################################################
def print_usage(progname):
print "usage:",progname.split("/")[-1], "-[vcil:N:I:T:]F filename"
print "This argument gives the version of the program:"
print " -v"
print "This argument is needed for program execution:"
print " -F filename"
print " filename is the name of the fasta file"
print "These arguments are optional:"
print " -c"
print " if 2 or more CDS overlap, the optimum way passing by"
print " the longest ones are kept. This optimizes the total "
print " length of CDS."
print " This option also eliminates included ones "
print " -i"
print " only kept the maximum CDS (ignore the internal start"
print " codons). This option is unsuitable for the -c option."
print " -l min_length"
print " if the length of a CDS is less than min_length, it is"
print " eliminated"
print " -d delta"
print " if -c option is specified allow to indicate length"
print " of acceptable overlap between two genes"
print " -N prefix"
print " the name of each CDS found will begin with this prefix"
print " -I list of start codons separated by '|'"
print " this option allows to change the set of initiation codons if "
print " different from ATG. "
print " -T list of stop codons separated by '|'"
print " this option allows to change the set of termination codons if "
print " different from TAG+TGA+TAA."
return
####################################################
# Ecriture de la version
####################################################
def print_version ():
print "version 0.2 created in 03/2004"
return
####################################################
# Gestion des erreurs de fichiers:
####################################################
def file_open_warning(file):
print "Error opening", file, " for reading."
return
####################################################
# Gestion des erreurs de formats d'entree:
####################################################
def file_fasta_warning(file):
print "Error ", file, "not in fasta format"
print "or contains more than one sequence."
return
####################################################
# lecture du fichier et recuperation en string
####################################################
def read_fasta (file):
try:
infile=open(file, 'r')
except IOError:
file_open_warning(file)
sys.exit(0)
k=[]
name = ""
tmp=infile.readlines()
if tmp[0][0] != ">":
file_fasta_warning(file)
sys.exit(3)
name =tmp[0].split()[0][1:]
i=1
lfile = len(tmp)
for x in xrange (1, len(tmp)) :
if tmp[x][0]!=">":
k.append(tmp[x].strip().upper())
else:
file_fasta_warning(file)
sys.exit(3)
return name, "".join(k)
####################################################
# fonction qui ecrie le mot sur nb caracteres
####################################################
def print_espace(mot, nb):
taille = len(mot)
result = mot
for i in range(taille, nb):
result += " "
return result
####################################################
# fonction qui ecrie # et va a la ligne
####################################################
def print_diese(nb):
print "#\n" * nb,
####################################################
# fonction qui ecrie les resultats sur la sortie
# standard
####################################################
def print_out(n, s, args, codons, cdsW, cdsC, elim, cds):
nb_start = [0, 0]
nb_stop = [0, 0]
for i in codons.items():
if i[0] >= 0:
nb_start[0] += i[1][0]
nb_stop [0] += i[1][1]
else:
nb_start[1] += i[1][0]
nb_stop [1] += i[1][1]
# ecrit les caracteristiques de la sequence etudiee
print """\
#**** Recherche des CDS Version 0.0 ****
#
# Titre : %(title)s
# Longueur : %(seqlength)d
#
#""" % {'title' : n,
'seqlength' : len(s) }
if args[5]:
print "# Les CDS plus petits que %d sont elimines." % args[5]
if args[1]:
print "# Les CDS inclus dans d'autres CDS sont elimines."
if args[0]:
print "# Les CDS chevauchants d'autres CDS plus grands"
print "# sont elimines."
print """\
#
# Sequence : Brin Watson
# Complementaire: Brin Crick
#
# Analyse du brin Watson
#
# Nombre de codons START %(startcodon)-20s : %(startcount)8d""" % {
'startcodon' : "(%s)" % ', '.join(args[3]),
'startcount' : nb_start[0]
}
for i in range (1, 4):
print "# Sur la phase %d : %5d" % (i,codons[i][0])
print """\
#
# Nombre de codons STOP %(stopcodon)-20s : %(stopcount)8d""" % {
'stopcodon' : "(%s)" % ', '.join(args[4]),
'stopcount' : nb_stop[0]
}
for i in range (1, 4):
print "# Sur la phase %d : %5d" % (i,codons[i][1])
print """\
# %(cdsW)d localises sur le brin Watson
#
# Analyse du brin Crick
#
# Nombre de codons START %(startcodon)-20s : %(startcount)8d""" % {
'cdsW' : cdsW,
'startcodon' : "(%s)" % ', '.join(args[3]),
'startcount' : nb_start[1]
}
for i in range (1, 4):
print "# Sur la phase %d : %5d" % (i,codons[-i][0])
print """\
#
# Nombre de codons STOP %(stopcodon)-20s : %(stopcount)8d""" % {
'stopcodon' : "(%s)" % ', '.join(args[4]),
'stopcount' : nb_stop[1]
}
for i in range (1, 4):
print "# Sur la phase %d : %5d" % (i,codons[-i][1])
print """\
# %(cdsC)d localises sur le brin Watson
#""" % { 'cdsC' : cdsC }
# ecrit les bilans des filtres
if args[0]:
print "# %d CDS chevauchants ou inclus elimines" % elim
print_diese(1)
# ecrit les caracteristiques des CDS
print "# Brin Nom Debut Fin Commentaire"
print_diese(1)
num = 1
for i in cds:
strand = "Watson"
brin = "w"
if i[2] < 0:
strand = "Crick"
brin = "c"
mnemo = "%s_CDS%d%c" % (args[2],num,brin)
num += 1
# Brin Nom Debut Fin Commentaire
print "CDS %-10s %-15s %7d %7d # %s (phase %1d)" % (strand,mnemo,i[0]+1,i[1]+1,n,abs(i[2]))
# ecrit la sequence
#for i in xrange(0,len(s),60):
# print "SEQ %s" % s[i:i+60]
#print "XXX"
return
####################################################
# fonction qui cherche le complementaire (brin) d'une expression
# reguliere
####################################################
def compl (exp):
c = ""
for i in range (1, len (exp)+1):
a = exp [-i]
if a == "A":
c = c + "T"
elif a == "T":
c = c + "A"
elif a == "C":
c = c + "G"
elif a == "G":
c = c + "C"
elif a == "(":
c = c + ")"
elif a == ")":
c = c + "("
elif a == "]":
c = c + "["
elif a == "[":
c = c + "]"
else:
c = c + a
return c
####################################################
# fonction de recherche de tous les CDS en un seul passage sur les deux brins en
# meme temps
####################################################
def findAllCDS2 (s, init, stop, include,lg): # retourne le deb, la fin, la phase et la longueur des cds
nb ={1 : [0,0], 2:[0,0], 3: [0,0], -1 :[0,0], -2:[0,0], -3:[0,0]} # contient pour chaque phase possible :
sig_stack = {1 : [], 2:[], 3: [], -1 :[], -2:[], -3:[]}
cstop = sets.Set([compl(x) for x in stop])
cinit = sets.Set([compl(x) for x in init])
cds = []
cdsW = 0
cdsC = 0
lseq = len(s)
for pos in xrange(0,lseq):
if lseq > 100000 and not (pos % 599):
progressBar(pos,lseq)
codon = s[pos:pos+3]
i = pos + 2
phase = (pos % 3) + 1
rphase= -((i % 3) + 1)
if codon in init:
nb[phase][0]+=1
if ((include and not sig_stack[phase]) or
not include):
sig_stack[phase].append(pos)
if codon in stop:
nb[phase][1]+=1
while sig_stack[phase]:
start = sig_stack[phase].pop()
length=i - start + 1
if not lg or length >= lg:
cds.append((start,i,phase,i - start + 1))
cdsW+=1
if codon in cinit:
nb[rphase][0]+=1
if sig_stack[rphase]:
if include:
if len(sig_stack[rphase]) == 1:
sig_stack[rphase].append(i)
else:
sig_stack[rphase][1]=i
else:
sig_stack[rphase].append(i)
if codon in cstop:
nb[rphase][1]+=1
pstop = -1
sstack= []
if sig_stack[rphase]:
pstop = sig_stack[rphase][0]
sstack = sig_stack[rphase][1:]
while sstack:
start = sstack.pop()
length = start - pstop + 1
if not lg or length >= lg:
cds.append((pstop,start,rphase,start - pstop + 1))
cdsC+=1
sig_stack[rphase]=[pos]
# depile les ORF restant sur les
# piles reverses
for rphase in xrange(-3,0):
pstop = -1
sstack= []
if sig_stack[rphase]:
pstop = sig_stack[rphase][0]
sstack = sig_stack[rphase][1:]
while sstack:
start = sstack.pop()
length = start - pstop + 1
if not lg or length >= lg:
cds.append((pstop,start,rphase,start - pstop + 1))
cdsC+=1
cds.sort(ordercds)
return nb, cdsW, cdsC, cds
def __find_incompatible(automat,cds,state,first,start):
lcds = len(cds)
m = first
while (m<lcds) and (cds[m][0] < start):
m+=1
if m < lcds:
ref = cds[m]
while (m<lcds) and (cds[m][0] <= ref[1]):
if state not in automat[m]:
automat[m].add(state)
__find_incompatible(automat,cds,m,m,cds[m][1]+1)
m+=1
####################################################
# fonction de filtrage (inclus, chevauchants, petits)
####################################################
def new_filtre_cds(cds,delta=0):
cds.sort(ordercds)
automat = [sets.Set() for x in xrange(len(cds))]
lcds = len(cds)
for current in xrange(-1,lcds):
start=-1
if current >= 0:
start=cds[current][1]
m = current+1
while (m<lcds) and (start - cds[m][0] >= delta):
m+=1
if m < lcds:
ref = cds[m][1]
while (m<lcds) and (ref - cds[m][0] > delta):
automat[m].add(current)
m+=1
backtrack = []
winner = (0,)
for p in xrange(len(cds)):
scoremax = 0
best = -1
for vient_de in automat[p]:
score = 0
if (vient_de >= 0):
score = backtrack[vient_de][0]+(cds[vient_de][3]**2)
if score >= scoremax:
scoremax = score
best = vient_de
if scoremax > winner[0]:
winner=(scoremax,p)
backtrack.append((scoremax,best))
# print automat[0:10]
# print backtrack
good = []
current = winner[1]
while current >= 0:
good.append(cds[current])
current = backtrack[current][1]
good.reverse()
return len(cds)-len(good),good
####################################################
# Programme
####################################################
def main():
args=parse_args()
(che, inc, name, init, stop, lg, delta, filename) = args
name, seq = read_fasta(filename)
cds_fil = []
elim = [0,0]
nb_codons, cdsW, cdsC, cds = findAllCDS2(seq, init, stop, inc,lg)
if che or inc: # si tri sur chevauchants
elim, cds_fil = new_filtre_cds (cds,delta)
else :
cds_fil = cds [0:len(cds)]
#print len(cds), len(cds_fil)
#print cds_fil
print_out (name, seq, args, nb_codons, cdsW, cdsC, elim, cds_fil)
print >>sys.stderr,""
if __name__ == '__main__':
main()