#!/usr/bin/env python #################################################### # Date de derniere modif : 03/2004 # Il ne prend qu'une seule sequence fasta en entree. # contact : coissac@abi.snv.jussieu.fr #################################################### import sys import re import getopt import sets def progressBar(pos,max): percent = float(pos)/max * 100 bar = '#' * int(percent/2) bar+= '|/-\\-'[pos % 5] bar+= ' ' * (50 - int(percent/2)) sys.stderr.write('\r%5.1f %% |%s]' %(percent,bar)) #################################################### # fonction de comparaison des longueurs de cds (4eme champs) # pour les trier par ordre decroissant #################################################### def complg (x,y) : if x[3]< y[3] : return 1 elif x[3] > y[3]: return -1 else : return 0 #################################################### # fonction de comparaison des longueurs de cds (4eme champs) # pour les trier par ordre decroissant #################################################### def ordercds (x,y) : if x[0] > y[0] : return 1 elif x[0] < y[0]: return -1 else : if x[1] > y[1] : return 1 elif x[1] < y[1]: return -1 return 0 #################################################### # recuperation des arguments #################################################### def parse_args(): try: opts = getopt.getopt(sys.argv[1:], 'vcil:N:I:T:F:d:') except getopt.GetoptError: print_usage(sys.argv[0]) sys.exit(2) che=0 # valeurs par defaut inc=0 lg=0 name="" init = 'ATG' stop = 'TAA|TAG|TGA' filename = "" delta=0 for elem in opts[0]: # modification des valeurs par defaut if (elem[0]=="-v"): print_version() sys.exit(0) if (elem[0]=="-c"): che=1 elif (elem[0]=="-i"): inc=1 elif (elem[0]=="-l"): lg=int(elem[1]) elif (elem[0]=="-N"): name=elem[1] elif (elem[0]=="-I"): init = elem[1] elif (elem[0]=="-T"): stop = elem[1] elif (elem[0]=="-F"): filename = elem[1] elif (elem[0]=="-d"): delta = int(elem[1]) if filename =="": print_usage(sys.argv[0]) sys.exit() init = sets.Set(init.upper().strip().split('|')) stop = sets.Set(stop.upper().strip().split('|')) return [che, inc, name, init, stop, lg, delta,filename] #################################################### # HELP #################################################### def print_usage(progname): print "usage:",progname.split("/")[-1], "-[vcil:N:I:T:]F filename" print "This argument gives the version of the program:" print " -v" print "This argument is needed for program execution:" print " -F filename" print " filename is the name of the fasta file" print "These arguments are optional:" print " -c" print " if 2 or more CDS overlap, the optimum way passing by" print " the longest ones are kept. This optimizes the total " print " length of CDS." print " This option also eliminates included ones " print " -i" print " only kept the maximum CDS (ignore the internal start" print " codons). This option is unsuitable for the -c option." print " -l min_length" print " if the length of a CDS is less than min_length, it is" print " eliminated" print " -d delta" print " if -c option is specified allow to indicate length" print " of acceptable overlap between two genes" print " -N prefix" print " the name of each CDS found will begin with this prefix" print " -I list of start codons separated by '|'" print " this option allows to change the set of initiation codons if " print " different from ATG. " print " -T list of stop codons separated by '|'" print " this option allows to change the set of termination codons if " print " different from TAG+TGA+TAA." return #################################################### # Ecriture de la version #################################################### def print_version (): print "version 0.2 created in 03/2004" return #################################################### # Gestion des erreurs de fichiers: #################################################### def file_open_warning(file): print "Error opening", file, " for reading." return #################################################### # Gestion des erreurs de formats d'entree: #################################################### def file_fasta_warning(file): print "Error ", file, "not in fasta format" print "or contains more than one sequence." return #################################################### # lecture du fichier et recuperation en string #################################################### def read_fasta (file): try: infile=open(file, 'r') except IOError: file_open_warning(file) sys.exit(0) k=[] name = "" tmp=infile.readlines() if tmp[0][0] != ">": file_fasta_warning(file) sys.exit(3) name =tmp[0].split()[0][1:] i=1 lfile = len(tmp) for x in xrange (1, len(tmp)) : if tmp[x][0]!=">": k.append(tmp[x].strip().upper()) else: file_fasta_warning(file) sys.exit(3) return name, "".join(k) #################################################### # fonction qui ecrie le mot sur nb caracteres #################################################### def print_espace(mot, nb): taille = len(mot) result = mot for i in range(taille, nb): result += " " return result #################################################### # fonction qui ecrie # et va a la ligne #################################################### def print_diese(nb): print "#\n" * nb, #################################################### # fonction qui ecrie les resultats sur la sortie # standard #################################################### def print_out(n, s, args, codons, cdsW, cdsC, elim, cds): nb_start = [0, 0] nb_stop = [0, 0] for i in codons.items(): if i[0] >= 0: nb_start[0] += i[1][0] nb_stop [0] += i[1][1] else: nb_start[1] += i[1][0] nb_stop [1] += i[1][1] # ecrit les caracteristiques de la sequence etudiee print """\ #**** Recherche des CDS Version 0.0 **** # # Titre : %(title)s # Longueur : %(seqlength)d # #""" % {'title' : n, 'seqlength' : len(s) } if args[5]: print "# Les CDS plus petits que %d sont elimines." % args[5] if args[1]: print "# Les CDS inclus dans d'autres CDS sont elimines." if args[0]: print "# Les CDS chevauchants d'autres CDS plus grands" print "# sont elimines." print """\ # # Sequence : Brin Watson # Complementaire: Brin Crick # # Analyse du brin Watson # # Nombre de codons START %(startcodon)-20s : %(startcount)8d""" % { 'startcodon' : "(%s)" % ', '.join(args[3]), 'startcount' : nb_start[0] } for i in range (1, 4): print "# Sur la phase %d : %5d" % (i,codons[i][0]) print """\ # # Nombre de codons STOP %(stopcodon)-20s : %(stopcount)8d""" % { 'stopcodon' : "(%s)" % ', '.join(args[4]), 'stopcount' : nb_stop[0] } for i in range (1, 4): print "# Sur la phase %d : %5d" % (i,codons[i][1]) print """\ # %(cdsW)d localises sur le brin Watson # # Analyse du brin Crick # # Nombre de codons START %(startcodon)-20s : %(startcount)8d""" % { 'cdsW' : cdsW, 'startcodon' : "(%s)" % ', '.join(args[3]), 'startcount' : nb_start[1] } for i in range (1, 4): print "# Sur la phase %d : %5d" % (i,codons[-i][0]) print """\ # # Nombre de codons STOP %(stopcodon)-20s : %(stopcount)8d""" % { 'stopcodon' : "(%s)" % ', '.join(args[4]), 'stopcount' : nb_stop[1] } for i in range (1, 4): print "# Sur la phase %d : %5d" % (i,codons[-i][1]) print """\ # %(cdsC)d localises sur le brin Watson #""" % { 'cdsC' : cdsC } # ecrit les bilans des filtres if args[0]: print "# %d CDS chevauchants ou inclus elimines" % elim print_diese(1) # ecrit les caracteristiques des CDS print "# Brin Nom Debut Fin Commentaire" print_diese(1) num = 1 for i in cds: strand = "Watson" brin = "w" if i[2] < 0: strand = "Crick" brin = "c" mnemo = "%s_CDS%d%c" % (args[2],num,brin) num += 1 # Brin Nom Debut Fin Commentaire print "CDS %-10s %-15s %7d %7d # %s (phase %1d)" % (strand,mnemo,i[0]+1,i[1]+1,n,abs(i[2])) # ecrit la sequence #for i in xrange(0,len(s),60): # print "SEQ %s" % s[i:i+60] #print "XXX" return #################################################### # fonction qui cherche le complementaire (brin) d'une expression # reguliere #################################################### def compl (exp): c = "" for i in range (1, len (exp)+1): a = exp [-i] if a == "A": c = c + "T" elif a == "T": c = c + "A" elif a == "C": c = c + "G" elif a == "G": c = c + "C" elif a == "(": c = c + ")" elif a == ")": c = c + "(" elif a == "]": c = c + "[" elif a == "[": c = c + "]" else: c = c + a return c #################################################### # fonction de recherche de tous les CDS en un seul passage sur les deux brins en # meme temps #################################################### def findAllCDS2 (s, init, stop, include,lg): # retourne le deb, la fin, la phase et la longueur des cds nb ={1 : [0,0], 2:[0,0], 3: [0,0], -1 :[0,0], -2:[0,0], -3:[0,0]} # contient pour chaque phase possible : sig_stack = {1 : [], 2:[], 3: [], -1 :[], -2:[], -3:[]} cstop = sets.Set([compl(x) for x in stop]) cinit = sets.Set([compl(x) for x in init]) cds = [] cdsW = 0 cdsC = 0 lseq = len(s) for pos in xrange(0,lseq): if lseq > 100000 and not (pos % 599): progressBar(pos,lseq) codon = s[pos:pos+3] i = pos + 2 phase = (pos % 3) + 1 rphase= -((i % 3) + 1) if codon in init: nb[phase][0]+=1 if ((include and not sig_stack[phase]) or not include): sig_stack[phase].append(pos) if codon in stop: nb[phase][1]+=1 while sig_stack[phase]: start = sig_stack[phase].pop() length=i - start + 1 if not lg or length >= lg: cds.append((start,i,phase,i - start + 1)) cdsW+=1 if codon in cinit: nb[rphase][0]+=1 if sig_stack[rphase]: if include: if len(sig_stack[rphase]) == 1: sig_stack[rphase].append(i) else: sig_stack[rphase][1]=i else: sig_stack[rphase].append(i) if codon in cstop: nb[rphase][1]+=1 pstop = -1 sstack= [] if sig_stack[rphase]: pstop = sig_stack[rphase][0] sstack = sig_stack[rphase][1:] while sstack: start = sstack.pop() length = start - pstop + 1 if not lg or length >= lg: cds.append((pstop,start,rphase,start - pstop + 1)) cdsC+=1 sig_stack[rphase]=[pos] # depile les ORF restant sur les # piles reverses for rphase in xrange(-3,0): pstop = -1 sstack= [] if sig_stack[rphase]: pstop = sig_stack[rphase][0] sstack = sig_stack[rphase][1:] while sstack: start = sstack.pop() length = start - pstop + 1 if not lg or length >= lg: cds.append((pstop,start,rphase,start - pstop + 1)) cdsC+=1 cds.sort(ordercds) return nb, cdsW, cdsC, cds def __find_incompatible(automat,cds,state,first,start): lcds = len(cds) m = first while (m= 0: start=cds[current][1] m = current+1 while (m= delta): m+=1 if m < lcds: ref = cds[m][1] while (m delta): automat[m].add(current) m+=1 backtrack = [] winner = (0,) for p in xrange(len(cds)): scoremax = 0 best = -1 for vient_de in automat[p]: score = 0 if (vient_de >= 0): score = backtrack[vient_de][0]+(cds[vient_de][3]**2) if score >= scoremax: scoremax = score best = vient_de if scoremax > winner[0]: winner=(scoremax,p) backtrack.append((scoremax,best)) # print automat[0:10] # print backtrack good = [] current = winner[1] while current >= 0: good.append(cds[current]) current = backtrack[current][1] good.reverse() return len(cds)-len(good),good #################################################### # Programme #################################################### def main(): args=parse_args() (che, inc, name, init, stop, lg, delta, filename) = args name, seq = read_fasta(filename) cds_fil = [] elim = [0,0] nb_codons, cdsW, cdsC, cds = findAllCDS2(seq, init, stop, inc,lg) if che or inc: # si tri sur chevauchants elim, cds_fil = new_filtre_cds (cds,delta) else : cds_fil = cds [0:len(cds)] #print len(cds), len(cds_fil) #print cds_fil print_out (name, seq, args, nb_codons, cdsW, cdsC, elim, cds_fil) print >>sys.stderr,"" if __name__ == '__main__': main()