updated ecoFindTaxids to read UNITE databases.

This commit is contained in:
Céline Mercier
2012-10-16 12:52:47 +00:00
parent 4a9fbc367b
commit 82f743447c

View File

@ -33,6 +33,31 @@ def UNITEIterator(f):
yield s yield s
def SILVAIterator(f):
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
for entry in fastaEntryIterator(f) :
all = entry.split('\n')
header = all[0]
fields = header.split(' | ')
id = fields[0][1:]
seq = all[1]
s = NucSequence(id, seq)
if '(' in fields[1] :
species_name = ''
found = False
for word in fields[1].split(' ') :
if word == '(' :
found = True
if found == False :
species_name = species_name + ' ' + word
else :
species_name = fields[1]
s['species_name'] = species_name
yield s
def lookForSimilarSpeciesNameInGenus(species_name, species_list): def lookForSimilarSpeciesNameInGenus(species_name, species_list):
genus_species = species_name.split(' ') genus_species = species_name.split(' ')
@ -108,6 +133,7 @@ def getGenusTaxid(tax, species_name, ancestor):
def getTaxid(tax, name, ancestor): def getTaxid(tax, name, ancestor):
taxid = tax.findTaxonByName(name)[0] taxid = tax.findTaxonByName(name)[0]
#print '\n~~~~~~~in getTaxid: ', name, ', taxid :', taxid,', anc = ', tax.isAncestor(ancestor, taxid), '\n'
if ancestor != None and not tax.isAncestor(ancestor, taxid) : if ancestor != None and not tax.isAncestor(ancestor, taxid) :
raise KeyError() raise KeyError()
@ -168,7 +194,7 @@ def addTaxonomyOptions(optionManager):
metavar="<FILENAME>", metavar="<FILENAME>",
type="string", type="string",
default=None, default=None,
help="file used to store sequences with the genus found.") help="(not with UNITE databases) file used to store sequences with the genus found.")
optionManager.add_option('-u','--unidentified', optionManager.add_option('-u','--unidentified',
action="store", dest="unidentified", action="store", dest="unidentified",
@ -182,7 +208,7 @@ def addTaxonomyOptions(optionManager):
metavar="<dirty_file>", metavar="<dirty_file>",
type="str", type="str",
default=None, default=None,
help="if chosen, ALL the words in the name used to identify the sequences will be searched" help="(not with UNITE databases) if chosen, ALL the words in the name used to identify the sequences will be searched"
" when neither the exact name nor the genus have been found." " when neither the exact name nor the genus have been found."
" Only use if the sequences in your database are badly named with useless words or numbers" " Only use if the sequences in your database are badly named with useless words or numbers"
" in the name etc." " in the name etc."
@ -193,7 +219,7 @@ def addTaxonomyOptions(optionManager):
metavar="<dbtype>", metavar="<dbtype>",
type="string", type="string",
default='raw', default='raw',
help="type of the database with the taxa to be added. Possibilities : 'raw' or 'UNITE'." help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'."
" Default : raw.") " Default : raw.")
optionManager.add_option('-T','--tag-name', optionManager.add_option('-T','--tag-name',
@ -234,18 +260,20 @@ if __name__=='__main__':
(options,entries) = optionParser() (options,entries) = optionParser()
tax=loadTaxonomyDatabase(options)
if options.db_type == 'raw' : if options.db_type == 'raw' :
entryIterator=fastaIterator entryIterator=fastaIterator
elif options.db_type == 'UNITE' : elif options.db_type == 'UNITE' :
entryIterator=UNITEIterator entryIterator=UNITEIterator
elif options.db_type == 'SILVA' :
entryIterator=SILVAIterator
options.tagname = 'species_name'
entries = entryIterator(entries) entries = entryIterator(entries)
tax=loadTaxonomyDatabase(options)
openFiles(options) openFiles(options)
if options.db_type == 'raw' : if (options.db_type == 'raw') or (options.db_type == 'SILVA') :
if options.res_anc == '' : if options.res_anc == '' :
restricting_ancestor = None restricting_ancestor = None
@ -287,59 +315,67 @@ if __name__=='__main__':
elif options.db_type == 'UNITE' : elif options.db_type == 'UNITE' :
raise NotImplemented restricting_ancestor = tax.findTaxonByName('Fungi')[0]
# for s in entries : for s in entries :
#
# try: try:
# species_name = s["UNITE_species_name"] species_name = s["UNITE_species_name"]
# taxid = getTaxid(tax, species_name) taxid = getTaxid(tax, species_name, restricting_ancestor)
# s['taxid']=taxid s['taxid']=taxid
# s["species_name"] = species_name s["species_name"] = species_name
# print formatFasta(s) print formatFasta(s)
#
# except KeyError: except KeyError:
# try: try:
# species_name = s["ISDN_species_name"] species_name = s["ISDN_species_name"]
# taxid = getTaxid(tax, species_name) print species_name
# s['taxid']=taxid taxid = getTaxid(tax, species_name, restricting_ancestor)
# s["species_name"] = species_name s['taxid']=taxid
# print formatFasta(s) s["species_name"] = species_name
# print formatFasta(s)
# except KeyError:
# except KeyError:
# if s["UNITE_species_name"] != "-" :
# species_name = s["UNITE_species_name"] if s["UNITE_species_name"] != "-" and s["UNITE_species_name"] != "" :
# s["species_name"] = s["UNITE_species_name"]
# s["species_name"] = species_name chosen = 'unite'
#
# if options.genusdefined is not None: elif s["ISDN_species_name"] != "-" and s["ISDN_species_name"] != "" :
# s["species_name"] = s["ISDN_species_name"]
# try: chosen = 'isdn'
# genusTaxid = getGenusTaxid(species_name)
# else :
# if options.species_list == True or options.similar_names_list == True : if s["UNITE_path"] != "-" and s["UNITE_path"] != "" :
# speciesList = [i for i in tax.subTreeIterator(genusTaxid)] chosen = 'unite'
# s["species_name"] = (s["UNITE_path"].split(', '))[-1]
# if options.species_list == True :
# s['species in this genus'] = getAllSpeciesFromGenus(speciesList) elif s["ISDN_path"] != "-" and s["ISDN_path"] != "" :
# chosen = 'isdn'
# if options.similar_names_list == True : s["species_name"] = (s["ISDN_path"].split(', '))[-1]
# similar_names = lookForSimilarSpeciesNameInGenus(species_name, speciesList)
# if similar_names != None : else :
# s['similar_names'] = similar_names print>>sys.stderr, "\n\nwarning : sequence without any identification at all\n\n"
#
# print>> options.genusdefined,formatFasta(s) if chosen == 'unite' :
# s['path'] = s["UNITE_path"]
# except KeyError: else :
# s['path'] = s["ISDN_path"]
# if options.unidentified is not None:
# if options.unidentified is not None :
# if options.similar_names_list == True : print>>options.unidentified,formatFasta(s)
# similar_names = lookForSimilarNamesInTaxonomy(species_name, tax._taxonomy)
# if similar_names != None :
# s['similar_names'] = similar_names
#
# print>> options.unidentified,formatFasta(s)