updated ecoFindTaxids to read UNITE databases.
This commit is contained in:
@ -33,6 +33,31 @@ def UNITEIterator(f):
|
|||||||
yield s
|
yield s
|
||||||
|
|
||||||
|
|
||||||
|
def SILVAIterator(f):
|
||||||
|
|
||||||
|
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
|
||||||
|
for entry in fastaEntryIterator(f) :
|
||||||
|
all = entry.split('\n')
|
||||||
|
header = all[0]
|
||||||
|
fields = header.split(' | ')
|
||||||
|
id = fields[0][1:]
|
||||||
|
seq = all[1]
|
||||||
|
s = NucSequence(id, seq)
|
||||||
|
|
||||||
|
if '(' in fields[1] :
|
||||||
|
species_name = ''
|
||||||
|
found = False
|
||||||
|
for word in fields[1].split(' ') :
|
||||||
|
if word == '(' :
|
||||||
|
found = True
|
||||||
|
if found == False :
|
||||||
|
species_name = species_name + ' ' + word
|
||||||
|
else :
|
||||||
|
species_name = fields[1]
|
||||||
|
s['species_name'] = species_name
|
||||||
|
yield s
|
||||||
|
|
||||||
|
|
||||||
def lookForSimilarSpeciesNameInGenus(species_name, species_list):
|
def lookForSimilarSpeciesNameInGenus(species_name, species_list):
|
||||||
|
|
||||||
genus_species = species_name.split(' ')
|
genus_species = species_name.split(' ')
|
||||||
@ -108,6 +133,7 @@ def getGenusTaxid(tax, species_name, ancestor):
|
|||||||
def getTaxid(tax, name, ancestor):
|
def getTaxid(tax, name, ancestor):
|
||||||
|
|
||||||
taxid = tax.findTaxonByName(name)[0]
|
taxid = tax.findTaxonByName(name)[0]
|
||||||
|
#print '\n~~~~~~~in getTaxid: ', name, ', taxid :', taxid,', anc = ', tax.isAncestor(ancestor, taxid), '\n'
|
||||||
if ancestor != None and not tax.isAncestor(ancestor, taxid) :
|
if ancestor != None and not tax.isAncestor(ancestor, taxid) :
|
||||||
raise KeyError()
|
raise KeyError()
|
||||||
|
|
||||||
@ -168,7 +194,7 @@ def addTaxonomyOptions(optionManager):
|
|||||||
metavar="<FILENAME>",
|
metavar="<FILENAME>",
|
||||||
type="string",
|
type="string",
|
||||||
default=None,
|
default=None,
|
||||||
help="file used to store sequences with the genus found.")
|
help="(not with UNITE databases) file used to store sequences with the genus found.")
|
||||||
|
|
||||||
optionManager.add_option('-u','--unidentified',
|
optionManager.add_option('-u','--unidentified',
|
||||||
action="store", dest="unidentified",
|
action="store", dest="unidentified",
|
||||||
@ -182,7 +208,7 @@ def addTaxonomyOptions(optionManager):
|
|||||||
metavar="<dirty_file>",
|
metavar="<dirty_file>",
|
||||||
type="str",
|
type="str",
|
||||||
default=None,
|
default=None,
|
||||||
help="if chosen, ALL the words in the name used to identify the sequences will be searched"
|
help="(not with UNITE databases) if chosen, ALL the words in the name used to identify the sequences will be searched"
|
||||||
" when neither the exact name nor the genus have been found."
|
" when neither the exact name nor the genus have been found."
|
||||||
" Only use if the sequences in your database are badly named with useless words or numbers"
|
" Only use if the sequences in your database are badly named with useless words or numbers"
|
||||||
" in the name etc."
|
" in the name etc."
|
||||||
@ -193,7 +219,7 @@ def addTaxonomyOptions(optionManager):
|
|||||||
metavar="<dbtype>",
|
metavar="<dbtype>",
|
||||||
type="string",
|
type="string",
|
||||||
default='raw',
|
default='raw',
|
||||||
help="type of the database with the taxa to be added. Possibilities : 'raw' or 'UNITE'."
|
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'."
|
||||||
" Default : raw.")
|
" Default : raw.")
|
||||||
|
|
||||||
optionManager.add_option('-T','--tag-name',
|
optionManager.add_option('-T','--tag-name',
|
||||||
@ -234,18 +260,20 @@ if __name__=='__main__':
|
|||||||
|
|
||||||
(options,entries) = optionParser()
|
(options,entries) = optionParser()
|
||||||
|
|
||||||
|
tax=loadTaxonomyDatabase(options)
|
||||||
|
|
||||||
if options.db_type == 'raw' :
|
if options.db_type == 'raw' :
|
||||||
entryIterator=fastaIterator
|
entryIterator=fastaIterator
|
||||||
elif options.db_type == 'UNITE' :
|
elif options.db_type == 'UNITE' :
|
||||||
entryIterator=UNITEIterator
|
entryIterator=UNITEIterator
|
||||||
|
elif options.db_type == 'SILVA' :
|
||||||
|
entryIterator=SILVAIterator
|
||||||
|
options.tagname = 'species_name'
|
||||||
|
|
||||||
entries = entryIterator(entries)
|
entries = entryIterator(entries)
|
||||||
|
|
||||||
tax=loadTaxonomyDatabase(options)
|
|
||||||
|
|
||||||
openFiles(options)
|
openFiles(options)
|
||||||
|
|
||||||
if options.db_type == 'raw' :
|
if (options.db_type == 'raw') or (options.db_type == 'SILVA') :
|
||||||
|
|
||||||
if options.res_anc == '' :
|
if options.res_anc == '' :
|
||||||
restricting_ancestor = None
|
restricting_ancestor = None
|
||||||
@ -287,59 +315,67 @@ if __name__=='__main__':
|
|||||||
|
|
||||||
|
|
||||||
elif options.db_type == 'UNITE' :
|
elif options.db_type == 'UNITE' :
|
||||||
raise NotImplemented
|
restricting_ancestor = tax.findTaxonByName('Fungi')[0]
|
||||||
# for s in entries :
|
for s in entries :
|
||||||
#
|
|
||||||
# try:
|
try:
|
||||||
# species_name = s["UNITE_species_name"]
|
species_name = s["UNITE_species_name"]
|
||||||
# taxid = getTaxid(tax, species_name)
|
taxid = getTaxid(tax, species_name, restricting_ancestor)
|
||||||
# s['taxid']=taxid
|
s['taxid']=taxid
|
||||||
# s["species_name"] = species_name
|
s["species_name"] = species_name
|
||||||
# print formatFasta(s)
|
print formatFasta(s)
|
||||||
#
|
|
||||||
# except KeyError:
|
except KeyError:
|
||||||
# try:
|
try:
|
||||||
# species_name = s["ISDN_species_name"]
|
species_name = s["ISDN_species_name"]
|
||||||
# taxid = getTaxid(tax, species_name)
|
print species_name
|
||||||
# s['taxid']=taxid
|
taxid = getTaxid(tax, species_name, restricting_ancestor)
|
||||||
# s["species_name"] = species_name
|
s['taxid']=taxid
|
||||||
# print formatFasta(s)
|
s["species_name"] = species_name
|
||||||
#
|
print formatFasta(s)
|
||||||
# except KeyError:
|
|
||||||
#
|
except KeyError:
|
||||||
# if s["UNITE_species_name"] != "-" :
|
|
||||||
# species_name = s["UNITE_species_name"]
|
if s["UNITE_species_name"] != "-" and s["UNITE_species_name"] != "" :
|
||||||
#
|
s["species_name"] = s["UNITE_species_name"]
|
||||||
# s["species_name"] = species_name
|
chosen = 'unite'
|
||||||
#
|
|
||||||
# if options.genusdefined is not None:
|
elif s["ISDN_species_name"] != "-" and s["ISDN_species_name"] != "" :
|
||||||
#
|
s["species_name"] = s["ISDN_species_name"]
|
||||||
# try:
|
chosen = 'isdn'
|
||||||
# genusTaxid = getGenusTaxid(species_name)
|
|
||||||
#
|
else :
|
||||||
# if options.species_list == True or options.similar_names_list == True :
|
if s["UNITE_path"] != "-" and s["UNITE_path"] != "" :
|
||||||
# speciesList = [i for i in tax.subTreeIterator(genusTaxid)]
|
chosen = 'unite'
|
||||||
#
|
s["species_name"] = (s["UNITE_path"].split(', '))[-1]
|
||||||
# if options.species_list == True :
|
|
||||||
# s['species in this genus'] = getAllSpeciesFromGenus(speciesList)
|
elif s["ISDN_path"] != "-" and s["ISDN_path"] != "" :
|
||||||
#
|
chosen = 'isdn'
|
||||||
# if options.similar_names_list == True :
|
s["species_name"] = (s["ISDN_path"].split(', '))[-1]
|
||||||
# similar_names = lookForSimilarSpeciesNameInGenus(species_name, speciesList)
|
|
||||||
# if similar_names != None :
|
else :
|
||||||
# s['similar_names'] = similar_names
|
print>>sys.stderr, "\n\nwarning : sequence without any identification at all\n\n"
|
||||||
#
|
|
||||||
# print>> options.genusdefined,formatFasta(s)
|
if chosen == 'unite' :
|
||||||
#
|
s['path'] = s["UNITE_path"]
|
||||||
# except KeyError:
|
else :
|
||||||
#
|
s['path'] = s["ISDN_path"]
|
||||||
# if options.unidentified is not None:
|
|
||||||
#
|
if options.unidentified is not None :
|
||||||
# if options.similar_names_list == True :
|
print>>options.unidentified,formatFasta(s)
|
||||||
# similar_names = lookForSimilarNamesInTaxonomy(species_name, tax._taxonomy)
|
|
||||||
# if similar_names != None :
|
|
||||||
# s['similar_names'] = similar_names
|
|
||||||
#
|
|
||||||
# print>> options.unidentified,formatFasta(s)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user