updated ecoFindTaxids to read UNITE databases.

2012-10-16 12:52:47 +00:00
parent 4a9fbc367b
commit 82f743447c
1 changed files with 99 additions and 63 deletions
--- a/src/ecoFindTaxids.py
+++ b/src/ecoFindTaxids.py
@ -33,6 +33,31 @@ def UNITEIterator(f):
        yield s


+def SILVAIterator(f):
+    
+    fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
+    for entry in fastaEntryIterator(f) :
+        all = entry.split('\n')
+        header = all[0]
+        fields = header.split(' | ')
+        id = fields[0][1:]
+        seq = all[1]
+        s = NucSequence(id, seq)
+        
+        if '(' in fields[1] :
+            species_name = ''
+            found = False
+            for word in fields[1].split(' ') :
+                if word == '(' :
+                    found = True
+                if found == False :
+                    species_name = species_name + ' ' + word
+        else :
+            species_name = fields[1]
+        s['species_name'] = species_name
+        yield s
+
+
 def lookForSimilarSpeciesNameInGenus(species_name, species_list):
    
    genus_species = species_name.split(' ')
@ -108,6 +133,7 @@ def getGenusTaxid(tax, species_name, ancestor):
 def getTaxid(tax, name, ancestor):
    
    taxid = tax.findTaxonByName(name)[0]
+    #print '\n~~~~~~~in getTaxid: ', name, ', taxid :', taxid,', anc = ', tax.isAncestor(ancestor, taxid), '\n'
    if ancestor != None and not tax.isAncestor(ancestor, taxid) :
        raise KeyError()
    
@ -168,7 +194,7 @@ def addTaxonomyOptions(optionManager):
                             metavar="<FILENAME>",
                             type="string",
                             default=None,
-                             help="file used to store sequences with the genus found.")
+                             help="(not with UNITE databases) file used to store sequences with the genus found.")

    optionManager.add_option('-u','--unidentified',
                             action="store", dest="unidentified",
@ -182,7 +208,7 @@ def addTaxonomyOptions(optionManager):
                             metavar="<dirty_file>",
                             type="str",
                             default=None,
-                             help="if chosen, ALL the words in the name used to identify the sequences will be searched"
+                             help="(not with UNITE databases) if chosen, ALL the words in the name used to identify the sequences will be searched"
                                  " when neither the exact name nor the genus have been found."
                                  " Only use if the sequences in your database are badly named with useless words or numbers"
                                  " in the name etc."
@ -193,7 +219,7 @@ def addTaxonomyOptions(optionManager):
                             metavar="<dbtype>",
                             type="string",
                             default='raw',
-                             help="type of the database with the taxa to be added. Possibilities : 'raw' or 'UNITE'."
+                             help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'."
                             " Default : raw.")
    
    optionManager.add_option('-T','--tag-name',
@ -234,18 +260,20 @@ if __name__=='__main__':
    
    (options,entries) = optionParser()
    
+    tax=loadTaxonomyDatabase(options)
+    
    if options.db_type == 'raw' :
        entryIterator=fastaIterator
    elif options.db_type == 'UNITE' :
        entryIterator=UNITEIterator
+    elif options.db_type == 'SILVA' :
+        entryIterator=SILVAIterator
+        options.tagname = 'species_name'
    
    entries = entryIterator(entries)
-    
-    tax=loadTaxonomyDatabase(options)
-   
    openFiles(options)
    
-    if options.db_type == 'raw' :
+    if (options.db_type == 'raw') or (options.db_type == 'SILVA') :
        
        if options.res_anc == '' :
            restricting_ancestor = None
@ -287,62 +315,70 @@ if __name__=='__main__':


    elif options.db_type == 'UNITE' :
-        raise NotImplemented        
-#        for s in entries :
-#            
-#            try:
-#                species_name = s["UNITE_species_name"]
-#                taxid = getTaxid(tax, species_name)
-#                s['taxid']=taxid
-#                s["species_name"] = species_name
-#                print formatFasta(s)
-#            
-#            except KeyError:
-#                try:
-#                    species_name = s["ISDN_species_name"]
-#                    taxid = getTaxid(tax, species_name)
-#                    s['taxid']=taxid
-#                    s["species_name"] = species_name
-#                    print formatFasta(s)
-#                
-#                except KeyError:
-#                    
-#                    if s["UNITE_species_name"] != "-" :
-#                        species_name = s["UNITE_species_name"]
-#                    
-#                    s["species_name"] = species_name
-#                    
-#                    if options.genusdefined is not None:
-#                        
-#                        try:
-#                            genusTaxid = getGenusTaxid(species_name)
-#                            
-#                            if options.species_list == True or options.similar_names_list == True :
-#                                speciesList = [i for i in tax.subTreeIterator(genusTaxid)]
-#                            
-#                            if options.species_list == True :
-#                                s['species in this genus'] = getAllSpeciesFromGenus(speciesList)
-#                            
-#                            if options.similar_names_list == True :
-#                                similar_names = lookForSimilarSpeciesNameInGenus(species_name, speciesList)
-#                                if similar_names != None :
-#                                    s['similar_names'] = similar_names
-#                            
-#                            print>> options.genusdefined,formatFasta(s)
-#                    
-#                        except KeyError:
-#                            
-#                            if options.unidentified is not None:
-#                                
-#                                if options.similar_names_list == True :
-#                                    similar_names = lookForSimilarNamesInTaxonomy(species_name, tax._taxonomy)
-#                                    if similar_names != None :
-#                                        s['similar_names'] = similar_names
-#                                    
-#                                print>> options.unidentified,formatFasta(s)
-                                
-                                
-                                
+        restricting_ancestor = tax.findTaxonByName('Fungi')[0]
+        for s in entries :
+          
+            try:
+                species_name = s["UNITE_species_name"]
+                taxid = getTaxid(tax, species_name, restricting_ancestor)
+                s['taxid']=taxid
+                s["species_name"] = species_name
+                print formatFasta(s)
+            
+            except KeyError:
+                try:
+                    species_name = s["ISDN_species_name"]
+                    print species_name
+                    taxid = getTaxid(tax, species_name, restricting_ancestor)
+                    s['taxid']=taxid
+                    s["species_name"] = species_name
+                    print formatFasta(s)
+                
+                except KeyError:
+                    
+                    if s["UNITE_species_name"] != "-" and s["UNITE_species_name"] != "" :
+                        s["species_name"] = s["UNITE_species_name"]
+                        chosen = 'unite'
+                    
+                    elif s["ISDN_species_name"] != "-" and s["ISDN_species_name"] != "" :
+                        s["species_name"] = s["ISDN_species_name"]
+                        chosen = 'isdn'
+                    
+                    else :
+                        if s["UNITE_path"] != "-" and s["UNITE_path"] != "" :
+                            chosen = 'unite'
+                            s["species_name"] = (s["UNITE_path"].split(', '))[-1]
+                        
+                        elif s["ISDN_path"] != "-" and s["ISDN_path"] != "" :
+                            chosen = 'isdn'
+                            s["species_name"] = (s["ISDN_path"].split(', '))[-1]
+                        
+                        else : 
+                            print>>sys.stderr, "\n\nwarning : sequence without any identification at all\n\n"
+                                        
+                    if chosen == 'unite' :
+                            s['path'] = s["UNITE_path"]
+                    else :
+                            s['path'] = s["ISDN_path"]
+                    
+                    if options.unidentified is not None :
+                        print>>options.unidentified,formatFasta(s)
+
+
+
+
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+            
+