Add option to ecoPCRFormat to deal with an obischema db as taxonomy source

git-svn-id: https://www.grenoble.prabi.fr/svn/LECASofts/ecoPCR/trunk@158 60f365c0-8329-0410-b2a4-ec073aeeaa1d
2008-02-08 10:27:42 +00:00
parent a1141a77b5
commit f68f4af244
1 changed files with 70 additions and 3 deletions
--- a/tools/ecoPCRFormat.py
+++ b/tools/ecoPCRFormat.py
@ -7,6 +7,12 @@ import sys
 import time
 import getopt

+try:
+    import psycopg2
+    _dbenable=True
+except ImportError:
+    _dbenable=False
+
 #####
 #
 #
@ -215,7 +221,56 @@ def readTaxonomyDump(taxdir):
    
    return taxonomy,ranks,alternativeName,index

+def readTaxonomyDB(dbname):
+    connection = psycopg2.connect(database=dbname)
+    
+    cursor = connection.cursor()
+    cursor.execute("select numid,rank,parent from ncbi_taxonomy.taxon")
+    taxonomy=[list(x) for x in cursor]

+    cursor.execute("select rank_class from ncbi_taxonomy.taxon_rank_class order by rank_class")
+    ranks=cursor.fetchall()
+    ranks = dict(map(None,(x[0] for x in ranks),xrange(len(ranks))))
+    
+    print >>sys.stderr,"Sorting taxons..."
+    taxonomy.sort(taxonCmp)
+
+    print >>sys.stderr,"Indexing taxonomy..."
+    index = {}
+    for t in taxonomy:
+        index[t[0]]=bsearchTaxon(taxonomy, t[0])
+    
+    print >>sys.stderr,"Indexing parent and rank..."
+    for t in taxonomy:
+        t[1]=ranks[t[1]]
+        try:
+            t[2]=index[t[2]]
+        except KeyError,e:
+            if t[2] is None and t[0]==1:
+                t[2]=index[t[0]]
+            else:
+                raise e
+        
+    cursor.execute("select taxid,name,category from ncbi_taxonomy.name")
+    
+    alternativeName=[]
+    for taxid,name,classname in cursor:
+        alternativeName.append((name,classname,index[taxid]))
+        if classname == 'scientific name':
+            taxonomy[index[taxid]].append(name)
+    
+    cursor.execute("select old_numid,current_numid from ncbi_taxonomy.taxon_id_alias")
+ 
+    print >>sys.stderr,"Adding taxid alias..."
+    for taxid,current in cursor:
+        if current is not None:
+            index[taxid]=index[current]
+        else:
+            index[taxid]=None
+            
+       
+    return taxonomy,ranks,alternativeName,index
+    
 #####
 #
 #
@ -293,11 +348,14 @@ def emblEntryParser(entry):

 ######################

+_fastaSplit=re.compile(';\W*')
+
 def parseFasta(seq):
+    seq=seq.split('\n')
    title = seq[0].strip()[1:].split(None,1)
    id=title[0]
    if len(title) == 2:
-        field = title[1].split('; ')
+        field = _fastaSplit.split(title[1])
    else:
        field=[]
    info = dict(x.split('=',1) for x in field if '=' in x)
@ -527,9 +585,10 @@ def ecoParseOptions(arguments):
           }
    
    o,filenames = getopt.getopt(arguments,
-                                'ht:n:gfe',
+                                'ht:T:n:gfe',
                                ['help',
                                 'taxonomy=',
+                                 'taxonomy_db=',
                                 'name=',
                                 'genbank',
                                 'fasta',
@ -540,7 +599,11 @@ def ecoParseOptions(arguments):
            printHelp()
            exit()
        elif name in ('-t','--taxonomy'):
+            opt['taxmod']='dump'
            opt['taxdir']=value
+        elif name in ('-T','--taxonomy_db'):
+            opt['taxmod']='db'
+            opt['taxdb']=value
        elif name in ('-n','--name'):
            opt['prefix']=value
        elif name in ('-g','--genbank'):
@ -578,7 +641,11 @@ if __name__ == '__main__':
    
    opt,filenames = ecoParseOptions(sys.argv[1:])
    
-    taxonomy = readTaxonomyDump(opt['taxdir'])
+    if opt['taxmod']=='dump':
+        taxonomy = readTaxonomyDump(opt['taxdir'])
+    elif opt['taxmod']=='db':
+        taxonomy = readTaxonomyDB(opt['taxdb'])
+        
   
    ecoDBWriter(opt['prefix'], taxonomy, filenames, opt['parser'])