Switch to version 1.2.3

Fixed major bug with -f UNITE_FULL option of obiaddtaxids
Merge branch 'master' of git.metabarcoding.org:obitools/obitools
2016-04-04 18:49:10 +02:00 · 2016-04-04 16:36:05 +02:00 · 2016-03-30 16:35:49 +02:00 · 2016-03-17 15:05:18 +01:00 · 2016-01-12 20:57:47 +01:00 · 2016-01-12 20:53:56 +01:00
8 changed files with 135 additions and 90 deletions
@@ -15,7 +15,7 @@
        output file. FLOAT is included in a [0,1] interval.

   .. cmdoption::    --minimum-circle=FLOAT
-   
+
        minimum identity considered for the assignment circle.
        FLOAT is included in a [0,1] interval.

@@ -40,11 +40,20 @@
   .. cmdoption::  -E FLOAT, --errors=FLOAT

        FLOAT is the fraction of reference sequences that will
-        be ignored when looking for the most recent common ancestor. This
+        be ignored when looking for the lowest common ancestor. This
        option is useful when a non-negligible proportion of reference sequences
        is expected to be assigned to the wrong taxon, for example because of
        taxonomic misidentification. FLOAT is included in a [0,1] interval.

+
+   .. cmdoption::  -M INTEGER, --min-matches=FLOAT
+
+        Define the minimum congruent assignation. If this minimum is reached and
+        the -E option is activated, the lowest common ancestor algorithm tolarated
+        that some sequences do not provide the same taxonomic annotation (see the
+        -E option).
+
+
   .. cmdoption::  --cache-size=INTEGER

        A cache for computed similarities is maintained by `ecotag`. the default
@@ -19,7 +19,7 @@ from os import path


 PACKAGE = "OBITools"
-VERSION = "1.2.0"
+VERSION = "1.2.3"
 AUTHOR  = 'Eric Coissac'
 EMAIL   = 'eric@coissac.eu'
 URL     = 'metabarcoding.org/obitools'
@@ -1,4 +1,4 @@
-#!/usr/local/bin/python
+#!/usr/local/OBITools-1.1.22/bin/python
 '''
 :py:mod:`ecotag`: assigns sequences to taxa
 ===========================================
@@ -145,9 +145,16 @@ def addSearchOptions(optionManager):

    optionManager.add_option('-E','--errors',
                             action='store',dest='error',
+                             type='float',
                             default=0.0,
                             help='Tolerated rate of wrong assignation')    

+    optionManager.add_option('-M','--min-matches',
+                             action='store',dest='minmatches',
+                             type="int",
+                             default=1,
+                             help='Minimum congruent assignation')    
+
    optionManager.add_option('--cache-size',
                             action='store',dest='cache',
                             type='int',
@@ -333,10 +340,10 @@ if __name__=='__main__':
    rankid = taxonomy.findRankByName(options.explain)

    for seq in db:
-        id = seq.id[0:46]
-        seq.id=id
-        assert id not in taxonlink
-        taxonlink[id]=int(seq['taxid'])
+        seqid = seq.id[0:46]
+        seq.id=seqid
+        assert seqid not in taxonlink
+        taxonlink[seqid]=int(seq['taxid'])
        
                
    if options.uniq:
@@ -363,7 +370,12 @@ if __name__=='__main__':

        if best[0]:
            taxlist = set(taxonlink[p[0].id] for p in match)
-            lca = taxonomy.betterCommonTaxon(options.error,*tuple(taxlist))
+            if options.error > 0.0 and len(match) >= int(options.minmatches / (1.0 - options.error)):
+                lca = taxonomy.betterCommonTaxon(options.error,
+                                                 *tuple(taxlist))
+            else:
+                lca = taxonomy.betterCommonTaxon(0.0,*tuple(taxlist))
+                
            scname = taxonomy.getScientificName(lca)
            rank = taxonomy.getRank(lca)
            if len(taxlist) < 15:
@@ -48,7 +48,7 @@ Otherwise,
    
    .. code-block:: bash
    
-       > obiaddtaxids -T species_name -g genus_identified.fasta \\
+       > obiaddtaxids -k species_name -g genus_identified.fasta \\
                      -u unidentified.fasta -d my_ecopcr_database \\
                      my_sequences.fasta > identified.fasta

@@ -340,6 +340,7 @@ if __name__=='__main__':
        entries = entryIterator(entries)
    elif options.db_type == 'UNITE_FULL' :
        entryIterator = UNITEIterator_FULL
+        entries = entryIterator(entries)
    elif options.db_type == 'UNITE_GENERAL' :
        entryIterator = UNITEIterator_GENERAL
        entries = entryIterator(entries)
@@ -23,8 +23,11 @@ def minimum(seqs):
    return min(s['select'] for s in seqs)

 def maximum(seqs):
-    return max(s['select'] for s in seqs)
-
+    try:
+        return max(s['select'] for s in seqs)
+    except TypeError, e:
+        print >>sys.stderr, seqs
+        raise e
 def mean(seqs):
    ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0)
    return float(ss) / len(seqs)
@@ -101,6 +104,13 @@ def addSelectOptions(optionManager):
                             default=[],
                             help="attributes to merge within each group")

+    group.add_option('-s','--sample',
+                             action="store", dest="sample",
+                             metavar="<TAGNAME>",
+                             type="str",
+                             default=None,
+                             help="Tag containing sample descriptions, the default value is set to *merged_sample*")
+
    group.add_option('--merge-ids',
                             action="store_true", dest="mergeids",
                             default=False,
@@ -129,16 +139,22 @@ if __name__ == '__main__':
    classes = {}
    
    print >>sys.stderr,"\nLoading sequences...\n"
+    
+    with_taxonomy=hasattr(options, 'taxonomy') and options.taxonomy is not None

+    nbseq=0
+    
    for s in entries:
+        nbseq+=1
        category = []
+
+        if with_taxonomy:
+            environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
+        else:
+            environ = {'sequence':s,'random':random()}
+
        for c in options.categories:
            try:
-                if hasattr(options, 'taxonomy') and options.taxonomy is not None:
-                    environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
-                else:
-                    environ = {'sequence':s,'random':random()}
-        
                v = eval(c,environ,s)
                category.append(v)
            except:
@@ -149,11 +165,6 @@ if __name__ == '__main__':
        group.append(s)
        classes[category]= group
            
-        if hasattr(options, 'taxonomy') and options.taxonomy is not None:
-            environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
-        else:
-            environ = {'sequence':s, 'random':random()}
-
        try:    
            select =  eval(options.function,environ,s)
            s['select']=select
@@ -181,75 +192,87 @@ if __name__ == '__main__':
        i+=1
        progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
        seqs = classes[c]
-        sortclass(seqs, options)
-        if len(c)==1:
-            c=c[0]
+        if options.sample is not None:
+            subsets = {}
+            for s in seqs:
+                for sid in s[options.sample]:
+                    ss = subsets.get(sid,[])
+                    ss.append(s)
+                    subsets[sid]=ss
+        else:
+            subsets={"all":seqs}
            
-        if options.number==1:
-            s = seqs[0]
+        for seqs in subsets.values():
+            sortclass(seqs, options)
            
-            for key in mergedKey:
-                if key=='taxid' and mergeIds:
-                    if 'taxid_dist' not in s:
-                        s["taxid_dist"]={}
-                    if 'taxid' in s:
-                        s["taxid_dist"][s.id]=s['taxid']
-                mkey = "merged_%s" % key 
-                if mkey not in s:
-                    if key in s:
-                        s[mkey]={s[key]:1}
-                    else:
-                        s[mkey]={}
-
-            if 'count' not in s:
-                s['count']=1
-            if mergeIds:        
-                s['merged']=[s.id]
-
-            for seq in seqs[1:]:
+            if len(c)==1:
+                c=c[0]
                
-                s['count']+=seq['count']
+            if options.number==1 and options.sample is None:
+                s = seqs[0]
                
                for key in mergedKey:
                    if key=='taxid' and mergeIds:
-                        if 'taxid_dist' in seq:
-                            s["taxid_dist"].update(seq["taxid_dist"])
-                        if 'taxid' in seq:
-                            s["taxid_dist"][seq.id]=seq['taxid']
-                            
+                        if 'taxid_dist' not in s:
+                            s["taxid_dist"]={}
+                        if 'taxid' in s:
+                            s["taxid_dist"][s.id]=s['taxid']
                    mkey = "merged_%s" % key 
-                    if mkey in seq:
-                        m = seq[mkey]
-                    else:
-                        if key in seq:
-                            m={seq[key]:1}
-                            
-                    allmkey = set(m.keys()) | set(s[mkey].keys())
-                    s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
-
-#                     if mkey in seq:
-#                         for skey in seq[mkey]:
-#                             if skey in s:
-#                                 s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey]
-#                             else:
-#                                 s[mkey][skey]=seq[mkey][skey]
-                                
-                #for key in seq.iterkeys():
-                #    # Merger proprement l'attribut merged s'il exist
-                #    if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged' and key!='select':
-                #        del(s[key])
-                    
-                            
+                    if mkey not in s:
+                        if key in s:
+                            s[mkey]={s[key]:1}
+                        else:
+                            s[mkey]={}
+    
+                if 'count' not in s:
+                    s['count']=1
                if mergeIds:        
-                    s['merged'].append(seq.id)
- 
-        if taxonomy is not None:
-            mergeTaxonomyClassification(seqs, taxonomy)
+                    s['merged']=[s.id]
+    
+                for seq in seqs[1:]:
                    
-        
-        for s in seqs[0:options.number]:
-            s['class']=c
-            del s['select']
-            writer(s)
+                    s['count']+=seq['count']
+                    
+                    for key in mergedKey:
+                        if key=='taxid' and mergeIds:
+                            if 'taxid_dist' in seq:
+                                s["taxid_dist"].update(seq["taxid_dist"])
+                            if 'taxid' in seq:
+                                s["taxid_dist"][seq.id]=seq['taxid']
+                                
+                        mkey = "merged_%s" % key 
+                        if mkey in seq:
+                            m = seq[mkey]
+                        else:
+                            if key in seq:
+                                m={seq[key]:1}
+                                
+                        allmkey = set(m.keys()) | set(s[mkey].keys())
+                        s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
+                                                    
+                    if mergeIds:        
+                        s['merged'].append(seq.id)
+     
+                if taxonomy is not None:
+                    mergeTaxonomyClassification(seqs, taxonomy)
+                        
            
+            for s in seqs[0:options.number]:
+                s['class']=c
+                s['__@TOWRITE@__']=True
+     
+    print >>sys.stderr,"\Writing sequences...\n"
+    progressBar(1,nbseq,True,'Writing')
+           
+    i=0
+    for c in classes:
+        seqs = classes[c]
+        for s in seqs:
+            i+=1
+            progressBar(i,nbseq,False,"Writing")
+            if '__@TOWRITE@__' in s:
+                del s['__@TOWRITE@__']
+                del s['select']
+                writer(s)
+        
    print >>sys.stderr
@@ -320,9 +320,9 @@ if __name__ == '__main__':
                                                     parent[3],parent[0],options.taxonomy._ranks[parent[1]])
        localdata=True

-    for n in options.newname:
-        tx = t.split(t,':')
-        taxid = options.taxonomy.addPreferedName(tx[0].strip(),tx[1])
+    for t in options.newname:
+        tx = t.split(':')
+        taxid = options.taxonomy.addPreferedName(int(tx[1]), tx[0].strip())
        print "name : %8d\t->\t%s" % (taxid,options.taxonomy.getPreferedName(taxid))
             
    ecoTaxonomyWriter(options.ecodb,options.taxonomy,onlyLocal=True)
@@ -269,7 +269,7 @@ class Taxonomy(object):
    
    def addPreferedName(self,taxid,name):
        idx = self.findIndex(taxid)
-        self._preferedName.append(name,'obi',idx)
+        self._preferedName.append([name,'obi',idx])
        self._preferedidx[idx]=name
        return taxid

@@ -399,10 +399,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
        for t in self._taxonomy[self._localtaxon:]:
            self._name.append((t[3],'scientific name',i))
            i+=1
-                
+            
        try :
            self._preferedName = [(x[0],'obi',x[2]) 
-                                  for x in self.__ecoNameIterator(self._preferedNamesFile,noError=True)]
+                                  for x in self.__ecoNameIterator(self._preferedNamesFile)]
            print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
        except:
            print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
@@ -1,5 +1,5 @@
 major = 1
 minor = 2
-serial= '0'
+serial= '3'

 version = "%2d.%02d %s" % (major,minor,serial)
Author	SHA1	Message	Date
coissac	ffb33f61a9	Switch to version 1.2.3	2016-04-04 18:49:10 +02:00
Celine Mercier	adbeac9684	Fixed major bug with -f UNITE_FULL option of obiaddtaxids	2016-04-04 16:36:05 +02:00
Celine Mercier	4aac47e639	Merge branch 'master' of git.metabarcoding.org:obitools/obitools	2016-03-30 16:35:49 +02:00
Celine Mercier	2028c69b3f	Fixed typo in obiaddtaxids documentation	2016-03-17 15:05:18 +01:00
coissac	2649cd268c	Switch to version 1.2.2	2016-01-12 20:57:47 +01:00
coissac	46b446f0c8	Add capacity to filter on merged sequences	2016-01-12 20:53:56 +01:00
Celine Mercier	6208cbee02	Fixes #20 and several other bugs related to that function	2016-01-08 11:40:39 +01:00
coissac	1d4c3f99b2	Patch a bug related to the -E et -M options	2015-10-27 20:16:07 +01:00
coissac	ba30e74dc9	Switch the version number to 1.2.1	2015-10-27 18:59:51 +01:00
coissac	0e8b97ca40	Modify the action of the -M option to fit the documentation	2015-10-27 18:58:32 +01:00
coissac	a95ae5126b	Change the ecotag doc according to the new -M option	2015-10-27 18:56:49 +01:00
coissac	0e2f859839	Patch bug in the -E option of ecotag and add the -M option to limit the action of the -E option	2015-10-27 18:48:03 +01:00