17 Commits

Author SHA1 Message Date
coissac ffb33f61a9 Switch to version 1.2.3 2016-04-04 18:49:10 +02:00
Celine Mercier adbeac9684 Fixed major bug with -f UNITE_FULL option of obiaddtaxids 2016-04-04 16:36:05 +02:00
Celine Mercier 4aac47e639 Merge branch 'master' of git.metabarcoding.org:obitools/obitools 2016-03-30 16:35:49 +02:00
Celine Mercier 2028c69b3f Fixed typo in obiaddtaxids documentation 2016-03-17 15:05:18 +01:00
coissac 2649cd268c Switch to version 1.2.2 2016-01-12 20:57:47 +01:00
coissac 46b446f0c8 Add capacity to filter on merged sequences 2016-01-12 20:53:56 +01:00
Celine Mercier 6208cbee02 Fixes #20 and several other bugs related to that function 2016-01-08 11:40:39 +01:00
coissac 1d4c3f99b2 Patch a bug related to the -E et -M options 2015-10-27 20:16:07 +01:00
coissac ba30e74dc9 Switch the version number to 1.2.1 2015-10-27 18:59:51 +01:00
coissac 0e8b97ca40 Modify the action of the -M option to fit the documentation 2015-10-27 18:58:32 +01:00
coissac a95ae5126b Change the ecotag doc according to the new -M option 2015-10-27 18:56:49 +01:00
coissac 0e2f859839 Patch bug in the -E option of ecotag and add the -M option to limit the
action of the -E option
2015-10-27 18:48:03 +01:00
coissac 23717f5833 Add a first version of the obisubset tool and tag the version 1.2.0 2015-10-20 22:02:02 +02:00
coissac fdeaf5956c Change the way ecoPCRDB are written by obitools. If the obitools is
called with several sequence files as input when the ecoPCRDB is
requested as output format, the sequences are splitted in several sdx
files
2015-10-02 16:36:25 +02:00
coissac 343d9ec6df Switch to version 1.1.22 2015-08-03 23:16:37 +02:00
coissac 7a993ec841 Patch a wrong import in the file 2015-08-03 23:14:13 +02:00
celinemercier 6170fac081 fixed a problem with the progress bar in obiselect when there is more
than one category
2015-07-20 15:24:30 +02:00
14 changed files with 377 additions and 116 deletions
+2 -3
View File
@@ -3,13 +3,12 @@ Sequence sampling and filtering
.. toctree::
:maxdepth: 2
scripts/obiextract
scripts/obigrep
scripts/obihead
scripts/obisample
scripts/obiselect
scripts/obisplit
scripts/obisubset
scripts/obitail
+11 -2
View File
@@ -15,7 +15,7 @@
output file. FLOAT is included in a [0,1] interval.
.. cmdoption:: --minimum-circle=FLOAT
minimum identity considered for the assignment circle.
FLOAT is included in a [0,1] interval.
@@ -40,11 +40,20 @@
.. cmdoption:: -E FLOAT, --errors=FLOAT
FLOAT is the fraction of reference sequences that will
be ignored when looking for the most recent common ancestor. This
be ignored when looking for the lowest common ancestor. This
option is useful when a non-negligible proportion of reference sequences
is expected to be assigned to the wrong taxon, for example because of
taxonomic misidentification. FLOAT is included in a [0,1] interval.
.. cmdoption:: -M INTEGER, --min-matches=FLOAT
Define the minimum congruent assignation. If this minimum is reached and
the -E option is activated, the lowest common ancestor algorithm tolarated
that some sequences do not provide the same taxonomic annotation (see the
-E option).
.. cmdoption:: --cache-size=INTEGER
A cache for computed similarities is maintained by `ecotag`. the default
+81
View File
@@ -0,0 +1,81 @@
.. automodule:: obisubset
:py:mod:`obisubset` specific options
------------------------------------
.. cmdoption:: -s <TAGNAME>, --sample=<TAGNAME>,
The option ``-s`` allows to specify the tag containing sample descriptions,
the default value is set to *merged_sample*.
*Example:*
.. code-block:: bash
> obiuniq -m sample seq1.fasta > seq2.fasta
> obisubset -s merged_sample -n sample1 seq2.fasta > seq3.fasta
After the dereplication of the sequences using the
in the new attribute ``merged_sample``.
.. cmdoption:: -o <TAGNAME>, --other-tag=<TAGNAME>,
Another tag to clean according to the sample subset
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
.. cmdoption:: -l <FILENAME>, --sample-list=<FILENAME>,
File containing the samples names (one sample id per line).
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -l ids.txt seq2.fasta > seq3.fasta
.. cmdoption:: -p <REGEX>, --sample-pattern=<REGEX>,
A regular expression pattern matching the sample ids to extract.
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -p "negative_.*" seq2.fasta > seq3.fasta
.. cmdoption:: -n <SAMPLEIDS>, --sample-name=<SAMPLEIDS>,
A sample id to extract
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
.. include:: ../optionsSet/inputformat.txt
.. include:: ../optionsSet/outputformat.txt
.. include:: ../optionsSet/defaultoptions.txt
:py:mod:`obisubset` modifies sequence attributes
------------------------------------------------
.. hlist::
:columns: 3
- :doc:`count <../attributes/count>`
- :doc:`merged_* <../attributes/merged_star>`
:py:mod:`obisubset` used sequence attribute
-------------------------------------------
- :doc:`count <../attributes/taxid>`
- :doc:`merged_* <../attributes/merged_star>`
+1 -1
View File
@@ -19,7 +19,7 @@ from os import path
PACKAGE = "OBITools"
VERSION = "1.1.21"
VERSION = "1.2.3"
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'metabarcoding.org/obitools'
+18 -6
View File
@@ -1,4 +1,4 @@
#!/usr/local/bin/python
#!/usr/local/OBITools-1.1.22/bin/python
'''
:py:mod:`ecotag`: assigns sequences to taxa
===========================================
@@ -145,9 +145,16 @@ def addSearchOptions(optionManager):
optionManager.add_option('-E','--errors',
action='store',dest='error',
type='float',
default=0.0,
help='Tolerated rate of wrong assignation')
optionManager.add_option('-M','--min-matches',
action='store',dest='minmatches',
type="int",
default=1,
help='Minimum congruent assignation')
optionManager.add_option('--cache-size',
action='store',dest='cache',
type='int',
@@ -333,10 +340,10 @@ if __name__=='__main__':
rankid = taxonomy.findRankByName(options.explain)
for seq in db:
id = seq.id[0:46]
seq.id=id
assert id not in taxonlink
taxonlink[id]=int(seq['taxid'])
seqid = seq.id[0:46]
seq.id=seqid
assert seqid not in taxonlink
taxonlink[seqid]=int(seq['taxid'])
if options.uniq:
@@ -363,7 +370,12 @@ if __name__=='__main__':
if best[0]:
taxlist = set(taxonlink[p[0].id] for p in match)
lca = taxonomy.betterCommonTaxon(options.error,*tuple(taxlist))
if options.error > 0.0 and len(match) >= int(options.minmatches / (1.0 - options.error)):
lca = taxonomy.betterCommonTaxon(options.error,
*tuple(taxlist))
else:
lca = taxonomy.betterCommonTaxon(0.0,*tuple(taxlist))
scname = taxonomy.getScientificName(lca)
rank = taxonomy.getRank(lca)
if len(taxlist) < 15:
+2 -1
View File
@@ -48,7 +48,7 @@ Otherwise,
.. code-block:: bash
> obiaddtaxids -T species_name -g genus_identified.fasta \\
> obiaddtaxids -k species_name -g genus_identified.fasta \\
-u unidentified.fasta -d my_ecopcr_database \\
my_sequences.fasta > identified.fasta
@@ -340,6 +340,7 @@ if __name__=='__main__':
entries = entryIterator(entries)
elif options.db_type == 'UNITE_FULL' :
entryIterator = UNITEIterator_FULL
entries = entryIterator(entries)
elif options.db_type == 'UNITE_GENERAL' :
entryIterator = UNITEIterator_GENERAL
entries = entryIterator(entries)
+97 -74
View File
@@ -23,8 +23,11 @@ def minimum(seqs):
return min(s['select'] for s in seqs)
def maximum(seqs):
return max(s['select'] for s in seqs)
try:
return max(s['select'] for s in seqs)
except TypeError, e:
print >>sys.stderr, seqs
raise e
def mean(seqs):
ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0)
return float(ss) / len(seqs)
@@ -101,6 +104,13 @@ def addSelectOptions(optionManager):
default=[],
help="attributes to merge within each group")
group.add_option('-s','--sample',
action="store", dest="sample",
metavar="<TAGNAME>",
type="str",
default=None,
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
group.add_option('--merge-ids',
action="store_true", dest="mergeids",
default=False,
@@ -129,16 +139,22 @@ if __name__ == '__main__':
classes = {}
print >>sys.stderr,"\nLoading sequences...\n"
with_taxonomy=hasattr(options, 'taxonomy') and options.taxonomy is not None
nbseq=0
for s in entries:
nbseq+=1
category = []
if with_taxonomy:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s,'random':random()}
for c in options.categories:
try:
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s,'random':random()}
v = eval(c,environ,s)
category.append(v)
except:
@@ -149,11 +165,6 @@ if __name__ == '__main__':
group.append(s)
classes[category]= group
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
else:
environ = {'sequence':s, 'random':random()}
try:
select = eval(options.function,environ,s)
s['select']=select
@@ -179,77 +190,89 @@ if __name__ == '__main__':
i=0
for c in classes:
i+=1
progressBar(i,lclasses,False,"%15s" % c)
progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
seqs = classes[c]
sortclass(seqs, options)
if len(c)==1:
c=c[0]
if options.sample is not None:
subsets = {}
for s in seqs:
for sid in s[options.sample]:
ss = subsets.get(sid,[])
ss.append(s)
subsets[sid]=ss
else:
subsets={"all":seqs}
if options.number==1:
s = seqs[0]
for seqs in subsets.values():
sortclass(seqs, options)
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' not in s:
s["taxid_dist"]={}
if 'taxid' in s:
s["taxid_dist"][s.id]=s['taxid']
mkey = "merged_%s" % key
if mkey not in s:
if key in s:
s[mkey]={s[key]:1}
else:
s[mkey]={}
if 'count' not in s:
s['count']=1
if mergeIds:
s['merged']=[s.id]
for seq in seqs[1:]:
if len(c)==1:
c=c[0]
s['count']+=seq['count']
if options.number==1 and options.sample is None:
s = seqs[0]
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' in seq:
s["taxid_dist"].update(seq["taxid_dist"])
if 'taxid' in seq:
s["taxid_dist"][seq.id]=seq['taxid']
if 'taxid_dist' not in s:
s["taxid_dist"]={}
if 'taxid' in s:
s["taxid_dist"][s.id]=s['taxid']
mkey = "merged_%s" % key
if mkey in seq:
m = seq[mkey]
else:
if key in seq:
m={seq[key]:1}
allmkey = set(m.keys()) | set(s[mkey].keys())
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
# if mkey in seq:
# for skey in seq[mkey]:
# if skey in s:
# s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey]
# else:
# s[mkey][skey]=seq[mkey][skey]
#for key in seq.iterkeys():
# # Merger proprement l'attribut merged s'il exist
# if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged' and key!='select':
# del(s[key])
if mkey not in s:
if key in s:
s[mkey]={s[key]:1}
else:
s[mkey]={}
if 'count' not in s:
s['count']=1
if mergeIds:
s['merged'].append(seq.id)
if taxonomy is not None:
mergeTaxonomyClassification(seqs, taxonomy)
s['merged']=[s.id]
for seq in seqs[1:]:
for s in seqs[0:options.number]:
s['class']=c
del s['select']
writer(s)
s['count']+=seq['count']
for key in mergedKey:
if key=='taxid' and mergeIds:
if 'taxid_dist' in seq:
s["taxid_dist"].update(seq["taxid_dist"])
if 'taxid' in seq:
s["taxid_dist"][seq.id]=seq['taxid']
mkey = "merged_%s" % key
if mkey in seq:
m = seq[mkey]
else:
if key in seq:
m={seq[key]:1}
allmkey = set(m.keys()) | set(s[mkey].keys())
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
if mergeIds:
s['merged'].append(seq.id)
if taxonomy is not None:
mergeTaxonomyClassification(seqs, taxonomy)
for s in seqs[0:options.number]:
s['class']=c
s['__@TOWRITE@__']=True
print >>sys.stderr,"\Writing sequences...\n"
progressBar(1,nbseq,True,'Writing')
i=0
for c in classes:
seqs = classes[c]
for s in seqs:
i+=1
progressBar(i,nbseq,False,"Writing")
if '__@TOWRITE@__' in s:
del s['__@TOWRITE@__']
del s['select']
writer(s)
print >>sys.stderr
+116
View File
@@ -0,0 +1,116 @@
#!/usr/local/bin/python
'''
:py:mod:`obisubset`: extract a subset of samples
================================================
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
The :py:mod:`obisubset` command extracts a subset of samples from a sequence file
after its dereplication using :py:mod:`obiuniq` program.
'''
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
from obitools.options import getOptionManager
import re
def addSubsetOptions(optionManager):
group = optionManager.add_option_group('obisubset specific options')
group.add_option('-s','--sample',
action="store", dest="sample",
metavar="<TAGNAME>",
type="str",
default='merged_sample',
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
group.add_option('-o','--other-tag',
action="append", dest="taglist",
metavar="<TAGNAME>",
type="string",
default=[],
help="Another tag to clean according to the sample subset")
group.add_option('-l','--sample-list',
action="store", dest="samplelist",
metavar="<FILENAME>",
type="string",
default=None,
help="File containing the samples names (one sample id per line)")
group.add_option('-p','--sample-pattern',
action="store", dest="samplepattern",
metavar="<REGEX>",
type="string",
default=None,
help="A regular expression pattern matching the sample ids to extract")
group.add_option('-n','--sample-name',
action="append", dest="samplename",
metavar="<SAMPLEIDS>",
type="string",
default=[],
help="A sample id to extract")
def sequenceSelectorGenerator(options):
samplename = set(options.samplename)
othertags = set(options.taglist)
if options.samplelist is not None:
with open(options.samplelist) as lname :
for name in lname:
name = name.strip()
samplename.add(name)
if options.samplepattern is not None:
samplepattern = re.compile(options.samplepattern)
else:
samplepattern = None
def sequenceSelector(entries):
for entry in entries:
samples=entry[options.sample]
slist = set(samples.keys())
tokeep=slist & samplename
if samplepattern is not None:
for name in slist:
if samplepattern.match(name):
tokeep.add(name)
if tokeep:
newsample={}
newcount=0
for name in tokeep:
c = samples[name]
newsample[name]= c
newcount+=c
entry['count']=newcount
entry[options.sample]=newsample
for t in othertags:
if t in entry:
d = entry[t]
newd={}
for name in tokeep:
if name in d:
newd[name] = d[name]
entry[t]=newd
yield entry
return sequenceSelector
if __name__=='__main__':
optionParser = getOptionManager([addInOutputOption,addSubsetOptions],progdoc=__doc__)
(options, entries) = optionParser()
writer = sequenceWriterGenerator(options)
good = sequenceSelectorGenerator(options)
for seq in good(entries):
writer(seq)
+3 -3
View File
@@ -320,9 +320,9 @@ if __name__ == '__main__':
parent[3],parent[0],options.taxonomy._ranks[parent[1]])
localdata=True
for n in options.newname:
tx = t.split(t,':')
taxid = options.taxonomy.addPreferedName(tx[0].strip(),tx[1])
for t in options.newname:
tx = t.split(':')
taxid = options.taxonomy.addPreferedName(int(tx[1]), tx[0].strip())
print "name : %8d\t->\t%s" % (taxid,options.taxonomy.getPreferedName(taxid))
ecoTaxonomyWriter(options.ecodb,options.taxonomy,onlyLocal=True)
+36 -12
View File
@@ -64,12 +64,14 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
class EcoPCRDBSequenceWriter(object):
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
from obitools.options import currentInputFileName
self.currentInputFileName=currentInputFileName
# Take care of the taxonomy associated to the database
self._currentfile=None
self._taxonomy= loadTaxonomyDatabase(options)
dbname = options.ecopcroutput
if (self._taxonomy is not None
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
print >> sys.stderr,"Writing the taxonomy file...",
@@ -82,23 +84,25 @@ class EcoPCRDBSequenceWriter(object):
fileidx = max(list(int(p.search(i).group(1))
for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
) +1
self._fileidx=fileidx
self._dbname=dbname
self._filename="%s_%03d.sdx" % (dbname,fileidx)
if append:
mode ='r+b'
f = universalOpen(self._filename)
(recordCount,) = struct.unpack('> I',f.read(4))
self._sequenceCount=recordCount
self._sequenceFileCount=recordCount
del f
self._file = open(self._filename,mode)
self._file.seek(0,0)
self._file.write(struct.pack('> I',0))
self.open('r+b')
self._file.seek(0,2)
else:
self._sequenceCount=0
mode = 'wb'
self._file = open(self._filename,mode)
self._file.write(struct.pack('> I',self._sequenceCount))
self._sequenceFileCount=0
self.open("wb")
if type is not None:
assert ftid is not None,"You must specify an id attribute for features"
@@ -141,7 +145,28 @@ class EcoPCRDBSequenceWriter(object):
return packed
def close(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceFileCount))
self._file.close()
def open(self,mode):
self._filename="%s_%03d.sdx" % (self._dbname,self._fileidx)
self._file=open(self._filename,mode)
self._sequenceFileCount=0
self._file.write(struct.pack('> I',self._sequenceFileCount))
def put(self,sequence):
if self._currentfile is None:
self._currentfile=self.currentInputFileName()
if self.currentInputFileName() != self._currentfile:
self._currentfile=self.currentInputFileName()
self.close()
self._fileidx+=1
self.open('wb')
if self._taxonomy is not None:
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
sequence.extractTaxon()
@@ -149,11 +174,10 @@ class EcoPCRDBSequenceWriter(object):
if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount)
self._sequenceCount+=1
self._sequenceFileCount+=1
def __del__(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()
self.close()
+3 -3
View File
@@ -269,7 +269,7 @@ class Taxonomy(object):
def addPreferedName(self,taxid,name):
idx = self.findIndex(taxid)
self._preferedName.append(name,'obi',idx)
self._preferedName.append([name,'obi',idx])
self._preferedidx[idx]=name
return taxid
@@ -399,10 +399,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
for t in self._taxonomy[self._localtaxon:]:
self._name.append((t[3],'scientific name',i))
i+=1
try :
self._preferedName = [(x[0],'obi',x[2])
for x in self.__ecoNameIterator(self._preferedNamesFile,noError=True)]
for x in self.__ecoNameIterator(self._preferedNamesFile)]
print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
except:
print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
+4 -8
View File
@@ -33,10 +33,6 @@ from obitools.format.sequence import skipOnErrorIterator, skipfirst, only
from obitools import BioSequence
from obitools.utils import FakeFile
from glob import glob
from test.test_compiler import Toto
def binarySequenceIterator(lineiterator):
f = FakeFile(lineiterator)
@@ -263,9 +259,9 @@ def autoEntriesIterator(options):
raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
if reader==binarySequenceIterator:
input = binarySequenceIterator(lineiterator)
input = binarySequenceIterator(lineiterator) # @ReservedAssignment
else:
input = reader(chain([first],lineiterator))
input = reader(chain([first],lineiterator)) # @ReservedAssignment
return input
@@ -325,7 +321,7 @@ def autoEntriesIterator(options):
def sequenceWriterGenerator(options,output=sys.stdout):
class SequenceWriter:
def __init__(self,options,file=sys.stdout):
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
self._format=None
self._file=file
self._upper=options.uppercase
@@ -350,7 +346,7 @@ def sequenceWriterGenerator(options,output=sys.stdout):
sys.exit(0)
class BinaryWriter:
def __init__(self,options,file=sys.stdout):
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
self._file=file
self._file.write("#!Pickle\n")
def put(self,seq):
+1 -1
View File
@@ -93,7 +93,7 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102,opti
else:
if entryIterator == EcoPCRDBSequenceIterator and options is not None:
if options.ecodb==f:
if hasattr(options,'ecodb') and options.ecodb==f:
iterator = entryIterator(f,options.taxonomy)
else:
iterator = entryIterator(f)
+2 -2
View File
@@ -1,5 +1,5 @@
major = 1
minor = 1
serial= '21'
minor = 2
serial= '3'
version = "%2d.%02d %s" % (major,minor,serial)