Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ffb33f61a9 | |||
| adbeac9684 | |||
| 4aac47e639 | |||
| 2028c69b3f | |||
| 2649cd268c | |||
| 46b446f0c8 | |||
| 6208cbee02 | |||
| 1d4c3f99b2 | |||
| ba30e74dc9 | |||
| 0e8b97ca40 | |||
| a95ae5126b | |||
| 0e2f859839 | |||
| 23717f5833 | |||
| fdeaf5956c | |||
| 343d9ec6df | |||
| 7a993ec841 | |||
| 6170fac081 |
@@ -3,13 +3,12 @@ Sequence sampling and filtering
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
||||
scripts/obiextract
|
||||
scripts/obigrep
|
||||
scripts/obihead
|
||||
scripts/obisample
|
||||
scripts/obiselect
|
||||
scripts/obisplit
|
||||
scripts/obisubset
|
||||
scripts/obitail
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
output file. FLOAT is included in a [0,1] interval.
|
||||
|
||||
.. cmdoption:: --minimum-circle=FLOAT
|
||||
|
||||
|
||||
minimum identity considered for the assignment circle.
|
||||
FLOAT is included in a [0,1] interval.
|
||||
|
||||
@@ -40,11 +40,20 @@
|
||||
.. cmdoption:: -E FLOAT, --errors=FLOAT
|
||||
|
||||
FLOAT is the fraction of reference sequences that will
|
||||
be ignored when looking for the most recent common ancestor. This
|
||||
be ignored when looking for the lowest common ancestor. This
|
||||
option is useful when a non-negligible proportion of reference sequences
|
||||
is expected to be assigned to the wrong taxon, for example because of
|
||||
taxonomic misidentification. FLOAT is included in a [0,1] interval.
|
||||
|
||||
|
||||
.. cmdoption:: -M INTEGER, --min-matches=FLOAT
|
||||
|
||||
Define the minimum congruent assignation. If this minimum is reached and
|
||||
the -E option is activated, the lowest common ancestor algorithm tolarated
|
||||
that some sequences do not provide the same taxonomic annotation (see the
|
||||
-E option).
|
||||
|
||||
|
||||
.. cmdoption:: --cache-size=INTEGER
|
||||
|
||||
A cache for computed similarities is maintained by `ecotag`. the default
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
.. automodule:: obisubset
|
||||
|
||||
|
||||
:py:mod:`obisubset` specific options
|
||||
------------------------------------
|
||||
|
||||
.. cmdoption:: -s <TAGNAME>, --sample=<TAGNAME>,
|
||||
|
||||
The option ``-s`` allows to specify the tag containing sample descriptions,
|
||||
the default value is set to *merged_sample*.
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obiuniq -m sample seq1.fasta > seq2.fasta
|
||||
> obisubset -s merged_sample -n sample1 seq2.fasta > seq3.fasta
|
||||
|
||||
After the dereplication of the sequences using the
|
||||
in the new attribute ``merged_sample``.
|
||||
|
||||
.. cmdoption:: -o <TAGNAME>, --other-tag=<TAGNAME>,
|
||||
|
||||
Another tag to clean according to the sample subset
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
|
||||
|
||||
.. cmdoption:: -l <FILENAME>, --sample-list=<FILENAME>,
|
||||
|
||||
File containing the samples names (one sample id per line).
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -l ids.txt seq2.fasta > seq3.fasta
|
||||
|
||||
.. cmdoption:: -p <REGEX>, --sample-pattern=<REGEX>,
|
||||
|
||||
A regular expression pattern matching the sample ids to extract.
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -p "negative_.*" seq2.fasta > seq3.fasta
|
||||
|
||||
.. cmdoption:: -n <SAMPLEIDS>, --sample-name=<SAMPLEIDS>,
|
||||
|
||||
A sample id to extract
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
|
||||
|
||||
.. include:: ../optionsSet/inputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/outputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/defaultoptions.txt
|
||||
|
||||
:py:mod:`obisubset` modifies sequence attributes
|
||||
------------------------------------------------
|
||||
|
||||
.. hlist::
|
||||
:columns: 3
|
||||
|
||||
- :doc:`count <../attributes/count>`
|
||||
- :doc:`merged_* <../attributes/merged_star>`
|
||||
|
||||
:py:mod:`obisubset` used sequence attribute
|
||||
-------------------------------------------
|
||||
|
||||
- :doc:`count <../attributes/taxid>`
|
||||
- :doc:`merged_* <../attributes/merged_star>`
|
||||
@@ -19,7 +19,7 @@ from os import path
|
||||
|
||||
|
||||
PACKAGE = "OBITools"
|
||||
VERSION = "1.1.21"
|
||||
VERSION = "1.2.3"
|
||||
AUTHOR = 'Eric Coissac'
|
||||
EMAIL = 'eric@coissac.eu'
|
||||
URL = 'metabarcoding.org/obitools'
|
||||
|
||||
+18
-6
@@ -1,4 +1,4 @@
|
||||
#!/usr/local/bin/python
|
||||
#!/usr/local/OBITools-1.1.22/bin/python
|
||||
'''
|
||||
:py:mod:`ecotag`: assigns sequences to taxa
|
||||
===========================================
|
||||
@@ -145,9 +145,16 @@ def addSearchOptions(optionManager):
|
||||
|
||||
optionManager.add_option('-E','--errors',
|
||||
action='store',dest='error',
|
||||
type='float',
|
||||
default=0.0,
|
||||
help='Tolerated rate of wrong assignation')
|
||||
|
||||
optionManager.add_option('-M','--min-matches',
|
||||
action='store',dest='minmatches',
|
||||
type="int",
|
||||
default=1,
|
||||
help='Minimum congruent assignation')
|
||||
|
||||
optionManager.add_option('--cache-size',
|
||||
action='store',dest='cache',
|
||||
type='int',
|
||||
@@ -333,10 +340,10 @@ if __name__=='__main__':
|
||||
rankid = taxonomy.findRankByName(options.explain)
|
||||
|
||||
for seq in db:
|
||||
id = seq.id[0:46]
|
||||
seq.id=id
|
||||
assert id not in taxonlink
|
||||
taxonlink[id]=int(seq['taxid'])
|
||||
seqid = seq.id[0:46]
|
||||
seq.id=seqid
|
||||
assert seqid not in taxonlink
|
||||
taxonlink[seqid]=int(seq['taxid'])
|
||||
|
||||
|
||||
if options.uniq:
|
||||
@@ -363,7 +370,12 @@ if __name__=='__main__':
|
||||
|
||||
if best[0]:
|
||||
taxlist = set(taxonlink[p[0].id] for p in match)
|
||||
lca = taxonomy.betterCommonTaxon(options.error,*tuple(taxlist))
|
||||
if options.error > 0.0 and len(match) >= int(options.minmatches / (1.0 - options.error)):
|
||||
lca = taxonomy.betterCommonTaxon(options.error,
|
||||
*tuple(taxlist))
|
||||
else:
|
||||
lca = taxonomy.betterCommonTaxon(0.0,*tuple(taxlist))
|
||||
|
||||
scname = taxonomy.getScientificName(lca)
|
||||
rank = taxonomy.getRank(lca)
|
||||
if len(taxlist) < 15:
|
||||
|
||||
+2
-1
@@ -48,7 +48,7 @@ Otherwise,
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obiaddtaxids -T species_name -g genus_identified.fasta \\
|
||||
> obiaddtaxids -k species_name -g genus_identified.fasta \\
|
||||
-u unidentified.fasta -d my_ecopcr_database \\
|
||||
my_sequences.fasta > identified.fasta
|
||||
|
||||
@@ -340,6 +340,7 @@ if __name__=='__main__':
|
||||
entries = entryIterator(entries)
|
||||
elif options.db_type == 'UNITE_FULL' :
|
||||
entryIterator = UNITEIterator_FULL
|
||||
entries = entryIterator(entries)
|
||||
elif options.db_type == 'UNITE_GENERAL' :
|
||||
entryIterator = UNITEIterator_GENERAL
|
||||
entries = entryIterator(entries)
|
||||
|
||||
+97
-74
@@ -23,8 +23,11 @@ def minimum(seqs):
|
||||
return min(s['select'] for s in seqs)
|
||||
|
||||
def maximum(seqs):
|
||||
return max(s['select'] for s in seqs)
|
||||
|
||||
try:
|
||||
return max(s['select'] for s in seqs)
|
||||
except TypeError, e:
|
||||
print >>sys.stderr, seqs
|
||||
raise e
|
||||
def mean(seqs):
|
||||
ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0)
|
||||
return float(ss) / len(seqs)
|
||||
@@ -101,6 +104,13 @@ def addSelectOptions(optionManager):
|
||||
default=[],
|
||||
help="attributes to merge within each group")
|
||||
|
||||
group.add_option('-s','--sample',
|
||||
action="store", dest="sample",
|
||||
metavar="<TAGNAME>",
|
||||
type="str",
|
||||
default=None,
|
||||
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
|
||||
|
||||
group.add_option('--merge-ids',
|
||||
action="store_true", dest="mergeids",
|
||||
default=False,
|
||||
@@ -129,16 +139,22 @@ if __name__ == '__main__':
|
||||
classes = {}
|
||||
|
||||
print >>sys.stderr,"\nLoading sequences...\n"
|
||||
|
||||
with_taxonomy=hasattr(options, 'taxonomy') and options.taxonomy is not None
|
||||
|
||||
nbseq=0
|
||||
|
||||
for s in entries:
|
||||
nbseq+=1
|
||||
category = []
|
||||
|
||||
if with_taxonomy:
|
||||
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
|
||||
else:
|
||||
environ = {'sequence':s,'random':random()}
|
||||
|
||||
for c in options.categories:
|
||||
try:
|
||||
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
|
||||
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
|
||||
else:
|
||||
environ = {'sequence':s,'random':random()}
|
||||
|
||||
v = eval(c,environ,s)
|
||||
category.append(v)
|
||||
except:
|
||||
@@ -149,11 +165,6 @@ if __name__ == '__main__':
|
||||
group.append(s)
|
||||
classes[category]= group
|
||||
|
||||
if hasattr(options, 'taxonomy') and options.taxonomy is not None:
|
||||
environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
|
||||
else:
|
||||
environ = {'sequence':s, 'random':random()}
|
||||
|
||||
try:
|
||||
select = eval(options.function,environ,s)
|
||||
s['select']=select
|
||||
@@ -179,77 +190,89 @@ if __name__ == '__main__':
|
||||
i=0
|
||||
for c in classes:
|
||||
i+=1
|
||||
progressBar(i,lclasses,False,"%15s" % c)
|
||||
progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
|
||||
seqs = classes[c]
|
||||
sortclass(seqs, options)
|
||||
if len(c)==1:
|
||||
c=c[0]
|
||||
if options.sample is not None:
|
||||
subsets = {}
|
||||
for s in seqs:
|
||||
for sid in s[options.sample]:
|
||||
ss = subsets.get(sid,[])
|
||||
ss.append(s)
|
||||
subsets[sid]=ss
|
||||
else:
|
||||
subsets={"all":seqs}
|
||||
|
||||
if options.number==1:
|
||||
s = seqs[0]
|
||||
for seqs in subsets.values():
|
||||
sortclass(seqs, options)
|
||||
|
||||
for key in mergedKey:
|
||||
if key=='taxid' and mergeIds:
|
||||
if 'taxid_dist' not in s:
|
||||
s["taxid_dist"]={}
|
||||
if 'taxid' in s:
|
||||
s["taxid_dist"][s.id]=s['taxid']
|
||||
mkey = "merged_%s" % key
|
||||
if mkey not in s:
|
||||
if key in s:
|
||||
s[mkey]={s[key]:1}
|
||||
else:
|
||||
s[mkey]={}
|
||||
|
||||
if 'count' not in s:
|
||||
s['count']=1
|
||||
if mergeIds:
|
||||
s['merged']=[s.id]
|
||||
|
||||
for seq in seqs[1:]:
|
||||
if len(c)==1:
|
||||
c=c[0]
|
||||
|
||||
s['count']+=seq['count']
|
||||
if options.number==1 and options.sample is None:
|
||||
s = seqs[0]
|
||||
|
||||
for key in mergedKey:
|
||||
if key=='taxid' and mergeIds:
|
||||
if 'taxid_dist' in seq:
|
||||
s["taxid_dist"].update(seq["taxid_dist"])
|
||||
if 'taxid' in seq:
|
||||
s["taxid_dist"][seq.id]=seq['taxid']
|
||||
|
||||
if 'taxid_dist' not in s:
|
||||
s["taxid_dist"]={}
|
||||
if 'taxid' in s:
|
||||
s["taxid_dist"][s.id]=s['taxid']
|
||||
mkey = "merged_%s" % key
|
||||
if mkey in seq:
|
||||
m = seq[mkey]
|
||||
else:
|
||||
if key in seq:
|
||||
m={seq[key]:1}
|
||||
|
||||
allmkey = set(m.keys()) | set(s[mkey].keys())
|
||||
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
|
||||
|
||||
# if mkey in seq:
|
||||
# for skey in seq[mkey]:
|
||||
# if skey in s:
|
||||
# s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey]
|
||||
# else:
|
||||
# s[mkey][skey]=seq[mkey][skey]
|
||||
|
||||
#for key in seq.iterkeys():
|
||||
# # Merger proprement l'attribut merged s'il exist
|
||||
# if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged' and key!='select':
|
||||
# del(s[key])
|
||||
|
||||
|
||||
if mkey not in s:
|
||||
if key in s:
|
||||
s[mkey]={s[key]:1}
|
||||
else:
|
||||
s[mkey]={}
|
||||
|
||||
if 'count' not in s:
|
||||
s['count']=1
|
||||
if mergeIds:
|
||||
s['merged'].append(seq.id)
|
||||
|
||||
if taxonomy is not None:
|
||||
mergeTaxonomyClassification(seqs, taxonomy)
|
||||
s['merged']=[s.id]
|
||||
|
||||
for seq in seqs[1:]:
|
||||
|
||||
|
||||
for s in seqs[0:options.number]:
|
||||
s['class']=c
|
||||
del s['select']
|
||||
writer(s)
|
||||
s['count']+=seq['count']
|
||||
|
||||
for key in mergedKey:
|
||||
if key=='taxid' and mergeIds:
|
||||
if 'taxid_dist' in seq:
|
||||
s["taxid_dist"].update(seq["taxid_dist"])
|
||||
if 'taxid' in seq:
|
||||
s["taxid_dist"][seq.id]=seq['taxid']
|
||||
|
||||
mkey = "merged_%s" % key
|
||||
if mkey in seq:
|
||||
m = seq[mkey]
|
||||
else:
|
||||
if key in seq:
|
||||
m={seq[key]:1}
|
||||
|
||||
allmkey = set(m.keys()) | set(s[mkey].keys())
|
||||
s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
|
||||
|
||||
if mergeIds:
|
||||
s['merged'].append(seq.id)
|
||||
|
||||
if taxonomy is not None:
|
||||
mergeTaxonomyClassification(seqs, taxonomy)
|
||||
|
||||
|
||||
for s in seqs[0:options.number]:
|
||||
s['class']=c
|
||||
s['__@TOWRITE@__']=True
|
||||
|
||||
print >>sys.stderr,"\Writing sequences...\n"
|
||||
progressBar(1,nbseq,True,'Writing')
|
||||
|
||||
i=0
|
||||
for c in classes:
|
||||
seqs = classes[c]
|
||||
for s in seqs:
|
||||
i+=1
|
||||
progressBar(i,nbseq,False,"Writing")
|
||||
if '__@TOWRITE@__' in s:
|
||||
del s['__@TOWRITE@__']
|
||||
del s['select']
|
||||
writer(s)
|
||||
|
||||
print >>sys.stderr
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
:py:mod:`obisubset`: extract a subset of samples
|
||||
================================================
|
||||
|
||||
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
|
||||
|
||||
The :py:mod:`obisubset` command extracts a subset of samples from a sequence file
|
||||
after its dereplication using :py:mod:`obiuniq` program.
|
||||
'''
|
||||
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
|
||||
from obitools.options import getOptionManager
|
||||
import re
|
||||
|
||||
def addSubsetOptions(optionManager):
|
||||
|
||||
group = optionManager.add_option_group('obisubset specific options')
|
||||
group.add_option('-s','--sample',
|
||||
action="store", dest="sample",
|
||||
metavar="<TAGNAME>",
|
||||
type="str",
|
||||
default='merged_sample',
|
||||
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
|
||||
|
||||
group.add_option('-o','--other-tag',
|
||||
action="append", dest="taglist",
|
||||
metavar="<TAGNAME>",
|
||||
type="string",
|
||||
default=[],
|
||||
help="Another tag to clean according to the sample subset")
|
||||
|
||||
group.add_option('-l','--sample-list',
|
||||
action="store", dest="samplelist",
|
||||
metavar="<FILENAME>",
|
||||
type="string",
|
||||
default=None,
|
||||
help="File containing the samples names (one sample id per line)")
|
||||
|
||||
group.add_option('-p','--sample-pattern',
|
||||
action="store", dest="samplepattern",
|
||||
metavar="<REGEX>",
|
||||
type="string",
|
||||
default=None,
|
||||
help="A regular expression pattern matching the sample ids to extract")
|
||||
|
||||
group.add_option('-n','--sample-name',
|
||||
action="append", dest="samplename",
|
||||
metavar="<SAMPLEIDS>",
|
||||
type="string",
|
||||
default=[],
|
||||
help="A sample id to extract")
|
||||
|
||||
def sequenceSelectorGenerator(options):
|
||||
|
||||
samplename = set(options.samplename)
|
||||
othertags = set(options.taglist)
|
||||
|
||||
if options.samplelist is not None:
|
||||
with open(options.samplelist) as lname :
|
||||
for name in lname:
|
||||
name = name.strip()
|
||||
samplename.add(name)
|
||||
|
||||
if options.samplepattern is not None:
|
||||
samplepattern = re.compile(options.samplepattern)
|
||||
else:
|
||||
samplepattern = None
|
||||
|
||||
def sequenceSelector(entries):
|
||||
for entry in entries:
|
||||
samples=entry[options.sample]
|
||||
|
||||
slist = set(samples.keys())
|
||||
tokeep=slist & samplename
|
||||
|
||||
if samplepattern is not None:
|
||||
for name in slist:
|
||||
if samplepattern.match(name):
|
||||
tokeep.add(name)
|
||||
|
||||
if tokeep:
|
||||
newsample={}
|
||||
newcount=0
|
||||
for name in tokeep:
|
||||
c = samples[name]
|
||||
newsample[name]= c
|
||||
newcount+=c
|
||||
|
||||
entry['count']=newcount
|
||||
entry[options.sample]=newsample
|
||||
|
||||
for t in othertags:
|
||||
if t in entry:
|
||||
d = entry[t]
|
||||
newd={}
|
||||
for name in tokeep:
|
||||
if name in d:
|
||||
newd[name] = d[name]
|
||||
entry[t]=newd
|
||||
|
||||
yield entry
|
||||
|
||||
return sequenceSelector
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
optionParser = getOptionManager([addInOutputOption,addSubsetOptions],progdoc=__doc__)
|
||||
|
||||
(options, entries) = optionParser()
|
||||
|
||||
writer = sequenceWriterGenerator(options)
|
||||
|
||||
good = sequenceSelectorGenerator(options)
|
||||
|
||||
for seq in good(entries):
|
||||
writer(seq)
|
||||
+3
-3
@@ -320,9 +320,9 @@ if __name__ == '__main__':
|
||||
parent[3],parent[0],options.taxonomy._ranks[parent[1]])
|
||||
localdata=True
|
||||
|
||||
for n in options.newname:
|
||||
tx = t.split(t,':')
|
||||
taxid = options.taxonomy.addPreferedName(tx[0].strip(),tx[1])
|
||||
for t in options.newname:
|
||||
tx = t.split(':')
|
||||
taxid = options.taxonomy.addPreferedName(int(tx[1]), tx[0].strip())
|
||||
print "name : %8d\t->\t%s" % (taxid,options.taxonomy.getPreferedName(taxid))
|
||||
|
||||
ecoTaxonomyWriter(options.ecodb,options.taxonomy,onlyLocal=True)
|
||||
|
||||
@@ -64,12 +64,14 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
||||
class EcoPCRDBSequenceWriter(object):
|
||||
|
||||
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
|
||||
|
||||
from obitools.options import currentInputFileName
|
||||
self.currentInputFileName=currentInputFileName
|
||||
# Take care of the taxonomy associated to the database
|
||||
self._currentfile=None
|
||||
|
||||
self._taxonomy= loadTaxonomyDatabase(options)
|
||||
dbname = options.ecopcroutput
|
||||
|
||||
|
||||
if (self._taxonomy is not None
|
||||
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
|
||||
print >> sys.stderr,"Writing the taxonomy file...",
|
||||
@@ -82,23 +84,25 @@ class EcoPCRDBSequenceWriter(object):
|
||||
fileidx = max(list(int(p.search(i).group(1))
|
||||
for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
|
||||
) +1
|
||||
|
||||
self._fileidx=fileidx
|
||||
self._dbname=dbname
|
||||
|
||||
|
||||
self._filename="%s_%03d.sdx" % (dbname,fileidx)
|
||||
if append:
|
||||
mode ='r+b'
|
||||
f = universalOpen(self._filename)
|
||||
(recordCount,) = struct.unpack('> I',f.read(4))
|
||||
self._sequenceCount=recordCount
|
||||
self._sequenceFileCount=recordCount
|
||||
del f
|
||||
self._file = open(self._filename,mode)
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',0))
|
||||
self.open('r+b')
|
||||
self._file.seek(0,2)
|
||||
|
||||
else:
|
||||
self._sequenceCount=0
|
||||
mode = 'wb'
|
||||
self._file = open(self._filename,mode)
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
self._sequenceFileCount=0
|
||||
self.open("wb")
|
||||
|
||||
if type is not None:
|
||||
assert ftid is not None,"You must specify an id attribute for features"
|
||||
@@ -141,7 +145,28 @@ class EcoPCRDBSequenceWriter(object):
|
||||
return packed
|
||||
|
||||
|
||||
def close(self):
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',self._sequenceFileCount))
|
||||
self._file.close()
|
||||
|
||||
def open(self,mode):
|
||||
self._filename="%s_%03d.sdx" % (self._dbname,self._fileidx)
|
||||
self._file=open(self._filename,mode)
|
||||
self._sequenceFileCount=0
|
||||
self._file.write(struct.pack('> I',self._sequenceFileCount))
|
||||
|
||||
def put(self,sequence):
|
||||
if self._currentfile is None:
|
||||
self._currentfile=self.currentInputFileName()
|
||||
if self.currentInputFileName() != self._currentfile:
|
||||
self._currentfile=self.currentInputFileName()
|
||||
|
||||
self.close()
|
||||
|
||||
self._fileidx+=1
|
||||
self.open('wb')
|
||||
|
||||
if self._taxonomy is not None:
|
||||
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
|
||||
sequence.extractTaxon()
|
||||
@@ -149,11 +174,10 @@ class EcoPCRDBSequenceWriter(object):
|
||||
if self._annotation is not None:
|
||||
self._annotation.put(sequence, self._sequenceCount)
|
||||
self._sequenceCount+=1
|
||||
self._sequenceFileCount+=1
|
||||
|
||||
def __del__(self):
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
self._file.close()
|
||||
self.close()
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -269,7 +269,7 @@ class Taxonomy(object):
|
||||
|
||||
def addPreferedName(self,taxid,name):
|
||||
idx = self.findIndex(taxid)
|
||||
self._preferedName.append(name,'obi',idx)
|
||||
self._preferedName.append([name,'obi',idx])
|
||||
self._preferedidx[idx]=name
|
||||
return taxid
|
||||
|
||||
@@ -399,10 +399,10 @@ class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
|
||||
for t in self._taxonomy[self._localtaxon:]:
|
||||
self._name.append((t[3],'scientific name',i))
|
||||
i+=1
|
||||
|
||||
|
||||
try :
|
||||
self._preferedName = [(x[0],'obi',x[2])
|
||||
for x in self.__ecoNameIterator(self._preferedNamesFile,noError=True)]
|
||||
for x in self.__ecoNameIterator(self._preferedNamesFile)]
|
||||
print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
|
||||
except:
|
||||
print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
|
||||
|
||||
@@ -33,10 +33,6 @@ from obitools.format.sequence import skipOnErrorIterator, skipfirst, only
|
||||
from obitools import BioSequence
|
||||
from obitools.utils import FakeFile
|
||||
|
||||
from glob import glob
|
||||
from test.test_compiler import Toto
|
||||
|
||||
|
||||
def binarySequenceIterator(lineiterator):
|
||||
|
||||
f = FakeFile(lineiterator)
|
||||
@@ -263,9 +259,9 @@ def autoEntriesIterator(options):
|
||||
raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
|
||||
|
||||
if reader==binarySequenceIterator:
|
||||
input = binarySequenceIterator(lineiterator)
|
||||
input = binarySequenceIterator(lineiterator) # @ReservedAssignment
|
||||
else:
|
||||
input = reader(chain([first],lineiterator))
|
||||
input = reader(chain([first],lineiterator)) # @ReservedAssignment
|
||||
|
||||
return input
|
||||
|
||||
@@ -325,7 +321,7 @@ def autoEntriesIterator(options):
|
||||
|
||||
def sequenceWriterGenerator(options,output=sys.stdout):
|
||||
class SequenceWriter:
|
||||
def __init__(self,options,file=sys.stdout):
|
||||
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
|
||||
self._format=None
|
||||
self._file=file
|
||||
self._upper=options.uppercase
|
||||
@@ -350,7 +346,7 @@ def sequenceWriterGenerator(options,output=sys.stdout):
|
||||
sys.exit(0)
|
||||
|
||||
class BinaryWriter:
|
||||
def __init__(self,options,file=sys.stdout):
|
||||
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
|
||||
self._file=file
|
||||
self._file.write("#!Pickle\n")
|
||||
def put(self,seq):
|
||||
|
||||
@@ -93,7 +93,7 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102,opti
|
||||
else:
|
||||
|
||||
if entryIterator == EcoPCRDBSequenceIterator and options is not None:
|
||||
if options.ecodb==f:
|
||||
if hasattr(options,'ecodb') and options.ecodb==f:
|
||||
iterator = entryIterator(f,options.taxonomy)
|
||||
else:
|
||||
iterator = entryIterator(f)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
major = 1
|
||||
minor = 1
|
||||
serial= '21'
|
||||
minor = 2
|
||||
serial= '3'
|
||||
|
||||
version = "%2d.%02d %s" % (major,minor,serial)
|
||||
|
||||
Reference in New Issue
Block a user