14 Commits

Author SHA1 Message Date
coissac 23717f5833 Add a first version of the obisubset tool and tag the version 1.2.0 2015-10-20 22:02:02 +02:00
coissac fdeaf5956c Change the way ecoPCRDB are written by obitools. If the obitools is
called with several sequence files as input when the ecoPCRDB is
requested as output format, the sequences are splitted in several sdx
files
2015-10-02 16:36:25 +02:00
coissac 343d9ec6df Switch to version 1.1.22 2015-08-03 23:16:37 +02:00
coissac 7a993ec841 Patch a wrong import in the file 2015-08-03 23:14:13 +02:00
celinemercier 6170fac081 fixed a problem with the progress bar in obiselect when there is more
than one category
2015-07-20 15:24:30 +02:00
coissac 764ee27c0d Switch to the version 1.1.21 2015-07-13 15:31:20 +02:00
coissac 18d87aa1b1 Add a small work around patch for the --skip and --only options in
illuminapairedend
2015-07-11 23:38:47 +02:00
coissac 1c91c27ee3 Tag the version 1.1.20 2015-07-03 14:57:24 +02:00
coissac 22d343b46a Patch two small print bug in the new ecotag 2015-07-03 14:55:44 +02:00
coissac 2af94b9da7 Add docuentation for the new options and an option to manage the ecotag
cache size
2015-07-03 10:39:59 +02:00
coissac aa064dda57 Add two general options for limiting the analysis of a file to only a
sub-part of the data:

   - --skip <N>
   - --only <N>
   
They respectively allow to first skip the N first sequences, and to
analysize only the <N> next sequences.
2015-07-03 09:10:49 +02:00
coissac f38ccae698 Add a --minimum-circle option to ecotag and a cache on the self
alignment scores of 1000000 of pairwise scores
2015-07-02 16:14:22 +02:00
Celine Mercier b99881817a Closes #9 : adds a parser in obiaddtaxids for the UNITE 'general FASTA
release' format
2015-06-12 15:08:58 +02:00
Celine Mercier df093b3fa9 updated the UNITE parser of obiaddtaxids 2015-06-11 17:47:00 +02:00
16 changed files with 492 additions and 134 deletions
+2 -3
View File
@@ -3,13 +3,12 @@ Sequence sampling and filtering
.. toctree::
:maxdepth: 2
scripts/obiextract
scripts/obigrep
scripts/obihead
scripts/obisample
scripts/obiselect
scripts/obisplit
scripts/obisubset
scripts/obitail
@@ -3,6 +3,24 @@ Options to specify input format
.. program:: obitools
Restrict the analysis to a sub-part of the input file
.....................................................
.. cmdoption:: --skip <N>
The N first sequence records of the file are discarded from the analysis and
not reported to the output file
.. cmdoption:: --only <N>
Only the N next sequence records of the file are analyzed. The following sequences
in the file are neither analyzed, neither reported to the output file.
This option can be used conjointly with the `--skip` option.
Sequence annotated format
.........................
+38 -24
View File
@@ -1,55 +1,70 @@
.. automodule:: ecotag
:py:mod:`ecotag` specific options
---------------------------------
.. cmdoption:: -R <FILENAME>, --ref-database=<FILENAME>
.. cmdoption:: -R <FILENAME>, --ref-database=<FILENAME>
<FILENAME> is the fasta file containing the reference sequences
.. cmdoption:: -m FLOAT, --minimum-identity=FLOAT
When the best match with the reference database present an identity
level below FLOAT, the taxonomic assignment for the sequence record
is not computed. The sequence record is nevertheless included in the
output file. FLOAT is included in a [0,1] interval.
.. cmdoption:: --minimum-circle=FLOAT
When sequence identity is less than FLOAT, the taxonomic
assignment for the sequence record is not indicated in ``ecotag``'s
output. FLOAT is included in a [0,1] interval.
(This option doesn't seem to work).
minimum identity considered for the assignment circle.
FLOAT is included in a [0,1] interval.
.. cmdoption:: -x RANK, --explain=RANK
.. cmdoption:: -u, --uniq
When this option is specified, the program first dereplicates the sequence
records to work on unique sequences only. This option greatly improves
When this option is specified, the program first dereplicates the sequence
records to work on unique sequences only. This option greatly improves
the program's speed, especially for highly redundant datasets.
.. cmdoption:: --sort=<KEY>
The output is sorted based on the values of the relevant attribute.
.. cmdoption:: -r, --reverse
The output is sorted in reverse order (should be used with the --sort option).
(Works even if the --sort option is not set, but could not find on what
(Works even if the --sort option is not set, but could not find on what
the output is sorted).
.. cmdoption:: -E FLOAT, --errors=FLOAT
FLOAT is the fraction of reference sequences that will
be ignored when looking for the most recent common ancestor. This
option is useful when a non-negligible proportion of reference sequences
is expected to be assigned to the wrong taxon, for example because of
FLOAT is the fraction of reference sequences that will
be ignored when looking for the most recent common ancestor. This
option is useful when a non-negligible proportion of reference sequences
is expected to be assigned to the wrong taxon, for example because of
taxonomic misidentification. FLOAT is included in a [0,1] interval.
.. cmdoption:: --cache-size=INTEGER
A cache for computed similarities is maintained by `ecotag`. the default
size for this cache is 1,000,000 of scores. This option allows to change
the cache size.
.. include:: ../optionsSet/taxonomyDB.txt
.. include:: ../optionsSet/inputformat.txt
.. include:: ../optionsSet/outputformat.txt
.. include:: ../optionsSet/defaultoptions.txt
:py:mod:`ecotag` added sequence attributes
------------------------------------------
.. hlist::
:columns: 3
- :doc:`best_identity <../attributes/best_identity>`
- :doc:`best_match <../attributes/best_match>`
- :doc:`family <../attributes/family>`
@@ -65,4 +80,3 @@
- :doc:`species_list <../attributes/species_list>`
- :doc:`species_name <../attributes/species_name>`
- :doc:`taxid <../attributes/taxid>`
+81
View File
@@ -0,0 +1,81 @@
.. automodule:: obisubset
:py:mod:`obisubset` specific options
------------------------------------
.. cmdoption:: -s <TAGNAME>, --sample=<TAGNAME>,
The option ``-s`` allows to specify the tag containing sample descriptions,
the default value is set to *merged_sample*.
*Example:*
.. code-block:: bash
> obiuniq -m sample seq1.fasta > seq2.fasta
> obisubset -s merged_sample -n sample1 seq2.fasta > seq3.fasta
After the dereplication of the sequences using the
in the new attribute ``merged_sample``.
.. cmdoption:: -o <TAGNAME>, --other-tag=<TAGNAME>,
Another tag to clean according to the sample subset
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
.. cmdoption:: -l <FILENAME>, --sample-list=<FILENAME>,
File containing the samples names (one sample id per line).
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -l ids.txt seq2.fasta > seq3.fasta
.. cmdoption:: -p <REGEX>, --sample-pattern=<REGEX>,
A regular expression pattern matching the sample ids to extract.
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -p "negative_.*" seq2.fasta > seq3.fasta
.. cmdoption:: -n <SAMPLEIDS>, --sample-name=<SAMPLEIDS>,
A sample id to extract
*Example:*
.. code-block:: bash
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
.. include:: ../optionsSet/inputformat.txt
.. include:: ../optionsSet/outputformat.txt
.. include:: ../optionsSet/defaultoptions.txt
:py:mod:`obisubset` modifies sequence attributes
------------------------------------------------
.. hlist::
:columns: 3
- :doc:`count <../attributes/count>`
- :doc:`merged_* <../attributes/merged_star>`
:py:mod:`obisubset` used sequence attribute
-------------------------------------------
- :doc:`count <../attributes/taxid>`
- :doc:`merged_* <../attributes/merged_star>`
+1 -1
View File
@@ -19,7 +19,7 @@ from os import path
PACKAGE = "OBITools"
VERSION = "1.1.18"
VERSION = "1.2.0"
AUTHOR = 'Eric Coissac'
EMAIL = 'eric@coissac.eu'
URL = 'metabarcoding.org/obitools'
+60 -4
View File
@@ -46,6 +46,8 @@ from obitools.options.taxonomyfilter import addTaxonomyDBOptions,loadTaxonomyDat
from obitools.options import getOptionManager
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
from collections import OrderedDict
import sys
import math
import os.path
@@ -76,6 +78,13 @@ def addSearchOptions(optionManager):
default=0.0,
help="minimum identity to consider.")
optionManager.add_option('--minimum-circle',
action="store", dest="circle",
metavar="identity",
type="float",
default=1.0,
help="minimum identity considered for the assignment circle.")
# optionManager.add_option('-S','--normalized-smallest',
# action="store_false", dest="large",
# default=True,
@@ -139,6 +148,13 @@ def addSearchOptions(optionManager):
default=0.0,
help='Tolerated rate of wrong assignation')
optionManager.add_option('--cache-size',
action='store',dest='cache',
type='int',
metavar='<SIZE>',
default=1000000,
help='Cache size for the aligment score')
def count(data):
rep = {}
@@ -190,6 +206,31 @@ def myLenlcs(s1, s2, minid, normalized, reference):
return lcs, lali
def cachedLenLCS(s1,s2,minid,normalized,reference):
global __LCSCache__
global __INCache__
global __OUTCache__
global __CACHE_SIZE__
pair=frozenset((s1.id,s2.id))
if pair in __LCSCache__:
rep=__LCSCache__[pair]
del __LCSCache__[pair]
__INCache__+=1.0
else:
rep=lenlcs(s1,s2,minid,normalized,reference)
__OUTCache__+=1.0
__LCSCache__[pair]=rep
if len(__LCSCache__) > __CACHE_SIZE__:
__LCSCache__.popitem(0)
return rep
#def lcsIterator(entries,db,options):
#
# for seq in entries:
@@ -233,7 +274,7 @@ def lcsIteratorSelf(entries,db,options):
maxid = ([],0.0)
minid = options.minimum
for d in db:
lcs,lali = myLenlcs(seq,d,minid,normalized=True,reference=ALILEN)
lcs,lali = myLenlcs(seq,d,minid,normalized=True,reference=ALILEN) # @UnusedVariable
if lcs > maxid[1] and lcs > options.minimum:
maxid = ([d],lcs)
minid = maxid[1]
@@ -241,11 +282,13 @@ def lcsIteratorSelf(entries,db,options):
maxid[0].append(d)
if maxid[0]:
if maxid[1] > options.circle:
maxid=(maxid[0],options.circle)
results.extend([(s,maxid[1]) for s in maxid[0]])
for d in db:
for s in maxid[0]:
if d.id != s.id:
lcs,lali = lenlcs(s,d,maxid[1],normalized=True,reference=ALILEN)
lcs,lali = cachedLenLCS(s,d,maxid[1],normalized=True,reference=ALILEN) # @UnusedVariable
if lcs >= maxid[1]:
results.append((d,lcs))
@@ -253,9 +296,19 @@ def lcsIteratorSelf(entries,db,options):
if __name__=='__main__':
__LCSCache__=OrderedDict()
__INCache__=1.0
__OUTCache__=1.0
optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
(options, entries) = optionParser()
__CACHE_SIZE__=options.cache
if __CACHE_SIZE__ < 10:
__CACHE_SIZE__=10
taxonomy = loadTaxonomyDatabase(options)
writer = sequenceWriterGenerator(options)
@@ -278,7 +331,7 @@ if __name__=='__main__':
taxonlink = {}
rankid = taxonomy.findRankByName(options.explain)
for seq in db:
id = seq.id[0:46]
seq.id=id
@@ -299,6 +352,8 @@ if __name__=='__main__':
search = lcsIteratorSelf(entries,db,options)
print >>sys.stderr,'\nCache size : %d\n' % __CACHE_SIZE__
for seq,best,match in search:
try:
@@ -385,8 +440,9 @@ if __name__=='__main__':
else:
seq['species_name']=None
writer(seq)
print >>sys.stderr,'\n%5.3f%% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)
+5 -17
View File
@@ -48,13 +48,14 @@ from obitools.options import getOptionManager, allEntryIterator
from obitools.align import QSolexaReverseAssemble
from obitools.align import QSolexaRightReverseAssemble
from obitools.tools._solexapairend import buildConsensus
from obitools.format.options import addInputFormatOption,addOutputFormatOption,\
from obitools.format.options import addOutputFormatOption,\
sequenceWriterGenerator
from itertools import chain
import cPickle
import math
from obitools.fastq._fastq import fastqIterator
from obitools.fastq._fastq import fastqIterator # @UnresolvedImport
def addSolexaPairEndOptions(optionManager):
optionManager.add_option('-r','--reverse-reads',
@@ -90,14 +91,6 @@ def addSolexaPairEndOptions(optionManager):
const='illumina',
help="input file is in fastq nucleic format produced by old solexa sequencer")
# optionManager.add_option('--proba',
# action="store", dest="proba",
# metavar="<FILENAME>",
# type="str",
# default=None,
# help="null ditribution data file")
optionManager.add_option('--score-min',
action="store", dest="smin",
metavar="#.###",
@@ -105,13 +98,6 @@ def addSolexaPairEndOptions(optionManager):
default=None,
help="minimum score for keeping aligment")
# optionManager.add_option('--pvalue',
# action="store", dest="pvalue",
# metavar="#.###",
# type="float",
# default=None,
# help="maximum pvalue for keeping aligment")
def cutDirectReverse(entries):
@@ -222,6 +208,8 @@ if __name__ == '__main__':
#WARNING TO REMOVE : DIRTY PATCH !
options.proba = None
options.skip = None
options.only = None
options.sminL = None
+72 -59
View File
@@ -68,7 +68,7 @@ Otherwise,
'''
import sys
import re
from obitools.fasta import fastaIterator,formatFasta
from obitools.options import getOptionManager
@@ -110,8 +110,10 @@ def addObiaddtaxidsOptions(optionManager):
metavar="<FORMAT>",
type="string",
default='raw',
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'."
" Default : raw.")
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE_FULL', 'UNITE_GENERAL' or 'SILVA'."
"The UNITE_FULL format is the one used for the 'Full UNITE+INSD dataset', and the UNITE_GENERAL format is the "
"one used for the 'General FASTA release'."
" Default : raw.")
optionManager.add_option('-k','--key-name',
action="store", dest="tagname",
@@ -142,22 +144,52 @@ def numberInStr(s) :
return containsNumber
def UNITEIterator(f):
def UNITEIterator_FULL(f):
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
for entry in fastaEntryIterator(f) :
all = entry.split('\n')
header = all[0]
fields = header.split('|')
id = fields[0][1:]
fields = header.split('|')
seq_id = fields[0][1:]
seq = all[1]
s = NucSequence(id, seq)
s['ISDN_species_name'] = fields[1]
s['UNITE_species_name'] = fields[3]
path1 = fields[2].replace(';', ',')
path2 = fields[4].replace(';', ',')
s['ISDN_path'] = path1
s['UNITE_path'] = path2
s = NucSequence(seq_id, seq)
path = fields[1]
species_name_loc = path.index('s__')
species_name_loc+=3
s['species_name'] = path[species_name_loc:]
genus_name_loc = path.index('g__')
genus_name_loc+=3
s['genus_name'] = path[genus_name_loc:species_name_loc-4]
path = re.sub('[a-z]__', '', path)
s['path'] = path.replace(';', ',')
yield s
def UNITEIterator_GENERAL(f):
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
for entry in fastaEntryIterator(f) :
all = entry.split('\n')
header = all[0]
fields = header.split('|')
seq_id = fields[0][1:]
seq = all[1]
s = NucSequence(seq_id, seq)
s['species_name'] = seq_id.replace("_", " ")
path = fields[4]
path = re.sub('[a-z]__', '', path)
path = path.replace(';', ',')
s['path'] = path.replace(',,', ',')
yield s
@@ -306,8 +338,10 @@ if __name__=='__main__':
if options.db_type == 'raw' :
entryIterator = fastaIterator
entries = entryIterator(entries)
elif options.db_type == 'UNITE' :
entryIterator = UNITEIterator
elif options.db_type == 'UNITE_FULL' :
entryIterator = UNITEIterator_FULL
elif options.db_type == 'UNITE_GENERAL' :
entryIterator = UNITEIterator_GENERAL
entries = entryIterator(entries)
elif options.db_type == 'SILVA' :
entryIterator = SILVAIterator
@@ -342,6 +376,7 @@ if __name__=='__main__':
if options.genus_found is not None and len(species_name.split(' ')) >= 2 :
try:
genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
s['genus_taxid'] = genusTaxid
print>>options.genus_found, formatFasta(s)
genusFound = True
except KeyError :
@@ -357,54 +392,32 @@ if __name__=='__main__':
print>>options.unidentified,formatFasta(s)
elif options.db_type == 'UNITE' :
restricting_ancestor = tax.findTaxonByName('Fungi')[0]
elif ((options.db_type =='UNITE_FULL') or (options.db_type =='UNITE_GENERAL')) :
restricting_ancestor = tax.findTaxonByName('Fungi')[0][0]
for s in entries :
try:
species_name = s["UNITE_species_name"]
try :
species_name = s['species_name']
taxid = getTaxid(tax, species_name, restricting_ancestor)
s['taxid']=taxid
s["species_name"] = species_name
s['taxid'] = taxid
s['rank'] = tax.getRank(taxid)
print formatFasta(s)
except KeyError:
try:
species_name = s["ISDN_species_name"]
print species_name
taxid = getTaxid(tax, species_name, restricting_ancestor)
s['taxid']=taxid
s["species_name"] = species_name
print formatFasta(s)
except KeyError:
genusFound = False
if options.genus_found is not None :
try:
genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
s['genus_taxid'] = genusTaxid
print>>options.genus_found, formatFasta(s)
genusFound = True
if s["UNITE_species_name"] != "-" and s["UNITE_species_name"] != "" :
s["species_name"] = s["UNITE_species_name"]
chosen = 'unite'
elif s["ISDN_species_name"] != "-" and s["ISDN_species_name"] != "" :
s["species_name"] = s["ISDN_species_name"]
chosen = 'isdn'
else :
if s["UNITE_path"] != "-" and s["UNITE_path"] != "" :
chosen = 'unite'
s["species_name"] = (s["UNITE_path"].split(', '))[-1]
elif s["ISDN_path"] != "-" and s["ISDN_path"] != "" :
chosen = 'isdn'
s["species_name"] = (s["ISDN_path"].split(', '))[-1]
else :
print>>sys.stderr, "\n\nwarning : sequence without any identification at all\n\n"
if chosen == 'unite' :
s['path'] = s["UNITE_path"]
else :
s['path'] = s["ISDN_path"]
if options.unidentified is not None :
print>>options.unidentified,formatFasta(s)
except KeyError:
pass
if options.unidentified is not None and not genusFound :
print>>options.unidentified,formatFasta(s)
+2 -2
View File
@@ -18,7 +18,7 @@
'''
from obitools.options import getOptionManager
from obitools.format.options import addInOutputOption
from obitools.format.options import addInputFormatOption
def addCountOptions(optionManager):
group=optionManager.add_option_group('Obicount specific options')
@@ -36,7 +36,7 @@ def addCountOptions(optionManager):
if __name__ == '__main__':
optionParser = getOptionManager([addCountOptions,addInOutputOption], progdoc=__doc__)
optionParser = getOptionManager([addCountOptions,addInputFormatOption], progdoc=__doc__)
(options, entries) = optionParser()
+1 -1
View File
@@ -179,7 +179,7 @@ if __name__ == '__main__':
i=0
for c in classes:
i+=1
progressBar(i,lclasses,False,"%15s" % c)
progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
seqs = classes[c]
sortclass(seqs, options)
if len(c)==1:
+116
View File
@@ -0,0 +1,116 @@
#!/usr/local/bin/python
'''
:py:mod:`obisubset`: extract a subset of samples
================================================
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
The :py:mod:`obisubset` command extracts a subset of samples from a sequence file
after its dereplication using :py:mod:`obiuniq` program.
'''
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
from obitools.options import getOptionManager
import re
def addSubsetOptions(optionManager):
group = optionManager.add_option_group('obisubset specific options')
group.add_option('-s','--sample',
action="store", dest="sample",
metavar="<TAGNAME>",
type="str",
default='merged_sample',
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
group.add_option('-o','--other-tag',
action="append", dest="taglist",
metavar="<TAGNAME>",
type="string",
default=[],
help="Another tag to clean according to the sample subset")
group.add_option('-l','--sample-list',
action="store", dest="samplelist",
metavar="<FILENAME>",
type="string",
default=None,
help="File containing the samples names (one sample id per line)")
group.add_option('-p','--sample-pattern',
action="store", dest="samplepattern",
metavar="<REGEX>",
type="string",
default=None,
help="A regular expression pattern matching the sample ids to extract")
group.add_option('-n','--sample-name',
action="append", dest="samplename",
metavar="<SAMPLEIDS>",
type="string",
default=[],
help="A sample id to extract")
def sequenceSelectorGenerator(options):
samplename = set(options.samplename)
othertags = set(options.taglist)
if options.samplelist is not None:
with open(options.samplelist) as lname :
for name in lname:
name = name.strip()
samplename.add(name)
if options.samplepattern is not None:
samplepattern = re.compile(options.samplepattern)
else:
samplepattern = None
def sequenceSelector(entries):
for entry in entries:
samples=entry[options.sample]
slist = set(samples.keys())
tokeep=slist & samplename
if samplepattern is not None:
for name in slist:
if samplepattern.match(name):
tokeep.add(name)
if tokeep:
newsample={}
newcount=0
for name in tokeep:
c = samples[name]
newsample[name]= c
newcount+=c
entry['count']=newcount
entry[options.sample]=newsample
for t in othertags:
if t in entry:
d = entry[t]
newd={}
for name in tokeep:
if name in d:
newd[name] = d[name]
entry[t]=newd
yield entry
return sequenceSelector
if __name__=='__main__':
optionParser = getOptionManager([addInOutputOption,addSubsetOptions],progdoc=__doc__)
(options, entries) = optionParser()
writer = sequenceWriterGenerator(options)
good = sequenceSelectorGenerator(options)
for seq in good(entries):
writer(seq)
+36 -12
View File
@@ -64,12 +64,14 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
class EcoPCRDBSequenceWriter(object):
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
from obitools.options import currentInputFileName
self.currentInputFileName=currentInputFileName
# Take care of the taxonomy associated to the database
self._currentfile=None
self._taxonomy= loadTaxonomyDatabase(options)
dbname = options.ecopcroutput
if (self._taxonomy is not None
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
print >> sys.stderr,"Writing the taxonomy file...",
@@ -82,23 +84,25 @@ class EcoPCRDBSequenceWriter(object):
fileidx = max(list(int(p.search(i).group(1))
for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
) +1
self._fileidx=fileidx
self._dbname=dbname
self._filename="%s_%03d.sdx" % (dbname,fileidx)
if append:
mode ='r+b'
f = universalOpen(self._filename)
(recordCount,) = struct.unpack('> I',f.read(4))
self._sequenceCount=recordCount
self._sequenceFileCount=recordCount
del f
self._file = open(self._filename,mode)
self._file.seek(0,0)
self._file.write(struct.pack('> I',0))
self.open('r+b')
self._file.seek(0,2)
else:
self._sequenceCount=0
mode = 'wb'
self._file = open(self._filename,mode)
self._file.write(struct.pack('> I',self._sequenceCount))
self._sequenceFileCount=0
self.open("wb")
if type is not None:
assert ftid is not None,"You must specify an id attribute for features"
@@ -141,7 +145,28 @@ class EcoPCRDBSequenceWriter(object):
return packed
def close(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceFileCount))
self._file.close()
def open(self,mode):
self._filename="%s_%03d.sdx" % (self._dbname,self._fileidx)
self._file=open(self._filename,mode)
self._sequenceFileCount=0
self._file.write(struct.pack('> I',self._sequenceFileCount))
def put(self,sequence):
if self._currentfile is None:
self._currentfile=self.currentInputFileName()
if self.currentInputFileName() != self._currentfile:
self._currentfile=self.currentInputFileName()
self.close()
self._fileidx+=1
self.open('wb')
if self._taxonomy is not None:
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
sequence.extractTaxon()
@@ -149,11 +174,10 @@ class EcoPCRDBSequenceWriter(object):
if self._annotation is not None:
self._annotation.put(sequence, self._sequenceCount)
self._sequenceCount+=1
self._sequenceFileCount+=1
def __del__(self):
self._file.seek(0,0)
self._file.write(struct.pack('> I',self._sequenceCount))
self._file.close()
self.close()
+30 -8
View File
@@ -29,13 +29,10 @@ import sys
import re
from obitools.ecopcr import EcoPCRFile
from obitools.format.sequence import skipOnErrorIterator
from obitools.format.sequence import skipOnErrorIterator, skipfirst, only
from obitools import BioSequence
from obitools.utils import FakeFile
from glob import glob
def binarySequenceIterator(lineiterator):
f = FakeFile(lineiterator)
@@ -52,6 +49,23 @@ def binarySequenceIterator(lineiterator):
def addInputFormatOption(optionManager):
group = optionManager.add_option_group("Restriction to a sub-part options",
"Allow to limit analysis to a sub-part of the data file")
group.add_option('--skip',
action="store", dest="skip",
metavar='<N>',
default=None,
type='int',
help="skip the N first sequences")
group.add_option('--only',
action="store", dest="only",
metavar='<N>',
default=None,
type='int',
help="treat only N sequences")
group = optionManager.add_option_group("Input format options",
"If not specified, a test is done to determine the file format")
@@ -245,9 +259,9 @@ def autoEntriesIterator(options):
raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
if reader==binarySequenceIterator:
input = binarySequenceIterator(lineiterator)
input = binarySequenceIterator(lineiterator) # @ReservedAssignment
else:
input = reader(chain([first],lineiterator))
input = reader(chain([first],lineiterator)) # @ReservedAssignment
return input
@@ -294,12 +308,20 @@ def autoEntriesIterator(options):
if options.skiperror:
reader = skipOnErrorIterator(reader)
if hasattr(options, 'skip') and options.skip is not None:
print >>sys.stderr,"Skipping %d sequences" % options.skip
reader = skipfirst(reader,options.skip)
if hasattr(options, 'only') and options.only is not None:
print >>sys.stderr,"Analysing only %d sequences" % options.only
reader = only(reader,options.only)
return reader
def sequenceWriterGenerator(options,output=sys.stdout):
class SequenceWriter:
def __init__(self,options,file=sys.stdout):
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
self._format=None
self._file=file
self._upper=options.uppercase
@@ -324,7 +346,7 @@ def sequenceWriterGenerator(options,output=sys.stdout):
sys.exit(0)
class BinaryWriter:
def __init__(self,options,file=sys.stdout):
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
self._file=file
self._file.write("#!Pickle\n")
def put(self,seq):
+27
View File
@@ -21,6 +21,33 @@ def skipOnErrorIterator(seqIterator):
continue
return internal
def skipfirst(seqIterator,n):
def internal(inputdata):
si = seqIterator(inputdata)
c=0
for seq in si:
c+=1
if c > n:
yield seq
print >>sys.stderr
return internal
def only(seqIterator,n):
def internal(inputdata):
si = seqIterator(inputdata)
c=0
for seq in si:
if c < n:
yield seq
else:
break
c+=1
print >>sys.stderr
return internal
def autoSequenceIterator(file):
+1 -1
View File
@@ -93,7 +93,7 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102,opti
else:
if entryIterator == EcoPCRDBSequenceIterator and options is not None:
if options.ecodb==f:
if hasattr(options,'ecodb') and options.ecodb==f:
iterator = entryIterator(f,options.taxonomy)
else:
iterator = entryIterator(f)
+2 -2
View File
@@ -1,5 +1,5 @@
major = 1
minor = 1
serial= '18'
minor = 2
serial= '0'
version = "%2d.%02d %s" % (major,minor,serial)