Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 23717f5833 | |||
| fdeaf5956c | |||
| 343d9ec6df | |||
| 7a993ec841 | |||
| 6170fac081 | |||
| 764ee27c0d | |||
| 18d87aa1b1 | |||
| 1c91c27ee3 | |||
| 22d343b46a | |||
| 2af94b9da7 | |||
| aa064dda57 | |||
| f38ccae698 | |||
| b99881817a | |||
| df093b3fa9 |
@@ -3,13 +3,12 @@ Sequence sampling and filtering
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
|
||||
scripts/obiextract
|
||||
scripts/obigrep
|
||||
scripts/obihead
|
||||
scripts/obisample
|
||||
scripts/obiselect
|
||||
scripts/obisplit
|
||||
scripts/obisubset
|
||||
scripts/obitail
|
||||
|
||||
|
||||
@@ -3,6 +3,24 @@ Options to specify input format
|
||||
|
||||
.. program:: obitools
|
||||
|
||||
|
||||
Restrict the analysis to a sub-part of the input file
|
||||
.....................................................
|
||||
|
||||
.. cmdoption:: --skip <N>
|
||||
|
||||
The N first sequence records of the file are discarded from the analysis and
|
||||
not reported to the output file
|
||||
|
||||
|
||||
.. cmdoption:: --only <N>
|
||||
|
||||
Only the N next sequence records of the file are analyzed. The following sequences
|
||||
in the file are neither analyzed, neither reported to the output file.
|
||||
This option can be used conjointly with the `--skip` option.
|
||||
|
||||
|
||||
|
||||
Sequence annotated format
|
||||
.........................
|
||||
|
||||
|
||||
@@ -1,55 +1,70 @@
|
||||
.. automodule:: ecotag
|
||||
|
||||
|
||||
:py:mod:`ecotag` specific options
|
||||
---------------------------------
|
||||
|
||||
.. cmdoption:: -R <FILENAME>, --ref-database=<FILENAME>
|
||||
|
||||
.. cmdoption:: -R <FILENAME>, --ref-database=<FILENAME>
|
||||
|
||||
<FILENAME> is the fasta file containing the reference sequences
|
||||
|
||||
.. cmdoption:: -m FLOAT, --minimum-identity=FLOAT
|
||||
|
||||
When the best match with the reference database present an identity
|
||||
level below FLOAT, the taxonomic assignment for the sequence record
|
||||
is not computed. The sequence record is nevertheless included in the
|
||||
output file. FLOAT is included in a [0,1] interval.
|
||||
|
||||
.. cmdoption:: --minimum-circle=FLOAT
|
||||
|
||||
When sequence identity is less than FLOAT, the taxonomic
|
||||
assignment for the sequence record is not indicated in ``ecotag``'s
|
||||
output. FLOAT is included in a [0,1] interval.
|
||||
(This option doesn't seem to work).
|
||||
minimum identity considered for the assignment circle.
|
||||
FLOAT is included in a [0,1] interval.
|
||||
|
||||
.. cmdoption:: -x RANK, --explain=RANK
|
||||
|
||||
|
||||
.. cmdoption:: -u, --uniq
|
||||
|
||||
When this option is specified, the program first dereplicates the sequence
|
||||
records to work on unique sequences only. This option greatly improves
|
||||
|
||||
When this option is specified, the program first dereplicates the sequence
|
||||
records to work on unique sequences only. This option greatly improves
|
||||
the program's speed, especially for highly redundant datasets.
|
||||
|
||||
.. cmdoption:: --sort=<KEY>
|
||||
|
||||
|
||||
The output is sorted based on the values of the relevant attribute.
|
||||
|
||||
.. cmdoption:: -r, --reverse
|
||||
|
||||
|
||||
The output is sorted in reverse order (should be used with the --sort option).
|
||||
(Works even if the --sort option is not set, but could not find on what
|
||||
(Works even if the --sort option is not set, but could not find on what
|
||||
the output is sorted).
|
||||
|
||||
.. cmdoption:: -E FLOAT, --errors=FLOAT
|
||||
|
||||
FLOAT is the fraction of reference sequences that will
|
||||
be ignored when looking for the most recent common ancestor. This
|
||||
option is useful when a non-negligible proportion of reference sequences
|
||||
is expected to be assigned to the wrong taxon, for example because of
|
||||
|
||||
FLOAT is the fraction of reference sequences that will
|
||||
be ignored when looking for the most recent common ancestor. This
|
||||
option is useful when a non-negligible proportion of reference sequences
|
||||
is expected to be assigned to the wrong taxon, for example because of
|
||||
taxonomic misidentification. FLOAT is included in a [0,1] interval.
|
||||
|
||||
.. cmdoption:: --cache-size=INTEGER
|
||||
|
||||
A cache for computed similarities is maintained by `ecotag`. the default
|
||||
size for this cache is 1,000,000 of scores. This option allows to change
|
||||
the cache size.
|
||||
|
||||
.. include:: ../optionsSet/taxonomyDB.txt
|
||||
|
||||
|
||||
.. include:: ../optionsSet/inputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/outputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/defaultoptions.txt
|
||||
|
||||
|
||||
:py:mod:`ecotag` added sequence attributes
|
||||
------------------------------------------
|
||||
|
||||
|
||||
.. hlist::
|
||||
:columns: 3
|
||||
|
||||
|
||||
- :doc:`best_identity <../attributes/best_identity>`
|
||||
- :doc:`best_match <../attributes/best_match>`
|
||||
- :doc:`family <../attributes/family>`
|
||||
@@ -65,4 +80,3 @@
|
||||
- :doc:`species_list <../attributes/species_list>`
|
||||
- :doc:`species_name <../attributes/species_name>`
|
||||
- :doc:`taxid <../attributes/taxid>`
|
||||
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
.. automodule:: obisubset
|
||||
|
||||
|
||||
:py:mod:`obisubset` specific options
|
||||
------------------------------------
|
||||
|
||||
.. cmdoption:: -s <TAGNAME>, --sample=<TAGNAME>,
|
||||
|
||||
The option ``-s`` allows to specify the tag containing sample descriptions,
|
||||
the default value is set to *merged_sample*.
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obiuniq -m sample seq1.fasta > seq2.fasta
|
||||
> obisubset -s merged_sample -n sample1 seq2.fasta > seq3.fasta
|
||||
|
||||
After the dereplication of the sequences using the
|
||||
in the new attribute ``merged_sample``.
|
||||
|
||||
.. cmdoption:: -o <TAGNAME>, --other-tag=<TAGNAME>,
|
||||
|
||||
Another tag to clean according to the sample subset
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
|
||||
|
||||
.. cmdoption:: -l <FILENAME>, --sample-list=<FILENAME>,
|
||||
|
||||
File containing the samples names (one sample id per line).
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -l ids.txt seq2.fasta > seq3.fasta
|
||||
|
||||
.. cmdoption:: -p <REGEX>, --sample-pattern=<REGEX>,
|
||||
|
||||
A regular expression pattern matching the sample ids to extract.
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -p "negative_.*" seq2.fasta > seq3.fasta
|
||||
|
||||
.. cmdoption:: -n <SAMPLEIDS>, --sample-name=<SAMPLEIDS>,
|
||||
|
||||
A sample id to extract
|
||||
|
||||
*Example:*
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
> obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
|
||||
|
||||
.. include:: ../optionsSet/inputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/outputformat.txt
|
||||
|
||||
.. include:: ../optionsSet/defaultoptions.txt
|
||||
|
||||
:py:mod:`obisubset` modifies sequence attributes
|
||||
------------------------------------------------
|
||||
|
||||
.. hlist::
|
||||
:columns: 3
|
||||
|
||||
- :doc:`count <../attributes/count>`
|
||||
- :doc:`merged_* <../attributes/merged_star>`
|
||||
|
||||
:py:mod:`obisubset` used sequence attribute
|
||||
-------------------------------------------
|
||||
|
||||
- :doc:`count <../attributes/taxid>`
|
||||
- :doc:`merged_* <../attributes/merged_star>`
|
||||
@@ -19,7 +19,7 @@ from os import path
|
||||
|
||||
|
||||
PACKAGE = "OBITools"
|
||||
VERSION = "1.1.18"
|
||||
VERSION = "1.2.0"
|
||||
AUTHOR = 'Eric Coissac'
|
||||
EMAIL = 'eric@coissac.eu'
|
||||
URL = 'metabarcoding.org/obitools'
|
||||
|
||||
+60
-4
@@ -46,6 +46,8 @@ from obitools.options.taxonomyfilter import addTaxonomyDBOptions,loadTaxonomyDat
|
||||
from obitools.options import getOptionManager
|
||||
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import sys
|
||||
import math
|
||||
import os.path
|
||||
@@ -76,6 +78,13 @@ def addSearchOptions(optionManager):
|
||||
default=0.0,
|
||||
help="minimum identity to consider.")
|
||||
|
||||
optionManager.add_option('--minimum-circle',
|
||||
action="store", dest="circle",
|
||||
metavar="identity",
|
||||
type="float",
|
||||
default=1.0,
|
||||
help="minimum identity considered for the assignment circle.")
|
||||
|
||||
# optionManager.add_option('-S','--normalized-smallest',
|
||||
# action="store_false", dest="large",
|
||||
# default=True,
|
||||
@@ -139,6 +148,13 @@ def addSearchOptions(optionManager):
|
||||
default=0.0,
|
||||
help='Tolerated rate of wrong assignation')
|
||||
|
||||
optionManager.add_option('--cache-size',
|
||||
action='store',dest='cache',
|
||||
type='int',
|
||||
metavar='<SIZE>',
|
||||
default=1000000,
|
||||
help='Cache size for the aligment score')
|
||||
|
||||
|
||||
def count(data):
|
||||
rep = {}
|
||||
@@ -190,6 +206,31 @@ def myLenlcs(s1, s2, minid, normalized, reference):
|
||||
return lcs, lali
|
||||
|
||||
|
||||
def cachedLenLCS(s1,s2,minid,normalized,reference):
|
||||
global __LCSCache__
|
||||
global __INCache__
|
||||
global __OUTCache__
|
||||
global __CACHE_SIZE__
|
||||
|
||||
pair=frozenset((s1.id,s2.id))
|
||||
|
||||
if pair in __LCSCache__:
|
||||
rep=__LCSCache__[pair]
|
||||
del __LCSCache__[pair]
|
||||
__INCache__+=1.0
|
||||
|
||||
else:
|
||||
rep=lenlcs(s1,s2,minid,normalized,reference)
|
||||
__OUTCache__+=1.0
|
||||
|
||||
__LCSCache__[pair]=rep
|
||||
|
||||
if len(__LCSCache__) > __CACHE_SIZE__:
|
||||
__LCSCache__.popitem(0)
|
||||
return rep
|
||||
|
||||
|
||||
|
||||
#def lcsIterator(entries,db,options):
|
||||
#
|
||||
# for seq in entries:
|
||||
@@ -233,7 +274,7 @@ def lcsIteratorSelf(entries,db,options):
|
||||
maxid = ([],0.0)
|
||||
minid = options.minimum
|
||||
for d in db:
|
||||
lcs,lali = myLenlcs(seq,d,minid,normalized=True,reference=ALILEN)
|
||||
lcs,lali = myLenlcs(seq,d,minid,normalized=True,reference=ALILEN) # @UnusedVariable
|
||||
if lcs > maxid[1] and lcs > options.minimum:
|
||||
maxid = ([d],lcs)
|
||||
minid = maxid[1]
|
||||
@@ -241,11 +282,13 @@ def lcsIteratorSelf(entries,db,options):
|
||||
maxid[0].append(d)
|
||||
|
||||
if maxid[0]:
|
||||
if maxid[1] > options.circle:
|
||||
maxid=(maxid[0],options.circle)
|
||||
results.extend([(s,maxid[1]) for s in maxid[0]])
|
||||
for d in db:
|
||||
for s in maxid[0]:
|
||||
if d.id != s.id:
|
||||
lcs,lali = lenlcs(s,d,maxid[1],normalized=True,reference=ALILEN)
|
||||
lcs,lali = cachedLenLCS(s,d,maxid[1],normalized=True,reference=ALILEN) # @UnusedVariable
|
||||
if lcs >= maxid[1]:
|
||||
results.append((d,lcs))
|
||||
|
||||
@@ -253,9 +296,19 @@ def lcsIteratorSelf(entries,db,options):
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
__LCSCache__=OrderedDict()
|
||||
__INCache__=1.0
|
||||
__OUTCache__=1.0
|
||||
|
||||
|
||||
optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
|
||||
|
||||
(options, entries) = optionParser()
|
||||
|
||||
__CACHE_SIZE__=options.cache
|
||||
|
||||
if __CACHE_SIZE__ < 10:
|
||||
__CACHE_SIZE__=10
|
||||
|
||||
taxonomy = loadTaxonomyDatabase(options)
|
||||
writer = sequenceWriterGenerator(options)
|
||||
@@ -278,7 +331,7 @@ if __name__=='__main__':
|
||||
taxonlink = {}
|
||||
|
||||
rankid = taxonomy.findRankByName(options.explain)
|
||||
|
||||
|
||||
for seq in db:
|
||||
id = seq.id[0:46]
|
||||
seq.id=id
|
||||
@@ -299,6 +352,8 @@ if __name__=='__main__':
|
||||
|
||||
search = lcsIteratorSelf(entries,db,options)
|
||||
|
||||
print >>sys.stderr,'\nCache size : %d\n' % __CACHE_SIZE__
|
||||
|
||||
|
||||
for seq,best,match in search:
|
||||
try:
|
||||
@@ -385,8 +440,9 @@ if __name__=='__main__':
|
||||
else:
|
||||
seq['species_name']=None
|
||||
|
||||
|
||||
writer(seq)
|
||||
|
||||
print >>sys.stderr,'\n%5.3f%% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -48,13 +48,14 @@ from obitools.options import getOptionManager, allEntryIterator
|
||||
from obitools.align import QSolexaReverseAssemble
|
||||
from obitools.align import QSolexaRightReverseAssemble
|
||||
from obitools.tools._solexapairend import buildConsensus
|
||||
from obitools.format.options import addInputFormatOption,addOutputFormatOption,\
|
||||
from obitools.format.options import addOutputFormatOption,\
|
||||
sequenceWriterGenerator
|
||||
|
||||
from itertools import chain
|
||||
import cPickle
|
||||
import math
|
||||
from obitools.fastq._fastq import fastqIterator
|
||||
from obitools.fastq._fastq import fastqIterator # @UnresolvedImport
|
||||
|
||||
|
||||
def addSolexaPairEndOptions(optionManager):
|
||||
optionManager.add_option('-r','--reverse-reads',
|
||||
@@ -90,14 +91,6 @@ def addSolexaPairEndOptions(optionManager):
|
||||
const='illumina',
|
||||
help="input file is in fastq nucleic format produced by old solexa sequencer")
|
||||
|
||||
# optionManager.add_option('--proba',
|
||||
# action="store", dest="proba",
|
||||
# metavar="<FILENAME>",
|
||||
# type="str",
|
||||
# default=None,
|
||||
# help="null ditribution data file")
|
||||
|
||||
|
||||
optionManager.add_option('--score-min',
|
||||
action="store", dest="smin",
|
||||
metavar="#.###",
|
||||
@@ -105,13 +98,6 @@ def addSolexaPairEndOptions(optionManager):
|
||||
default=None,
|
||||
help="minimum score for keeping aligment")
|
||||
|
||||
# optionManager.add_option('--pvalue',
|
||||
# action="store", dest="pvalue",
|
||||
# metavar="#.###",
|
||||
# type="float",
|
||||
# default=None,
|
||||
# help="maximum pvalue for keeping aligment")
|
||||
|
||||
|
||||
|
||||
def cutDirectReverse(entries):
|
||||
@@ -222,6 +208,8 @@ if __name__ == '__main__':
|
||||
|
||||
#WARNING TO REMOVE : DIRTY PATCH !
|
||||
options.proba = None
|
||||
options.skip = None
|
||||
options.only = None
|
||||
|
||||
|
||||
options.sminL = None
|
||||
|
||||
+72
-59
@@ -68,7 +68,7 @@ Otherwise,
|
||||
'''
|
||||
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
from obitools.fasta import fastaIterator,formatFasta
|
||||
from obitools.options import getOptionManager
|
||||
@@ -110,8 +110,10 @@ def addObiaddtaxidsOptions(optionManager):
|
||||
metavar="<FORMAT>",
|
||||
type="string",
|
||||
default='raw',
|
||||
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE' or 'SILVA'."
|
||||
" Default : raw.")
|
||||
help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE_FULL', 'UNITE_GENERAL' or 'SILVA'."
|
||||
"The UNITE_FULL format is the one used for the 'Full UNITE+INSD dataset', and the UNITE_GENERAL format is the "
|
||||
"one used for the 'General FASTA release'."
|
||||
" Default : raw.")
|
||||
|
||||
optionManager.add_option('-k','--key-name',
|
||||
action="store", dest="tagname",
|
||||
@@ -142,22 +144,52 @@ def numberInStr(s) :
|
||||
return containsNumber
|
||||
|
||||
|
||||
def UNITEIterator(f):
|
||||
def UNITEIterator_FULL(f):
|
||||
|
||||
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
|
||||
for entry in fastaEntryIterator(f) :
|
||||
all = entry.split('\n')
|
||||
header = all[0]
|
||||
fields = header.split('|')
|
||||
id = fields[0][1:]
|
||||
fields = header.split('|')
|
||||
seq_id = fields[0][1:]
|
||||
seq = all[1]
|
||||
s = NucSequence(id, seq)
|
||||
s['ISDN_species_name'] = fields[1]
|
||||
s['UNITE_species_name'] = fields[3]
|
||||
path1 = fields[2].replace(';', ',')
|
||||
path2 = fields[4].replace(';', ',')
|
||||
s['ISDN_path'] = path1
|
||||
s['UNITE_path'] = path2
|
||||
s = NucSequence(seq_id, seq)
|
||||
|
||||
path = fields[1]
|
||||
|
||||
species_name_loc = path.index('s__')
|
||||
species_name_loc+=3
|
||||
s['species_name'] = path[species_name_loc:]
|
||||
|
||||
genus_name_loc = path.index('g__')
|
||||
genus_name_loc+=3
|
||||
s['genus_name'] = path[genus_name_loc:species_name_loc-4]
|
||||
|
||||
path = re.sub('[a-z]__', '', path)
|
||||
s['path'] = path.replace(';', ',')
|
||||
|
||||
yield s
|
||||
|
||||
|
||||
def UNITEIterator_GENERAL(f):
|
||||
|
||||
fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
|
||||
for entry in fastaEntryIterator(f) :
|
||||
all = entry.split('\n')
|
||||
header = all[0]
|
||||
fields = header.split('|')
|
||||
|
||||
seq_id = fields[0][1:]
|
||||
seq = all[1]
|
||||
s = NucSequence(seq_id, seq)
|
||||
|
||||
s['species_name'] = seq_id.replace("_", " ")
|
||||
|
||||
path = fields[4]
|
||||
path = re.sub('[a-z]__', '', path)
|
||||
path = path.replace(';', ',')
|
||||
s['path'] = path.replace(',,', ',')
|
||||
|
||||
yield s
|
||||
|
||||
|
||||
@@ -306,8 +338,10 @@ if __name__=='__main__':
|
||||
if options.db_type == 'raw' :
|
||||
entryIterator = fastaIterator
|
||||
entries = entryIterator(entries)
|
||||
elif options.db_type == 'UNITE' :
|
||||
entryIterator = UNITEIterator
|
||||
elif options.db_type == 'UNITE_FULL' :
|
||||
entryIterator = UNITEIterator_FULL
|
||||
elif options.db_type == 'UNITE_GENERAL' :
|
||||
entryIterator = UNITEIterator_GENERAL
|
||||
entries = entryIterator(entries)
|
||||
elif options.db_type == 'SILVA' :
|
||||
entryIterator = SILVAIterator
|
||||
@@ -342,6 +376,7 @@ if __name__=='__main__':
|
||||
if options.genus_found is not None and len(species_name.split(' ')) >= 2 :
|
||||
try:
|
||||
genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
|
||||
s['genus_taxid'] = genusTaxid
|
||||
print>>options.genus_found, formatFasta(s)
|
||||
genusFound = True
|
||||
except KeyError :
|
||||
@@ -357,54 +392,32 @@ if __name__=='__main__':
|
||||
print>>options.unidentified,formatFasta(s)
|
||||
|
||||
|
||||
elif options.db_type == 'UNITE' :
|
||||
restricting_ancestor = tax.findTaxonByName('Fungi')[0]
|
||||
elif ((options.db_type =='UNITE_FULL') or (options.db_type =='UNITE_GENERAL')) :
|
||||
|
||||
restricting_ancestor = tax.findTaxonByName('Fungi')[0][0]
|
||||
|
||||
for s in entries :
|
||||
|
||||
try:
|
||||
species_name = s["UNITE_species_name"]
|
||||
|
||||
try :
|
||||
species_name = s['species_name']
|
||||
taxid = getTaxid(tax, species_name, restricting_ancestor)
|
||||
s['taxid']=taxid
|
||||
s["species_name"] = species_name
|
||||
s['taxid'] = taxid
|
||||
s['rank'] = tax.getRank(taxid)
|
||||
print formatFasta(s)
|
||||
|
||||
|
||||
except KeyError:
|
||||
try:
|
||||
species_name = s["ISDN_species_name"]
|
||||
print species_name
|
||||
taxid = getTaxid(tax, species_name, restricting_ancestor)
|
||||
s['taxid']=taxid
|
||||
s["species_name"] = species_name
|
||||
print formatFasta(s)
|
||||
|
||||
except KeyError:
|
||||
genusFound = False
|
||||
if options.genus_found is not None :
|
||||
try:
|
||||
genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
|
||||
s['genus_taxid'] = genusTaxid
|
||||
print>>options.genus_found, formatFasta(s)
|
||||
genusFound = True
|
||||
|
||||
if s["UNITE_species_name"] != "-" and s["UNITE_species_name"] != "" :
|
||||
s["species_name"] = s["UNITE_species_name"]
|
||||
chosen = 'unite'
|
||||
|
||||
elif s["ISDN_species_name"] != "-" and s["ISDN_species_name"] != "" :
|
||||
s["species_name"] = s["ISDN_species_name"]
|
||||
chosen = 'isdn'
|
||||
|
||||
else :
|
||||
if s["UNITE_path"] != "-" and s["UNITE_path"] != "" :
|
||||
chosen = 'unite'
|
||||
s["species_name"] = (s["UNITE_path"].split(', '))[-1]
|
||||
|
||||
elif s["ISDN_path"] != "-" and s["ISDN_path"] != "" :
|
||||
chosen = 'isdn'
|
||||
s["species_name"] = (s["ISDN_path"].split(', '))[-1]
|
||||
|
||||
else :
|
||||
print>>sys.stderr, "\n\nwarning : sequence without any identification at all\n\n"
|
||||
|
||||
if chosen == 'unite' :
|
||||
s['path'] = s["UNITE_path"]
|
||||
else :
|
||||
s['path'] = s["ISDN_path"]
|
||||
|
||||
if options.unidentified is not None :
|
||||
print>>options.unidentified,formatFasta(s)
|
||||
|
||||
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
if options.unidentified is not None and not genusFound :
|
||||
print>>options.unidentified,formatFasta(s)
|
||||
|
||||
+2
-2
@@ -18,7 +18,7 @@
|
||||
'''
|
||||
|
||||
from obitools.options import getOptionManager
|
||||
from obitools.format.options import addInOutputOption
|
||||
from obitools.format.options import addInputFormatOption
|
||||
|
||||
def addCountOptions(optionManager):
|
||||
group=optionManager.add_option_group('Obicount specific options')
|
||||
@@ -36,7 +36,7 @@ def addCountOptions(optionManager):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
optionParser = getOptionManager([addCountOptions,addInOutputOption], progdoc=__doc__)
|
||||
optionParser = getOptionManager([addCountOptions,addInputFormatOption], progdoc=__doc__)
|
||||
|
||||
(options, entries) = optionParser()
|
||||
|
||||
|
||||
+1
-1
@@ -179,7 +179,7 @@ if __name__ == '__main__':
|
||||
i=0
|
||||
for c in classes:
|
||||
i+=1
|
||||
progressBar(i,lclasses,False,"%15s" % c)
|
||||
progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
|
||||
seqs = classes[c]
|
||||
sortclass(seqs, options)
|
||||
if len(c)==1:
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
#!/usr/local/bin/python
|
||||
'''
|
||||
:py:mod:`obisubset`: extract a subset of samples
|
||||
================================================
|
||||
|
||||
.. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org>
|
||||
|
||||
The :py:mod:`obisubset` command extracts a subset of samples from a sequence file
|
||||
after its dereplication using :py:mod:`obiuniq` program.
|
||||
'''
|
||||
from obitools.format.options import addInOutputOption, sequenceWriterGenerator
|
||||
from obitools.options import getOptionManager
|
||||
import re
|
||||
|
||||
def addSubsetOptions(optionManager):
|
||||
|
||||
group = optionManager.add_option_group('obisubset specific options')
|
||||
group.add_option('-s','--sample',
|
||||
action="store", dest="sample",
|
||||
metavar="<TAGNAME>",
|
||||
type="str",
|
||||
default='merged_sample',
|
||||
help="Tag containing sample descriptions, the default value is set to *merged_sample*")
|
||||
|
||||
group.add_option('-o','--other-tag',
|
||||
action="append", dest="taglist",
|
||||
metavar="<TAGNAME>",
|
||||
type="string",
|
||||
default=[],
|
||||
help="Another tag to clean according to the sample subset")
|
||||
|
||||
group.add_option('-l','--sample-list',
|
||||
action="store", dest="samplelist",
|
||||
metavar="<FILENAME>",
|
||||
type="string",
|
||||
default=None,
|
||||
help="File containing the samples names (one sample id per line)")
|
||||
|
||||
group.add_option('-p','--sample-pattern',
|
||||
action="store", dest="samplepattern",
|
||||
metavar="<REGEX>",
|
||||
type="string",
|
||||
default=None,
|
||||
help="A regular expression pattern matching the sample ids to extract")
|
||||
|
||||
group.add_option('-n','--sample-name',
|
||||
action="append", dest="samplename",
|
||||
metavar="<SAMPLEIDS>",
|
||||
type="string",
|
||||
default=[],
|
||||
help="A sample id to extract")
|
||||
|
||||
def sequenceSelectorGenerator(options):
|
||||
|
||||
samplename = set(options.samplename)
|
||||
othertags = set(options.taglist)
|
||||
|
||||
if options.samplelist is not None:
|
||||
with open(options.samplelist) as lname :
|
||||
for name in lname:
|
||||
name = name.strip()
|
||||
samplename.add(name)
|
||||
|
||||
if options.samplepattern is not None:
|
||||
samplepattern = re.compile(options.samplepattern)
|
||||
else:
|
||||
samplepattern = None
|
||||
|
||||
def sequenceSelector(entries):
|
||||
for entry in entries:
|
||||
samples=entry[options.sample]
|
||||
|
||||
slist = set(samples.keys())
|
||||
tokeep=slist & samplename
|
||||
|
||||
if samplepattern is not None:
|
||||
for name in slist:
|
||||
if samplepattern.match(name):
|
||||
tokeep.add(name)
|
||||
|
||||
if tokeep:
|
||||
newsample={}
|
||||
newcount=0
|
||||
for name in tokeep:
|
||||
c = samples[name]
|
||||
newsample[name]= c
|
||||
newcount+=c
|
||||
|
||||
entry['count']=newcount
|
||||
entry[options.sample]=newsample
|
||||
|
||||
for t in othertags:
|
||||
if t in entry:
|
||||
d = entry[t]
|
||||
newd={}
|
||||
for name in tokeep:
|
||||
if name in d:
|
||||
newd[name] = d[name]
|
||||
entry[t]=newd
|
||||
|
||||
yield entry
|
||||
|
||||
return sequenceSelector
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
optionParser = getOptionManager([addInOutputOption,addSubsetOptions],progdoc=__doc__)
|
||||
|
||||
(options, entries) = optionParser()
|
||||
|
||||
writer = sequenceWriterGenerator(options)
|
||||
|
||||
good = sequenceSelectorGenerator(options)
|
||||
|
||||
for seq in good(entries):
|
||||
writer(seq)
|
||||
@@ -64,12 +64,14 @@ class EcoPCRDBSequenceIterator(EcoPCRDBFile):
|
||||
class EcoPCRDBSequenceWriter(object):
|
||||
|
||||
def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
|
||||
|
||||
from obitools.options import currentInputFileName
|
||||
self.currentInputFileName=currentInputFileName
|
||||
# Take care of the taxonomy associated to the database
|
||||
self._currentfile=None
|
||||
|
||||
self._taxonomy= loadTaxonomyDatabase(options)
|
||||
dbname = options.ecopcroutput
|
||||
|
||||
|
||||
if (self._taxonomy is not None
|
||||
and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
|
||||
print >> sys.stderr,"Writing the taxonomy file...",
|
||||
@@ -82,23 +84,25 @@ class EcoPCRDBSequenceWriter(object):
|
||||
fileidx = max(list(int(p.search(i).group(1))
|
||||
for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
|
||||
) +1
|
||||
|
||||
self._fileidx=fileidx
|
||||
self._dbname=dbname
|
||||
|
||||
|
||||
self._filename="%s_%03d.sdx" % (dbname,fileidx)
|
||||
if append:
|
||||
mode ='r+b'
|
||||
f = universalOpen(self._filename)
|
||||
(recordCount,) = struct.unpack('> I',f.read(4))
|
||||
self._sequenceCount=recordCount
|
||||
self._sequenceFileCount=recordCount
|
||||
del f
|
||||
self._file = open(self._filename,mode)
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',0))
|
||||
self.open('r+b')
|
||||
self._file.seek(0,2)
|
||||
|
||||
else:
|
||||
self._sequenceCount=0
|
||||
mode = 'wb'
|
||||
self._file = open(self._filename,mode)
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
self._sequenceFileCount=0
|
||||
self.open("wb")
|
||||
|
||||
if type is not None:
|
||||
assert ftid is not None,"You must specify an id attribute for features"
|
||||
@@ -141,7 +145,28 @@ class EcoPCRDBSequenceWriter(object):
|
||||
return packed
|
||||
|
||||
|
||||
def close(self):
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',self._sequenceFileCount))
|
||||
self._file.close()
|
||||
|
||||
def open(self,mode):
|
||||
self._filename="%s_%03d.sdx" % (self._dbname,self._fileidx)
|
||||
self._file=open(self._filename,mode)
|
||||
self._sequenceFileCount=0
|
||||
self._file.write(struct.pack('> I',self._sequenceFileCount))
|
||||
|
||||
def put(self,sequence):
|
||||
if self._currentfile is None:
|
||||
self._currentfile=self.currentInputFileName()
|
||||
if self.currentInputFileName() != self._currentfile:
|
||||
self._currentfile=self.currentInputFileName()
|
||||
|
||||
self.close()
|
||||
|
||||
self._fileidx+=1
|
||||
self.open('wb')
|
||||
|
||||
if self._taxonomy is not None:
|
||||
if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
|
||||
sequence.extractTaxon()
|
||||
@@ -149,11 +174,10 @@ class EcoPCRDBSequenceWriter(object):
|
||||
if self._annotation is not None:
|
||||
self._annotation.put(sequence, self._sequenceCount)
|
||||
self._sequenceCount+=1
|
||||
self._sequenceFileCount+=1
|
||||
|
||||
def __del__(self):
|
||||
self._file.seek(0,0)
|
||||
self._file.write(struct.pack('> I',self._sequenceCount))
|
||||
self._file.close()
|
||||
self.close()
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -29,13 +29,10 @@ import sys
|
||||
|
||||
import re
|
||||
from obitools.ecopcr import EcoPCRFile
|
||||
from obitools.format.sequence import skipOnErrorIterator
|
||||
from obitools.format.sequence import skipOnErrorIterator, skipfirst, only
|
||||
from obitools import BioSequence
|
||||
from obitools.utils import FakeFile
|
||||
|
||||
from glob import glob
|
||||
|
||||
|
||||
def binarySequenceIterator(lineiterator):
|
||||
|
||||
f = FakeFile(lineiterator)
|
||||
@@ -52,6 +49,23 @@ def binarySequenceIterator(lineiterator):
|
||||
|
||||
def addInputFormatOption(optionManager):
|
||||
|
||||
group = optionManager.add_option_group("Restriction to a sub-part options",
|
||||
"Allow to limit analysis to a sub-part of the data file")
|
||||
|
||||
group.add_option('--skip',
|
||||
action="store", dest="skip",
|
||||
metavar='<N>',
|
||||
default=None,
|
||||
type='int',
|
||||
help="skip the N first sequences")
|
||||
|
||||
group.add_option('--only',
|
||||
action="store", dest="only",
|
||||
metavar='<N>',
|
||||
default=None,
|
||||
type='int',
|
||||
help="treat only N sequences")
|
||||
|
||||
group = optionManager.add_option_group("Input format options",
|
||||
"If not specified, a test is done to determine the file format")
|
||||
|
||||
@@ -245,9 +259,9 @@ def autoEntriesIterator(options):
|
||||
raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
|
||||
|
||||
if reader==binarySequenceIterator:
|
||||
input = binarySequenceIterator(lineiterator)
|
||||
input = binarySequenceIterator(lineiterator) # @ReservedAssignment
|
||||
else:
|
||||
input = reader(chain([first],lineiterator))
|
||||
input = reader(chain([first],lineiterator)) # @ReservedAssignment
|
||||
|
||||
return input
|
||||
|
||||
@@ -294,12 +308,20 @@ def autoEntriesIterator(options):
|
||||
|
||||
if options.skiperror:
|
||||
reader = skipOnErrorIterator(reader)
|
||||
|
||||
if hasattr(options, 'skip') and options.skip is not None:
|
||||
print >>sys.stderr,"Skipping %d sequences" % options.skip
|
||||
reader = skipfirst(reader,options.skip)
|
||||
|
||||
if hasattr(options, 'only') and options.only is not None:
|
||||
print >>sys.stderr,"Analysing only %d sequences" % options.only
|
||||
reader = only(reader,options.only)
|
||||
|
||||
return reader
|
||||
|
||||
def sequenceWriterGenerator(options,output=sys.stdout):
|
||||
class SequenceWriter:
|
||||
def __init__(self,options,file=sys.stdout):
|
||||
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
|
||||
self._format=None
|
||||
self._file=file
|
||||
self._upper=options.uppercase
|
||||
@@ -324,7 +346,7 @@ def sequenceWriterGenerator(options,output=sys.stdout):
|
||||
sys.exit(0)
|
||||
|
||||
class BinaryWriter:
|
||||
def __init__(self,options,file=sys.stdout):
|
||||
def __init__(self,options,file=sys.stdout): # @ReservedAssignment
|
||||
self._file=file
|
||||
self._file.write("#!Pickle\n")
|
||||
def put(self,seq):
|
||||
|
||||
@@ -21,6 +21,33 @@ def skipOnErrorIterator(seqIterator):
|
||||
continue
|
||||
|
||||
return internal
|
||||
|
||||
def skipfirst(seqIterator,n):
|
||||
def internal(inputdata):
|
||||
si = seqIterator(inputdata)
|
||||
c=0
|
||||
for seq in si:
|
||||
c+=1
|
||||
if c > n:
|
||||
yield seq
|
||||
print >>sys.stderr
|
||||
|
||||
return internal
|
||||
|
||||
|
||||
def only(seqIterator,n):
|
||||
def internal(inputdata):
|
||||
si = seqIterator(inputdata)
|
||||
c=0
|
||||
for seq in si:
|
||||
if c < n:
|
||||
yield seq
|
||||
else:
|
||||
break
|
||||
c+=1
|
||||
print >>sys.stderr
|
||||
return internal
|
||||
|
||||
|
||||
|
||||
def autoSequenceIterator(file):
|
||||
|
||||
@@ -93,7 +93,7 @@ def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102,opti
|
||||
else:
|
||||
|
||||
if entryIterator == EcoPCRDBSequenceIterator and options is not None:
|
||||
if options.ecodb==f:
|
||||
if hasattr(options,'ecodb') and options.ecodb==f:
|
||||
iterator = entryIterator(f,options.taxonomy)
|
||||
else:
|
||||
iterator = entryIterator(f)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
major = 1
|
||||
minor = 1
|
||||
serial= '18'
|
||||
minor = 2
|
||||
serial= '0'
|
||||
|
||||
version = "%2d.%02d %s" % (major,minor,serial)
|
||||
|
||||
Reference in New Issue
Block a user