mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 16:50:27 +00:00
A first version of obigrep. Normally fully functionnal, but not fully tested
This commit is contained in:
@@ -38,9 +38,10 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo
|
||||
}
|
||||
options.BoolVar(&__rank_list__, "rank-list", false,
|
||||
options.Alias("l"),
|
||||
options.Description("List every taxonomic rank available iin the taxonomy."))
|
||||
options.IntSliceVar(&__taxonomical_restriction__, "subclade-of", 1, 1,
|
||||
options.Alias("s"),
|
||||
options.Description("List every taxonomic rank available in the taxonomy."))
|
||||
|
||||
options.IntSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1,
|
||||
options.Alias("r"),
|
||||
options.Description("Restrict output to some subclades."))
|
||||
}
|
||||
|
||||
|
||||
47
pkg/obitools/obigrep/grep.go
Normal file
47
pkg/obitools/obigrep/grep.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package obigrep
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
)
|
||||
|
||||
func IFilterSequence(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
||||
var newIter obiiter.IBioSequenceBatch
|
||||
|
||||
predicate := CLISequenceSelectionPredicate()
|
||||
|
||||
if predicate != nil {
|
||||
if CLISaveDiscardedSequences() {
|
||||
var discarded obiiter.IBioSequenceBatch
|
||||
|
||||
log.Printf("Discarded sequences saved in file: %s\n", CLIDiscardedFileName())
|
||||
newIter, discarded = iterator.DivideOn(predicate,
|
||||
obioptions.CLIBatchSize())
|
||||
|
||||
go func() {
|
||||
_, err := obiconvert.WriteBioSequencesBatch(discarded,
|
||||
true,
|
||||
CLIDiscardedFileName())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
} else {
|
||||
newIter = iterator.FilterOn(predicate,
|
||||
obioptions.CLIBatchSize(),
|
||||
obioptions.CLIParallelWorkers(),
|
||||
obioptions.CLIBufferSize(),
|
||||
)
|
||||
}
|
||||
} else {
|
||||
newIter = iterator
|
||||
}
|
||||
|
||||
return newIter
|
||||
|
||||
}
|
||||
375
pkg/obitools/obigrep/options.go
Normal file
375
pkg/obitools/obigrep/options.go
Normal file
@@ -0,0 +1,375 @@
|
||||
package obigrep
|
||||
|
||||
import (
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats/ncbitaxdump"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _BelongTaxa = make([]int, 0)
|
||||
var _NotBelongTaxa = make([]int, 0)
|
||||
var _RequiredRanks = make([]string, 0)
|
||||
|
||||
var _MinimumLength = 1
|
||||
var _MaximumLength = int(2e9)
|
||||
|
||||
var _MinimumCount = 1
|
||||
var _MaximumCount = int(2e9)
|
||||
|
||||
var _SequencePatterns = make([]string, 0)
|
||||
var _DefinitionPatterns = make([]string, 0)
|
||||
var _IdPatterns = make([]string, 0)
|
||||
|
||||
var _Predicats = make([]string, 0)
|
||||
|
||||
var _IdList = ""
|
||||
|
||||
var _Taxdump = ""
|
||||
var _Taxonomy = (*obitax.Taxonomy)(nil)
|
||||
var _RequiredAttributes = make([]string, 0)
|
||||
var _AttributePatterns = make(map[string]string, 0)
|
||||
|
||||
var _InvertMatch = false
|
||||
var _SaveRejected = ""
|
||||
|
||||
func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringVar(&_Taxdump, "taxdump", _Taxdump,
|
||||
options.Alias("t"),
|
||||
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
|
||||
|
||||
options.IntSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
|
||||
options.Alias("r"),
|
||||
options.ArgName("TAXID"),
|
||||
options.Description("Require that the actual taxon of the sequence belongs the provided taxid."))
|
||||
|
||||
options.IntSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1,
|
||||
options.Alias("i"),
|
||||
options.ArgName("TAXID"),
|
||||
options.Description("Require that the actual taxon of the sequence doesn't belong the provided taxid."))
|
||||
|
||||
options.StringSliceVar(&_RequiredRanks, "require-rank", 1, 1,
|
||||
options.ArgName("RANK_NAME"),
|
||||
options.Description("Select sequences belonging a taxon with a rank <RANK_NAME>"))
|
||||
|
||||
}
|
||||
|
||||
func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
|
||||
TaxonomySelectionOptionSet(options)
|
||||
options.StringVar(&_SaveRejected, "save-discarded", _SaveRejected,
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("Indicates in which file not selected sequences must be saved."))
|
||||
|
||||
options.StringVar(&_IdList, "id-list", _IdList,
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("<FILENAME> points to a text file containing the list of sequence record identifiers to be selected."+
|
||||
" The file format consists in a single identifier per line."))
|
||||
|
||||
options.BoolVar(&_InvertMatch, "inverse-match", _InvertMatch,
|
||||
options.Alias("v"),
|
||||
options.Description("Inverts the sequence record selection."))
|
||||
|
||||
options.IntVar(&_MinimumLength, "min-length", _MinimumLength,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Alias("l"),
|
||||
options.Description("Selects sequence records whose sequence length is equal or longer than lmin."))
|
||||
|
||||
options.IntVar(&_MaximumLength, "max-length", _MaximumLength,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Alias("L"),
|
||||
options.Description("Selects sequence records whose sequence length is equal or shorter than lmax."))
|
||||
|
||||
options.IntVar(&_MinimumCount, "min-count", _MinimumCount,
|
||||
options.ArgName("COUNT"),
|
||||
options.Alias("c"),
|
||||
options.Description("Selects sequence records occuring at least min count time in the data set."))
|
||||
|
||||
options.IntVar(&_MaximumCount, "max-count", _MaximumCount,
|
||||
options.ArgName("COUNT"),
|
||||
options.Alias("C"),
|
||||
options.Description("Selects sequence records occuring no more than max count time in the data set."))
|
||||
|
||||
options.StringSliceVar(&_Predicats, "predicate", 1, 1,
|
||||
options.Alias("p"),
|
||||
options.ArgName("EXPRESSION"),
|
||||
options.Description("boolean expression to be evaluated for each sequence record. "+
|
||||
"The attribute keys defined for each sequence record can be used in the "+
|
||||
"expression as variable names. An extra variable named ‘sequence’ refers "+
|
||||
"to the sequence record itself. Several -p options can be used on the same "+
|
||||
"command line and in this last case, the selected sequence records will match "+
|
||||
"all constraints."))
|
||||
|
||||
options.StringSliceVar(&_SequencePatterns, "sequence", 1, 1,
|
||||
options.Alias("s"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive."))
|
||||
|
||||
options.StringSliceVar(&_DefinitionPatterns, "definition", 1, 1,
|
||||
options.Alias("D"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Regular expression pattern to be tested against the sequence definition. The pattern is case insensitive."))
|
||||
|
||||
options.StringSliceVar(&_IdPatterns, "identifier", 1, 1,
|
||||
options.Alias("I"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Regular expression pattern to be tested against the sequence identifier. The pattern is case insensitive."))
|
||||
|
||||
options.StringSliceVar(&_RequiredAttributes, "has-attribute", 1, 1,
|
||||
options.Alias("A"),
|
||||
options.ArgName("KEY"),
|
||||
options.Description("Selects sequence records having an attribute whose key = <KEY>."))
|
||||
|
||||
options.StringMapVar(&_AttributePatterns, "attribute", 1, 1,
|
||||
options.Alias("a"),
|
||||
options.ArgName("KEY=VALUE"),
|
||||
options.Description("Regular expression pattern matched against the attributes of the sequence record. "+
|
||||
"the value of this attribute is of the form : key:regular_pattern. The pattern is case sensitive. "+
|
||||
"Several -a options can be used on the same command line and in this last case, the selected "+
|
||||
"sequence records will match all constraints."))
|
||||
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
SequenceSelectionOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIMinSequenceLength() int {
|
||||
return _MinimumLength
|
||||
}
|
||||
|
||||
func CLIMaxSequenceLength() int {
|
||||
return _MaximumLength
|
||||
}
|
||||
|
||||
func CLISequenceSizePredicate() obiseq.SequencePredicate {
|
||||
if _MinimumLength > 1 {
|
||||
p := obiseq.IsLongerOrEqualTo(_MinimumLength)
|
||||
|
||||
if _MaximumLength != int(2e9) {
|
||||
p = p.And(obiseq.IsShorterOrEqualTo(_MaximumLength))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
if _MaximumLength != int(2e9) {
|
||||
return obiseq.IsShorterOrEqualTo(_MaximumLength)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIMinSequenceCount() int {
|
||||
return _MinimumCount
|
||||
}
|
||||
|
||||
func CLIMaxSequenceCount() int {
|
||||
return _MaximumCount
|
||||
}
|
||||
|
||||
func CLISequenceCountPredicate() obiseq.SequencePredicate {
|
||||
if _MinimumCount > 1 {
|
||||
p := obiseq.IsMoreAbundantOrEqualTo(_MinimumCount)
|
||||
|
||||
if _MaximumCount != int(2e9) {
|
||||
p = p.And(obiseq.IsLessAbundantOrEqualTo(_MaximumCount))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
if _MaximumLength != int(2e9) {
|
||||
return obiseq.IsLessAbundantOrEqualTo(_MaximumCount)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLISelectedNCBITaxDump() string {
|
||||
return _Taxdump
|
||||
}
|
||||
|
||||
func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
|
||||
if CLISelectedNCBITaxDump() != "" {
|
||||
if _Taxonomy == nil {
|
||||
var err error
|
||||
_Taxonomy, err = ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), true)
|
||||
if err != nil {
|
||||
log.Fatalf("cannot load taxonomy %s : %v",
|
||||
CLISelectedNCBITaxDump(), err)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return _Taxonomy
|
||||
}
|
||||
|
||||
log.Fatalln("no NCBI taxdump selected using option -t|--taxdump")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_BelongTaxa) > 0 {
|
||||
taxonomy := CLILoadSelectedTaxonomy()
|
||||
p := obitax.IsSubCladeOf(*taxonomy, _BelongTaxa[0])
|
||||
|
||||
for _, taxid := range _BelongTaxa[1:] {
|
||||
p.Or(obitax.IsSubCladeOf(*taxonomy, taxid))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_NotBelongTaxa) > 0 {
|
||||
taxonomy := CLILoadSelectedTaxonomy()
|
||||
p := obitax.IsSubCladeOf(*taxonomy, _NotBelongTaxa[0])
|
||||
|
||||
for _, taxid := range _NotBelongTaxa[1:] {
|
||||
p.Or(obitax.IsSubCladeOf(*taxonomy, taxid))
|
||||
}
|
||||
|
||||
return p.Not()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_RequiredRanks) > 0 {
|
||||
taxonomy := CLILoadSelectedTaxonomy()
|
||||
p := obitax.HasRankDefined(*taxonomy, _RequiredRanks[0])
|
||||
|
||||
for _, rank := range _RequiredRanks[1:] {
|
||||
p.And(obitax.HasRankDefined(*taxonomy, rank))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLITaxonomyFilterPredicate() obiseq.SequencePredicate {
|
||||
return CLIHasRankDefinedPredicate().And(CLIRestrictTaxonomyPredicate()).And(CLIAvoidTaxonomyPredicate())
|
||||
}
|
||||
|
||||
func CLIPredicatesPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_Predicats) > 0 {
|
||||
p := obiseq.ExpressionPredicat(_Predicats[0])
|
||||
|
||||
for _, expression := range _Predicats[1:] {
|
||||
p.And(obiseq.ExpressionPredicat(expression))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLISequencePatternPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_SequencePatterns) > 0 {
|
||||
p := obiseq.IsSequenceMatch(_SequencePatterns[0])
|
||||
|
||||
for _, pattern := range _SequencePatterns[1:] {
|
||||
p.And(obiseq.IsSequenceMatch(pattern))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIDefinitionPatternPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_DefinitionPatterns) > 0 {
|
||||
p := obiseq.IsDefinitionMatch(_DefinitionPatterns[0])
|
||||
|
||||
for _, pattern := range _DefinitionPatterns[1:] {
|
||||
p.And(obiseq.IsDefinitionMatch(pattern))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIIdPatternPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_IdPatterns) > 0 {
|
||||
p := obiseq.IsIdMatch(_IdPatterns[0])
|
||||
|
||||
for _, pattern := range _IdPatterns[1:] {
|
||||
p.And(obiseq.IsIdMatch(pattern))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIIdListPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if _IdList != "" {
|
||||
ids, err := goutils.ReadLines(_IdList)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("cannot read the id file %s : %v", _IdList, err)
|
||||
}
|
||||
|
||||
for i, v := range ids {
|
||||
ids[i] = strings.TrimSpace(v)
|
||||
}
|
||||
|
||||
p := obiseq.IsIdIn(ids...)
|
||||
|
||||
return p
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLISequenceSelectionPredicate() obiseq.SequencePredicate {
|
||||
p := CLISequenceSizePredicate()
|
||||
p = p.And(CLISequenceCountPredicate())
|
||||
p = p.And(CLITaxonomyFilterPredicate())
|
||||
p = p.And(CLIPredicatesPredicate())
|
||||
p = p.And(CLISequencePatternPredicate())
|
||||
p = p.And(CLIDefinitionPatternPredicate())
|
||||
p = p.And(CLIIdPatternPredicate())
|
||||
p = p.And(CLIIdListPredicate())
|
||||
|
||||
if _InvertMatch {
|
||||
p = p.Not()
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
func CLISaveDiscardedSequences() bool {
|
||||
return _SaveRejected != ""
|
||||
}
|
||||
|
||||
func CLIDiscardedFileName() string {
|
||||
return _SaveRejected
|
||||
}
|
||||
Reference in New Issue
Block a user